diff --git a/include/ircd/gpt/pipe/desc.h b/include/ircd/gpt/pipe/desc.h index 1fd13b3ad..9c68ea9ce 100644 --- a/include/ircd/gpt/pipe/desc.h +++ b/include/ircd/gpt/pipe/desc.h @@ -20,14 +20,14 @@ struct ircd::gpt::pipe::desc pipe::code *code; cl::data - master, - state, // qry/key/val projection (tokens * embed * 3 * float) - accum, // accumulator (tokens * embed * float) - logit, // result logit vector (50257 * float) - logexp, // outputs distribution (50257 * float) - logsm, // outputs distribution (50257 * float) - ctrl, // control page - opts; // options page + state, // [root] projection (layers * tokens * embed * 3 * float) + master, // [root] single allocation for additional buffers: + accum, // [-sub] accumulator (tokens * embed * float) + logit, // [-sub] result logit vector (50257 * float) + logexp, // [-sub] outputs distribution (50257 * float) + logsm, // [-sub] outputs distribution (50257 * float) + ctrl, // [root] control page + opts; // [root] options page cl::kern lm_embed, @@ -46,6 +46,9 @@ struct ircd::gpt::pipe::desc struct ircd::gpt::pipe::desc::layer { + cl::data + state; // [-sub] qry/key/val projection (tokens * embed * 3 * float) + cl::kern negative, positive, diff --git a/include/ircd/gpt/pipe/exec.h b/include/ircd/gpt/pipe/exec.h index c8c4ccf3c..3cec74cb8 100644 --- a/include/ircd/gpt/pipe/exec.h +++ b/include/ircd/gpt/pipe/exec.h @@ -37,6 +37,8 @@ struct ircd::gpt::pipe::exec recv_ctrl; // Set when receiving the control page. cl::kern::range + range_full, + range_last, range_lm_embed, // Dimension range of the lm_embed kernel. range_negative, // Dimension range of a layer kernel. range_positive, // Dimension range of a layer kernel. diff --git a/include/ircd/gpt/task/opts.h b/include/ircd/gpt/task/opts.h index a6fef5c73..bff1872ad 100644 --- a/include/ircd/gpt/task/opts.h +++ b/include/ircd/gpt/task/opts.h @@ -51,6 +51,9 @@ struct ircd_gpt_opts /// Embedding vector elements uint embed_elems; + /// Cross-attention dimension + uint attn_rank; + /// Attention unit fcon width multiple uint attn_mult; diff --git a/ircd/gpt.cc b/ircd/gpt.cc index 9c67d0dd2..64c844b85 100644 --- a/ircd/gpt.cc +++ b/ircd/gpt.cc @@ -317,6 +317,10 @@ noexcept { 768U } +,attn_rank +{ + 12U +} ,attn_mult { 3U diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl index 2ae09489b..07a4aafd2 100644 --- a/ircd/gpt_gpu.cl +++ b/ircd/gpt_gpu.cl @@ -159,11 +159,13 @@ inline void __attribute__((flatten, always_inline)) ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl, __constant const struct ircd_gpt_opts *const opts, - __local float self[][12]) + __local float self[][12], + const uint wn) { const uint + gn = get_global_size(0), li = get_local_id(0), - wn = get_num_groups(0); + ln = get_local_size(0); struct ircd_math_samax samax = { @@ -201,14 +203,16 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), - wi = get_group_id(0), - wn = get_num_groups(0), - ti = li % 12, - ki = li / 12; + wi = get_global_offset(0) / ln + get_group_id(0), + wn = ctrl->tokens.count, + ti = li % opts->attn_rank, + ki = li / opts->attn_rank, + kn = ln / opts->attn_rank; // Low-rank mask - if(li < 12) + if(li < opts->attn_rank) { + // For each token for(uint i = 0; i < wn; ++i) { // Left-attention mask @@ -219,7 +223,8 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, } float4 acc = 0.0f; - for(uint k = 0; k < 64/4; ++k) + __attribute__((opencl_unroll_hint)) + for(uint k = 0; k < kn; ++k) { float4 qry = token[wi].qry.attn[li][k], @@ -236,7 +241,7 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, } // Three-piece softmax - ircd_gpt_attn_self_samax(ctrl, opts, self); + ircd_gpt_attn_self_samax(ctrl, opts, self, wn); } // Propagate to full width for value dot prod. @@ -244,7 +249,7 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, float4 acc = 0.0f; __attribute__((opencl_unroll_hint)) - for(uint i = 0; i < wn; ++i) + for(uint i = 0; i < wi; ++i) { const float4 attn = self[i][ti], @@ -420,7 +425,7 @@ inline void __attribute__((always_inline)) _ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict out, + __global union ircd_gpt_tokenv *const restrict accum, __global const union ircd_gpt_tokenv *const restrict pos, __global const union ircd_gpt_tokenv *const restrict vocab, const uint out_idx, @@ -435,7 +440,7 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, wte = vocab[token].word[word_idx], wpe = pos[tok_idx].word[word_idx]; - out[out_idx].word[word_idx] = wte + wpe; + accum[out_idx].word[word_idx] = wte + wpe; } __kernel void @@ -451,9 +456,7 @@ ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, ln = get_local_size(0), wi = get_global_offset(0) / ln + get_group_id(0); - for(uint i = 0; i < ctrl->tokens.count; ++i) - if(i % wn == wi) - _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li); + _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li); } __kernel void diff --git a/ircd/gpt_pipe.cc b/ircd/gpt_pipe.cc index 6302da489..392bb9b5a 100644 --- a/ircd/gpt_pipe.cc +++ b/ircd/gpt_pipe.cc @@ -224,49 +224,71 @@ ircd::gpt::pipe::exec::exec(task &task, ,send_opts { reinterpret_cast(task.opts), - release? sizeof(gpt::opts): 0 + release? + sizeof(gpt::opts): + 0 } ,send_ctrl { reinterpret_cast(task.ctrl), - release? sizeof(gpt::ctrl): 0 + release? + sizeof(gpt::ctrl): + 0 } ,send_coil { reinterpret_cast(gpt::model::default_model), - release && desc->model->invalid? (sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm)): 0 + release && desc->model->invalid? + (sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm)): + 0 } ,send_head { reinterpret_cast(&gpt::model::default_model->word), - release && desc->model->invalid? sizeof(gpt::model::embed): 0 + release && desc->model->invalid? + sizeof(gpt::model::embed): + 0 } ,recv_ctrl { reinterpret_cast(task.ctrl), - acquire? sizeof(gpt::ctrl): 0 + acquire? + sizeof(gpt::ctrl): + 0 } -,range_lm_embed -{ - { std::min(tokens, 12UL) * 192UL, 0, }, - { 192UL, 0, }, -} -,range_negative +,range_full { { tokens * 192UL, 0, }, { 192UL, 0, }, } -,range_positive -{ - { tokens * 192UL, 0, }, - { 192UL, 0, }, -} -,range_lm_norm +,range_last { { 1 * 192UL, 0 }, { 192UL, 0 }, { (tokens - 1) * 192UL, 0 }, } +,range_lm_embed +{ + release? + range_full: + range_last +} +,range_negative +{ + release? + range_full: + range_last +} +,range_positive +{ + release? + range_full: + range_last +} +,range_lm_norm +{ + range_last +} ,range_lm_logit { { 786 * 64UL, 0 }, // align_up(50257) / 64 @@ -453,30 +475,27 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, { &code } +,state +{ + 0 + + 12 * 512 * 3 * 768 * sizeof(float), + mutable_buffer{}, +} ,master { 0 - + 512 * 3 * 768 * sizeof(float) + 512 * 768 * sizeof(float) + 65536 * sizeof(float) + 65536 * sizeof(float) + 65536 * sizeof(float) ,mutable_buffer{} } -,state -{ - master, - { - 512 * 3 * 768 * sizeof(float), - off_t(0), - }, -} ,accum { master, { 512 * 768 * sizeof(float), - state.offset() + off_t(state.size()), + off_t(0), }, } ,logit @@ -613,13 +632,21 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, const int laynum) -:negative +:state +{ + desc.state, + { + 512 * 3 * 768 * sizeof(float), + laynum * 512 * 3 * 768 * sizeof(float), + } +} +,negative { *desc.code, "ircd_gpt_attn_fcon", desc.ctrl, desc.opts, - desc.state, + state, desc.accum, desc.model->decode->block[laynum].attn.norm.bias.param, desc.model->decode->block[laynum].attn.norm.weight.param, @@ -633,7 +660,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, desc.ctrl, desc.opts, desc.accum, - desc.state, + state, desc.model->decode->block[laynum].attn.proj.bias.param, desc.model->decode->block[laynum].attn.proj.weight.param, desc.model->decode->block[laynum].ffnn.norm.bias.param,