diff --git a/include/ircd/gpt/model.h b/include/ircd/gpt/model.h index 59ebaeb42..b196e2d25 100644 --- a/include/ircd/gpt/model.h +++ b/include/ircd/gpt/model.h @@ -33,9 +33,6 @@ struct ircd::gpt::model::attn attn_bias alignas(align) [2304], attn_weight alignas(align) [768][2304]; - bool - bias alignas(align) [1024][1024]; - float proj_bias alignas(align) [768], proj_weight alignas(align) [768][768]; diff --git a/include/ircd/gpt/pipe/model.h b/include/ircd/gpt/pipe/model.h index d003150cb..ce62bd671 100644 --- a/include/ircd/gpt/pipe/model.h +++ b/include/ircd/gpt/pipe/model.h @@ -60,9 +60,6 @@ struct ircd::gpt::pipe::model::attn fcon, proj; - cl::data - mask; - attn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::attn &); attn(cl::data *, const off_t, gpt::model::norm &, gpt::model::attn &); }; diff --git a/include/ircd/gpt/token.h b/include/ircd/gpt/token.h index 01ec99059..1ec4d358c 100644 --- a/include/ircd/gpt/token.h +++ b/include/ircd/gpt/token.h @@ -11,12 +11,6 @@ #pragma once #define HAVE_IRCD_GPT_TOKEN_H -struct ircd_gpt_attn_mask -{ - bool - token[1024]; -}; - union ircd_gpt_token { float diff --git a/ircd/gpt_cpu.cc b/ircd/gpt_cpu.cc index db1b627e9..a338b9369 100644 --- a/ircd/gpt_cpu.cc +++ b/ircd/gpt_cpu.cc @@ -28,7 +28,7 @@ namespace ircd::gpt static void norm(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const f32); static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t); static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t); - static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t); + static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t); static void smax(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t); static void attn(float (&)[3][1024][12][64], const float *const, const size_t, const model::decoder &, const uint layer); static void ffnn(float *, const float *, const model::decoder &, const uint layer); @@ -173,7 +173,7 @@ ircd::gpt::coil(float *__restrict__ accum, attn(qkv, accum, tokens, decoder, i); pare(state, qkv, tokens); - mask(state, state, layer.attn.bias, tokens); + mask(state, state, tokens); smax(state, state, tokens); vals(attns, state, qkv, tokens); @@ -287,7 +287,6 @@ ircd::gpt::pare(float (&__restrict__ out)[12][1024][1024], void ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024], const float (&__restrict__ in)[12][1024][1024], - const bool (&__restrict__ bias)[1024][1024], const size_t num) { static const float masked @@ -299,7 +298,7 @@ ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024], for(uint j(0); j < 12; ++j) for(uint k(0); k < num; ++k) for(uint l(0); l < num; ++l) - out[j][k][l] = bias[k][l]? in[j][k][l]: masked; + out[j][k][l] = (k < l)? in[j][k][l]: masked; } void diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl index e97dc466f..fedcd0afb 100644 --- a/ircd/gpt_gpu.cl +++ b/ircd/gpt_gpu.cl @@ -197,8 +197,7 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, __constant const struct ircd_gpt_opts *const opts, __local union ircd_gpt_tokenv *const restrict out, __local float self[][12], - __global const struct ircd_gpt_attn_qkvv *const restrict token, - __global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024], + __global const struct ircd_gpt_attn_qkvv *const restrict token) { const uint gi = get_global_id(0), @@ -215,7 +214,8 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, { for(uint i = 0; i < wn; ++i) { - if(!mask[wi].token[i]) + // Left-attention mask + if(wi < i) { self[i][li] = -10000.0f; continue; @@ -288,7 +288,6 @@ ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict accum, __global const struct ircd_gpt_attn_qkvv *const restrict state, - __global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024], __global const float4 *const restrict attn_proj_bias, __global const float4 *const restrict attn_proj_weight, __global const float4 *const restrict ffnn_norm_bias, @@ -324,8 +323,7 @@ ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl, opts, &buf1, buf.attn_self, - state, - mask + state ); barrier(CLK_LOCAL_MEM_FENCE); diff --git a/ircd/gpt_model.cc b/ircd/gpt_model.cc index 9042022e5..d8d7f9038 100644 --- a/ircd/gpt_model.cc +++ b/ircd/gpt_model.cc @@ -29,8 +29,7 @@ namespace ircd::gpt::model init_h_attn_attn_weight(decoder &, const string_view &, const size_t &, const json::array &), init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &), init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &), - init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &), - init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &); + init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &); static bool init_dataset(const string_view &); static bool init_from_cache(const string_view &); @@ -74,7 +73,6 @@ ircd::gpt::model::manifest_h { "h.%u.attn.c_attn.bias.json", init_h_attn_attn_bias, }, { "h.%u.attn.c_proj.weight.json", init_h_attn_proj_weight, }, { "h.%u.attn.c_proj.bias.json", init_h_attn_proj_bias }, - { "h.%u.attn.bias.json", init_h_attn_bias, }, }; decltype(ircd::gpt::model::manifest) @@ -243,7 +241,7 @@ ircd::gpt::model::init_from_json(const string_view &cache_path, // Load the transformer files by layer const size_t layers {12}; for(size_t i(0); i < layers; ++i) - for(size_t j(0); j < 13; ++j) + for(size_t j(0); j < 12; ++j) init_from_json_handle(*decoder, manifest_h[j], i); const const_buffer src @@ -645,35 +643,3 @@ ircd::gpt::model::init_h_attn_proj_bias(decoder &d, always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float)); } - -void -ircd::gpt::model::init_h_attn_bias(decoder &d, - const string_view &name, - const size_t &layer, - const json::array &mat) -{ - for(const json::array dim0 : mat) - { - for(const json::array dim1 : dim0) - { - size_t k(0); - for(const json::array dim2 : dim1) - { - size_t l(0); - for(const auto &elem : dim2) - { - always_assert(elem == "1.0" || elem == "0.0"); - d.layer[layer].attn.bias[k][l++] = startswith(elem, '1'); - } - - ++k; - } - - always_assert - ( - k == sizeof(d.layer[layer].attn.bias) - / sizeof(d.layer[layer].attn.bias[0]) - ); - } - } -} diff --git a/ircd/gpt_pipe.cc b/ircd/gpt_pipe.cc index bea19a88f..87ceceb25 100644 --- a/ircd/gpt_pipe.cc +++ b/ircd/gpt_pipe.cc @@ -609,7 +609,6 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, desc.opts, desc.accum, desc.state, - desc.model->decode->block[laynum].attn.mask, desc.model->decode->block[laynum].attn.proj.bias.param, desc.model->decode->block[laynum].attn.proj.weight.param, desc.model->decode->block[laynum].ffnn.norm.bias.param, @@ -1120,18 +1119,10 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master, ,proj { master, - offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + sizeof(attn.bias)), + offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)), mutable_buffer{attn.proj_bias}, mutable_buffer{attn.proj_weight}, } -,mask -{ - master[0], - { - sizeof(attn.bias), - offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)), - }, -} { always_assert ( @@ -1139,7 +1130,6 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master, == ircd::data(const_buffer{norm.bias}) + sizeof(norm) + - sizeof(attn.bias) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + ircd::size(const_buffer{attn.proj_bias}) @@ -1167,18 +1157,10 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master, ,proj { master, - offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + sizeof(attn.bias)), + offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)), const_buffer{attn.proj_bias}, const_buffer{attn.proj_weight}, } -,mask -{ - master[0], - { - sizeof(attn.bias), - offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)), - }, -} { always_assert ( @@ -1186,7 +1168,6 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master, == ircd::data(const_buffer{norm.bias}) + sizeof(norm) + - sizeof(attn.bias) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + ircd::size(const_buffer{attn.proj_bias})