mirror of
https://github.com/matrix-construct/construct
synced 2024-11-16 15:00:51 +01:00
ircd::gpt::model: Optimize left-attention mask.
This commit is contained in:
parent
9ca95591ff
commit
ce9abfb321
7 changed files with 11 additions and 79 deletions
|
@ -33,9 +33,6 @@ struct ircd::gpt::model::attn
|
|||
attn_bias alignas(align) [2304],
|
||||
attn_weight alignas(align) [768][2304];
|
||||
|
||||
bool
|
||||
bias alignas(align) [1024][1024];
|
||||
|
||||
float
|
||||
proj_bias alignas(align) [768],
|
||||
proj_weight alignas(align) [768][768];
|
||||
|
|
|
@ -60,9 +60,6 @@ struct ircd::gpt::pipe::model::attn
|
|||
fcon,
|
||||
proj;
|
||||
|
||||
cl::data
|
||||
mask;
|
||||
|
||||
attn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::attn &);
|
||||
attn(cl::data *, const off_t, gpt::model::norm &, gpt::model::attn &);
|
||||
};
|
||||
|
|
|
@ -11,12 +11,6 @@
|
|||
#pragma once
|
||||
#define HAVE_IRCD_GPT_TOKEN_H
|
||||
|
||||
struct ircd_gpt_attn_mask
|
||||
{
|
||||
bool
|
||||
token[1024];
|
||||
};
|
||||
|
||||
union ircd_gpt_token
|
||||
{
|
||||
float
|
||||
|
|
|
@ -28,7 +28,7 @@ namespace ircd::gpt
|
|||
static void norm(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const f32);
|
||||
static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
|
||||
static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
|
||||
static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
|
||||
static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t);
|
||||
static void smax(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t);
|
||||
static void attn(float (&)[3][1024][12][64], const float *const, const size_t, const model::decoder &, const uint layer);
|
||||
static void ffnn(float *, const float *, const model::decoder &, const uint layer);
|
||||
|
@ -173,7 +173,7 @@ ircd::gpt::coil(float *__restrict__ accum,
|
|||
|
||||
attn(qkv, accum, tokens, decoder, i);
|
||||
pare(state, qkv, tokens);
|
||||
mask(state, state, layer.attn.bias, tokens);
|
||||
mask(state, state, tokens);
|
||||
smax(state, state, tokens);
|
||||
vals(attns, state, qkv, tokens);
|
||||
|
||||
|
@ -287,7 +287,6 @@ ircd::gpt::pare(float (&__restrict__ out)[12][1024][1024],
|
|||
void
|
||||
ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024],
|
||||
const float (&__restrict__ in)[12][1024][1024],
|
||||
const bool (&__restrict__ bias)[1024][1024],
|
||||
const size_t num)
|
||||
{
|
||||
static const float masked
|
||||
|
@ -299,7 +298,7 @@ ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024],
|
|||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
out[j][k][l] = bias[k][l]? in[j][k][l]: masked;
|
||||
out[j][k][l] = (k < l)? in[j][k][l]: masked;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -197,8 +197,7 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl,
|
|||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__local union ircd_gpt_tokenv *const restrict out,
|
||||
__local float self[][12],
|
||||
__global const struct ircd_gpt_attn_qkvv *const restrict token,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],
|
||||
__global const struct ircd_gpt_attn_qkvv *const restrict token)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
|
@ -215,7 +214,8 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl,
|
|||
{
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
{
|
||||
if(!mask[wi].token[i])
|
||||
// Left-attention mask
|
||||
if(wi < i)
|
||||
{
|
||||
self[i][li] = -10000.0f;
|
||||
continue;
|
||||
|
@ -288,7 +288,6 @@ ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl,
|
|||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const struct ircd_gpt_attn_qkvv *const restrict state,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
|
||||
__global const float4 *const restrict attn_proj_bias,
|
||||
__global const float4 *const restrict attn_proj_weight,
|
||||
__global const float4 *const restrict ffnn_norm_bias,
|
||||
|
@ -324,8 +323,7 @@ ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl,
|
|||
opts,
|
||||
&buf1,
|
||||
buf.attn_self,
|
||||
state,
|
||||
mask
|
||||
state
|
||||
);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
|
|
@ -29,8 +29,7 @@ namespace ircd::gpt::model
|
|||
init_h_attn_attn_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &);
|
||||
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &);
|
||||
|
||||
static bool init_dataset(const string_view &);
|
||||
static bool init_from_cache(const string_view &);
|
||||
|
@ -74,7 +73,6 @@ ircd::gpt::model::manifest_h
|
|||
{ "h.%u.attn.c_attn.bias.json", init_h_attn_attn_bias, },
|
||||
{ "h.%u.attn.c_proj.weight.json", init_h_attn_proj_weight, },
|
||||
{ "h.%u.attn.c_proj.bias.json", init_h_attn_proj_bias },
|
||||
{ "h.%u.attn.bias.json", init_h_attn_bias, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::manifest)
|
||||
|
@ -243,7 +241,7 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
|
|||
// Load the transformer files by layer
|
||||
const size_t layers {12};
|
||||
for(size_t i(0); i < layers; ++i)
|
||||
for(size_t j(0); j < 13; ++j)
|
||||
for(size_t j(0); j < 12; ++j)
|
||||
init_from_json_handle(*decoder, manifest_h[j], i);
|
||||
|
||||
const const_buffer src
|
||||
|
@ -645,35 +643,3 @@ ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
|
|||
|
||||
always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_attn_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
for(const json::array dim0 : mat)
|
||||
{
|
||||
for(const json::array dim1 : dim0)
|
||||
{
|
||||
size_t k(0);
|
||||
for(const json::array dim2 : dim1)
|
||||
{
|
||||
size_t l(0);
|
||||
for(const auto &elem : dim2)
|
||||
{
|
||||
always_assert(elem == "1.0" || elem == "0.0");
|
||||
d.layer[layer].attn.bias[k][l++] = startswith(elem, '1');
|
||||
}
|
||||
|
||||
++k;
|
||||
}
|
||||
|
||||
always_assert
|
||||
(
|
||||
k == sizeof(d.layer[layer].attn.bias)
|
||||
/ sizeof(d.layer[layer].attn.bias[0])
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -609,7 +609,6 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
|
|||
desc.opts,
|
||||
desc.accum,
|
||||
desc.state,
|
||||
desc.model->decode->block[laynum].attn.mask,
|
||||
desc.model->decode->block[laynum].attn.proj.bias.param,
|
||||
desc.model->decode->block[laynum].attn.proj.weight.param,
|
||||
desc.model->decode->block[laynum].ffnn.norm.bias.param,
|
||||
|
@ -1120,18 +1119,10 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master,
|
|||
,proj
|
||||
{
|
||||
master,
|
||||
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + sizeof(attn.bias)),
|
||||
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
|
||||
mutable_buffer{attn.proj_bias},
|
||||
mutable_buffer{attn.proj_weight},
|
||||
}
|
||||
,mask
|
||||
{
|
||||
master[0],
|
||||
{
|
||||
sizeof(attn.bias),
|
||||
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
|
||||
},
|
||||
}
|
||||
{
|
||||
always_assert
|
||||
(
|
||||
|
@ -1139,7 +1130,6 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master,
|
|||
==
|
||||
ircd::data(const_buffer{norm.bias}) +
|
||||
sizeof(norm) +
|
||||
sizeof(attn.bias) +
|
||||
sizeof(attn.attn_bias) +
|
||||
sizeof(attn.attn_weight) +
|
||||
ircd::size(const_buffer{attn.proj_bias})
|
||||
|
@ -1167,18 +1157,10 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master,
|
|||
,proj
|
||||
{
|
||||
master,
|
||||
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + sizeof(attn.bias)),
|
||||
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
|
||||
const_buffer{attn.proj_bias},
|
||||
const_buffer{attn.proj_weight},
|
||||
}
|
||||
,mask
|
||||
{
|
||||
master[0],
|
||||
{
|
||||
sizeof(attn.bias),
|
||||
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
|
||||
},
|
||||
}
|
||||
{
|
||||
always_assert
|
||||
(
|
||||
|
@ -1186,7 +1168,6 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master,
|
|||
==
|
||||
ircd::data(const_buffer{norm.bias}) +
|
||||
sizeof(norm) +
|
||||
sizeof(attn.bias) +
|
||||
sizeof(attn.attn_bias) +
|
||||
sizeof(attn.attn_weight) +
|
||||
ircd::size(const_buffer{attn.proj_bias})
|
||||
|
|
Loading…
Reference in a new issue