0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-06-20 10:58:20 +02:00

ircd::gpt: Transformer Pipe.

This commit is contained in:
Jason Volk 2021-03-29 18:22:42 -07:00
parent 5e52f6b97b
commit 29e74ec9e1
7 changed files with 1526 additions and 0 deletions

1
.gitignore vendored
View file

@ -4,6 +4,7 @@ Makefile
*.o
*.so
*.lo
*.clo
*.la
*.orig
*.log

View file

@ -0,0 +1,103 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_PIPE_CTRL_H
struct ctor_ctrl
{
long call;
ulong pc;
ulong tokens;
ulong magic;
uchar pad[1024 - 32];
union
{
char str[3072];
ushort token[1536];
}
body;
}
__attribute__((aligned(4096)));
struct ctor_opts
{
uchar pad[4096];
}
__attribute__((aligned(4096)));
#ifndef __OPENCL_C_VERSION__
static_assert(sizeof(struct ctor_ctrl) == 4096);
#endif
#ifndef __OPENCL_C_VERSION__
static_assert(sizeof(struct ctor_opts) == 4096);
#endif
#ifndef __cplusplus
union token
{
float
word[768],
attn[12][64];
};
union tokenv
{
float4
word[768/4],
attn[12][64/4];
};
struct qkv
{
union token
qry,
key,
val;
};
struct qkvv
{
union tokenv
qry,
key,
val;
};
struct attn_mask
{
bool
token[1024];
};
union aperature
{
float
word[768],
fcon[2304],
proj[3][768],
qkv[3][12][64],
attn[12][64];
};
union aperaturev
{
float4
word[768/4],
fcon[2304/4],
proj[3][768/4],
qkv[3][12][64/4],
attn[12][64/4];
};
#endif

View file

@ -0,0 +1,80 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_PIPE_MODEL_H
struct ircd::gpt::pipe::model
{
struct tensor;
struct norm;
struct proj;
struct fcon;
struct attn;
struct ffnn;
struct block;
struct decoder;
struct language;
std::unique_ptr<model::decoder> decode;
std::unique_ptr<model::language> embed;
model(const gpt::model::decoder &, const gpt::model::embed &);
~model() noexcept;
};
struct ircd::gpt::pipe::model::tensor
{
cl::data bias, weight;
tensor(const const_buffer &bias, const const_buffer &weight);
tensor(cl::data &, const off_t, const const_buffer &bias, const const_buffer &weight);
};
struct ircd::gpt::pipe::model::attn
{
tensor norm, fcon, proj;
cl::data mask;
attn(cl::data &, const off_t, const gpt::model::norm &, const gpt::model::attn &);
};
struct ircd::gpt::pipe::model::ffnn
{
tensor norm, fcon, proj;
ffnn(cl::data &, const off_t, const gpt::model::norm &, const gpt::model::ffnn &);
};
struct ircd::gpt::pipe::model::block
{
cl::data master;
model::attn attn;
model::ffnn ffnn;
block(const gpt::model::block &, const size_t);
};
struct ircd::gpt::pipe::model::decoder
{
model::block block[12];
tensor norm;
decoder(const gpt::model::decoder &);
~decoder() noexcept;
};
struct ircd::gpt::pipe::model::language
{
cl::data pos, token;
language(const gpt::model::embed &);
~language() noexcept;
};

View file

@ -0,0 +1,96 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_PIPE_H
namespace ircd::gpt::pipe
{
struct model;
struct code;
struct desc;
struct exec;
struct bank;
extern model *default_model;
extern code *default_code;
extern desc *default_desc;
void init(), fini() noexcept;
};
#include "model.h"
#include "ctrl.h"
struct ircd::gpt::pipe::code
:cl::code
{
static const string_view compile_opts;
code();
~code() noexcept;
};
struct ircd::gpt::pipe::desc
{
struct layer;
pipe::model *model;
pipe::code *code;
cl::data opts;
cl::data ctrl;
cl::data state;
cl::data xattn;
cl::data accum;
cl::data logit;
cl::kern anode;
std::unique_ptr<struct desc::layer> layer[12];
cl::kern cathode;
cl::kern lmhead;
cl::kern lmamax;
desc(pipe::code &, pipe::model &);
};
struct ircd::gpt::pipe::desc::layer
{
cl::kern negative;
cl::kern selfattn;
cl::kern positive;
layer(pipe::desc &, const int);
};
struct ircd::gpt::pipe::exec
{
pipe::desc *desc;
mutable_buffer out_ctrl;
const_buffer in_ctrl, in_opts;
cl::kern::range range_anode;
cl::kern::range range_coil;
cl::kern::range range_negative;
cl::kern::range range_selfattn;
cl::kern::range range_positive;
cl::kern::range range_cathode;
cl::kern::range range_lmhead;
cl::kern::range range_lmamax;
cl::exec send[2];
cl::exec tail[1];
cl::exec coil[12 * 3];
cl::exec head[3];
cl::exec recv[1];
exec(ctor_ctrl &, const ctor_opts &);
~exec() noexcept;
};

View file

@ -218,6 +218,10 @@ if OPENCL
libircd_la_SOURCES += cl.cc
endif
libircd_la_SOURCES += gpt.cc
libircd_la_SOURCES += gpt_pipe.cc
if OPENCL
BUILT_SOURCES += gpt_cl.clo
endif
libircd_la_SOURCES += gpt_model.cc
libircd_la_SOURCES += gpt_vocab.cc
libircd_la_SOURCES += openssl.cc
@ -348,3 +352,6 @@ endif
# LLVM PGO text to binary for -fprofile-use
default.profdata:
-$(LLVM_PROFDATA) merge -output=default.profdata default.proftext
gpt_cl.clo: gpt_cl.cl
clang++-11 -std=CL1.1 -c -pipe -Xclang -finclude-default-header -o gpt_cl.clo $^

559
ircd/gpt_cl.cl Normal file
View file

@ -0,0 +1,559 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
inline void
ctor_local_bcast_ldr(__local float4 *const out,
const uint ln,
const uint li)
{
for(uint stride = 1; stride < ln; stride <<= 1)
{
if(li < stride)
out[li + stride] = out[li];
barrier(CLK_LOCAL_MEM_FENCE);
}
}
inline void
ctor_local_reduce_add_ldr(__local float4 *const out,
const uint ln,
const uint li)
{
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride)
out[li] += out[li + stride];
}
}
inline void
ctor_local_reduce_max_ldr(__local float *const out,
const uint ln,
const uint li)
{
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride)
out[li] = max(out[li], out[li + stride]);
}
}
inline void
ctor_local_reduce_tournament_ldr(__local float *const best,
__local ushort *const idx,
const uint ln,
const uint li)
{
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride && best[li] < best[li + stride])
{
best[li] = best[li + stride];
idx[li] = idx[li + stride];
}
}
}
inline void
ctor_mean(__local float4 *const restrict out,
__local const float4 *const restrict in,
const uint num,
const uint i)
{
out[i] = in[i];
ctor_local_reduce_add_ldr(out, num, i);
float numerator = 0.0f;
float4 numeratorv = out[i];
for(uint k = 0; k < 4; ++k)
numerator += numeratorv[k];
out[i] = numerator;
ctor_local_bcast_ldr(out, num, i);
numeratorv = out[i];
out[i] = numeratorv / (num * 4);
}
inline void
ctor_norm(__local float4 *const out,
__local const float4 *const in,
__local float4 *const restrict tmp,
const uint num,
const uint i)
{
ctor_mean(tmp, in, num, i);
const float4
sub_mean = in[i] - tmp[i];
tmp[i] = pow(sub_mean, 2);
ctor_mean(out, tmp, num, i);
const float4
epsilon = 0.00001f,
s = sqrt(out[i] + epsilon);
out[i] = sub_mean / s;
}
inline void
ctor_norm_fmad(__local float4 *const out,
__local const float4 *const in,
__global const float4 *const restrict bias,
__global const float4 *const restrict weight,
const uint i)
{
out[i] = in[i] * weight[i] + bias[i];
}
// Matrix * Vector Multiply/Accumulate
inline void
ctor_sgemv(__local float4 *const restrict out,
__local const float4 *const restrict in,
__global const float4 *const restrict bias,
__global const float4 *const restrict weight,
const uint width,
const uint height,
const uint tiles,
const uint i)
{
const uint seg = height / tiles;
float4 acc = bias[i];
for(uint j = 0; j < seg; ++j)
for(uint t = 0; t < tiles; ++t)
for(uint k = 0; k < 4; ++k)
{
const uint
jidx = t * seg + j,
kidx = jidx * 4 + k,
widx = kidx * width + i;
acc += weight[widx] * in[jidx][k];
}
out[i] = acc;
}
inline void
ctor_gelu(__local float4 *const out,
__local const float4 *const in_,
const uint i)
{
float4 a,
in = in_[i];
a = 0.044715f;
a *= in;
a *= in;
a += 1.0f;
a *= 0.7978845608f;
a *= in;
a = tanh(a);
a += 1.0f;
a *= in;
a *= 0.5f;
out[i] = a;
}
//
// core
//
__kernel void
ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union aperaturev *const restrict out,
__global const union tokenv *const restrict in,
__global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight,
__global const float4 *const restrict fcon_bias,
__global const float4 *const restrict fcon_weight)
{
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
__local union aperaturev token;
__local float4 tmp[768/4];
token.word[li] = in[wi].word[li];
// Layer re-normalization
ctor_norm(token.word, token.word, tmp, ln, li);
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
// Fully connected
for(uint i = 0; i < 3; ++i)
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
// Export queries, keys, and values.
for(uint i = 0; i < 3; ++i)
out[wi].proj[i][li] = token.proj[i][li];
}
__kernel void
ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const union tokenv *const restrict xattn,
__global const float4 *const restrict proj_bias,
__global const float4 *const restrict proj_weight)
{
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
__local float4
in[768/4],
out[768/4];
// Fetch
in[li] = xattn[wi].word[li];
// Projection
ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
// Accumulation; end of layer
accum[wi].word[li] += out[li];
}
__kernel void
ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight,
__global const float4 *const restrict fcon_bias,
__global const float4 *const restrict fcon_weight,
__global const float4 *const restrict proj_bias,
__global const float4 *const restrict proj_weight)
{
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
__local union aperaturev token;
__local float4 tmp[768/4];
// Fetch local copy of the global accumulator. We operate on a cached
// copy as input, and add our output to the global upon completion.
token.word[li] = accum[wi].word[li];
// Layer re-normalization
ctor_norm(token.word, token.word, tmp, ln, li);
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
// Fully connected
for(uint i = 0; i < 4; ++i)
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
// Gaussian Error Linear Unit
for(uint i = 0; i < 4; ++i)
ctor_gelu(token.fcon, token.fcon, i * ln + li);
// Projection
ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
// Accumulation; end of layer
accum[wi].word[li] += tmp[li];
}
__kernel void
ctor_backend(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const union tokenv *const restrict xattn,
__global const float4 *const restrict attn_proj_bias,
__global const float4 *const restrict attn_proj_weight,
__global const float4 *const restrict ffnn_norm_bias,
__global const float4 *const restrict ffnn_norm_weight,
__global const float4 *const restrict ffnn_fcon_bias,
__global const float4 *const restrict ffnn_fcon_weight,
__global const float4 *const restrict ffnn_proj_bias,
__global const float4 *const restrict ffnn_proj_weight)
{
ctor_attn_proj
(
ctrl,
opts,
accum,
xattn,
attn_proj_bias,
attn_proj_weight
);
ctor_ffnn
(
ctrl,
opts,
accum,
ffnn_norm_bias,
ffnn_norm_weight,
ffnn_fcon_bias,
ffnn_fcon_weight,
ffnn_proj_bias,
ffnn_proj_weight
);
}
//
// ctrl
//
__kernel void
ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict out,
__global const struct qkvv *const restrict token,
__global const struct attn_mask *const restrict mask) // [1024][1024],
{
__local struct
{
float
attn[12][32];
}
self;
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i])
self.attn[li][i] = 0.0f;
else
self.attn[li][i] = -10000.0f;
for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i])
for(uint j = 0; j < 64/4; ++j)
{
float4
qry = token[wi].qry.attn[li][j],
key = token[i].key.attn[li][j],
res = qry * key;
for(uint k = 0; k < 4; ++k)
self.attn[li][i] += res[k];
}
for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i])
self.attn[li][i] /= 8.0f;
for(uint i = 0; i < wn; ++i)
self.attn[li][i] = exp(self.attn[li][i]);
float4 vacc = 0.0f;
for(uint i = 0; i < wn; ++i)
vacc[i % 4] += self.attn[li][i];
float acc = 0.0f;
for(uint i = 0; i < 4; ++i)
acc += vacc[i];
for(uint i = 0; i < wn; ++i)
self.attn[li][i] /= acc;
for(uint j = 0; j < 64/4; ++j)
out[wi].attn[li][j] = 0.0f;
for(uint i = 0; i < wn; ++i)
for(uint j = 0; j < 64/4; ++j)
out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
}
//
// leads
//
__kernel void
ctor_anode0(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const union tokenv *const restrict pos,
__global const union tokenv *const restrict vocab)
{
const uint
li = get_local_id(0),
wi = get_group_id(0);
const ushort
token = ctrl->body.token[wi];
const float4
wte = vocab[token].word[li],
wpe = pos[wi].word[li];
accum[wi].word[li] = wte + wpe;
}
__kernel void
ctor_anode1(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const union tokenv *const restrict pos,
__global const union tokenv *const restrict vocab)
{
const uint
li = get_local_id(0);
for(uint i = 0; i < ctrl->tokens; ++i)
{
const ushort
token = ctrl->body.token[i];
const float4
wte = vocab[token].word[li],
wpe = pos[i].word[li];
accum[i].word[li] = wte + wpe;
}
}
__kernel void
ctor_anode2(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const union tokenv *const restrict pos,
__global const union tokenv *const restrict vocab)
{
const uint
gi = get_global_id(0);
const ushort
token = ctrl->body.token[gi];
for(uint i = 0; i < 768/4; ++i)
{
const float4
wte = vocab[token].word[i],
wpe = pos[gi].word[i];
accum[gi].word[i] = wte + wpe;
}
}
__kernel void
ctor_cathode(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight)
{
const uint
li = get_local_id(0),
ln = get_local_size(0),
wi = get_global_offset(0) / ln + get_group_id(0);
__local union tokenv
token, tmp;
token.word[li] = accum[wi].word[li];
// Final re-normalization
ctor_norm(token.word, token.word, tmp.word, ln, li);
ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
accum[0].word[li] = token.word[li];
}
__kernel void
ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global float *const restrict logit,
__global const union tokenv *const restrict accum,
__global const union tokenv *const restrict token)
{
const uint
gi = get_global_id(0);
float4 acc = 0.0f;
for(uint j = 0; j < 768/4; ++j)
{
const float4
in = accum[0].word[j],
vocab = token[gi].word[j],
res = vocab * in;
acc += res;
}
float res = 0.0f;
for(uint k = 0; k < 4; ++k)
res += acc[k];
logit[gi] = res;
}
__kernel void
ctor_lmamax(__global struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global const float *const restrict logit)
{
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0),
tn = 262,
ti = tn * li;
__local ushort idx[192];
__local float best[192];
idx[li] = ti;
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
if(logit[j] > logit[idx[li]])
idx[li] = j;
best[li] = logit[idx[li]];
ctor_local_reduce_tournament_ldr(best, idx, ln, li);
if(li == 0 && ctrl->call == -1)
ctrl->body.token[ctrl->tokens++] = idx[li];
if(li == 0 && ctrl->call == -1)
ctrl->call = 1;
#ifdef RB_DEBUG
if(li == 0 && ctrl->call == 1)
if(ctrl->tokens < 2)
ctrl->call = -2;
#endif
}

680
ircd/gpt_pipe.cc Normal file
View file

@ -0,0 +1,680 @@
// Tensor Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#include <ircd/gpt/pipe/pipe.h>
namespace ircd::gpt
{
void transform(ctor_ctrl &, const ctor_opts &);
}
namespace ircd::gpt::pipe
{
static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts,
lmhead_opts, lmamax_opts;
extern const ircd::run::changed handle_quit;
}
decltype(ircd::gpt::pipe::default_model)
ircd::gpt::pipe::default_model;
decltype(ircd::gpt::pipe::default_code)
ircd::gpt::pipe::default_code;
decltype(ircd::gpt::pipe::default_desc)
ircd::gpt::pipe::default_desc;
decltype(ircd::gpt::pipe::handle_quit)
ircd::gpt::pipe::handle_quit
{
run::level::QUIT, pipe::fini
};
void
ircd::gpt::pipe::init()
{
const auto &default_model
{
*gpt::model::default_model
};
assert(!pipe::default_model);
pipe::default_model = new pipe::model
{
default_model, default_model.word
};
pipe::default_code = new pipe::code
{
};
pipe::default_desc = new pipe::desc
{
*pipe::default_code, *pipe::default_model
};
}
void
ircd::gpt::pipe::fini()
noexcept
{
delete default_desc;
default_desc = nullptr;
delete default_code;
default_code = nullptr;
delete default_model;
default_model = nullptr;
}
//
// pipe
//
void
ircd::gpt::transform(ctor_ctrl &ctrl,
const ctor_opts &opts)
{
if(unlikely(!pipe::default_model))
pipe::init();
ctrl.call = -1;
pipe::exec
{
ctrl, opts
};
if(unlikely(ctrl.call <= 0))
throw error
{
"hyper (#%d) :%s",
abs(ctrl.call),
ctrl.body.str,
};
}
//
// pipe::exec
//
ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl,
const ctor_opts &opts)
:desc
{
default_desc
}
,out_ctrl
{
reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl)
}
,in_ctrl
{
reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl)
}
,in_opts
{
reinterpret_cast<const char *>(&opts), sizeof(ctor_opts)
}
,range_anode
{
{ ctrl.tokens, 0, },
{ 1, 0, },
}
,range_coil
{
{ ctrl.tokens * 192UL, 0, },
{ 192UL, 0, },
}
,range_negative
{
range_coil
}
,range_selfattn
{
range_coil
}
,range_positive
{
range_coil
}
,range_cathode
{
{ 1 * 192UL, 0 },
{ 192UL, 0 },
{ (ctrl.tokens - 1) * 192UL, 0 },
}
,range_lmhead
{
{ 262 * 192UL, 0 }, // align_up(50257) / 192
{ 192UL, 0 },
}
,range_lmamax
{
{ 1 * 192UL, 0 },
{ 192UL, 0 },
}
,send
{
{ desc->opts, in_opts },
{ desc->ctrl, in_ctrl },
}
,tail
{
{ desc->anode, range_anode, anode_opts },
}
,coil
{
{ desc->layer[0x00]->negative, range_negative, negative_opts },
{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x00]->positive, range_positive, positive_opts },
{ desc->layer[0x01]->negative, range_negative, negative_opts },
{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x01]->positive, range_positive, positive_opts },
{ desc->layer[0x02]->negative, range_negative, negative_opts },
{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x02]->positive, range_positive, positive_opts },
{ desc->layer[0x03]->negative, range_negative, negative_opts },
{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x03]->positive, range_positive, positive_opts },
{ desc->layer[0x04]->negative, range_negative, negative_opts },
{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x04]->positive, range_positive, positive_opts },
{ desc->layer[0x05]->negative, range_negative, negative_opts },
{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x05]->positive, range_positive, positive_opts },
{ desc->layer[0x06]->negative, range_negative, negative_opts },
{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x06]->positive, range_positive, positive_opts },
{ desc->layer[0x07]->negative, range_negative, negative_opts },
{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x07]->positive, range_positive, positive_opts },
{ desc->layer[0x08]->negative, range_negative, negative_opts },
{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x08]->positive, range_positive, positive_opts },
{ desc->layer[0x09]->negative, range_negative, negative_opts },
{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x09]->positive, range_positive, positive_opts },
{ desc->layer[0x0a]->negative, range_negative, negative_opts },
{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x0a]->positive, range_positive, positive_opts },
{ desc->layer[0x0b]->negative, range_negative, negative_opts },
{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x0b]->positive, range_positive, positive_opts },
}
,head
{
{ desc->cathode, range_cathode, cathode_opts },
{ desc->lmhead, range_lmhead, lmhead_opts },
{ desc->lmamax, range_lmamax, lmamax_opts },
}
,recv
{
{ desc->ctrl, out_ctrl },
}
{
}
ircd::gpt::pipe::exec::~exec()
noexcept
{
}
//
// code
//
decltype(ircd::gpt::pipe::code::compile_opts)
ircd::gpt::pipe::code::compile_opts
{
" -cl-strict-aliasing"
" -cl-no-signed-zeros"
" -cl-finite-math-only"
" -cl-unsafe-math-optimizations"
" -cl-fast-relaxed-math"
//" -cl-mad-enable"
//" -cl-single-precision-constant"
//" -cl-fp32-correctly-rounded-divide-sqrt"
};
ircd::gpt::pipe::code::code()
:cl::code{[]
{
const fs::fd fd
{
};
const std::string read
{
fs::read(fd)
};
const string_view bin
{
read
};
const vector_view<const string_view> bins
(
&bin, 1
);
return cl::code
{
bins, compile_opts
};
}()}
{
}
ircd::gpt::pipe::code::~code()
noexcept
{
}
//
// pipe::desc
//
ircd::gpt::pipe::desc::desc(pipe::code &code,
pipe::model &model)
:model
{
&model
}
,code
{
&code
}
,opts
{
4_KiB,
const_buffer{}
}
,ctrl
{
4_KiB,
mutable_buffer{}
}
,state
{
32 * 3 * 768 * sizeof(float),
mutable_buffer{}
}
,xattn
{
32 * 1 * 768 * sizeof(float),
mutable_buffer{}
}
,accum
{
32 * 768 * sizeof(float),
mutable_buffer{}
}
,logit
{
65536 * sizeof(float),
mutable_buffer{}
}
,anode
{
code,
"ctor_anode2",
ctrl,
opts,
accum,
model.embed->pos,
model.embed->token,
}
,layer
{
std::make_unique<struct desc::layer>(*this, 0x00),
std::make_unique<struct desc::layer>(*this, 0x01),
std::make_unique<struct desc::layer>(*this, 0x02),
std::make_unique<struct desc::layer>(*this, 0x03),
std::make_unique<struct desc::layer>(*this, 0x04),
std::make_unique<struct desc::layer>(*this, 0x05),
std::make_unique<struct desc::layer>(*this, 0x06),
std::make_unique<struct desc::layer>(*this, 0x07),
std::make_unique<struct desc::layer>(*this, 0x08),
std::make_unique<struct desc::layer>(*this, 0x09),
std::make_unique<struct desc::layer>(*this, 0x0a),
std::make_unique<struct desc::layer>(*this, 0x0b),
}
,cathode
{
code,
"ctor_cathode",
ctrl,
opts,
accum,
model.decode->norm.bias,
model.decode->norm.weight,
}
,lmhead
{
code,
"ctor_lmhead",
ctrl,
opts,
logit,
accum,
model.embed->token,
}
,lmamax
{
code,
"ctor_lmamax",
ctrl,
opts,
logit,
}
{
}
//
// pipe::desc::layer
//
ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
const int laynum)
:negative
{
*desc.code,
"ctor_attn_fcon",
desc.ctrl,
desc.opts,
desc.state,
desc.accum,
desc.model->decode->block[laynum].attn.norm.bias,
desc.model->decode->block[laynum].attn.norm.weight,
desc.model->decode->block[laynum].attn.fcon.bias,
desc.model->decode->block[laynum].attn.fcon.weight,
}
,selfattn
{
*desc.code,
"ctor_attn_self",
desc.ctrl,
desc.opts,
desc.xattn,
desc.state,
desc.model->decode->block[laynum].attn.mask,
}
,positive
{
*desc.code,
"ctor_backend",
desc.ctrl,
desc.opts,
desc.accum,
desc.xattn,
desc.model->decode->block[laynum].attn.proj.bias,
desc.model->decode->block[laynum].attn.proj.weight,
desc.model->decode->block[laynum].ffnn.norm.bias,
desc.model->decode->block[laynum].ffnn.norm.weight,
desc.model->decode->block[laynum].ffnn.fcon.bias,
desc.model->decode->block[laynum].ffnn.fcon.weight,
desc.model->decode->block[laynum].ffnn.proj.bias,
desc.model->decode->block[laynum].ffnn.proj.weight,
}
{
}
///////////////////////////////////////////////////////////////////////////////
//
// model
//
//
// pipe::model::model
//
ircd::gpt::pipe::model::model(const gpt::model::decoder &decoder,
const gpt::model::embed &embed)
:decode
{
std::make_unique<model::decoder>(decoder)
}
,embed
{
std::make_unique<model::language>(embed)
}
{
}
ircd::gpt::pipe::model::~model()
noexcept
{
}
//
// pipe::model::language
//
ircd::gpt::pipe::model::language::language(const gpt::model::embed &embed)
:pos
{
sizeof(embed.pos),
const_buffer{embed.pos}
}
,token
{
sizeof(embed.token),
const_buffer{embed.token}
}
{
}
ircd::gpt::pipe::model::language::~language()
noexcept
{
}
//
// pipe::model::decoder
//
ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
:block
{
{ decoder.layer[0x00], 0x00, },
{ decoder.layer[0x01], 0x01, },
{ decoder.layer[0x02], 0x02, },
{ decoder.layer[0x03], 0x03, },
{ decoder.layer[0x04], 0x04, },
{ decoder.layer[0x05], 0x05, },
{ decoder.layer[0x06], 0x06, },
{ decoder.layer[0x07], 0x07, },
{ decoder.layer[0x08], 0x08, },
{ decoder.layer[0x09], 0x09, },
{ decoder.layer[0x0a], 0x0a, },
{ decoder.layer[0x0b], 0x0b, },
}
,norm
{
const_buffer{decoder.f.bias},
const_buffer{decoder.f.weight},
}
{
}
ircd::gpt::pipe::model::decoder::~decoder()
noexcept
{
}
//
// pipe::model::block
//
ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
const size_t layer)
:master
{
sizeof(block), const_buffer
{
reinterpret_cast<const char *>(&block), sizeof(block)
}
}
,attn
{
master,
0,
block.ln1,
block.attn,
}
,ffnn
{
master,
sizeof(block.ln1) + sizeof(block.attn),
block.ln2,
block.ffnn,
}
{
}
//
// pipe::model::ffnn
//
ircd::gpt::pipe::model::ffnn::ffnn(cl::data &master,
const off_t offset,
const gpt::model::norm &norm,
const gpt::model::ffnn &ffnn)
:norm
{
master,
offset,
const_buffer{norm.bias},
const_buffer{norm.weight},
}
,fcon
{
master,
offset + off_t(sizeof(norm)),
const_buffer{ffnn.fc_bias},
const_buffer{ffnn.fc_weight},
}
,proj
{
master,
offset + off_t(sizeof(norm) + sizeof(ffnn.fc_bias) + sizeof(ffnn.fc_weight)),
const_buffer{ffnn.proj_bias},
const_buffer{ffnn.proj_weight},
}
{
always_assert
(
ircd::data(const_buffer{ffnn.proj_weight})
==
ircd::data(const_buffer{norm.bias}) +
sizeof(norm) +
sizeof(ffnn.fc_bias) +
sizeof(ffnn.fc_weight) +
ircd::size(const_buffer{ffnn.proj_bias})
);
}
//
// pipe::model::attn
//
ircd::gpt::pipe::model::attn::attn(cl::data &master,
const off_t offset,
const gpt::model::norm &norm,
const gpt::model::attn &attn)
:norm
{
master,
offset,
const_buffer{norm.bias},
const_buffer{norm.weight},
}
,fcon
{
master,
offset + off_t(sizeof(norm)),
const_buffer{attn.attn_bias},
const_buffer{attn.attn_weight},
}
,proj
{
master,
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + sizeof(attn.bias)),
const_buffer{attn.proj_bias},
const_buffer{attn.proj_weight},
}
,mask
{
master,
{
sizeof(attn.bias),
offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
},
}
{
always_assert
(
ircd::data(const_buffer{attn.proj_weight})
==
ircd::data(const_buffer{norm.bias}) +
sizeof(norm) +
sizeof(attn.bias) +
sizeof(attn.attn_bias) +
sizeof(attn.attn_weight) +
ircd::size(const_buffer{attn.proj_bias})
);
}
//
// pipe::model::tensor
//
ircd::gpt::pipe::model::tensor::tensor(const const_buffer &bias,
const const_buffer &weight)
:bias
{
ircd::size(bias),
bias,
}
,weight
{
ircd::size(weight),
weight,
}
{
}
ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
const off_t offset,
const const_buffer &bias,
const const_buffer &weight)
:bias
{
master,
{
ircd::size(bias), // size
offset, // offset
},
}
,weight
{
master,
{
ircd::size(weight), // size
offset + ircd::size(bias), // offset
}
}
{
}