mirror of
https://github.com/matrix-construct/construct
synced 2025-01-05 20:34:29 +01:00
1012 lines
28 KiB
Common Lisp
1012 lines
28 KiB
Common Lisp
// Matrix Construct
|
|
//
|
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
//
|
|
// Permission to use, copy, modify, and/or distribute this software for any
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
// copyright notice and this permission notice is present in all copies. The
|
|
// full license for this software is available in the LICENSE file.
|
|
|
|
#include <ircd/simt/simt.h>
|
|
#include <ircd/gpt/token.h>
|
|
#include <ircd/gpt/opts.h>
|
|
#include <ircd/gpt/ctrl.h>
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_norm_fmad(__local float4 *const out,
|
|
__local const float4 *const in,
|
|
__global const float4 *const restrict bias,
|
|
__global const float4 *const restrict weight,
|
|
const uint i)
|
|
{
|
|
out[i] = in[i] * weight[i] + bias[i];
|
|
}
|
|
|
|
/// Gaussian Error Linear Unit
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_ffnn_gelu(__local float4 *const out,
|
|
__local const float4 *const in_,
|
|
const uint i)
|
|
{
|
|
const float4
|
|
in = in_[i];
|
|
|
|
float4 a;
|
|
a = 0.044715f;
|
|
a *= in;
|
|
a *= in;
|
|
a += 1.0f;
|
|
a *= 0.7978845608f;
|
|
a *= in;
|
|
|
|
a = tanh(a);
|
|
a += 1.0f;
|
|
a *= in;
|
|
a *= 0.5f;
|
|
|
|
out[i] = a;
|
|
}
|
|
|
|
// Matrix * Vector Multiply/Accumulate
|
|
inline float4
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_tmul_dot(__local const float4 *const restrict in,
|
|
__global const float4 *const restrict bias,
|
|
__global const float4 *const restrict weight,
|
|
const uint width,
|
|
const uint height,
|
|
const uint col,
|
|
const uint i,
|
|
const uint j)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
lanes = 4;
|
|
|
|
float4
|
|
acc = 0.0f;
|
|
|
|
for(uint k = 0; k < lanes; ++k)
|
|
{
|
|
const uint
|
|
row = j * lanes + k,
|
|
cell = row * width + col;
|
|
|
|
acc += in[j][k] * weight[cell];
|
|
}
|
|
|
|
return acc;
|
|
}
|
|
|
|
// Matrix * Vector Multiply/Accumulate
|
|
inline void
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_tmul(__local float4 *const restrict out,
|
|
__local const float4 *const restrict in,
|
|
__global const float4 *const restrict bias,
|
|
__global const float4 *const restrict weight,
|
|
const uint width,
|
|
const uint height,
|
|
const uint segs)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0);
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < segs; ++i)
|
|
{
|
|
const uint
|
|
col = i * ln + li;
|
|
|
|
out[col] = bias[col];
|
|
}
|
|
|
|
for(uint i = 0; i < segs; ++i)
|
|
{
|
|
const uint
|
|
col = i * ln + li;
|
|
|
|
for(uint j = 0; j < height; ++j)
|
|
out[col] += ircd_gpt_tmul_dot(in, bias, weight, width, height, col, i, j);
|
|
}
|
|
}
|
|
|
|
inline void
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local union ircd_gpt_ffnn_aperaturev *const restrict out,
|
|
__local const union ircd_gpt_tokenv *const in,
|
|
__global const float4 *const restrict bias,
|
|
__global const float4 *const restrict weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
width = opts->ffnn_width,
|
|
height = opts->ffnn_height,
|
|
tiles = opts->ffnn_mult;
|
|
|
|
ircd_gpt_tmul(out->fcon, in->word, bias, weight, width, height, tiles);
|
|
|
|
for(uint i = 0; i < tiles; ++i)
|
|
ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li);
|
|
}
|
|
|
|
inline void
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local union ircd_gpt_tokenv *const restrict token,
|
|
__local union ircd_gpt_ffnn_aperaturev *const restrict buf,
|
|
__local union ircd_gpt_ffnn_aperaturev *const restrict tmp0,
|
|
__local union ircd_gpt_tokenv *const restrict tmp1,
|
|
__global const float4 *const restrict norm_bias,
|
|
__global const float4 *const restrict norm_weight,
|
|
__global const float4 *const restrict fcon_bias,
|
|
__global const float4 *const restrict fcon_weight,
|
|
__global const float4 *const restrict proj_bias,
|
|
__global const float4 *const restrict proj_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0),
|
|
width = opts->ffnn_width,
|
|
height = opts->ffnn_height;
|
|
|
|
// Layer re-normalization
|
|
ircd_simt_math_norm_f4lldr(token->word, token->word, buf->word);
|
|
ircd_gpt_norm_fmad(token->word, token->word, norm_bias, norm_weight, li);
|
|
|
|
// ln's writes are still pending but fcon reads results across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Fully connected
|
|
ircd_gpt_ffnn_fcon(ctrl, opts, buf, token, fcon_bias, fcon_weight);
|
|
|
|
// fcon's writes are still pending but proj reads results across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Projection
|
|
ircd_gpt_tmul(token->word, buf->fcon, proj_bias, proj_weight, height, width, 1);
|
|
}
|
|
|
|
inline void
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local float self[][12],
|
|
const uint wn)
|
|
{
|
|
const uint
|
|
gn = get_global_size(0),
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0);
|
|
|
|
struct ircd_math_samax samax =
|
|
{
|
|
.mu = -10000.0f,
|
|
.sum = 0.0f,
|
|
};
|
|
|
|
for(uint i = 0; i < wn; ++i)
|
|
samax.mu = max(samax.mu, self[i][li]);
|
|
|
|
for(uint i = 0; i < wn; ++i)
|
|
self[i][li] = exp(self[i][li] - samax.mu);
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
samax.sum += self[i][li];
|
|
|
|
samax.lambda = 1.0f / samax.sum;
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
self[i][li] *= samax.lambda;
|
|
}
|
|
|
|
inline void
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local union ircd_gpt_tokenv *const restrict out,
|
|
__local float self[][12],
|
|
__global const struct ircd_gpt_attn_qkvv *const restrict token)
|
|
{
|
|
const uint
|
|
gi = get_global_id(0),
|
|
gn = get_global_size(0),
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0),
|
|
wn = ctrl->tokens.count,
|
|
ti = li % opts->attn_rank,
|
|
ki = li / opts->attn_rank,
|
|
kn = ln / opts->attn_rank;
|
|
|
|
// Low-rank mask
|
|
if(li < opts->attn_rank)
|
|
{
|
|
// For each token
|
|
for(uint i = 0; i < wn; ++i)
|
|
{
|
|
// Left-attention mask
|
|
if(wi < i)
|
|
{
|
|
self[i][li] = -10000.0f;
|
|
continue;
|
|
}
|
|
|
|
float4 acc = 0.0f;
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint k = 0; k < kn; ++k)
|
|
{
|
|
float4
|
|
qry = token[wi].qry.attn[li][k],
|
|
key = token[i].key.attn[li][k];
|
|
|
|
acc += qry * key;
|
|
}
|
|
|
|
const float
|
|
sum = ircd_simt_reduce_add_f4(acc),
|
|
res = sum / 8.0f;
|
|
|
|
self[i][li] = res;
|
|
}
|
|
|
|
// Three-piece softmax
|
|
ircd_gpt_attn_self_samax(ctrl, opts, self, wn);
|
|
}
|
|
|
|
// Propagate to full width for value dot prod.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
float4 acc = 0.0f;
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wi; ++i)
|
|
{
|
|
const float4
|
|
attn = self[i][ti],
|
|
val = token[i].val.attn[ti][ki];
|
|
|
|
acc += attn * val;
|
|
}
|
|
|
|
out->attn[ti][ki] = acc;
|
|
}
|
|
|
|
inline void
|
|
__attribute__((flatten, always_inline))
|
|
ircd_gpt_attn_proj(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local union ircd_gpt_tokenv *const out,
|
|
__local const union ircd_gpt_tokenv *const xattn,
|
|
__global const float4 *const restrict bias,
|
|
__global const float4 *const restrict weight)
|
|
{
|
|
const uint
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0),
|
|
width = opts->attn_height, // same
|
|
height = opts->attn_height;
|
|
|
|
// Projection
|
|
ircd_gpt_tmul(out->word, xattn->word, bias, weight, width, height, 1);
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict accum,
|
|
__global const struct ircd_gpt_attn_qkvv *const restrict state,
|
|
__global const float4 *const restrict attn_proj_bias,
|
|
__global const float4 *const restrict attn_proj_weight,
|
|
__global const float4 *const restrict ffnn_norm_bias,
|
|
__global const float4 *const restrict ffnn_norm_weight,
|
|
__global const float4 *const restrict ffnn_fcon_bias,
|
|
__global const float4 *const restrict ffnn_fcon_weight,
|
|
__global const float4 *const restrict ffnn_proj_bias,
|
|
__global const float4 *const restrict ffnn_proj_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0);
|
|
|
|
__local union ircd_gpt_tokenv
|
|
buf1, buf0;
|
|
|
|
__local union
|
|
{
|
|
union ircd_gpt_ffnn_aperaturev
|
|
ffnn_fcon[2];
|
|
|
|
float
|
|
attn_self[512][12];
|
|
}
|
|
buf;
|
|
|
|
// Self-attention backend; this computes the self-attention result now
|
|
// that keys and values are globally visible across tokens.
|
|
ircd_gpt_attn_self
|
|
(
|
|
ctrl,
|
|
opts,
|
|
&buf1,
|
|
buf.attn_self,
|
|
state
|
|
);
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Project result of self-attention.
|
|
ircd_gpt_attn_proj
|
|
(
|
|
ctrl,
|
|
opts,
|
|
&buf0,
|
|
&buf1,
|
|
attn_proj_bias,
|
|
attn_proj_weight
|
|
);
|
|
|
|
// Frontend accumulation
|
|
{
|
|
const float4
|
|
attn = buf0.word[li],
|
|
resid = accum[wi].word[li];
|
|
|
|
buf0.word[li] += resid;
|
|
accum[wi].word[li] += attn;
|
|
}
|
|
|
|
// Backend mlp; layer-norm acquires any pending writes, no fence required.
|
|
ircd_gpt_ffnn
|
|
(
|
|
ctrl,
|
|
opts,
|
|
&buf0,
|
|
buf.ffnn_fcon + 0,
|
|
buf.ffnn_fcon + 1,
|
|
&buf1,
|
|
ffnn_norm_bias,
|
|
ffnn_norm_weight,
|
|
ffnn_fcon_bias,
|
|
ffnn_fcon_weight,
|
|
ffnn_proj_bias,
|
|
ffnn_proj_weight
|
|
);
|
|
|
|
// Backend accumulation
|
|
{
|
|
const float4
|
|
ffnn = buf0.word[li],
|
|
resid = accum[wi].word[li],
|
|
result = ffnn + resid;
|
|
|
|
accum[wi].word[li] = result;
|
|
}
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict state,
|
|
__global const union ircd_gpt_tokenv *const restrict accum,
|
|
__global const float4 *const restrict norm_bias,
|
|
__global const float4 *const restrict norm_weight,
|
|
__global const float4 *const restrict fcon_bias,
|
|
__global const float4 *const restrict fcon_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0),
|
|
width = opts->attn_width,
|
|
height = opts->attn_height,
|
|
tiles = opts->attn_mult;
|
|
|
|
__local union ircd_gpt_attn_aperaturev
|
|
token;
|
|
|
|
__local float4
|
|
tmp[768/4];
|
|
|
|
token.word[li] = accum[wi].word[li];
|
|
|
|
// Layer re-normalization
|
|
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp);
|
|
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
|
|
|
// Ln's writes are still pending; fcon requires results across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Fully connected
|
|
ircd_gpt_tmul(token.fcon, tmp, fcon_bias, fcon_weight, width, height, tiles);
|
|
|
|
// Export queries, keys, and values.
|
|
for(uint i = 0; i < tiles; ++i)
|
|
state[wi].proj[i][li] = token.proj[i][li];
|
|
}
|
|
|
|
//
|
|
// frontend
|
|
//
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict accum,
|
|
__global const union ircd_gpt_tokenv *const restrict pos,
|
|
__global const union ircd_gpt_tokenv *const restrict vocab,
|
|
const uint out_idx,
|
|
const uint tok_idx,
|
|
const uint word_idx)
|
|
{
|
|
const ushort
|
|
ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens,
|
|
token = ctrl->token[ring_idx];
|
|
|
|
const float4
|
|
wte = vocab[token].word[word_idx],
|
|
wpe = pos[tok_idx].word[word_idx];
|
|
|
|
accum[out_idx].word[word_idx] = wte + wpe;
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict accum,
|
|
__global const union ircd_gpt_tokenv *const restrict pos,
|
|
__global const union ircd_gpt_tokenv *const restrict vocab)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0);
|
|
|
|
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li);
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict accum,
|
|
__global const float4 *const restrict norm_bias,
|
|
__global const float4 *const restrict norm_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0);
|
|
|
|
__local union ircd_gpt_tokenv
|
|
token, tmp;
|
|
|
|
token.word[li] = accum[wi].word[li];
|
|
|
|
// Final re-normalization
|
|
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word);
|
|
ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
|
|
|
|
accum[wi].word[li] = token.word[li];
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global float *const restrict logit,
|
|
__global const union ircd_gpt_tokenv *const restrict accum,
|
|
__global const union ircd_gpt_tokenv *const restrict token)
|
|
{
|
|
const uint
|
|
gi = get_global_id(0),
|
|
ti = ctrl->tokens.count - 1,
|
|
words = opts->embed_width;
|
|
|
|
float4 acc = 0.0f;
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint j = 0; j < words; ++j)
|
|
{
|
|
const float4
|
|
in = accum[ti].word[j],
|
|
vocab = token[gi].word[j];
|
|
|
|
acc += vocab * in;
|
|
}
|
|
|
|
const float
|
|
ret = ircd_simt_reduce_add_f4(acc);
|
|
|
|
if(gi < opts->logits)
|
|
logit[gi] = ret;
|
|
else
|
|
logit[gi] = -10000.0f;
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global float4 *const restrict logsm,
|
|
__global float4 *const restrict logexp,
|
|
__global const float4 *const restrict logit)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
logits = opts->logits,
|
|
logits_alignup = logits + (ln - (logits % ln)),
|
|
tn = logits_alignup / ln / 4,
|
|
ti = tn * li;
|
|
|
|
__local float share[256];
|
|
__local float4 share4[256];
|
|
|
|
share4[li] = -10000.0f;
|
|
for(uint i = ti; i < ti + tn; ++i)
|
|
share4[li] = max(share4[li], logit[i]);
|
|
|
|
share[li] = -10000.0f;
|
|
for(uint k = 0; k < 4; ++k)
|
|
share[li] = max(share[li], share4[li][k]);
|
|
|
|
ircd_simt_reduce_max_flldr(share);
|
|
|
|
if(li == 0)
|
|
share4[li] = ctrl->samax.mu = share[li];
|
|
|
|
ircd_simt_broadcast_f4lldr(share4);
|
|
|
|
const float4
|
|
mu = share4[li];
|
|
|
|
share4[li] = 0.0f;
|
|
for(uint i = ti; i < ti + tn; ++i)
|
|
{
|
|
const float4
|
|
reg = logit[i] - mu;
|
|
|
|
float4 res;
|
|
for(uint k = 0; k < 4; ++k)
|
|
if(i * 4 + k < logits)
|
|
res[k] = exp(reg[k]);
|
|
else
|
|
res[k] = 0.0f;
|
|
|
|
share4[li] += res;
|
|
logexp[i] = res;
|
|
}
|
|
|
|
ircd_simt_reduce_add_f4lldr(share4);
|
|
|
|
if(li == 0)
|
|
{
|
|
const float
|
|
sum = ircd_simt_reduce_add_f4(share4[li]);
|
|
|
|
share4[li][0] = ctrl->samax.sum = sum;
|
|
share4[li][1] = ctrl->samax.lambda = 1.0f / sum;
|
|
}
|
|
|
|
ircd_simt_broadcast_f4lldr(share4);
|
|
|
|
const float4
|
|
sum = share4[li][0],
|
|
lambda = share4[li][1];
|
|
|
|
for(uint i = ti; i < ti + tn; ++i)
|
|
logsm[i] = logexp[i] * lambda;
|
|
}
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_lm_result_top(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local const ushort *const restrict idx,
|
|
__global const float *const restrict logsm,
|
|
__global const float *const restrict logit,
|
|
const uint i)
|
|
{
|
|
const ushort
|
|
token = idx[i];
|
|
|
|
const float
|
|
samax = logsm[token];
|
|
|
|
ctrl->top[i].token = token;
|
|
ctrl->top[i].samax = samax;
|
|
}
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_lm_result_label(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local const ushort *const restrict idx,
|
|
__global const float *const restrict logsm,
|
|
__global const float *const restrict logit,
|
|
const uint i)
|
|
{
|
|
__global struct ircd_gpt_ctrl_label
|
|
*const label = ctrl->label + i;
|
|
|
|
const ushort
|
|
token = label->token,
|
|
sum_sel = ctrl->epic.cycle % 3;
|
|
|
|
const float
|
|
samax = logsm[token],
|
|
mean_div = ctrl->epic.cycle + 1.0f;
|
|
|
|
const float
|
|
loss = 0.0f - log(samax),
|
|
loss_sum = label->loss.sum[0] + label->loss.sum[1] + label->loss.sum[2] + loss,
|
|
loss_mean = loss_sum / mean_div;
|
|
|
|
const float
|
|
perp = (1.0f - samax) * native_log2(opts->logits),
|
|
perp_sum = label->perp.sum[0] + label->perp.sum[1] + label->perp.sum[2] + perp,
|
|
perp_mean = perp_sum / mean_div;
|
|
|
|
label->samax = samax;
|
|
|
|
label->loss.last = loss;
|
|
label->loss.sum[sum_sel] += loss;
|
|
label->loss.mean = loss_mean;
|
|
|
|
label->perp.last = perp;
|
|
label->perp.sum[sum_sel] += perp;
|
|
label->perp.mean = perp_mean;
|
|
}
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_lm_result_select(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local const ushort *const restrict idx,
|
|
__global const float *const restrict logsm,
|
|
__global const float *const restrict logexp,
|
|
__global const float *const restrict logit)
|
|
{
|
|
const bool
|
|
buffer_full = ctrl->tokens.count >= opts->buffer_tokens;
|
|
|
|
const ulong
|
|
rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
|
|
ent_k = max(opts->top_k, 1U),
|
|
ent_p = max(1U, min(opts->top_p, 100U));
|
|
|
|
const float
|
|
thresh = (rnd % ent_p) / 100.0f;
|
|
|
|
ushort select = 0;
|
|
float smacc = 0.0f;
|
|
for(; select < opts->top_k; ++select)
|
|
if((smacc += logsm[idx[select]]) > thresh)
|
|
break;
|
|
|
|
const ushort
|
|
token = idx[select],
|
|
dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens,
|
|
tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens),
|
|
head = buffer_full?
|
|
(ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head;
|
|
|
|
ctrl->tokens.head = head;
|
|
ctrl->tokens.count = tokens;
|
|
ctrl->token[dest] = token;
|
|
}
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_leave(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
const uint li)
|
|
{
|
|
if(ctrl->epic.cycle + 1 >= opts->limit)
|
|
ctrl->epic.epoch += 1;
|
|
|
|
ctrl->epic.cycle += 1;
|
|
ctrl->magic = 0xC7012C70U;
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_lm_select(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global const float *const restrict logsm,
|
|
__global const float *const restrict logit)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
tn = opts->logits / ln,
|
|
ti = tn * li;
|
|
|
|
__local ushort idx[256];
|
|
|
|
idx[li] = ti;
|
|
for(uint j = ti + 1; j < ti + tn; ++j)
|
|
if(logsm[j] > logsm[idx[li]])
|
|
idx[li] = j;
|
|
|
|
ircd_simt_sort_idx16_flldr(idx, logsm);
|
|
|
|
if(li < opts->top_n)
|
|
ircd_gpt_lm_result_top(ctrl, opts, idx, logsm, logexp, logit, li);
|
|
|
|
if(li < opts->labels)
|
|
ircd_gpt_lm_result_label(ctrl, opts, idx, logsm, logexp, logit, li);
|
|
|
|
// Writes to `idx` from the sort are still pending across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Mask for write-leader
|
|
if(li == 0)
|
|
ircd_gpt_lm_result_select(ctrl, opts, idx, logsm, logexp, logit);
|
|
|
|
if(li != 0)
|
|
return;
|
|
|
|
ircd_gpt_leave(ctrl, opts, li);
|
|
}
|
|
|
|
//
|
|
// backpropagations
|
|
//
|
|
|
|
inline void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global float4 *const restrict param_,
|
|
__global float4 *const restrict exp_avg_,
|
|
__global float4 *const restrict exp_avg_sqr_)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
step = ctrl->epic.step;
|
|
|
|
const float4
|
|
param = param_[li],
|
|
grad = ctrl->label[0].loss.mean,
|
|
alpha[2] = { 1.0f - opts->beta[0], 1.0f - opts->beta[1], },
|
|
exp_avg = step? exp_avg_[li]: 0.0f,
|
|
exp_avg_sqr = step? exp_avg_sqr_[li]: 0.0f,
|
|
exp_avg_mul = exp_avg * opts->beta[0],
|
|
exp_avg_dot = exp_avg_mul + alpha[0] * grad,
|
|
exp_avg_sqr_mul = exp_avg_sqr * opts->beta[1],
|
|
exp_avg_sqr_dot = exp_avg_sqr_mul + alpha[1] * grad * grad,
|
|
denom = sqrt(exp_avg_sqr_dot) + opts->epsilon,
|
|
delta = opts->alpha * (exp_avg_dot / denom),
|
|
update = param - delta;
|
|
|
|
param_[li] = update;
|
|
exp_avg_[li] = exp_avg_dot;
|
|
exp_avg_sqr_[li] = exp_avg_sqr_dot;
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict bias,
|
|
__global union ircd_gpt_tokenv *const restrict bias_m0,
|
|
__global union ircd_gpt_tokenv *const restrict bias_m1,
|
|
__global union ircd_gpt_tokenv *const restrict weight,
|
|
__global union ircd_gpt_tokenv *const restrict weight_m0,
|
|
__global union ircd_gpt_tokenv *const restrict weight_m1)
|
|
{
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
bias->word,
|
|
bias_m0->word,
|
|
bias_m1->word
|
|
);
|
|
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
weight->word,
|
|
weight_m0->word,
|
|
weight_m1->word
|
|
);
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict norm_bias,
|
|
__global union ircd_gpt_tokenv *const restrict norm_bias_m0,
|
|
__global union ircd_gpt_tokenv *const restrict norm_bias_m1,
|
|
__global union ircd_gpt_tokenv *const restrict norm_weight,
|
|
__global union ircd_gpt_tokenv *const restrict norm_weight_m0,
|
|
__global union ircd_gpt_tokenv *const restrict norm_weight_m1,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict fcon_bias,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict fcon_bias_m0,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict fcon_bias_m1,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict fcon_weight,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict fcon_weight_m0,
|
|
__global union ircd_gpt_attn_aperaturev *const restrict fcon_weight_m1,
|
|
__global union ircd_gpt_tokenv *const restrict proj_bias,
|
|
__global union ircd_gpt_tokenv *const restrict proj_bias_m0,
|
|
__global union ircd_gpt_tokenv *const restrict proj_bias_m1,
|
|
__global union ircd_gpt_tokenv *const restrict proj_weight,
|
|
__global union ircd_gpt_tokenv *const restrict proj_weight_m0,
|
|
__global union ircd_gpt_tokenv *const restrict proj_weight_m1)
|
|
{
|
|
ircd_gpt_norm_prop
|
|
(
|
|
ctrl, opts,
|
|
norm_bias,
|
|
norm_bias_m0,
|
|
norm_bias_m1,
|
|
norm_weight,
|
|
norm_weight_m0,
|
|
norm_weight_m1
|
|
);
|
|
|
|
for(uint j = 0; j < 3; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_bias->proj[j],
|
|
fcon_bias_m0->proj[j],
|
|
fcon_bias_m1->proj[j]
|
|
);
|
|
|
|
for(uint i = 0; i < 768; ++i)
|
|
for(uint j = 0; j < 3; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_weight[i].proj[j],
|
|
fcon_weight_m0[i].proj[j],
|
|
fcon_weight_m1[i].proj[j]
|
|
);
|
|
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_bias->word,
|
|
proj_bias_m0->word,
|
|
proj_bias_m1->word
|
|
);
|
|
|
|
for(uint i = 0; i < 768; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_weight[i].word,
|
|
proj_weight_m0[i].word,
|
|
proj_weight_m1[i].word
|
|
);
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict norm_bias,
|
|
__global union ircd_gpt_tokenv *const restrict norm_bias_m0,
|
|
__global union ircd_gpt_tokenv *const restrict norm_bias_m1,
|
|
__global union ircd_gpt_tokenv *const restrict norm_weight,
|
|
__global union ircd_gpt_tokenv *const restrict norm_weight_m0,
|
|
__global union ircd_gpt_tokenv *const restrict norm_weight_m1,
|
|
__global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias,
|
|
__global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m0,
|
|
__global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m1,
|
|
__global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight,
|
|
__global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m0,
|
|
__global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m1,
|
|
__global union ircd_gpt_tokenv *const restrict proj_bias,
|
|
__global union ircd_gpt_tokenv *const restrict proj_bias_m0,
|
|
__global union ircd_gpt_tokenv *const restrict proj_bias_m1,
|
|
__global union ircd_gpt_tokenv *const restrict proj_weight,
|
|
__global union ircd_gpt_tokenv *const restrict proj_weight_m0,
|
|
__global union ircd_gpt_tokenv *const restrict proj_weight_m1)
|
|
{
|
|
ircd_gpt_norm_prop
|
|
(
|
|
ctrl, opts,
|
|
norm_bias,
|
|
norm_bias_m0,
|
|
norm_bias_m1,
|
|
norm_weight,
|
|
norm_weight_m0,
|
|
norm_weight_m1
|
|
);
|
|
|
|
for(uint j = 0; j < 4; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_bias->proj[j],
|
|
fcon_bias_m0->proj[j],
|
|
fcon_bias_m1->proj[j]
|
|
);
|
|
|
|
for(uint i = 0; i < 768; ++i)
|
|
for(uint j = 0; j < 4; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_weight[i].proj[j],
|
|
fcon_weight_m0[i].proj[j],
|
|
fcon_weight_m1[i].proj[j]
|
|
);
|
|
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_bias->word,
|
|
proj_bias_m0->word,
|
|
proj_bias_m1->word
|
|
);
|
|
|
|
for(uint i = 0; i < 3072; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_weight[i].word,
|
|
proj_weight_m0[i].word,
|
|
proj_weight_m1[i].word
|
|
);
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((flatten))
|
|
ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global union ircd_gpt_tokenv *const restrict pos,
|
|
__global union ircd_gpt_tokenv *const restrict pos_m0,
|
|
__global union ircd_gpt_tokenv *const restrict pos_m1,
|
|
__global union ircd_gpt_tokenv *const restrict token,
|
|
__global union ircd_gpt_tokenv *const restrict token_m0,
|
|
__global union ircd_gpt_tokenv *const restrict token_m1)
|
|
{
|
|
const uint
|
|
gn = get_global_size(0),
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0),
|
|
wn = ctrl->tokens.count,
|
|
cn = opts->context_tokens / wn,
|
|
ci = cn * wi,
|
|
tn = opts->logits / wn,
|
|
ti = tn * wi;
|
|
|
|
for(uint i = ci; i < ci + cn; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
pos[i].word,
|
|
pos_m0[i].word,
|
|
pos_m1[i].word
|
|
);
|
|
|
|
for(uint i = ti; i < ti + tn; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
token[i].word,
|
|
token_m0[i].word,
|
|
token_m1[i].word
|
|
);
|
|
}
|