construct/ircd/gpt_cl.cl

// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.


inline void
ircd_gpt_norm_fmad(__local float4 *const out,
                   __local const float4 *const in,
                   __global const float4 *const restrict bias,
                   __global const float4 *const restrict weight,
                   const uint i)
{
	out[i] = in[i] * weight[i] + bias[i];
}

// Matrix * Vector Multiply/Accumulate
inline void
ircd_gpt_sgemv(__local float4 *const restrict out,
               __local const float4 *const restrict in,
               __global const float4 *const restrict bias,
               __global const float4 *const restrict weight,
               const uint width,
               const uint height,
               const uint tiles,
               const uint i)
{
	const uint seg = height / tiles;

	float4 acc = bias[i];
	for(uint j = 0; j < seg; ++j)
		for(uint t = 0; t < tiles; ++t)
			for(uint k = 0; k < 4; ++k)
			{
				const uint
				jidx = t * seg + j,
				kidx = jidx * 4 + k,
				widx = kidx * width + i;

				acc += weight[widx] * in[jidx][k];
			}

	out[i] = acc;
}

inline void
ircd_gpt_gelu(__local float4 *const out,
              __local const float4 *const in_,
              const uint i)
{
	float4 a,
	in = in_[i];

	a = 0.044715f;
	a *= in;
	a *= in;
	a += 1.0f;
	a *= 0.7978845608f;
	a *= in;

	a = tanh(a);
	a += 1.0f;
	a *= in;
	a *= 0.5f;

	out[i] = a;
}

//
// core
//

__kernel void
ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
              __constant const struct ircd_gpt_opts *const opts,
              __global union ircd_gpt_tokenv *const restrict accum,
              __global const float4 *const restrict norm_bias,
              __global const float4 *const restrict norm_weight,
              __global const float4 *const restrict fcon_bias,
              __global const float4 *const restrict fcon_weight,
              __global const float4 *const restrict proj_bias,
              __global const float4 *const restrict proj_weight)
{
	const uint
	gi = get_global_id(0),
	gn = get_global_size(0),
	li = get_local_id(0),
	ln = get_local_size(0),
	wi = get_group_id(0),
	wn = get_num_groups(0);

	__local union ircd_gpt_aperaturev token;
	__local float4 tmp[768/4];

	// Fetch local copy of the global accumulator. We operate on a cached
	// copy as input, and add our output to the global upon completion.
	token.word[li] = accum[wi].word[li];

	// Layer re-normalization
	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);

	// Fully connected
	for(uint i = 0; i < 4; ++i)
		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);

	// Gaussian Error Linear Unit
	for(uint i = 0; i < 4; ++i)
		ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);

	// Projection
	ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);

	// Accumulation; end of layer
	accum[wi].word[li] += tmp[li];
}

__kernel void
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __global union ircd_gpt_tokenv *const restrict accum,
                   __local const union ircd_gpt_tokenv *const restrict xattn,
                   __global const float4 *const restrict proj_bias,
                   __global const float4 *const restrict proj_weight)
{
	const uint
	gi = get_global_id(0),
	gn = get_global_size(0),
	li = get_local_id(0),
	ln = get_local_size(0),
	wi = get_group_id(0),
	wn = get_num_groups(0);

	__local float4
	in[768/4],
	out[768/4];

	// Fetch
	in[li] = xattn->word[li];

	// Need this here if xattn is __local
	barrier(CLK_LOCAL_MEM_FENCE);

	// Projection
	ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);

	// Accumulation; end of layer
	accum[wi].word[li] += out[li];
}

__kernel void
ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __local union ircd_gpt_tokenv *const restrict out,
                   __global const struct ircd_gpt_qkvv *const restrict token,
                   __global const struct ircd_gpt_attn_mask *const restrict mask)   // [1024][1024],
{
	const uint
	gi = get_global_id(0),
	gn = get_global_size(0),
	li = get_local_id(0),
	ln = get_local_size(0),
	wi = get_group_id(0),
	wn = get_num_groups(0);

	__local union
	{
		float
		attn[12][96];
	}
	self;

	for(uint i = 0; i < wn; ++i)
		if(mask[wi].token[i])
			self.attn[li][i] = 0.0f;
		else
			self.attn[li][i] = -10000.0f;

	for(uint i = 0; i < wn; ++i)
		if(mask[wi].token[i])
			for(uint j = 0; j < 64/4; ++j)
			{
				float4
				qry = token[wi].qry.attn[li][j],
				key = token[i].key.attn[li][j],
				res = qry * key;
				for(uint k = 0; k < 4; ++k)
					self.attn[li][i] += res[k];
			}

	for(uint i = 0; i < wn; ++i)
		if(mask[wi].token[i])
			self.attn[li][i] /= 8.0f;

	for(uint i = 0; i < wn; ++i)
		self.attn[li][i] = exp(self.attn[li][i]);

	float4 vacc = 0.0f;
	for(uint i = 0; i < wn; ++i)
		vacc[i % 4] += self.attn[li][i];

	float acc = 0.0f;
	for(uint i = 0; i < 4; ++i)
		acc += vacc[i];

	for(uint i = 0; i < wn; ++i)
		self.attn[li][i] /= acc;

	for(uint j = 0; j < 64/4; ++j)
		out->attn[li][j] = 0.0f;

	for(uint i = 0; i < wn; ++i)
		for(uint j = 0; j < 64/4; ++j)
			out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
}

__kernel void
ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __global union ircd_gpt_aperaturev *const restrict out,
                   __global const union ircd_gpt_tokenv *const restrict in,
                   __global const float4 *const restrict norm_bias,
                   __global const float4 *const restrict norm_weight,
                   __global const float4 *const restrict fcon_bias,
                   __global const float4 *const restrict fcon_weight)
{
	const uint
	gi = get_global_id(0),
	gn = get_global_size(0),
	li = get_local_id(0),
	ln = get_local_size(0),
	wi = get_group_id(0),
	wn = get_num_groups(0);

	__local union ircd_gpt_aperaturev token;
	__local float4 tmp[768/4];

	token.word[li] = in[wi].word[li];

	// Layer re-normalization
	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);

	// Fully connected
	for(uint i = 0; i < 3; ++i)
		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);

	// Export queries, keys, and values.
	for(uint i = 0; i < 3; ++i)
		out[wi].proj[i][li] = token.proj[i][li];
}

__kernel void
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
              __constant const struct ircd_gpt_opts *const opts,
              __global union ircd_gpt_tokenv *const restrict accum,
              __global const struct ircd_gpt_qkvv *const restrict state,
              __global const struct ircd_gpt_attn_mask *const restrict mask,   // [1024][1024],
              __global const float4 *const restrict attn_proj_bias,
              __global const float4 *const restrict attn_proj_weight,
              __global const float4 *const restrict ffnn_norm_bias,
              __global const float4 *const restrict ffnn_norm_weight,
              __global const float4 *const restrict ffnn_fcon_bias,
              __global const float4 *const restrict ffnn_fcon_weight,
              __global const float4 *const restrict ffnn_proj_bias,
              __global const float4 *const restrict ffnn_proj_weight)
{
	__local union ircd_gpt_tokenv value;

	ircd_gpt_attn_self
	(
		ctrl,
		opts,
		&value,
		state,
		mask
	);

	ircd_gpt_attn_proj
	(
		ctrl,
		opts,
		accum,
		&value,
		attn_proj_bias,
		attn_proj_weight
	);

	ircd_gpt_ffnn
	(
		ctrl,
		opts,
		accum,
		ffnn_norm_bias,
		ffnn_norm_weight,
		ffnn_fcon_bias,
		ffnn_fcon_weight,
		ffnn_proj_bias,
		ffnn_proj_weight
	);
}

//
// frontend
//

inline void
_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __global union ircd_gpt_tokenv *const restrict out,
                   __global const union ircd_gpt_tokenv *const restrict pos,
                   __global const union ircd_gpt_tokenv *const restrict vocab,
                   const uint out_idx,
                   const uint tok_idx,
                   const uint word_idx)
{
	const ushort
	token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];

	const float4
	wte = vocab[token].word[word_idx],
	wpe = pos[tok_idx].word[word_idx];

	out[out_idx].word[word_idx] = wte + wpe;
}

__kernel void
ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                  __constant const struct ircd_gpt_opts *const opts,
                  __global union ircd_gpt_tokenv *const restrict accum,
                  __global const union ircd_gpt_tokenv *const restrict pos,
                  __global const union ircd_gpt_tokenv *const restrict vocab)
{
	const uint
	li = get_local_id(0);

	for(uint i = 0; i < ctrl->tokens; ++i)
		_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
}

__kernel void
ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
                 __constant const struct ircd_gpt_opts *const opts,
                 __global union ircd_gpt_tokenv *const restrict accum,
                 __global const float4 *const restrict norm_bias,
                 __global const float4 *const restrict norm_weight)
{
	const uint
	li = get_local_id(0),
	ln = get_local_size(0),
	wi = get_global_offset(0) / ln + get_group_id(0);

	__local union ircd_gpt_tokenv
	token, tmp;

	token.word[li] = accum[wi].word[li];

	// Final re-normalization
	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
	ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);

	accum[0].word[li] = token.word[li];
}

__kernel void
ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
                  __constant const struct ircd_gpt_opts *const opts,
                  __global float *const restrict logit,
                  __global const union ircd_gpt_tokenv *const restrict accum,
                  __global const union ircd_gpt_tokenv *const restrict token)
{
	const uint
	gi = get_global_id(0);

	float4 acc = 0.0f;
	for(uint j = 0; j < 768/4; ++j)
	{
		const float4
		in = accum[0].word[j],
		vocab = token[gi].word[j],
		res = vocab * in;

		acc += res;
	}

	float res = 0.0f;
	for(uint k = 0; k < 4; ++k)
		res += acc[k];

	logit[gi] = res;
}

inline void
ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
               __constant const struct ircd_gpt_opts *const opts,
               const uint li)
{
	// No action for other threads right now
	if(li != 0)
		return;

	// Run debug checks and assertions.
	#ifdef RB_DEBUG
	if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)
		if(ctrl->tokens < 2)
			ctrl->call = IRCD_GPT_ETOKENS;
	#endif

	// If the call value has been set to something other than default we
	// do nothing else here.
	if(ctrl->call != IRCD_GPT_ECOMPLETE)
		return;

	// On the last cycle, with no prior call or error code set, indicate
	// a nominal exit condition.
	if(ctrl->cycle + 1 >= opts->limit)
	{
		ctrl->call = IRCD_GPT_ACCEPT;
		ctrl->epoch += 1;
	}

	ctrl->cycle += 1;
}

inline void
ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   const uint li,
                   __local const ushort *const restrict idx)
{
	// To read from cells other than idx[0] we need this barrier.
	if(opts->top_k > 1)
		barrier(CLK_LOCAL_MEM_FENCE);

	// No action for other threads right now
	if(li != 0)
		return;

	// When the hypercall code is already set, bail here.
	if(ctrl->call != IRCD_GPT_ECOMPLETE)
		return;

	const bool
	buffer_full = ctrl->tokens >= opts->buffer_tokens;

	const ulong
	rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
	sel = rnd % max(opts->top_k, 1U);

	const ushort
	token = idx[sel],
	token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;

	ctrl->token[token_idx] = token;

	if(buffer_full)
		ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
	else
		ctrl->tokens++;
}

__kernel void
ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __global const float *const restrict logit)
{
	const uint
	gi = get_global_id(0),
	gn = get_global_size(0),
	li = get_local_id(0),
	ln = get_local_size(0),
	wi = get_group_id(0),
	wn = get_num_groups(0),
	tn = 262,
	ti = tn * li;

	__local ushort idx[192];

	idx[li] = ti;
	for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
		if(logit[j] > logit[idx[li]])
			idx[li] = j;

	ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
	ircd_gpt_lm_result(ctrl, opts, li, idx);
	ircd_gpt_leave(ctrl, opts, li);
}
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`// Matrix Construct`
			`//`
			`// Copyright (C) Matrix Construct Developers, Authors & Contributors`
			`// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>`
			`//`
			`// Permission to use, copy, modify, and/or distribute this software for any`
			`// purpose with or without fee is hereby granted, provided that the above`
			`// copyright notice and this permission notice is present in all copies. The`
			`// full license for this software is available in the LICENSE file.`


			`inline void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_norm_fmad(__local float4 *const out,`
			`__local const float4 *const in,`
			`__global const float4 *const restrict bias,`
			`__global const float4 *const restrict weight,`
			`const uint i)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`out[i] = in[i] * weight[i] + bias[i];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`// Matrix * Vector Multiply/Accumulate`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`inline void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_sgemv(__local float4 *const restrict out,`
			`__local const float4 *const restrict in,`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`__global const float4 *const restrict bias,`
			`__global const float4 *const restrict weight,`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`const uint width,`
			`const uint height,`
			`const uint tiles,`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`const uint i)`
			`{`
			`const uint seg = height / tiles;`

			`float4 acc = bias[i];`
			`for(uint j = 0; j < seg; ++j)`
			`for(uint t = 0; t < tiles; ++t)`
			`for(uint k = 0; k < 4; ++k)`
			`{`
			`const uint`
			`jidx = t * seg + j,`
			`kidx = jidx * 4 + k,`
			`widx = kidx * width + i;`

			`acc += weight[widx] * in[jidx][k];`
			`}`

			`out[i] = acc;`
			`}`

			`inline void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_gelu(__local float4 *const out,`
			`__local const float4 *const in_,`
			`const uint i)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`float4 a,`
			`in = in_[i];`

			`a = 0.044715f;`
			`a *= in;`
			`a *= in;`
			`a += 1.0f;`
			`a *= 0.7978845608f;`
			`a *= in;`

			`a = tanh(a);`
			`a += 1.0f;`
			`a *= in;`
			`a *= 0.5f;`

			`out[i] = a;`
			`}`

			`//`
			`// core`
			`//`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_tokenv *const restrict accum,`
			`__global const float4 *const restrict norm_bias,`
			`__global const float4 *const restrict norm_weight,`
			`__global const float4 *const restrict fcon_bias,`
			`__global const float4 *const restrict fcon_weight,`
			`__global const float4 *const restrict proj_bias,`
			`__global const float4 *const restrict proj_weight)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
			`gi = get_global_id(0),`
			`gn = get_global_size(0),`
			`li = get_local_id(0),`
			`ln = get_local_size(0),`
			`wi = get_group_id(0),`
			`wn = get_num_groups(0);`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`__local union ircd_gpt_aperaturev token;`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`__local float4 tmp[768/4];`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`// Fetch local copy of the global accumulator. We operate on a cached`
			`// copy as input, and add our output to the global upon completion.`
			`token.word[li] = accum[wi].word[li];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
			`// Layer re-normalization`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);`
			`ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
			`// Fully connected`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`for(uint i = 0; i < 4; ++i)`
			`ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`// Gaussian Error Linear Unit`
			`for(uint i = 0; i < 4; ++i)`
			`ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);`

			`// Projection`
			`ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);`

			`// Accumulation; end of layer`
			`accum[wi].word[li] += tmp[li];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_tokenv *const restrict accum,`
			`__local const union ircd_gpt_tokenv *const restrict xattn,`
			`__global const float4 *const restrict proj_bias,`
			`__global const float4 *const restrict proj_weight)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
			`gi = get_global_id(0),`
			`gn = get_global_size(0),`
			`li = get_local_id(0),`
			`ln = get_local_size(0),`
			`wi = get_group_id(0),`
			`wn = get_num_groups(0);`

			`__local float4`
			`in[768/4],`
			`out[768/4];`

			`// Fetch`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`in[li] = xattn->word[li];`

			`// Need this here if xattn is __local`
			`barrier(CLK_LOCAL_MEM_FENCE);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
			`// Projection`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
			`// Accumulation; end of layer`
			`accum[wi].word[li] += out[li];`
			`}`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__local union ircd_gpt_tokenv *const restrict out,`
			`__global const struct ircd_gpt_qkvv *const restrict token,`
			`__global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
			`gi = get_global_id(0),`
			`gn = get_global_size(0),`
			`li = get_local_id(0),`
			`ln = get_local_size(0),`
			`wi = get_group_id(0),`
			`wn = get_num_groups(0);`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`__local union`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`float`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`attn[12][96];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`
			`self;`

			`for(uint i = 0; i < wn; ++i)`
			`if(mask[wi].token[i])`
			`self.attn[li][i] = 0.0f;`
			`else`
			`self.attn[li][i] = -10000.0f;`

			`for(uint i = 0; i < wn; ++i)`
			`if(mask[wi].token[i])`
			`for(uint j = 0; j < 64/4; ++j)`
			`{`
			`float4`
			`qry = token[wi].qry.attn[li][j],`
			`key = token[i].key.attn[li][j],`
			`res = qry * key;`
			`for(uint k = 0; k < 4; ++k)`
			`self.attn[li][i] += res[k];`
			`}`

			`for(uint i = 0; i < wn; ++i)`
			`if(mask[wi].token[i])`
			`self.attn[li][i] /= 8.0f;`

			`for(uint i = 0; i < wn; ++i)`
			`self.attn[li][i] = exp(self.attn[li][i]);`

			`float4 vacc = 0.0f;`
			`for(uint i = 0; i < wn; ++i)`
			`vacc[i % 4] += self.attn[li][i];`

			`float acc = 0.0f;`
			`for(uint i = 0; i < 4; ++i)`
			`acc += vacc[i];`

			`for(uint i = 0; i < wn; ++i)`
			`self.attn[li][i] /= acc;`

			`for(uint j = 0; j < 64/4; ++j)`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`out->attn[li][j] = 0.0f;`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
			`for(uint i = 0; i < wn; ++i)`
			`for(uint j = 0; j < 64/4; ++j)`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_aperaturev *const restrict out,`
			`__global const union ircd_gpt_tokenv *const restrict in,`
			`__global const float4 *const restrict norm_bias,`
			`__global const float4 *const restrict norm_weight,`
			`__global const float4 *const restrict fcon_bias,`
			`__global const float4 *const restrict fcon_weight)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`gi = get_global_id(0),`
			`gn = get_global_size(0),`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`li = get_local_id(0),`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ln = get_local_size(0),`
			`wi = get_group_id(0),`
			`wn = get_num_groups(0);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`__local union ircd_gpt_aperaturev token;`
			`__local float4 tmp[768/4];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`token.word[li] = in[wi].word[li];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`// Layer re-normalization`
			`ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);`
			`ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);`

			`// Fully connected`
			`for(uint i = 0; i < 3; ++i)`
			`ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);`

			`// Export queries, keys, and values.`
			`for(uint i = 0; i < 3; ++i)`
			`out[wi].proj[i][li] = token.proj[i][li];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_tokenv *const restrict accum,`
			`__global const struct ircd_gpt_qkvv *const restrict state,`
			`__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],`
			`__global const float4 *const restrict attn_proj_bias,`
			`__global const float4 *const restrict attn_proj_weight,`
			`__global const float4 *const restrict ffnn_norm_bias,`
			`__global const float4 *const restrict ffnn_norm_weight,`
			`__global const float4 *const restrict ffnn_fcon_bias,`
			`__global const float4 *const restrict ffnn_fcon_weight,`
			`__global const float4 *const restrict ffnn_proj_bias,`
			`__global const float4 *const restrict ffnn_proj_weight)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`__local union ircd_gpt_tokenv value;`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_attn_self`
			`(`
			`ctrl,`
			`opts,`
			`&value,`
			`state,`
			`mask`
			`);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_attn_proj`
			`(`
			`ctrl,`
			`opts,`
			`accum,`
			`&value,`
			`attn_proj_bias,`
			`attn_proj_weight`
			`);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_ffnn`
			`(`
			`ctrl,`
			`opts,`
			`accum,`
			`ffnn_norm_bias,`
			`ffnn_norm_weight,`
			`ffnn_fcon_bias,`
			`ffnn_fcon_weight,`
			`ffnn_proj_bias,`
			`ffnn_proj_weight`
			`);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`//`
			`// frontend`
			`//`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`inline void`
			`_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_tokenv *const restrict out,`
			`__global const union ircd_gpt_tokenv *const restrict pos,`
			`__global const union ircd_gpt_tokenv *const restrict vocab,`
			`const uint out_idx,`
			`const uint tok_idx,`
			`const uint word_idx)`
			`{`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`const ushort`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`const float4`
			`wte = vocab[token].word[word_idx],`
			`wpe = pos[tok_idx].word[word_idx];`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`out[out_idx].word[word_idx] = wte + wpe;`
			`}`

			`__kernel void`
			`ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_tokenv *const restrict accum,`
			`__global const union ircd_gpt_tokenv *const restrict pos,`
			`__global const union ircd_gpt_tokenv *const restrict vocab)`
			`{`
			`const uint`
			`li = get_local_id(0);`

			`for(uint i = 0; i < ctrl->tokens; ++i)`
			`_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global union ircd_gpt_tokenv *const restrict accum,`
			`__global const float4 *const restrict norm_bias,`
			`__global const float4 *const restrict norm_weight)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
			`li = get_local_id(0),`
			`ln = get_local_size(0),`
			`wi = get_global_offset(0) / ln + get_group_id(0);`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`__local union ircd_gpt_tokenv`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`token, tmp;`

			`token.word[li] = accum[wi].word[li];`

			`// Final re-normalization`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);`
			`ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00
			`accum[0].word[li] = token.word[li];`
			`}`

			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global float *const restrict logit,`
			`__global const union ircd_gpt_tokenv *const restrict accum,`
			`__global const union ircd_gpt_tokenv *const restrict token)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
			`gi = get_global_id(0);`

			`float4 acc = 0.0f;`
			`for(uint j = 0; j < 768/4; ++j)`
			`{`
			`const float4`
			`in = accum[0].word[j],`
			`vocab = token[gi].word[j],`
			`res = vocab * in;`

			`acc += res;`
			`}`

			`float res = 0.0f;`
			`for(uint k = 0; k < 4; ++k)`
			`res += acc[k];`

			`logit[gi] = res;`
			`}`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`inline void`
			`ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`const uint li)`
			`{`
			`// No action for other threads right now`
			`if(li != 0)`
			`return;`

			`// Run debug checks and assertions.`
			`#ifdef RB_DEBUG`
			`if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)`
			`if(ctrl->tokens < 2)`
			`ctrl->call = IRCD_GPT_ETOKENS;`
			`#endif`

			`// If the call value has been set to something other than default we`
			`// do nothing else here.`
			`if(ctrl->call != IRCD_GPT_ECOMPLETE)`
			`return;`

			`// On the last cycle, with no prior call or error code set, indicate`
			`// a nominal exit condition.`
			`if(ctrl->cycle + 1 >= opts->limit)`
			`{`
			`ctrl->call = IRCD_GPT_ACCEPT;`
			`ctrl->epoch += 1;`
			`}`

			`ctrl->cycle += 1;`
			`}`

			`inline void`
			`ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`const uint li,`
			`__local const ushort *const restrict idx)`
			`{`
			`// To read from cells other than idx[0] we need this barrier.`
			`if(opts->top_k > 1)`
			`barrier(CLK_LOCAL_MEM_FENCE);`

			`// No action for other threads right now`
			`if(li != 0)`
			`return;`

			`// When the hypercall code is already set, bail here.`
			`if(ctrl->call != IRCD_GPT_ECOMPLETE)`
			`return;`

			`const bool`
			`buffer_full = ctrl->tokens >= opts->buffer_tokens;`

			`const ulong`
			`rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),`
			`sel = rnd % max(opts->top_k, 1U);`

			`const ushort`
			`token = idx[sel],`
			`token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;`

			`ctrl->token[token_idx] = token;`

			`if(buffer_full)`
			`ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;`
			`else`
			`ctrl->tokens++;`
			`}`

ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`__kernel void`
ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,`
			`__constant const struct ircd_gpt_opts *const opts,`
			`__global const float *const restrict logit)`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`{`
			`const uint`
			`gi = get_global_id(0),`
			`gn = get_global_size(0),`
			`li = get_local_id(0),`
			`ln = get_local_size(0),`
			`wi = get_group_id(0),`
			`wn = get_num_groups(0),`
			`tn = 262,`
			`ti = tn * li;`

			`__local ushort idx[192];`

			`idx[li] = ti;`
			`for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)`
			`if(logit[j] > logit[idx[li]])`
			`idx[li] = j;`

ircd::gpt: Reorganize interface. 2021-04-02 22:01:38 +02:00			`ircd_simt_sort_idx16_flldr(idx, logit, ln, li);`
			`ircd_gpt_lm_result(ctrl, opts, li, idx);`
			`ircd_gpt_leave(ctrl, opts, li);`
ircd::gpt: Transformer Pipe. 2021-03-30 03:22:42 +02:00			`}`