// Matrix Construct // // Copyright (C) Matrix Construct Developers, Authors & Contributors // Copyright (C) 2016-2021 Jason Volk // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. inline void ircd_gpt_norm_fmad(__local float4 *const out, __local const float4 *const in, __global const float4 *const restrict bias, __global const float4 *const restrict weight, const uint i) { out[i] = in[i] * weight[i] + bias[i]; } // Matrix * Vector Multiply/Accumulate inline void ircd_gpt_sgemv(__local float4 *const restrict out, __local const float4 *const restrict in, __global const float4 *const restrict bias, __global const float4 *const restrict weight, const uint width, const uint height, const uint tiles, const uint i) { const uint seg = height / tiles; float4 acc = bias[i]; for(uint j = 0; j < seg; ++j) for(uint t = 0; t < tiles; ++t) for(uint k = 0; k < 4; ++k) { const uint jidx = t * seg + j, kidx = jidx * 4 + k, widx = kidx * width + i; acc += weight[widx] * in[jidx][k]; } out[i] = acc; } inline void ircd_gpt_gelu(__local float4 *const out, __local const float4 *const in_, const uint i) { float4 a, in = in_[i]; a = 0.044715f; a *= in; a *= in; a += 1.0f; a *= 0.7978845608f; a *= in; a = tanh(a); a += 1.0f; a *= in; a *= 0.5f; out[i] = a; } // // core // __kernel void ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict accum, __global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight, __global const float4 *const restrict fcon_bias, __global const float4 *const restrict fcon_weight, __global const float4 *const restrict proj_bias, __global const float4 *const restrict proj_weight) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local union ircd_gpt_aperaturev token; __local float4 tmp[768/4]; // Fetch local copy of the global accumulator. We operate on a cached // copy as input, and add our output to the global upon completion. token.word[li] = accum[wi].word[li]; // Layer re-normalization ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li); ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); // Fully connected for(uint i = 0; i < 4; ++i) ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li); // Gaussian Error Linear Unit for(uint i = 0; i < 4; ++i) ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li); // Projection ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li); // Accumulation; end of layer accum[wi].word[li] += tmp[li]; } __kernel void ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict accum, __local const union ircd_gpt_tokenv *const restrict xattn, __global const float4 *const restrict proj_bias, __global const float4 *const restrict proj_weight) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local float4 in[768/4], out[768/4]; // Fetch in[li] = xattn->word[li]; // Need this here if xattn is __local barrier(CLK_LOCAL_MEM_FENCE); // Projection ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li); // Accumulation; end of layer accum[wi].word[li] += out[li]; } __kernel void ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __local union ircd_gpt_tokenv *const restrict out, __global const struct ircd_gpt_qkvv *const restrict token, __global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024], { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local union { float attn[12][96]; } self; for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) self.attn[li][i] = 0.0f; else self.attn[li][i] = -10000.0f; for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) for(uint j = 0; j < 64/4; ++j) { float4 qry = token[wi].qry.attn[li][j], key = token[i].key.attn[li][j], res = qry * key; for(uint k = 0; k < 4; ++k) self.attn[li][i] += res[k]; } for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) self.attn[li][i] /= 8.0f; for(uint i = 0; i < wn; ++i) self.attn[li][i] = exp(self.attn[li][i]); float4 vacc = 0.0f; for(uint i = 0; i < wn; ++i) vacc[i % 4] += self.attn[li][i]; float acc = 0.0f; for(uint i = 0; i < 4; ++i) acc += vacc[i]; for(uint i = 0; i < wn; ++i) self.attn[li][i] /= acc; for(uint j = 0; j < 64/4; ++j) out->attn[li][j] = 0.0f; for(uint i = 0; i < wn; ++i) for(uint j = 0; j < 64/4; ++j) out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i]; } __kernel void ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_aperaturev *const restrict out, __global const union ircd_gpt_tokenv *const restrict in, __global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight, __global const float4 *const restrict fcon_bias, __global const float4 *const restrict fcon_weight) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local union ircd_gpt_aperaturev token; __local float4 tmp[768/4]; token.word[li] = in[wi].word[li]; // Layer re-normalization ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li); ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); // Fully connected for(uint i = 0; i < 3; ++i) ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li); // Export queries, keys, and values. for(uint i = 0; i < 3; ++i) out[wi].proj[i][li] = token.proj[i][li]; } __kernel void ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict accum, __global const struct ircd_gpt_qkvv *const restrict state, __global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024], __global const float4 *const restrict attn_proj_bias, __global const float4 *const restrict attn_proj_weight, __global const float4 *const restrict ffnn_norm_bias, __global const float4 *const restrict ffnn_norm_weight, __global const float4 *const restrict ffnn_fcon_bias, __global const float4 *const restrict ffnn_fcon_weight, __global const float4 *const restrict ffnn_proj_bias, __global const float4 *const restrict ffnn_proj_weight) { __local union ircd_gpt_tokenv value; ircd_gpt_attn_self ( ctrl, opts, &value, state, mask ); ircd_gpt_attn_proj ( ctrl, opts, accum, &value, attn_proj_bias, attn_proj_weight ); ircd_gpt_ffnn ( ctrl, opts, accum, ffnn_norm_bias, ffnn_norm_weight, ffnn_fcon_bias, ffnn_fcon_weight, ffnn_proj_bias, ffnn_proj_weight ); } // // frontend // inline void _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict out, __global const union ircd_gpt_tokenv *const restrict pos, __global const union ircd_gpt_tokenv *const restrict vocab, const uint out_idx, const uint tok_idx, const uint word_idx) { const ushort token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens]; const float4 wte = vocab[token].word[word_idx], wpe = pos[tok_idx].word[word_idx]; out[out_idx].word[word_idx] = wte + wpe; } __kernel void ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict accum, __global const union ircd_gpt_tokenv *const restrict pos, __global const union ircd_gpt_tokenv *const restrict vocab) { const uint li = get_local_id(0); for(uint i = 0; i < ctrl->tokens; ++i) _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li); } __kernel void ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global union ircd_gpt_tokenv *const restrict accum, __global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight) { const uint li = get_local_id(0), ln = get_local_size(0), wi = get_global_offset(0) / ln + get_group_id(0); __local union ircd_gpt_tokenv token, tmp; token.word[li] = accum[wi].word[li]; // Final re-normalization ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li); ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li); accum[0].word[li] = token.word[li]; } __kernel void ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global float *const restrict logit, __global const union ircd_gpt_tokenv *const restrict accum, __global const union ircd_gpt_tokenv *const restrict token) { const uint gi = get_global_id(0); float4 acc = 0.0f; for(uint j = 0; j < 768/4; ++j) { const float4 in = accum[0].word[j], vocab = token[gi].word[j], res = vocab * in; acc += res; } float res = 0.0f; for(uint k = 0; k < 4; ++k) res += acc[k]; logit[gi] = res; } inline void ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, const uint li) { // No action for other threads right now if(li != 0) return; // Run debug checks and assertions. #ifdef RB_DEBUG if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE) if(ctrl->tokens < 2) ctrl->call = IRCD_GPT_ETOKENS; #endif // If the call value has been set to something other than default we // do nothing else here. if(ctrl->call != IRCD_GPT_ECOMPLETE) return; // On the last cycle, with no prior call or error code set, indicate // a nominal exit condition. if(ctrl->cycle + 1 >= opts->limit) { ctrl->call = IRCD_GPT_ACCEPT; ctrl->epoch += 1; } ctrl->cycle += 1; } inline void ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, const uint li, __local const ushort *const restrict idx) { // To read from cells other than idx[0] we need this barrier. if(opts->top_k > 1) barrier(CLK_LOCAL_MEM_FENCE); // No action for other threads right now if(li != 0) return; // When the hypercall code is already set, bail here. if(ctrl->call != IRCD_GPT_ECOMPLETE) return; const bool buffer_full = ctrl->tokens >= opts->buffer_tokens; const ulong rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand), sel = rnd % max(opts->top_k, 1U); const ushort token = idx[sel], token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens; ctrl->token[token_idx] = token; if(buffer_full) ctrl->head = (ctrl->head + 1) % opts->buffer_tokens; else ctrl->tokens++; } __kernel void ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl, __constant const struct ircd_gpt_opts *const opts, __global const float *const restrict logit) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0), tn = 262, ti = tn * li; __local ushort idx[192]; idx[li] = ti; for(uint j = ti + 1; j < ti + tn && j < 50257; ++j) if(logit[j] > logit[idx[li]]) idx[li] = j; ircd_simt_sort_idx16_flldr(idx, logit, ln, li); ircd_gpt_lm_result(ctrl, opts, li, idx); ircd_gpt_leave(ctrl, opts, li); }