// Matrix Construct // // Copyright (C) Matrix Construct Developers, Authors & Contributors // Copyright (C) 2016-2021 Jason Volk // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. inline void ctor_local_bcast_ldr(__local float4 *const out, const uint ln, const uint li) { for(uint stride = 1; stride < ln; stride <<= 1) { if(li < stride) out[li + stride] = out[li]; barrier(CLK_LOCAL_MEM_FENCE); } } inline void ctor_local_reduce_add_ldr(__local float4 *const out, const uint ln, const uint li) { for(uint stride = ln >> 1; stride > 0; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); if(li < stride) out[li] += out[li + stride]; } } inline void ctor_local_reduce_max_ldr(__local float *const out, const uint ln, const uint li) { for(uint stride = ln >> 1; stride > 0; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); if(li < stride) out[li] = max(out[li], out[li + stride]); } } inline void ctor_local_reduce_tournament_ldr(__local float *const best, __local ushort *const idx, const uint ln, const uint li) { for(uint stride = ln >> 1; stride > 0; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); if(li < stride && best[li] < best[li + stride]) { best[li] = best[li + stride]; idx[li] = idx[li + stride]; } } } inline void ctor_mean(__local float4 *const restrict out, __local const float4 *const restrict in, const uint num, const uint i) { out[i] = in[i]; ctor_local_reduce_add_ldr(out, num, i); float numerator = 0.0f; float4 numeratorv = out[i]; for(uint k = 0; k < 4; ++k) numerator += numeratorv[k]; out[i] = numerator; ctor_local_bcast_ldr(out, num, i); numeratorv = out[i]; out[i] = numeratorv / (num * 4); } inline void ctor_norm(__local float4 *const out, __local const float4 *const in, __local float4 *const restrict tmp, const uint num, const uint i) { ctor_mean(tmp, in, num, i); const float4 sub_mean = in[i] - tmp[i]; tmp[i] = pow(sub_mean, 2); ctor_mean(out, tmp, num, i); const float4 epsilon = 0.00001f, s = sqrt(out[i] + epsilon); out[i] = sub_mean / s; } inline void ctor_norm_fmad(__local float4 *const out, __local const float4 *const in, __global const float4 *const restrict bias, __global const float4 *const restrict weight, const uint i) { out[i] = in[i] * weight[i] + bias[i]; } // Matrix * Vector Multiply/Accumulate inline void ctor_sgemv(__local float4 *const restrict out, __local const float4 *const restrict in, __global const float4 *const restrict bias, __global const float4 *const restrict weight, const uint width, const uint height, const uint tiles, const uint i) { const uint seg = height / tiles; float4 acc = bias[i]; for(uint j = 0; j < seg; ++j) for(uint t = 0; t < tiles; ++t) for(uint k = 0; k < 4; ++k) { const uint jidx = t * seg + j, kidx = jidx * 4 + k, widx = kidx * width + i; acc += weight[widx] * in[jidx][k]; } out[i] = acc; } inline void ctor_gelu(__local float4 *const out, __local const float4 *const in_, const uint i) { float4 a, in = in_[i]; a = 0.044715f; a *= in; a *= in; a += 1.0f; a *= 0.7978845608f; a *= in; a = tanh(a); a += 1.0f; a *= in; a *= 0.5f; out[i] = a; } // // core // __kernel void ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union aperaturev *const restrict out, __global const union tokenv *const restrict in, __global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight, __global const float4 *const restrict fcon_bias, __global const float4 *const restrict fcon_weight) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local union aperaturev token; __local float4 tmp[768/4]; token.word[li] = in[wi].word[li]; // Layer re-normalization ctor_norm(token.word, token.word, tmp, ln, li); ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); // Fully connected for(uint i = 0; i < 3; ++i) ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li); // Export queries, keys, and values. for(uint i = 0; i < 3; ++i) out[wi].proj[i][li] = token.proj[i][li]; } __kernel void ctor_attn_proj(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const union tokenv *const restrict xattn, __global const float4 *const restrict proj_bias, __global const float4 *const restrict proj_weight) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local float4 in[768/4], out[768/4]; // Fetch in[li] = xattn[wi].word[li]; // Projection ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li); // Accumulation; end of layer accum[wi].word[li] += out[li]; } __kernel void ctor_ffnn(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight, __global const float4 *const restrict fcon_bias, __global const float4 *const restrict fcon_weight, __global const float4 *const restrict proj_bias, __global const float4 *const restrict proj_weight) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); __local union aperaturev token; __local float4 tmp[768/4]; // Fetch local copy of the global accumulator. We operate on a cached // copy as input, and add our output to the global upon completion. token.word[li] = accum[wi].word[li]; // Layer re-normalization ctor_norm(token.word, token.word, tmp, ln, li); ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); // Fully connected for(uint i = 0; i < 4; ++i) ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li); // Gaussian Error Linear Unit for(uint i = 0; i < 4; ++i) ctor_gelu(token.fcon, token.fcon, i * ln + li); // Projection ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li); // Accumulation; end of layer accum[wi].word[li] += tmp[li]; } __kernel void ctor_backend(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const union tokenv *const restrict xattn, __global const float4 *const restrict attn_proj_bias, __global const float4 *const restrict attn_proj_weight, __global const float4 *const restrict ffnn_norm_bias, __global const float4 *const restrict ffnn_norm_weight, __global const float4 *const restrict ffnn_fcon_bias, __global const float4 *const restrict ffnn_fcon_weight, __global const float4 *const restrict ffnn_proj_bias, __global const float4 *const restrict ffnn_proj_weight) { ctor_attn_proj ( ctrl, opts, accum, xattn, attn_proj_bias, attn_proj_weight ); ctor_ffnn ( ctrl, opts, accum, ffnn_norm_bias, ffnn_norm_weight, ffnn_fcon_bias, ffnn_fcon_weight, ffnn_proj_bias, ffnn_proj_weight ); } // // ctrl // __kernel void ctor_attn_self(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict out, __global const struct qkvv *const restrict token, __global const struct attn_mask *const restrict mask) // [1024][1024], { __local struct { float attn[12][32]; } self; const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0); for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) self.attn[li][i] = 0.0f; else self.attn[li][i] = -10000.0f; for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) for(uint j = 0; j < 64/4; ++j) { float4 qry = token[wi].qry.attn[li][j], key = token[i].key.attn[li][j], res = qry * key; for(uint k = 0; k < 4; ++k) self.attn[li][i] += res[k]; } for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) self.attn[li][i] /= 8.0f; for(uint i = 0; i < wn; ++i) self.attn[li][i] = exp(self.attn[li][i]); float4 vacc = 0.0f; for(uint i = 0; i < wn; ++i) vacc[i % 4] += self.attn[li][i]; float acc = 0.0f; for(uint i = 0; i < 4; ++i) acc += vacc[i]; for(uint i = 0; i < wn; ++i) self.attn[li][i] /= acc; for(uint j = 0; j < 64/4; ++j) out[wi].attn[li][j] = 0.0f; for(uint i = 0; i < wn; ++i) for(uint j = 0; j < 64/4; ++j) out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i]; } // // leads // __kernel void ctor_anode0(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const union tokenv *const restrict pos, __global const union tokenv *const restrict vocab) { const uint li = get_local_id(0), wi = get_group_id(0); const ushort token = ctrl->body.token[wi]; const float4 wte = vocab[token].word[li], wpe = pos[wi].word[li]; accum[wi].word[li] = wte + wpe; } __kernel void ctor_anode1(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const union tokenv *const restrict pos, __global const union tokenv *const restrict vocab) { const uint li = get_local_id(0); for(uint i = 0; i < ctrl->tokens; ++i) { const ushort token = ctrl->body.token[i]; const float4 wte = vocab[token].word[li], wpe = pos[i].word[li]; accum[i].word[li] = wte + wpe; } } __kernel void ctor_anode2(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const union tokenv *const restrict pos, __global const union tokenv *const restrict vocab) { const uint gi = get_global_id(0); const ushort token = ctrl->body.token[gi]; for(uint i = 0; i < 768/4; ++i) { const float4 wte = vocab[token].word[i], wpe = pos[gi].word[i]; accum[gi].word[i] = wte + wpe; } } __kernel void ctor_cathode(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global union tokenv *const restrict accum, __global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight) { const uint li = get_local_id(0), ln = get_local_size(0), wi = get_global_offset(0) / ln + get_group_id(0); __local union tokenv token, tmp; token.word[li] = accum[wi].word[li]; // Final re-normalization ctor_norm(token.word, token.word, tmp.word, ln, li); ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li); accum[0].word[li] = token.word[li]; } __kernel void ctor_lmhead(__global const struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global float *const restrict logit, __global const union tokenv *const restrict accum, __global const union tokenv *const restrict token) { const uint gi = get_global_id(0); float4 acc = 0.0f; for(uint j = 0; j < 768/4; ++j) { const float4 in = accum[0].word[j], vocab = token[gi].word[j], res = vocab * in; acc += res; } float res = 0.0f; for(uint k = 0; k < 4; ++k) res += acc[k]; logit[gi] = res; } __kernel void ctor_lmamax(__global struct ctor_ctrl *const ctrl, __constant const struct ctor_opts *const opts, __global const float *const restrict logit) { const uint gi = get_global_id(0), gn = get_global_size(0), li = get_local_id(0), ln = get_local_size(0), wi = get_group_id(0), wn = get_num_groups(0), tn = 262, ti = tn * li; __local ushort idx[192]; __local float best[192]; idx[li] = ti; for(uint j = ti + 1; j < ti + tn && j < 50257; ++j) if(logit[j] > logit[idx[li]]) idx[li] = j; best[li] = logit[idx[li]]; ctor_local_reduce_tournament_ldr(best, idx, ln, li); if(li == 0 && ctrl->call == -1) ctrl->body.token[ctrl->tokens++] = idx[li]; if(li == 0 && ctrl->call == -1) ctrl->call = 1; #ifdef RB_DEBUG if(li == 0 && ctrl->call == 1) if(ctrl->tokens < 2) ctrl->call = -2; #endif }