mirror of
https://github.com/matrix-construct/construct
synced 2024-12-30 17:34:04 +01:00
1504 lines
41 KiB
Common Lisp
1504 lines
41 KiB
Common Lisp
// Matrix Construct
|
|
//
|
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
//
|
|
// Permission to use, copy, modify, and/or distribute this software for any
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
// copyright notice and this permission notice is present in all copies. The
|
|
// full license for this software is available in the LICENSE file.
|
|
|
|
//#pragma OPENCL EXTENSION cl_amd_device_attribute_query : enable
|
|
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store :enable
|
|
//#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
|
//#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
|
//#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
|
|
//#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
|
|
|
|
#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
|
|
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
|
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
|
|
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
|
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
|
|
|
|
#pragma clang fp exceptions(ignore)
|
|
#pragma clang fp reassociate(on)
|
|
#pragma clang fp contract(fast)
|
|
|
|
#include <ircd/config.h>
|
|
#include <clc/clc.h>
|
|
|
|
#define __region __attribute__((address_space(0x02)))
|
|
|
|
#if !defined(assume)
|
|
#define assume(x) __builtin_assume(x)
|
|
#endif
|
|
|
|
#if defined(__SPIR)
|
|
#define restrict
|
|
#elif defined(__cplusplus)
|
|
#define restrict __restrict
|
|
#endif
|
|
|
|
#if __OPENCL_VERSION__ < 120
|
|
#define static __attribute__((internal_linkage))
|
|
#else
|
|
#define static __constant static
|
|
#endif
|
|
|
|
#pragma clang attribute push(__attribute__((always_inline)), apply_to = function)
|
|
#pragma clang attribute push(__attribute__((internal_linkage)), apply_to = function)
|
|
#include <ircd/simt/simt.h>
|
|
#include <ircd/gpt/vector.h>
|
|
#include <ircd/gpt/opts.h>
|
|
#include <ircd/gpt/ctrl.h>
|
|
#pragma clang attribute pop
|
|
#pragma clang attribute pop
|
|
|
|
#if __OPENCL_VERSION__ >= 120
|
|
#undef static
|
|
#endif
|
|
|
|
#include <ircd/gpt/gpu.h>
|
|
|
|
//
|
|
// head
|
|
//
|
|
|
|
__kernel void
|
|
__attribute__((visibility("protected")))
|
|
__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_alloc(__global const void *const restrict model,
|
|
__global void *const restrict master,
|
|
__constant const void *const opts,
|
|
__global void *const restrict ctrl,
|
|
__global void *const restrict frame0,
|
|
__global void *const restrict frame1,
|
|
__global void *const restrict frame2,
|
|
__global void *const restrict frame3,
|
|
__global void *const restrict frame4,
|
|
__global void *const restrict frame5,
|
|
__global void *const restrict frame6,
|
|
__global void *const restrict frame7)
|
|
{
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((visibility("protected")))
|
|
__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_enter(__global const void *const restrict model,
|
|
__global void *const restrict state,
|
|
__global void *const restrict master,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global struct ircd_gpt_ctrl *const restrict ctrl)
|
|
{
|
|
const ushort
|
|
gi = get_global_id(0),
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
cycle = ctrl->clk.cycle;
|
|
|
|
if(li == 0)
|
|
;//ctrl->prof.entered = __builtin_readcyclecounter();
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((vec_type_hint(float4)))
|
|
__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict accum,
|
|
__global const ircd_gpt_vectorv *const restrict pos,
|
|
__global const ircd_gpt_vectorv *const restrict vocab)
|
|
{
|
|
const ushort
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0);
|
|
|
|
const uint
|
|
wo = get_global_offset(0);
|
|
|
|
assume(ln == 192);
|
|
assume(wo % ln == 0);
|
|
|
|
const ushort
|
|
wi = wo / ln + get_group_id(0);
|
|
|
|
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li);
|
|
}
|
|
|
|
static void
|
|
_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict accum,
|
|
__global const ircd_gpt_vectorv *const restrict pos,
|
|
__global const ircd_gpt_vectorv *const restrict vocab,
|
|
const ushort out_idx,
|
|
const ushort tok_idx,
|
|
const ushort elem_idx)
|
|
{
|
|
const ushort
|
|
token = ctrl->token[tok_idx];
|
|
|
|
const float4
|
|
wpe = pos[tok_idx].elem[elem_idx],
|
|
wte = vocab[token].elem[elem_idx],
|
|
res = wte + wpe;
|
|
|
|
accum[out_idx].elem[elem_idx] = res;
|
|
}
|
|
|
|
//
|
|
// Frontside
|
|
//
|
|
|
|
void
|
|
ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_ffnn_aperaturev *const restrict out,
|
|
__local const ircd_gpt_vectorv *const restrict in,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict bias,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict weight,
|
|
const uint li)
|
|
{
|
|
const uint
|
|
lanes = 4,
|
|
segs = ircd_gpt_ffnn_segs,
|
|
height = ircd_gpt_vector_elems / lanes;
|
|
|
|
assume(height > 0);
|
|
assume(height % lanes == 0);
|
|
|
|
for(uint x = 0; x < segs; ++x)
|
|
out->proj[x][li] = bias->proj[x][li];
|
|
|
|
for(uint y = 0; y < height; ++y)
|
|
for(uint k = 0; k < lanes; ++k)
|
|
for(uint x = 0; x < segs; ++x)
|
|
{
|
|
const uint
|
|
row = y * lanes + k;
|
|
|
|
out->proj[x][li] += in->elem[y][k] * weight[row].proj[x][li];
|
|
}
|
|
}
|
|
|
|
void
|
|
ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_ffnn_aperaturev *const restrict out,
|
|
__local const ircd_gpt_vectorv *const restrict in,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict bias,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict weight,
|
|
const uint ln,
|
|
const uint li)
|
|
{
|
|
const uint
|
|
segs = ircd_gpt_ffnn_segs;
|
|
|
|
// Fully connected
|
|
ircd_gpt_ffnn_fcon_tmul
|
|
(
|
|
opts,
|
|
out,
|
|
in,
|
|
bias,
|
|
weight,
|
|
li
|
|
);
|
|
|
|
for(uint i = 0; i < segs; ++i)
|
|
ircd_gpt_ffnn_gelu(out, out, i * ln + li);
|
|
}
|
|
|
|
void
|
|
ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_vectorv *const restrict out,
|
|
__local const ircd_gpt_ffnn_aperaturev *const restrict in,
|
|
__global const ircd_gpt_vectorv *const restrict bias,
|
|
__global const ircd_gpt_vectorv *const restrict weight,
|
|
const uint li)
|
|
{
|
|
const uint
|
|
lanes = 4,
|
|
height = ircd_gpt_ffnn_fcon_elems / lanes;
|
|
|
|
assume(height > 0);
|
|
assume(height % lanes == 0);
|
|
|
|
out->elem[li] = bias->elem[li];
|
|
|
|
for(uint y = 0; y < height; ++y)
|
|
for(uint k = 0; k < lanes; ++k)
|
|
{
|
|
const uint
|
|
row = y * lanes + k;
|
|
|
|
out->elem[li] += in->fcon[y][k] * weight[row].elem[li];
|
|
}
|
|
}
|
|
|
|
void
|
|
ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_vectorv *const restrict token,
|
|
__local ircd_gpt_ffnn_aperaturev *const restrict buf,
|
|
__local ircd_gpt_vectorv *const restrict tmp,
|
|
__global const ircd_gpt_vectorv *const restrict norm_bias,
|
|
__global const ircd_gpt_vectorv *const restrict norm_weight,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict fcon_bias,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict fcon_weight,
|
|
__global const ircd_gpt_vectorv *const restrict proj_bias,
|
|
__global const ircd_gpt_vectorv *const restrict proj_weight,
|
|
const uint ln,
|
|
const uint li)
|
|
{
|
|
// Layer re-normalization
|
|
ircd_gpt_norm(token, token, tmp, norm_bias, norm_weight, ln, li);
|
|
|
|
// ln's writes are still pending but fcon reads results across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Fully connected
|
|
ircd_gpt_ffnn_fcon
|
|
(
|
|
ctrl,
|
|
opts,
|
|
buf,
|
|
token,
|
|
fcon_bias,
|
|
fcon_weight,
|
|
ln,
|
|
li
|
|
);
|
|
|
|
// fcon's writes are still pending but proj reads results across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Projection
|
|
ircd_gpt_ffnn_proj_tmul
|
|
(
|
|
opts,
|
|
token,
|
|
buf,
|
|
proj_bias,
|
|
proj_weight,
|
|
li
|
|
);
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local float self[restrict][12],
|
|
const uint ln,
|
|
const uint li,
|
|
const uint wn,
|
|
const uint wi)
|
|
{
|
|
struct ircd_math_samax samax =
|
|
{
|
|
.mu = -10000.0f,
|
|
.sum = 0.0f,
|
|
};
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
samax.mu = max(samax.mu, self[i][li]);
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
self[i][li] -= samax.mu;
|
|
|
|
for(uint i = 0; i < wn; ++i)
|
|
self[i][li] = native_exp(self[i][li]);
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
samax.sum += self[i][li];
|
|
|
|
samax.sum += FLT_EPSILON;
|
|
samax.lambda = 1.0f / samax.sum;
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
self[i][li] *= samax.lambda;
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_attn_self_keys(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local float self[restrict][ircd_gpt_attn_rank],
|
|
__global const ircd_gpt_attn_qkvv *const restrict token,
|
|
const uint ln,
|
|
const uint li,
|
|
const uint wi,
|
|
const uint kn,
|
|
const uint i)
|
|
{
|
|
assume(i < wi);
|
|
|
|
self[i][li] = 0.0f;
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint k = 0; k < kn; ++k)
|
|
{
|
|
float4
|
|
qry = token[wi].qry.attn[li][k],
|
|
key = token[i].key.attn[li][k],
|
|
res = qry * key;
|
|
|
|
self[i][li] += ircd_simt_reduce_add_f4(res);
|
|
}
|
|
|
|
self[i][li] /= 8.0f;
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_attn_self_vals(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_vectorv *const restrict out,
|
|
__local const float self[restrict][ircd_gpt_attn_rank],
|
|
__global const ircd_gpt_attn_qkvv *const restrict token,
|
|
const uint li,
|
|
const uint wi,
|
|
const uint ki,
|
|
const uint ti)
|
|
{
|
|
out->attn[ti][ki] = 0.0f;
|
|
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wi; ++i)
|
|
{
|
|
const float4
|
|
val = token[i].val.attn[ti][ki],
|
|
attn = self[i][ti],
|
|
res = attn * val;
|
|
|
|
out->attn[ti][ki] += res;
|
|
}
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_attn_self(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_vectorv *const restrict out,
|
|
__local float self[restrict][ircd_gpt_attn_rank],
|
|
__global float attns[restrict][ircd_gpt_attn_rank],
|
|
__global const ircd_gpt_attn_qkvv *const restrict token,
|
|
const uint ln,
|
|
const uint li,
|
|
const uint wi)
|
|
{
|
|
//assume(opts->attn_rank == sizeof(self[0]) / sizeof(float));
|
|
assume(opts->attn_rank == ircd_gpt_attn_rank);
|
|
assume(ctrl->count < ircd_gpt_context_tokens);
|
|
assume(ctrl->tokens <= ircd_gpt_context_tokens);
|
|
assume(ctrl->tokens > wi);
|
|
assume(ctrl->tokens > 0);
|
|
|
|
const uint
|
|
wn = ctrl->tokens,
|
|
kn = ln / opts->attn_rank,
|
|
ki = li / opts->attn_rank,
|
|
ti = li % opts->attn_rank;
|
|
|
|
// Low-rank mask
|
|
if(li < opts->attn_rank)
|
|
{
|
|
// Left attention
|
|
uint i;
|
|
for(i = 0; i < wi; ++i)
|
|
ircd_gpt_attn_self_keys(ctrl, opts, self, token, ln, li, wi, kn, i);
|
|
|
|
// Future mask
|
|
__attribute__((opencl_unroll_hint))
|
|
while(i < wn)
|
|
self[i++][li] = -10000.0f;
|
|
|
|
// Three-piece softmax
|
|
ircd_gpt_attn_self_samax(ctrl, opts, self, ln, li, wn, wi);
|
|
}
|
|
|
|
// Propagate to full width for value dot prod.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
ircd_gpt_attn_self_vals(ctrl, opts, out, self, token, li, wi, ki, ti);
|
|
|
|
// Save softmax results for later analysis/observation.
|
|
if(li < opts->attn_rank)
|
|
{
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint i = 0; i < wn; ++i)
|
|
attns[i][li] = self[i][li];
|
|
}
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_attn_proj_tmul(__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_vectorv *const restrict out,
|
|
__local const ircd_gpt_vectorv *const restrict in,
|
|
__global const ircd_gpt_vectorv *const restrict bias,
|
|
__global const ircd_gpt_vectorv *const restrict weight,
|
|
const uint li)
|
|
{
|
|
const uint
|
|
lanes = 4,
|
|
height = ircd_gpt_vector_elems / 4;
|
|
|
|
assume(height > 0);
|
|
assume(height % lanes == 0);
|
|
|
|
out->elem[li] = bias->elem[li];
|
|
|
|
for(uint y = 0; y < height; ++y)
|
|
for(uint k = 0; k < lanes; ++k)
|
|
{
|
|
const uint
|
|
row = y * lanes + k;
|
|
|
|
const float4
|
|
a = in->elem[y][k],
|
|
b = weight[row].elem[li];
|
|
|
|
out->elem[li] += a * b;
|
|
}
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((vec_type_hint(float4)))
|
|
__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
__attribute__((visibility("protected")))
|
|
ircd_gpt_coil(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__private const uint layer,
|
|
__global ircd_gpt_vectorv *const restrict accum,
|
|
__global float attns[restrict][ircd_gpt_attn_rank],
|
|
__global const ircd_gpt_attn_qkvv *const restrict state,
|
|
__global const ircd_gpt_vectorv *const restrict attn_proj_bias,
|
|
__global const ircd_gpt_vectorv *const restrict attn_proj_weight,
|
|
__global const ircd_gpt_vectorv *const restrict ffnn_norm_bias,
|
|
__global const ircd_gpt_vectorv *const restrict ffnn_norm_weight,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict ffnn_fcon_bias,
|
|
__global const ircd_gpt_ffnn_aperaturev *const restrict ffnn_fcon_weight,
|
|
__global const ircd_gpt_vectorv *const restrict ffnn_proj_bias,
|
|
__global const ircd_gpt_vectorv *const restrict ffnn_proj_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wo = get_global_offset(0),
|
|
wi = wo / ln + get_group_id(0);
|
|
|
|
assume(ln == 192);
|
|
assume(wo % ln == 0);
|
|
|
|
__local union
|
|
{
|
|
float
|
|
attn_self[ircd_gpt_context_tokens][ircd_gpt_attn_rank];
|
|
|
|
ircd_gpt_ffnn_aperaturev
|
|
ffnn_fcon[2];
|
|
|
|
ircd_gpt_vectorv
|
|
vector[8];
|
|
}
|
|
buf;
|
|
|
|
__local ircd_gpt_vectorv
|
|
buf0, buf1,
|
|
*const restrict attn_self = &buf1,
|
|
*const restrict token = &buf0,
|
|
*const restrict tmp = &buf1;
|
|
|
|
// Self-attention backend; this computes the self-attention result now
|
|
// that keys and values are globally visible across tokens.
|
|
ircd_gpt_attn_self
|
|
(
|
|
ctrl,
|
|
opts,
|
|
attn_self,
|
|
buf.attn_self,
|
|
attns,
|
|
state,
|
|
ln,
|
|
li,
|
|
wi
|
|
);
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Project result of self-attention.
|
|
ircd_gpt_attn_proj_tmul
|
|
(
|
|
opts,
|
|
token,
|
|
attn_self,
|
|
attn_proj_bias,
|
|
attn_proj_weight,
|
|
li
|
|
);
|
|
|
|
// Frontend accumulation
|
|
{
|
|
const float4
|
|
attn = token->elem[li],
|
|
resid = accum[wi].elem[li],
|
|
result = resid + attn;
|
|
|
|
token->elem[li] = result;
|
|
accum[wi].elem[li] = result;
|
|
}
|
|
|
|
// Backend mlp; layer-norm acquires any pending writes, no fence required.
|
|
ircd_gpt_ffnn
|
|
(
|
|
ctrl,
|
|
opts,
|
|
token,
|
|
buf.ffnn_fcon,
|
|
tmp,
|
|
ffnn_norm_bias,
|
|
ffnn_norm_weight,
|
|
ffnn_fcon_bias,
|
|
ffnn_fcon_weight,
|
|
ffnn_proj_bias,
|
|
ffnn_proj_weight,
|
|
ln,
|
|
li
|
|
);
|
|
|
|
// Backend accumulation
|
|
{
|
|
const float4
|
|
ffnn = token->elem[li],
|
|
resid = accum[wi].elem[li],
|
|
result = resid + ffnn;
|
|
|
|
accum[wi].elem[li] = result;
|
|
}
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *const opts,
|
|
__local ircd_gpt_attn_aperaturev *const restrict out,
|
|
__local const ircd_gpt_vectorv *const restrict in,
|
|
__global const ircd_gpt_attn_aperaturev *const restrict bias,
|
|
__global const ircd_gpt_attn_aperaturev *const restrict weight,
|
|
const uint ln,
|
|
const uint li)
|
|
{
|
|
const uint
|
|
lanes = 4,
|
|
segs = ircd_gpt_attn_segs,
|
|
height = ircd_gpt_vector_elems / lanes;
|
|
|
|
assume(height > 0);
|
|
assume(height % segs == 0);
|
|
assume(height % lanes == 0);
|
|
|
|
for(uint x = 0; x < segs; ++x)
|
|
out->proj[x][li] = bias->proj[x][li];
|
|
|
|
for(uint y = 0; y < height; ++y)
|
|
for(uint k = 0; k < lanes; ++k)
|
|
for(uint x = 0; x < segs; ++x)
|
|
{
|
|
const uint
|
|
row = y * lanes + k;
|
|
|
|
const float4
|
|
a = in->elem[y][k],
|
|
b = weight[row].proj[x][li];
|
|
|
|
out->proj[x][li] += a * b;
|
|
}
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((vec_type_hint(float4)))
|
|
__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__private const uint layer,
|
|
__global ircd_gpt_attn_aperaturev *const restrict state,
|
|
__global const ircd_gpt_vectorv *const restrict accum,
|
|
__global const ircd_gpt_vectorv *const restrict norm_bias,
|
|
__global const ircd_gpt_vectorv *const restrict norm_weight,
|
|
__global const ircd_gpt_attn_aperaturev *const restrict fcon_bias,
|
|
__global const ircd_gpt_attn_aperaturev *const restrict fcon_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wo = get_global_offset(0),
|
|
wi = wo / ln + get_group_id(0),
|
|
segs = ircd_gpt_attn_segs;
|
|
|
|
assume(ln == 192);
|
|
assume(wo % ln == 0);
|
|
|
|
__local ircd_gpt_attn_aperaturev
|
|
attn;
|
|
|
|
__local ircd_gpt_vectorv
|
|
token, *const restrict tmp = attn.vector;
|
|
|
|
token.elem[li] = accum[wi].elem[li];
|
|
|
|
// Layer re-normalization
|
|
ircd_gpt_norm(&token, &token, tmp, norm_bias, norm_weight, ln, li);
|
|
|
|
// Ln's writes are still pending; fcon requires results across threads.
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// Fully connected
|
|
ircd_gpt_attn_fcon_tmul
|
|
(
|
|
opts,
|
|
&attn,
|
|
&token,
|
|
fcon_bias,
|
|
fcon_weight,
|
|
ln,
|
|
li
|
|
);
|
|
|
|
// Export queries, keys, and values.
|
|
for(uint x = 0; x < segs; ++x)
|
|
state[wi].proj[x][li] = attn.proj[x][li];
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((vec_type_hint(float4)))
|
|
__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict accum,
|
|
__global const ircd_gpt_vectorv *const restrict norm_bias,
|
|
__global const ircd_gpt_vectorv *const restrict norm_weight)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
wo = get_global_offset(0),
|
|
wi = wo / ln + get_group_id(0);
|
|
|
|
assume(ln == 192);
|
|
assume(wo % ln == 0);
|
|
|
|
__local ircd_gpt_vectorv
|
|
tmp, token;
|
|
|
|
token.elem[li] = accum[wi].elem[li];
|
|
|
|
// Final re-normalization
|
|
ircd_gpt_norm(&token, &token, &tmp, norm_bias, norm_weight, ln, li);
|
|
|
|
accum[wi].elem[li] = token.elem[li];
|
|
}
|
|
|
|
__kernel void
|
|
ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global float *const restrict logit,
|
|
__global const ircd_gpt_vectorv *const restrict accum,
|
|
__global const ircd_gpt_vectorv *const restrict pos,
|
|
__global const ircd_gpt_vectorv *const restrict vocab)
|
|
{
|
|
const uint
|
|
gi = get_global_id(0),
|
|
wi = ctrl->count - 1;
|
|
|
|
assume(opts->embed_width == 192);
|
|
assume(opts->logits <= 65536);
|
|
|
|
if(gi >= opts->logits)
|
|
{
|
|
logit[gi] = -10000.0f;
|
|
return;
|
|
}
|
|
|
|
float acc = 0.0f;
|
|
for(uint j = 0; j < opts->embed_width; ++j)
|
|
{
|
|
const float4
|
|
token = vocab[gi].elem[j],
|
|
in = accum[wi].elem[j],
|
|
wpe = pos[wi].elem[j],
|
|
res = in * token + wpe;
|
|
|
|
acc += ircd_simt_reduce_add_f4(res);
|
|
}
|
|
|
|
logit[gi] = acc;
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(256, 256)))
|
|
ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global float logit[restrict 65536])
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
//wo = get_global_offset(0),
|
|
//wi = wo / ln + get_group_id(0),
|
|
wn = 50432,
|
|
tn = wn / ln,
|
|
start = tn * li,
|
|
stop = min(start + tn, opts->logits);
|
|
|
|
__local float
|
|
mu[256], sum[256], lambda[256];
|
|
|
|
__local struct ircd_math_samax
|
|
samax;
|
|
|
|
assume(ln == 256);
|
|
|
|
mu[li] = -10000.0f;
|
|
__attribute__((opencl_unroll_hint))
|
|
for(uint ti = start; ti < stop; ++ti)
|
|
mu[li] = max(mu[li], logit[ti]);
|
|
|
|
ircd_simt_reduce_max_flldr(mu, ln, li);
|
|
|
|
if(li == 0)
|
|
samax.mu = mu[li];
|
|
|
|
sum[li] = 0.0f;
|
|
for(uint ti = start; ti < stop; ++ti)
|
|
{
|
|
const float
|
|
sub = logit[ti] - samax.mu,
|
|
res = native_exp(sub);
|
|
|
|
sum[li] += res;
|
|
}
|
|
|
|
ircd_simt_reduce_add_flldr(sum, ln, li);
|
|
|
|
if(li == 0)
|
|
sum[li] += FLT_EPSILON,
|
|
samax.sum = sum[li],
|
|
samax.lambda = lambda[li] = 1.0f / sum[li];
|
|
|
|
ircd_simt_broadcast_flldr(lambda, ln, li);
|
|
|
|
for(uint ti = start; ti < stop; ++ti)
|
|
{
|
|
const float
|
|
sub = logit[ti] - samax.mu,
|
|
res = lambda[li] * native_exp(sub);
|
|
|
|
logit[ti] = res;
|
|
}
|
|
}
|
|
|
|
void
|
|
ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local const ushort *const restrict idx,
|
|
__global const float *const restrict logsm,
|
|
const uint i)
|
|
{
|
|
const ushort
|
|
token = idx[i];
|
|
|
|
const float
|
|
samax = logsm[token] + FLT_EPSILON;
|
|
|
|
ctrl->top[i].token = token;
|
|
ctrl->top[i].samax = samax;
|
|
}
|
|
|
|
void
|
|
ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local struct ircd_math_mean *const mean,
|
|
const float last)
|
|
{
|
|
const uint
|
|
div = mean->div + 1,
|
|
sum_sel = mean->div % 4;
|
|
|
|
const float
|
|
sum = mean->sum[0] + mean->sum[1] + mean->sum[2] + mean->sum[3] + last,
|
|
res = sum / div;
|
|
|
|
mean->sum[sum_sel] += last;
|
|
mean->div = div;
|
|
mean->last = last;
|
|
mean->mean = res;
|
|
}
|
|
|
|
void
|
|
ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local struct ircd_gpt_ctrl_label *const label,
|
|
__global const float *const restrict logsm)
|
|
{
|
|
const ushort
|
|
token = label->logit.token;
|
|
|
|
const float
|
|
samax = logsm[token] + FLT_EPSILON,
|
|
loss = 0.0f - native_log(samax),
|
|
ppl = (1.0f - samax) * native_log2(opts->logits);
|
|
|
|
label->logit.samax = samax;
|
|
ircd_gpt_lm_result_label_mean(ctrl, opts, &label->loss, loss);
|
|
ircd_gpt_lm_result_label_mean(ctrl, opts, &label->ppl, ppl);
|
|
}
|
|
|
|
ushort
|
|
ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local const ushort *const restrict idx,
|
|
__global const float *const restrict logsm)
|
|
{
|
|
const ulong
|
|
ent_k = max(opts->top_k, 1U) - 1,
|
|
rnd = ircd_simt_rand_xoshiro256pl(ctrl->rand);
|
|
|
|
const float
|
|
ent_p = min(max(opts->top_p, 0.0f), 1.0f),
|
|
thresh = ent_p;
|
|
|
|
float acc = 1.0f;
|
|
ushort select = 0;
|
|
for(; select < ent_k; ++select)
|
|
if((acc -= logsm[idx[select]]) < thresh)
|
|
break;
|
|
|
|
const ushort
|
|
token = idx[select];
|
|
|
|
return token;
|
|
}
|
|
|
|
static ushort
|
|
ircd_gpt_lm_result(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__local const ushort *const restrict idx,
|
|
__global const float *const restrict logsm)
|
|
{
|
|
const ushort
|
|
token = ircd_gpt_lm_result_select(ctrl, opts, idx, logsm);
|
|
|
|
// Update the dynamic result label.
|
|
ctrl->select.logit.token = token;
|
|
ircd_gpt_lm_result_label(ctrl, opts, &ctrl->select, logsm);
|
|
|
|
// Update the dynamic target label.
|
|
ctrl->target.logit.token = ctrl->count < ctrl->tokens?
|
|
ctrl->token[ctrl->count]:
|
|
ctrl->select.logit.token;
|
|
|
|
ircd_gpt_lm_result_label(ctrl, opts, &ctrl->target, logsm);
|
|
|
|
const bool
|
|
hit = ctrl->select.logit.token == ctrl->target.logit.token;
|
|
|
|
// Update the token context.
|
|
if(ctrl->count == ctrl->tokens)
|
|
{
|
|
ctrl->token[ctrl->count] = ctrl->select.logit.token;
|
|
ctrl->tokens++;
|
|
}
|
|
|
|
ctrl->miss += !hit;
|
|
ctrl->hit += hit;
|
|
ctrl->count++;
|
|
return token;
|
|
}
|
|
|
|
static void
|
|
ircd_gpt_lm_result_attns(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global const float *const restrict attns,
|
|
const uint ln,
|
|
const uint li)
|
|
{
|
|
const uint
|
|
layer = li / opts->layers,
|
|
head = li % opts->attn_rank,
|
|
base = layer * opts->attn_self_elems;
|
|
|
|
uint best = 0;
|
|
float bestv = 10000.0f;
|
|
for(uint i = 0; i < ctrl->count; ++i)
|
|
{
|
|
const uint
|
|
bx = (((i + 1) * i) / 2) * opts->attn_rank,
|
|
idx = base + bx + i * 12 + head;
|
|
|
|
if(attns[idx] < bestv)
|
|
bestv = attns[idx],
|
|
best = i;
|
|
}
|
|
|
|
ctrl->attn[layer][head] = best;
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(256, 256)))
|
|
__attribute__((visibility("protected")))
|
|
ircd_gpt_lm_select(__global struct ircd_gpt_ctrl *const restrict ctrl_,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global const float logsm[restrict 65536],
|
|
__global const float *const restrict attns)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0),
|
|
logits_pad = ln - (opts->logits % ln),
|
|
tn = (opts->logits + logits_pad) / ln,
|
|
start = tn * li,
|
|
stop = min(start + tn, opts->logits);
|
|
|
|
__local ushort idx[256];
|
|
__local struct ircd_gpt_ctrl ctrl;
|
|
__private event_t event[1];
|
|
|
|
assume(ln == 256);
|
|
assume(start < stop);
|
|
|
|
event[0] = async_work_group_copy
|
|
(
|
|
(__local char16 *)&ctrl,
|
|
(__global const char16 *)ctrl_,
|
|
sizeof(struct ircd_gpt_ctrl) / sizeof(char16),
|
|
0
|
|
);
|
|
|
|
idx[li] = start;
|
|
for(uint j = start + 1; j < stop; ++j)
|
|
if(logsm[j] > logsm[idx[li]])
|
|
idx[li] = j;
|
|
|
|
ircd_simt_sort_idx16_flldr(idx, logsm, ln, li);
|
|
wait_group_events(1, event);
|
|
|
|
if(ctrl.count >= opts->buffer_tokens)
|
|
return;
|
|
|
|
if(li < opts->top_n)
|
|
ircd_gpt_lm_result_top(&ctrl, opts, idx, logsm, li);
|
|
|
|
if(li < opts->labels)
|
|
ircd_gpt_lm_result_label(&ctrl, opts, ctrl.label + li, logsm);
|
|
|
|
if(li < opts->layers * opts->attn_rank)
|
|
ircd_gpt_lm_result_attns(&ctrl, opts, attns, ln, li);
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
if(li == 0)
|
|
ircd_gpt_lm_result(&ctrl, opts, idx, logsm);
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
event[0] = async_work_group_copy
|
|
(
|
|
(__global char16 *)ctrl_,
|
|
(__local const char16 *)&ctrl,
|
|
sizeof(struct ircd_gpt_ctrl) / sizeof(char16),
|
|
0
|
|
);
|
|
|
|
wait_group_events(1, event);
|
|
}
|
|
|
|
__kernel void
|
|
__attribute__((visibility("protected")))
|
|
__attribute__((reqd_work_group_size(256, 1, 1)))
|
|
__attribute__((amdgpu_flat_work_group_size(256, 256)))
|
|
ircd_gpt_leave(__global const void *const restrict model,
|
|
__global void *const restrict state,
|
|
__global void *const restrict master,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global struct ircd_gpt_ctrl *const ctrl_,
|
|
__global struct ircd_gpt_ctrl *const frame)
|
|
{
|
|
const ushort
|
|
li = get_local_id(0),
|
|
ln = get_local_size(0);
|
|
|
|
assume(ln == 256);
|
|
|
|
__local struct ircd_gpt_ctrl _ctrl;
|
|
__local struct ircd_gpt_ctrl *const ctrl = &_ctrl;
|
|
|
|
if(li == 0)
|
|
*ctrl = *ctrl_;
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
if(li == 0 && ctrl->accept < 0)
|
|
ircd_gpt_accept(ctrl, opts);
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
const uint
|
|
batch_size = opts->batch_size,
|
|
samps = opts->training_steps + opts->validation_steps + opts->testing_steps,
|
|
steps = samps / batch_size;
|
|
|
|
const bool
|
|
accepting = ctrl->accept >= 0,
|
|
cycling = !accepting,
|
|
sampling = accepting,
|
|
stepping = sampling && (ctrl->clk.samp + 1) >= batch_size,
|
|
epoching = stepping && (ctrl->clk.step + 1) >= steps;
|
|
|
|
if(li == 0)
|
|
;//ctrl->prof.finished = __builtin_readcyclecounter();
|
|
|
|
if(li == 0)
|
|
*frame = *ctrl;
|
|
|
|
if(!accepting && li == 0)
|
|
{
|
|
ctrl->clk.cycle += cycling;
|
|
ctrl->clk.samp += sampling;
|
|
ctrl->clk.step += stepping;
|
|
ctrl->clk.epoch += epoching;
|
|
}
|
|
}
|
|
|
|
void
|
|
ircd_gpt_accept(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts)
|
|
{
|
|
const bool
|
|
unlimited = opts->limit < 0;
|
|
|
|
const uint
|
|
batch_size = opts->batch_size,
|
|
samps = opts->training_steps + opts->validation_steps + opts->testing_steps,
|
|
steps = samps / batch_size,
|
|
limit_ = opts->limit,
|
|
unproc = ctrl->tokens - ctrl->count;
|
|
|
|
const int
|
|
limit = min(limit_?: unproc, opts->context_tokens),
|
|
cycle_remain = limit - (ctrl->clk.cycle + 1), // cycle not yet incr
|
|
token_remain = opts->context_tokens - ctrl->count, // but count already incr
|
|
remain_ = min(cycle_remain, token_remain),
|
|
accept_ = ircd_gpt_accept_check(ctrl, opts),
|
|
accept_abs = abs(accept_),
|
|
remain = accept_ < 0 && accept_abs < remain_? accept_abs: remain_,
|
|
_accept = accept_ >= 0? accept_: -remain;
|
|
|
|
const bool
|
|
accepting = _accept >= 0,
|
|
dispatching = _accept < 0,
|
|
limiting = remain <= 0;
|
|
|
|
const int
|
|
accept_num = 4,
|
|
accept = limiting? accept_num: _accept,
|
|
dispatch = accept >= 0? 0: remain;
|
|
|
|
ctrl->accept = accept;
|
|
ctrl->dispatch = dispatch;
|
|
|
|
ctrl->magic = 0xC7012C70UL;
|
|
}
|
|
|
|
int
|
|
ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts)
|
|
{
|
|
int best = 8;
|
|
for(uint i = 0; i < 4; ++i)
|
|
{
|
|
const int
|
|
remain = ircd_gpt_accept_match(ctrl, opts, i);
|
|
|
|
if(remain == 0)
|
|
return i;
|
|
|
|
if(remain < best)
|
|
best = remain;
|
|
}
|
|
|
|
return -best;
|
|
}
|
|
|
|
uint
|
|
ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
const uint i)
|
|
{
|
|
const uint
|
|
len = ircd_gpt_accept_len(ctrl, opts, i),
|
|
n = min(ctrl->count, len),
|
|
maxlen = 8;
|
|
|
|
uint ret = len?: maxlen;
|
|
for(uint j = 1; j <= n; ++j)
|
|
{
|
|
uint match = 0;
|
|
for(; match < j; ++match)
|
|
{
|
|
const uint
|
|
accept = opts->accept[i][match],
|
|
token = ctrl->token[ctrl->count - j + match];
|
|
|
|
if(token != accept)
|
|
break;
|
|
}
|
|
|
|
if(match >= j)
|
|
if(!(ret = len - match))
|
|
break;
|
|
}
|
|
|
|
ret = max(ret, ctrl->tokens - ctrl->count);
|
|
ret = min(ret, maxlen);
|
|
return ret;
|
|
}
|
|
|
|
uint
|
|
ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
const uint i)
|
|
{
|
|
uint len = 0;
|
|
for(; len < 8; ++len)
|
|
if(opts->accept[i][len] == (ushort)-1U)
|
|
break;
|
|
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// backside
|
|
//
|
|
|
|
void
|
|
__attribute__((always_inline))
|
|
ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global float4 *const restrict param_,
|
|
__global float4 *const restrict exp_avg_,
|
|
__global float4 *const restrict exp_avg_sqr_)
|
|
{
|
|
const uint
|
|
li = get_local_id(0),
|
|
ts = ctrl->clk.step;
|
|
|
|
const float4
|
|
param = param_[li],
|
|
grad = ctrl->label[0].loss.mean,
|
|
alpha[2] = { 1.0f - opts->beta[0], 1.0f - opts->beta[1], },
|
|
exp_avg = ts? exp_avg_[li]: 0.0f,
|
|
exp_avg_sqr = ts? exp_avg_sqr_[li]: 0.0f,
|
|
exp_avg_mul = exp_avg * opts->beta[0],
|
|
exp_avg_dot = exp_avg_mul + alpha[0] * grad,
|
|
exp_avg_sqr_mul = exp_avg_sqr * opts->beta[1],
|
|
exp_avg_sqr_dot = exp_avg_sqr_mul + alpha[1] * grad * grad,
|
|
denom = native_sqrt(exp_avg_sqr_dot) + FLT_EPSILON,
|
|
delta = opts->alpha * (exp_avg_dot / denom),
|
|
update = param - delta;
|
|
|
|
param_[li] = param + FLT_EPSILON;
|
|
exp_avg_[li] = exp_avg + FLT_EPSILON;
|
|
exp_avg_sqr_[li] = exp_avg_sqr + FLT_EPSILON;
|
|
|
|
//param_[li] = update;
|
|
//exp_avg_[li] = exp_avg_dot;
|
|
//exp_avg_sqr_[li] = exp_avg_sqr_dot;
|
|
}
|
|
|
|
//
|
|
// backpropagations
|
|
//
|
|
|
|
__kernel void
|
|
__attribute__((always_inline))
|
|
//__attribute__((vec_type_hint(float4)))
|
|
//__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
//__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict bias,
|
|
__global ircd_gpt_vectorv *const restrict bias_m0,
|
|
__global ircd_gpt_vectorv *const restrict bias_m1,
|
|
__global ircd_gpt_vectorv *const restrict weight,
|
|
__global ircd_gpt_vectorv *const restrict weight_m0,
|
|
__global ircd_gpt_vectorv *const restrict weight_m1)
|
|
{
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
bias->elem,
|
|
bias_m0->elem,
|
|
bias_m1->elem
|
|
);
|
|
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
weight->elem,
|
|
weight_m0->elem,
|
|
weight_m1->elem
|
|
);
|
|
}
|
|
|
|
__kernel void
|
|
//__attribute__((vec_type_hint(float4)))
|
|
//__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
//__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict norm_bias,
|
|
__global ircd_gpt_vectorv *const restrict norm_bias_m0,
|
|
__global ircd_gpt_vectorv *const restrict norm_bias_m1,
|
|
__global ircd_gpt_vectorv *const restrict norm_weight,
|
|
__global ircd_gpt_vectorv *const restrict norm_weight_m0,
|
|
__global ircd_gpt_vectorv *const restrict norm_weight_m1,
|
|
__global ircd_gpt_attn_aperaturev *const restrict fcon_bias,
|
|
__global ircd_gpt_attn_aperaturev *const restrict fcon_bias_m0,
|
|
__global ircd_gpt_attn_aperaturev *const restrict fcon_bias_m1,
|
|
__global ircd_gpt_attn_aperaturev *const restrict fcon_weight,
|
|
__global ircd_gpt_attn_aperaturev *const restrict fcon_weight_m0,
|
|
__global ircd_gpt_attn_aperaturev *const restrict fcon_weight_m1,
|
|
__global ircd_gpt_vectorv *const restrict proj_bias,
|
|
__global ircd_gpt_vectorv *const restrict proj_bias_m0,
|
|
__global ircd_gpt_vectorv *const restrict proj_bias_m1,
|
|
__global ircd_gpt_vectorv *const restrict proj_weight,
|
|
__global ircd_gpt_vectorv *const restrict proj_weight_m0,
|
|
__global ircd_gpt_vectorv *const restrict proj_weight_m1)
|
|
{
|
|
const uint
|
|
fcon_height = opts->embed_elems,
|
|
proj_height = opts->embed_elems,
|
|
segs = 3;
|
|
|
|
ircd_gpt_norm_prop
|
|
(
|
|
ctrl, opts,
|
|
norm_bias,
|
|
norm_bias_m0,
|
|
norm_bias_m1,
|
|
norm_weight,
|
|
norm_weight_m0,
|
|
norm_weight_m1
|
|
);
|
|
|
|
for(uint j = 0; j < segs; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_bias->proj[j],
|
|
fcon_bias_m0->proj[j],
|
|
fcon_bias_m1->proj[j]
|
|
);
|
|
|
|
for(uint i = 0; i < fcon_height; ++i)
|
|
for(uint j = 0; j < segs; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_weight[i].proj[j],
|
|
fcon_weight_m0[i].proj[j],
|
|
fcon_weight_m1[i].proj[j]
|
|
);
|
|
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_bias->elem,
|
|
proj_bias_m0->elem,
|
|
proj_bias_m1->elem
|
|
);
|
|
|
|
for(uint i = 0; i < proj_height; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_weight[i].elem,
|
|
proj_weight_m0[i].elem,
|
|
proj_weight_m1[i].elem
|
|
);
|
|
}
|
|
|
|
__kernel void
|
|
//__attribute__((vec_type_hint(float4)))
|
|
//__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
//__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict norm_bias,
|
|
__global ircd_gpt_vectorv *const restrict norm_bias_m0,
|
|
__global ircd_gpt_vectorv *const restrict norm_bias_m1,
|
|
__global ircd_gpt_vectorv *const restrict norm_weight,
|
|
__global ircd_gpt_vectorv *const restrict norm_weight_m0,
|
|
__global ircd_gpt_vectorv *const restrict norm_weight_m1,
|
|
__global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias,
|
|
__global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m0,
|
|
__global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m1,
|
|
__global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight,
|
|
__global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m0,
|
|
__global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m1,
|
|
__global ircd_gpt_vectorv *const restrict proj_bias,
|
|
__global ircd_gpt_vectorv *const restrict proj_bias_m0,
|
|
__global ircd_gpt_vectorv *const restrict proj_bias_m1,
|
|
__global ircd_gpt_vectorv *const restrict proj_weight,
|
|
__global ircd_gpt_vectorv *const restrict proj_weight_m0,
|
|
__global ircd_gpt_vectorv *const restrict proj_weight_m1)
|
|
{
|
|
const uint
|
|
fcon_height = opts->embed_elems,
|
|
proj_height = opts->ffnn_elems,
|
|
segs = 4;
|
|
|
|
ircd_gpt_norm_prop
|
|
(
|
|
ctrl, opts,
|
|
norm_bias,
|
|
norm_bias_m0,
|
|
norm_bias_m1,
|
|
norm_weight,
|
|
norm_weight_m0,
|
|
norm_weight_m1
|
|
);
|
|
|
|
for(uint j = 0; j < segs; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_bias->proj[j],
|
|
fcon_bias_m0->proj[j],
|
|
fcon_bias_m1->proj[j]
|
|
);
|
|
|
|
for(uint i = 0; i < fcon_height; ++i)
|
|
for(uint j = 0; j < segs; ++j)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
fcon_weight[i].proj[j],
|
|
fcon_weight_m0[i].proj[j],
|
|
fcon_weight_m1[i].proj[j]
|
|
);
|
|
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_bias->elem,
|
|
proj_bias_m0->elem,
|
|
proj_bias_m1->elem
|
|
);
|
|
|
|
for(uint i = 0; i < proj_height; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
proj_weight[i].elem,
|
|
proj_weight_m0[i].elem,
|
|
proj_weight_m1[i].elem
|
|
);
|
|
}
|
|
|
|
__kernel void
|
|
//__attribute__((vec_type_hint(float4)))
|
|
//__attribute__((reqd_work_group_size(192, 1, 1)))
|
|
//__attribute__((amdgpu_flat_work_group_size(192, 192)))
|
|
ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *const ctrl,
|
|
__constant const struct ircd_gpt_opts *const opts,
|
|
__global ircd_gpt_vectorv *const restrict pos,
|
|
__global ircd_gpt_vectorv *const restrict pos_m0,
|
|
__global ircd_gpt_vectorv *const restrict pos_m1,
|
|
__global ircd_gpt_vectorv *const restrict token,
|
|
__global ircd_gpt_vectorv *const restrict token_m0,
|
|
__global ircd_gpt_vectorv *const restrict token_m1)
|
|
{
|
|
const uint
|
|
ln = get_local_size(0),
|
|
wi = get_global_offset(0) / ln + get_group_id(0),
|
|
wn = ctrl->count,
|
|
cn = opts->context_tokens / wn,
|
|
ci = cn * wi,
|
|
tn = opts->logits / wn,
|
|
ti = tn * wi;
|
|
|
|
for(uint i = ci; i < ci + cn; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
pos[i].elem,
|
|
pos_m0[i].elem,
|
|
pos_m1[i].elem
|
|
);
|
|
|
|
for(uint i = ti; i < ti + tn; ++i)
|
|
ircd_gpt_prop_elem
|
|
(
|
|
ctrl, opts,
|
|
token[i].elem,
|
|
token_m0[i].elem,
|
|
token_m1[i].elem
|
|
);
|
|
}
|
|
|
|
/// Gaussian Error Linear Unit
|
|
void
|
|
ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *const out,
|
|
__local const ircd_gpt_ffnn_aperaturev *const in_,
|
|
const uint i)
|
|
{
|
|
const float4
|
|
in = in_->fcon[i];
|
|
|
|
float4 a;
|
|
a = 0.044715f;
|
|
a *= in;
|
|
a *= in;
|
|
a += 1.0f;
|
|
a *= 0.7978845608f;
|
|
a *= in;
|
|
|
|
a = tanh(a);
|
|
a += 1.0f;
|
|
a *= in;
|
|
a *= 0.5f;
|
|
|
|
out->fcon[i] = a;
|
|
}
|
|
|
|
void
|
|
ircd_gpt_norm(__local ircd_gpt_vectorv *const out,
|
|
__local const ircd_gpt_vectorv *const in,
|
|
__local ircd_gpt_vectorv *const restrict tmp,
|
|
__global const ircd_gpt_vectorv *const restrict bias,
|
|
__global const ircd_gpt_vectorv *const restrict weight,
|
|
const uint ln,
|
|
const uint li)
|
|
{
|
|
// Layer re-normalization
|
|
ircd_simt_math_norm_f4lldr(out->elem, in->elem, tmp->elem, ln, li);
|
|
ircd_gpt_norm_fmad(out, out, bias, weight, li);
|
|
}
|
|
|
|
void
|
|
ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *const out,
|
|
__local const ircd_gpt_vectorv *const in,
|
|
__global const ircd_gpt_vectorv *const restrict bias,
|
|
__global const ircd_gpt_vectorv *const restrict weight,
|
|
const uint i)
|
|
{
|
|
out->elem[i] = in->elem[i] * weight->elem[i] + bias->elem[i];
|
|
}
|