0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-06-11 06:28:55 +02:00

ircd::gpt::pipe: Correctness; compute loss, statistics; pipeline optimize.

This commit is contained in:
Jason Volk 2021-04-10 19:28:23 -07:00
parent 0a6be0efed
commit 9c062d9c3f
13 changed files with 867 additions and 444 deletions

View file

@ -22,6 +22,8 @@ namespace ircd::gpt::model
constexpr auto align {64};
extern const decoder *default_model;
extern string_view default_dataset;
extern std::vector<json::object> default_data;
}
/// Attention aperature

View file

@ -17,7 +17,7 @@
/// This stucture and its mutable companion in `task.h` determine the outcome
/// of the next execution cycle; options are immutable to device software but
/// may be changed by the host between executions cycles if desired.
///
///
struct ircd_gpt_opts
{
/// Specifies the nominal halting condition based on a sequence of tokens.
@ -89,6 +89,82 @@ struct ircd_gpt_opts
#endif
;
/// Embedding vector elements
uint embed_elems
#ifdef __cplusplus
{
768
}
#endif
;
/// Attention unit width multiple
uint attn_elems
#ifdef __cplusplus
{
embed_elems * 3
}
#endif
;
/// FFNN unit width multiple
uint ffnn_elems
#ifdef __cplusplus
{
embed_elems * 4
}
#endif
;
uint embed_width
#ifdef __cplusplus
{
embed_elems / 4
}
#endif
;
uint attn_width
#ifdef __cplusplus
{
attn_elems / 4
}
#endif
;
uint attn_height
#ifdef __cplusplus
{
embed_elems / 4
}
#endif
;
uint ffnn_width
#ifdef __cplusplus
{
ffnn_elems / 4
}
#endif
;
uint ffnn_height
#ifdef __cplusplus
{
embed_elems / 4
}
#endif
;
/// Specifies the token context size in tokens.
uint logits
#ifdef __cplusplus
{
50257
}
#endif
;
/// Seed for the task's PRNG.
ulong seed
#ifdef __cplusplus
@ -97,6 +173,32 @@ struct ircd_gpt_opts
}
#endif
;
/// Training steps
ulong training_steps
#ifdef __cplusplus
{
250000
}
#endif
;
/// Validation steps
ulong validation_steps
#ifdef __cplusplus
{
5000
}
#endif
;
ushort label
#ifdef __cplusplus
{
198
}
#endif
;
}
__attribute__((aligned(4096)));

View file

@ -22,7 +22,9 @@ struct ircd::gpt::pipe::desc
cl::data
state, // qry/key/val projection (tokens * embed * 3 * float)
accum, // accumulator (tokens * embed * float)
logit, // result output vector (50257 * float)
logit, // result logit vector (50257 * float)
logexp, // outputs distribution (50257 * float)
logsm, // outputs distribution (50257 * float)
ctrl, // control page
opts; // options page
@ -30,6 +32,7 @@ struct ircd::gpt::pipe::desc
lm_embed,
lm_norm,
lm_logit,
lm_logsm,
lm_select;
std::unique_ptr<struct desc::layer>

View file

@ -40,6 +40,7 @@ struct ircd::gpt::pipe::exec
range_positive, // Dimension range of a layer kernel.
range_lm_norm, // Dimension range of the final norm kernel.
range_lm_logit, // Dimension range of the language logit kernel.
range_lm_logsm, // Dimension range of the language statistic kernel.
range_lm_select; // Dimension range of the language token kernel.
cl::exec
@ -48,7 +49,8 @@ struct ircd::gpt::pipe::exec
lm_embed, // Compute token and positional embeddings.
coil[12 * 2], // Pass over all layers.
lm_norm, // Final normalization.
lm_logit, // Compute logit result vector.
lm_logit, // Compute language logits.
lm_logsm, // Statistics on the logits.
lm_select, // Select next token.
acquire_ctrl; // Acquire the control page.

View file

@ -71,6 +71,42 @@ struct ircd_gpt_task
/// State counters for the accept/error sequence codes.
uint accept_seq[4], error_seq[4];
/// Loss for last token of last cycle
float loss;
/// Sum loss over all cycles
float loss_sum;
/// Average loss over all cycles
float loss_mean;
/// Perplexity score for last token of last cycle
float perp;
/// Perplexity sum over all cycles
float perp_sum;
/// Perplexity mean over context
float perp_mean;
/// Logit softmax mu
float samax_mu;
/// Logit softmax sum
float samax_sum;
/// Logit softmax lambda
float samax_lambda;
/// Certainty difference score for last token of last cycle
float cert;
/// Certainty sum over all cycles
float cert_sum;
/// Certainty mean over context
float cert_mean;
/// The token buffer starts at offset 2048 and continues to the end of
/// the page; options specify the size of the tokens buffer in tokens.
/// Additional pages must be attached for larger buffer sizes.
@ -112,5 +148,6 @@ enum ircd::gpt::task::status
};
static_assert(sizeof(struct ircd_gpt_task) == 4096);
static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
#endif

View file

@ -9,9 +9,14 @@
// full license for this software is available in the LICENSE file.
#pragma once
#ifdef __OPENCL_C_VERSION__
#define HAVE_IRCD_GPT_TOKEN_H
struct ircd_gpt_attn_mask
{
bool
token[1024];
};
union ircd_gpt_token
{
float
@ -19,36 +24,37 @@ union ircd_gpt_token
attn[12][64];
};
#ifdef __OPENCL_C_VERSION__
union ircd_gpt_tokenv
{
float4
word[768/4],
attn[12][64/4];
union ircd_gpt_token
token;
};
#endif
struct ircd_gpt_attn_qkv
{
union ircd_gpt_token
qry,
key,
val;
};
struct ircd_gpt_qkv
#ifdef __OPENCL_C_VERSION__
struct ircd_gpt_attn_qkvv
{
union ircd_gpt_tokenv
qry,
key,
val;
};
#endif
struct ircd_gpt_qkvv
{
union ircd_gpt_tokenv
qry,
key,
val;
};
struct ircd_gpt_attn_mask
{
bool
token[1024];
};
union ircd_gpt_aperature
union ircd_gpt_attn_aperature
{
float
word[768],
@ -56,9 +62,13 @@ union ircd_gpt_aperature
proj[3][768],
qkv[3][12][64],
attn[12][64];
union ircd_gpt_token
token[3];
};
union ircd_gpt_aperaturev
#ifdef __OPENCL_C_VERSION__
union ircd_gpt_attn_aperaturev
{
float4
word[768/4],
@ -66,6 +76,32 @@ union ircd_gpt_aperaturev
proj[3][768/4],
qkv[3][12][64/4],
attn[12][64/4];
union ircd_gpt_tokenv
token[3];
};
#endif
union ircd_gpt_ffnn_aperature
{
float
word[768],
fcon[3072],
proj[4][768];
union ircd_gpt_token
token[4];
};
#ifdef __OPENCL_C_VERSION__
union ircd_gpt_ffnn_aperaturev
{
float4
word[768/4],
fcon[3072/4],
proj[4][768/4];
union ircd_gpt_tokenv
token[4];
};
#endif

View file

@ -19,14 +19,17 @@ ircd_simt_math_mean_f4lldr(__local float4 *const restrict out,
out[i] = in[i];
ircd_simt_reduce_add_f4lldr(out, num, i);
float numerator = 0.0f;
float4 numeratorv = out[i];
for(uint k = 0; k < 4; ++k)
numerator += numeratorv[k];
if(i == 0)
{
float numerator = 0.0f;
float4 numeratorv = out[i];
for(uint k = 0; k < 4; ++k)
numerator += numeratorv[k];
out[i] = numerator;
}
out[i] = numerator;
ircd_simt_broadcast_f4lldr(out, num, i);
numeratorv = out[i];
const float4 numeratorv = out[i];
out[i] = numeratorv / (num * 4);
}

View file

@ -28,15 +28,16 @@ ircd_simt_reduce_add_f4lldr(__local float4 *const buf,
/// the greatest value is placed in index [0], the rest of the buffer is
/// trashed.
inline void
ircd_simt_reduce_max_f4lldr(__local float *const buf,
const uint ln,
const uint li)
ircd_simt_reduce_max_flldr(__local float *const buf,
const uint ln,
const uint li)
{
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride)
buf[li] = max(buf[li], buf[li + stride]);
if(buf[li] < buf[li + stride])
buf[li] = buf[li + stride];
}
}

View file

@ -8,17 +8,14 @@
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#include <ircd/gpt/pipe/pipe.h>
namespace ircd::gpt
{
template<class T>
static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
static void gelu(f32x4 &, const f32x4 &);
static void gelu(f32x4 *, const f32x4 *);
static void norm(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const f32);
static void fmma4(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
static void fmma3(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
static void fmma2(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const size_t);
static void fmma1(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
@ -84,6 +81,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
auto &errc(ctrl.error_seq);
auto &accc(ctrl.accept_seq);
ctrl.tokens = in.size();
ctrl.head = 0;
const size_t tmax
{
@ -112,25 +110,34 @@ ircd::gpt::generate(const vector_view<u16> &out,
else
embed(data(dst), in[j], j, opts);
#if RB_DEBUG
static char dbuf[512] {0};
char report[1536] {0};
char tmbuf[1][64] {{0}};
const size_t report_size = snprintf
(
report, sizeof(report),
"%-2u -- %-3u [%5u] --- --- %s 0 0 | %8s",
"%-4u %4u %4u:%-4u %1u%1u [ %6.2fL %6.2f%% ] %6.2fL %5.1f%% %s",
ctrl.epoch,
ctrl.cycle,
j,
ctrl.tokens,
ctrl.token[j],
vocab::debug(dbuf, ctrl.token[j]).c_str(),
pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
0,
0,
0.0,
0.0,
0.0,
0.0,
vocab::debug(dbuf, in[j]).c_str()
);
log::info
log::logf
{
log, "%s",
log, log::level::DEBUG,
"%s",
string_view{report, report_size}
};
#endif
}
uint64_t cycles(0);
@ -192,24 +199,30 @@ ircd::gpt::generate(const vector_view<u16> &out,
const size_t report_size = snprintf
(
report, sizeof(report),
"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
j,
"%4u:%-4u %4u:%-4u %1u%1u [ %4.1f%% %6.2f%% %5.2fL ] %5.1f%% %5.1f%% %4.1fL %s %04x %8s %8s | %8s",
j + in.size(),
ctrl.tokens,
ctrl.cycle,
ctrl.epoch,
out[j],
ctrl.cycle,
accc[0] + accc[1] + accc[2],
errc[0] + errc[1] + errc[2],
ctrl.cert_mean < 100.0? ctrl.cert_mean: NAN,
ctrl.perp_mean < 100.0? ctrl.perp_mean: NAN,
ctrl.loss_mean < 100.0? ctrl.loss_mean: NAN,
ctrl.cert < 100.0? ctrl.cert: NAN,
ctrl.perp < 100.0? ctrl.perp: NAN,
ctrl.loss < 100.0? ctrl.loss: NAN,
vocab::debug(dbuf, out[j]).c_str(),
out[j],
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
);
log::info
log::logf
{
log, "%s",
log, log::level::DEBUG,
"%s",
string_view{report, report_size}
};
}
@ -375,41 +388,19 @@ ircd::gpt::coil(float *__restrict__ accum,
a[j][k * 64 + l] = attns[k][j][l];
}
static const math::fmma_opts fmma_opts
{
768, 768, 2U
};
for(uint j(0); j < tokens; ++j)
fmma2((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, tokens);
fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
for(uint j(0); j < tokens; ++j)
ffnn(accum + j * 768, accum + j * 768, decoder, i);
}
}
void
ircd::gpt::ffnn(float *const out,
const float *const in,
const model::decoder &decoder,
const uint laynum)
{
constexpr float ln2_epsilon
{
0.00001
};
const auto &layer
{
decoder.layer[laynum]
};
static float
buf alignas(64) [768],
buf2 alignas(64) [3072];
memset(buf2, 0x0, sizeof(buf2));
norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
fmma3((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight);
gelu((f32x4 *)buf2, (const f32x4 *)buf2);
fmma4((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight);
}
void
ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
const float *const __restrict__ in,
@ -440,8 +431,13 @@ ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon);
static const math::fmma_opts fmma_opts
{
768, 2304, 2U,
};
memset(proj, 0x0, sizeof(proj));
fmma1((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight);
fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts);
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
@ -548,6 +544,43 @@ ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
out[j][k][m] += in[j][k][l] * val[l][j][m];
}
void
ircd::gpt::ffnn(float *const out,
const float *const in,
const model::decoder &decoder,
const uint laynum)
{
static const math::fmma_opts fmma3_opts
{
768, 3072, 2U,
};
static const math::fmma_opts fmma4_opts
{
3072, 768, 2U,
};
constexpr float ln2_epsilon
{
0.00001
};
const auto &layer
{
decoder.layer[laynum]
};
static float
buf alignas(64) [768],
buf2 alignas(64) [3072];
memset(buf2, 0x0, sizeof(buf2));
norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts);
gelu((f32x4 *)buf2, (const f32x4 *)buf2);
fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
}
void
ircd::gpt::norm(f32x4 *const __restrict__ out,
const f32x4 *const __restrict__ in,
@ -567,143 +600,18 @@ ircd::gpt::norm(f32x4 *const __restrict__ out,
out[j] = out[j] * weight[j] + bias[j];
}
template<class T>
void
ircd::gpt::fmma1(f32x4 *const __restrict__ out,
const f32x4 *const __restrict__ in,
const f32x4 *const __restrict__ bias,
const f32x4 *const __restrict__ weight)
ircd::gpt::fmma(T *const __restrict__ out,
const T *const __restrict__ in,
const T *const __restrict__ bias,
const T *const __restrict__ weight,
const math::fmma_opts &opts)
{
constexpr uint width
{
2304
};
for(uint i(0); i < opts.rows / simd::lanes<T>(); ++i)
out[i] += bias[i];
constexpr uint height
{
768
};
constexpr uint lanes
{
simd::lanes<f32x4>()
};
for(uint j(0); j < width / lanes; ++j)
out[j] += bias[j];
static const math::fmma_opts opts
{
width,
height,
2U,
'y',
};
math::fmma<opts>(out, in, weight);
}
void
ircd::gpt::fmma2(f32x4 *const __restrict__ out,
const f32x4 *const __restrict__ in,
const f32x4 *const __restrict__ bias,
const f32x4 *const __restrict__ weight,
const size_t num)
{
constexpr uint width
{
768
};
constexpr uint height
{
768
};
constexpr uint lanes
{
simd::lanes<f32x4>()
};
for(uint j(0); j < width / lanes; ++j)
out[j] += bias[j];
static const math::fmma_opts opts
{
width,
height,
2U,
};
math::fmma<opts>(out, in, weight);
}
void
ircd::gpt::fmma3(f32x4 *const __restrict__ out,
const f32x4 *const __restrict__ in,
const f32x4 *const __restrict__ bias,
const f32x4 *const __restrict__ weight)
{
constexpr uint width
{
3072
};
constexpr uint height
{
768
};
constexpr uint lanes
{
simd::lanes<f32x4>()
};
for(uint j(0); j < width / lanes; ++j)
out[j] += bias[j];
static const math::fmma_opts opts
{
width,
height,
2U,
'y',
};
math::fmma<opts>(out, in, weight);
}
void
ircd::gpt::fmma4(f32x4 *const __restrict__ out,
const f32x4 *const __restrict__ in,
const f32x4 *const __restrict__ bias,
const f32x4 *const __restrict__ weight)
{
constexpr uint width
{
3072
};
constexpr uint height
{
768
};
constexpr uint lanes
{
simd::lanes<f32x4>()
};
for(uint j(0); j < height / lanes; ++j)
out[j] += bias[j];
static const math::fmma_opts opts
{
width,
height,
2U,
};
math::fmma<opts>(out, in, weight);
math::fmma(out, in, weight, opts);
}
void

View file

@ -10,8 +10,8 @@
inline void
ircd_gpt_norm_fmad(__local float4 *const out,
__local const float4 *const in,
ircd_gpt_norm_fmad(__local float4 *const restrict out,
__local const float4 *const restrict in,
__global const float4 *const restrict bias,
__global const float4 *const restrict weight,
const uint i)
@ -27,35 +27,26 @@ ircd_gpt_sgemv(__local float4 *const restrict out,
__global const float4 *const restrict weight,
const uint width,
const uint height,
const uint tiles,
const uint i)
{
const uint seg = height / tiles;
float4 acc = bias[i];
for(uint j = 0; j < seg; ++j)
for(uint t = 0; t < tiles; ++t)
for(uint k = 0; k < 4; ++k)
{
const uint
jidx = t * seg + j,
kidx = jidx * 4 + k,
widx = kidx * width + i;
acc += weight[widx] * in[jidx][k];
}
for(uint j = 0; j < height; ++j)
for(uint k = 0; k < 4; ++k)
acc += in[j][k] * weight[width * (j * 4 + k) + i];
out[i] = acc;
}
/// Gaussian Error Linear Unit
inline void
ircd_gpt_gelu(__local float4 *const out,
__local const float4 *const in_,
const uint i)
ircd_gpt_ffnn_gelu(__local float4 *const out,
__local const float4 *const in_,
const uint i)
{
float4 a,
const float4
in = in_[i];
float4 a;
a = 0.044715f;
a *= in;
a *= in;
@ -71,14 +62,35 @@ ircd_gpt_gelu(__local float4 *const out,
out[i] = a;
}
//
// core
//
inline void
__attribute__((always_inline))
ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__local union ircd_gpt_ffnn_aperaturev *const restrict out,
__local const union ircd_gpt_tokenv *const in,
__global const float4 *const restrict bias,
__global const float4 *const restrict weight)
{
const uint
li = get_local_id(0),
ln = get_local_size(0),
width = opts->ffnn_width,
height = opts->ffnn_height;
__kernel void
for(uint i = 0; i < 4; ++i)
ircd_gpt_sgemv(out->fcon, in->word, bias, weight, width, height, i * ln + li);
for(uint i = 0; i < 4; ++i)
ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li);
}
inline void
__attribute__((always_inline))
ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_tokenv *const restrict accum,
__local union ircd_gpt_tokenv *const restrict token,
__local union ircd_gpt_tokenv *const restrict tmp,
__local union ircd_gpt_ffnn_aperaturev *const restrict buf,
__global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight,
__global const float4 *const restrict fcon_bias,
@ -92,72 +104,34 @@ ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
__local union ircd_gpt_aperaturev token;
__local float4 tmp[768/4];
// Fetch local copy of the global accumulator. We operate on a cached
// copy as input, and add our output to the global upon completion.
token.word[li] = accum[wi].word[li];
wn = get_num_groups(0),
width = opts->ffnn_width,
height = opts->ffnn_height;
// Layer re-normalization
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
ircd_simt_math_norm_f4lldr(token->word, token->word, buf->word, ln, li);
ircd_gpt_norm_fmad(tmp->word, token->word, norm_bias, norm_weight, li);
// ln's writes are still pending but fcon reads results across threads.
barrier(CLK_LOCAL_MEM_FENCE);
// Fully connected
for(uint i = 0; i < 4; ++i)
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
ircd_gpt_ffnn_fcon(ctrl, opts, buf, tmp, fcon_bias, fcon_weight);
// Gaussian Error Linear Unit
for(uint i = 0; i < 4; ++i)
ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
// Projection
ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
// Accumulation; end of layer
accum[wi].word[li] += tmp[li];
}
__kernel void
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_tokenv *const restrict accum,
__local const union ircd_gpt_tokenv *const restrict xattn,
__global const float4 *const restrict proj_bias,
__global const float4 *const restrict proj_weight)
{
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
__local float4
in[768/4],
out[768/4];
// Fetch
in[li] = xattn->word[li];
// Need this here if xattn is __local
// fcon's writes are still pending but proj reads results across threads.
barrier(CLK_LOCAL_MEM_FENCE);
// Projection
ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
// Accumulation; end of layer
accum[wi].word[li] += out[li];
ircd_gpt_sgemv(token->word, buf->fcon, proj_bias, proj_weight, height, width, li);
}
__kernel void
inline void
__attribute__((always_inline))
ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__local union ircd_gpt_tokenv *const restrict out,
__global const struct ircd_gpt_qkvv *const restrict token,
__local union ircd_gpt_tokenv *const restrict tmp,
__global const struct ircd_gpt_attn_qkvv *const restrict token,
__global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],
{
const uint
@ -168,18 +142,13 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
wi = get_group_id(0),
wn = get_num_groups(0);
__local union
{
float
attn[12][96];
}
self;
__local union ircd_gpt_token *const restrict self = &tmp->token;
for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i])
self.attn[li][i] = 0.0f;
self->attn[li][i] = 0.0f;
else
self.attn[li][i] = -10000.0f;
self->attn[li][i] = -10000.0f;
for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i])
@ -190,40 +159,153 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
key = token[i].key.attn[li][j],
res = qry * key;
for(uint k = 0; k < 4; ++k)
self.attn[li][i] += res[k];
self->attn[li][i] += res[k];
}
for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i])
self.attn[li][i] /= 8.0f;
self->attn[li][i] /= 8.0f;
float mu = -10000.0f;
for(uint i = 0; i < wn; ++i)
mu = max(mu, self->attn[li][i]);
for(uint i = 0; i < wn; ++i)
self.attn[li][i] = exp(self.attn[li][i]);
self->attn[li][i] = exp(self->attn[li][i] - mu);
float4 vacc = 0.0f;
float sum = 0.0f;
for(uint i = 0; i < wn; ++i)
vacc[i % 4] += self.attn[li][i];
float acc = 0.0f;
for(uint i = 0; i < 4; ++i)
acc += vacc[i];
sum += self->attn[li][i];
const float lambda = 1.0f / sum;
for(uint i = 0; i < wn; ++i)
self.attn[li][i] /= acc;
self->attn[li][i] *= lambda;
for(uint j = 0; j < 64/4; ++j)
out->attn[li][j] = 0.0f;
for(uint i = 0; i < wn; ++i)
for(uint j = 0; j < 64/4; ++j)
out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
out->attn[li][j] += token[i].val.attn[li][j] * self->attn[li][i];
}
inline void
__attribute__((always_inline))
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__local union ircd_gpt_tokenv *const out,
__local const union ircd_gpt_tokenv *const xattn,
__global const float4 *const restrict bias,
__global const float4 *const restrict weight)
{
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0),
height = opts->attn_height,
width = opts->attn_height; // same
// Projection
ircd_gpt_sgemv(out->word, xattn->word, bias, weight, height, width, li);
}
__kernel void
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_tokenv *const restrict accum,
__global const struct ircd_gpt_attn_qkvv *const restrict state,
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
__global const float4 *const restrict attn_proj_bias,
__global const float4 *const restrict attn_proj_weight,
__global const float4 *const restrict ffnn_norm_bias,
__global const float4 *const restrict ffnn_norm_weight,
__global const float4 *const restrict ffnn_fcon_bias,
__global const float4 *const restrict ffnn_fcon_weight,
__global const float4 *const restrict ffnn_proj_bias,
__global const float4 *const restrict ffnn_proj_weight)
{
const uint
li = get_local_id(0),
wi = get_group_id(0);
__local union ircd_gpt_ffnn_aperaturev
ffnn_fcon;
__local union ircd_gpt_tokenv
buf0, buf1;
// Self-attention backend; this computes the self-attention result now
// that keys and values are globally visible across tokens.
ircd_gpt_attn_self
(
ctrl,
opts,
&buf1,
&buf0,
state,
mask
);
// Self-attention's writes are pending on each thread but each proj
// call requires results from all threads for input to the matmul.
barrier(CLK_LOCAL_MEM_FENCE);
// Project result of self-attention.
ircd_gpt_attn_proj
(
ctrl,
opts,
&buf0,
&buf1,
attn_proj_bias,
attn_proj_weight
);
// Frontend accumulation
{
const float4
attn = buf0.word[li],
resid = accum[wi].word[li];
buf0.word[li] += resid;
accum[wi].word[li] += attn;
}
// Backend mlp; layer-norm acquires any pending writes, no fence required.
ircd_gpt_ffnn
(
ctrl,
opts,
&buf0,
&buf1,
&ffnn_fcon,
ffnn_norm_bias,
ffnn_norm_weight,
ffnn_fcon_bias,
ffnn_fcon_weight,
ffnn_proj_bias,
ffnn_proj_weight
);
// Backend accumulation
{
const float4
ffnn = buf0.word[li],
resid = accum[wi].word[li],
result = ffnn + resid;
accum[wi].word[li] = result;
}
}
__kernel void
ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_aperaturev *const restrict out,
__global const union ircd_gpt_tokenv *const restrict in,
__global union ircd_gpt_attn_aperaturev *const restrict state,
__global const union ircd_gpt_tokenv *const restrict accum,
__global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight,
__global const float4 *const restrict fcon_bias,
@ -235,74 +317,32 @@ ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
wn = get_num_groups(0),
width = opts->attn_width,
height = opts->attn_height;
__local union ircd_gpt_aperaturev token;
__local float4 tmp[768/4];
__local union ircd_gpt_attn_aperaturev
token;
token.word[li] = in[wi].word[li];
__local float4
tmp[768/4];
token.word[li] = accum[wi].word[li];
// Layer re-normalization
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
// Ln's writes are still pending; fcon requires results across threads.
barrier(CLK_LOCAL_MEM_FENCE);
// Fully connected
for(uint i = 0; i < 3; ++i)
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, width, height, i * ln + li);
// Export queries, keys, and values.
for(uint i = 0; i < 3; ++i)
out[wi].proj[i][li] = token.proj[i][li];
}
__kernel void
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_tokenv *const restrict accum,
__global const struct ircd_gpt_qkvv *const restrict state,
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
__global const float4 *const restrict attn_proj_bias,
__global const float4 *const restrict attn_proj_weight,
__global const float4 *const restrict ffnn_norm_bias,
__global const float4 *const restrict ffnn_norm_weight,
__global const float4 *const restrict ffnn_fcon_bias,
__global const float4 *const restrict ffnn_fcon_weight,
__global const float4 *const restrict ffnn_proj_bias,
__global const float4 *const restrict ffnn_proj_weight)
{
__local union ircd_gpt_tokenv value;
ircd_gpt_attn_self
(
ctrl,
opts,
&value,
state,
mask
);
ircd_gpt_attn_proj
(
ctrl,
opts,
accum,
&value,
attn_proj_bias,
attn_proj_weight
);
ircd_gpt_ffnn
(
ctrl,
opts,
accum,
ffnn_norm_bias,
ffnn_norm_weight,
ffnn_fcon_bias,
ffnn_fcon_weight,
ffnn_proj_bias,
ffnn_proj_weight
);
state[wi].proj[i][li] = token.proj[i][li];
}
//
@ -320,7 +360,8 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
const uint word_idx)
{
const ushort
token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
token = ctrl->token[ring_idx];
const float4
wte = vocab[token].word[word_idx],
@ -362,9 +403,9 @@ ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
// Final re-normalization
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
ircd_gpt_norm_fmad(tmp.word, token.word, norm_bias, norm_weight, li);
accum[0].word[li] = token.word[li];
accum[wi].word[li] = tmp.word[li];
}
__kernel void
@ -375,31 +416,122 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
__global const union ircd_gpt_tokenv *const restrict token)
{
const uint
gi = get_global_id(0);
gi = get_global_id(0),
ti = ctrl->tokens - 1,
words = opts->embed_width;
float4 acc = 0.0f;
for(uint j = 0; j < 768/4; ++j)
__attribute__((opencl_unroll_hint))
for(uint j = 0; j < words; ++j)
{
const float4
in = accum[0].word[j],
vocab = token[gi].word[j],
res = vocab * in;
in = accum[ti].word[j],
vocab = token[gi].word[j];
acc += res;
acc += vocab * in;
}
float res = 0.0f;
for(uint k = 0; k < 4; ++k)
res += acc[k];
logit[gi] = res;
if(gi < opts->logits)
logit[gi] = res;
else
logit[gi] = -10000.0f;
}
__kernel void
ircd_gpt_lm_logsm(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global float4 *const restrict logsm,
__global float4 *const restrict logexp,
__global const float4 *const restrict logit)
{
const uint
gi = get_global_id(0),
li = get_local_id(0),
ln = get_local_size(0),
logits = opts->logits,
logits_alignup = logits + (ln - (logits % ln)),
tn = logits_alignup / ln / 4,
ti = tn * li;
__local float share[256];
__local float4 share4[256];
share4[li] = -10000.0f;
for(uint i = ti; i < ti + tn; ++i)
share4[li] = max(share4[li], logit[i]);
share[li] = -10000.0f;
for(uint k = 0; k < 4; ++k)
share[li] = max(share[li], share4[li][k]);
ircd_simt_reduce_max_flldr(share, ln, li);
if(li == 0)
share4[li] = ctrl->samax_mu = share[li];
ircd_simt_broadcast_f4lldr(share4, ln, li);
const float4
mu = share4[li];
share4[li] = 0.0f;
for(uint i = ti; i < ti + tn; ++i)
{
const float4
reg = logit[i] - mu,
res = exp(reg);
for(uint k = 0; k < 4; ++k)
if(i * 4 + k < logits)
share4[li][k] += res[k];
for(uint k = 0; k < 4; ++k)
if(i * 4 + k < logits)
logexp[i][k] = res[k];
else
logexp[i][k] = 0.0f;
}
ircd_simt_reduce_add_f4lldr(share4, ln, li);
if(li == 0)
{
float sum = 0.0f;
for(uint k = 0; k < 4; ++k)
sum += share4[li][k];
share4[li][0] = ctrl->samax_sum = sum;
share4[li][1] = ctrl->samax_lambda = 1.0f / sum;
}
ircd_simt_broadcast_f4lldr(share4, ln, li);
const float4
sum = share4[li][0],
lambda = share4[li][1];
for(uint i = ti; i < ti + tn; ++i)
for(uint k = 0; k < 4; ++k)
if(i * 4 + k < logits)
logsm[i] = logexp[i] * lambda;
else
logsm[i] = 0.0f;
}
inline void
__attribute__((always_inline))
ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
const uint li)
{
// If the call value has been set to something other than default we
// do nothing else here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
// No action for other threads right now
if(li != 0)
return;
@ -411,11 +543,6 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
ctrl->call = IRCD_GPT_ETOKENS;
#endif
// If the call value has been set to something other than default we
// do nothing else here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
// On the last cycle, with no prior call or error code set, indicate
// a nominal exit condition.
if(ctrl->cycle + 1 >= opts->limit)
@ -425,48 +552,83 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
}
ctrl->cycle += 1;
ctrl->magic = 0xC7012C70U;
}
inline void
__attribute__((always_inline))
ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
const uint li,
__local const ushort *const restrict idx)
__local const ushort *const restrict idx,
__global const float *const restrict logsm,
__global const float *const restrict logexp,
__global const float *const restrict logit)
{
// When the hypercall code is already set, bail here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
// To read from cells other than idx[0] we need this barrier.
if(opts->top_k > 1)
barrier(CLK_LOCAL_MEM_FENCE);
// No action for other threads right now
// Mask for write-leader
if(li != 0)
return;
// When the hypercall code is already set, bail here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
const bool
buffer_full = ctrl->tokens >= opts->buffer_tokens;
const ulong
rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
sel = rnd % max(opts->top_k, 1U);
rnd = opts->top_k > 1?
ircd_simt_rand_xoshiro256pg(ctrl->rand): 1UL;
const ushort
token = idx[sel],
token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
entro = max(opts->top_k, 1U),
select = rnd % entro,
token = idx[select],
dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
head = buffer_full?
(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;
ctrl->token[token_idx] = token;
const ushort
next_select = select + 1,
next_token = idx[next_select];
if(buffer_full)
ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
else
ctrl->tokens++;
const float
test_lsm = logexp[opts->label] * ctrl->samax_lambda,
loss = 0.0f - log(test_lsm * ctrl->samax_lambda),
perp = logsm[token] * 100.0f,
cert = ((logsm[token] - logsm[next_token]) / logsm[token]) * 100.0f,
loss_sum = ctrl->loss_sum + loss,
perp_sum = ctrl->perp_sum + perp,
cert_sum = ctrl->cert_sum + cert,
mean_div = ctrl->epoch + 1.0f,
loss_mean = loss_sum / mean_div,
perp_mean = perp_sum / mean_div,
cert_mean = cert_sum / mean_div;
ctrl->loss = loss;
ctrl->loss_sum = loss_sum;
ctrl->loss_mean = loss_mean;
ctrl->perp = perp;
ctrl->perp_sum = perp_sum;
ctrl->perp_mean = perp_mean;
ctrl->cert = cert;
ctrl->cert_sum = cert_sum;
ctrl->cert_mean = cert_mean;
ctrl->head = head;
ctrl->tokens = tokens;
ctrl->token[dest] = token;
}
__kernel void
ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global const float *const restrict logsm,
__global const float *const restrict logexp,
__global const float *const restrict logit)
{
const uint
@ -476,17 +638,17 @@ ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0),
tn = 262,
tn = opts->logits / ln,
ti = tn * li;
__local ushort idx[192];
__local ushort idx[256];
idx[li] = ti;
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
if(logit[j] > logit[idx[li]])
for(uint j = ti + 1; j < ti + tn; ++j)
if(logsm[j] > logsm[idx[li]])
idx[li] = j;
ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
ircd_gpt_lm_result(ctrl, opts, li, idx);
ircd_simt_sort_idx16_flldr(idx, logsm, ln, li);
ircd_gpt_lm_result(ctrl, opts, li, idx, logsm, logexp, logit);
ircd_gpt_leave(ctrl, opts, li);
}

View file

@ -32,20 +32,25 @@ namespace ircd::gpt::model
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &);
static bool init_dataset(const string_view &);
static bool init_from_cache(const string_view &);
static void init_from_json_handle(decoder &, const init_handler &, const size_t &);
static void init_from_json(const string_view &, const string_view &);
static void init() noexcept;
static void init(), fini() noexcept;
extern const init_handler
manifest[],
manifest_h[],
manifest_td[];
manifest_h[];
extern conf::item<std::string> path;
extern conf::item<std::string> cache_path;
extern conf::item<std::string>
path,
cache_path,
dataset_path;
static fs::map
default_model_shm,
default_dataset_shm;
static fs::map default_model_shm;
static std::unique_ptr<decoder> default_model_res;
}
@ -76,14 +81,6 @@ ircd::gpt::model::manifest
{ "wte.weight.json", init_wte_weight },
};
decltype(ircd::gpt::model::manifest_td)
ircd::gpt::model::manifest_td
{
{ "test.jsonl", nullptr, },
{ "valid.jsonl", nullptr, },
{ "train.jsonl", nullptr, },
};
decltype(ircd::gpt::model::cache_path)
ircd::gpt::model::cache_path
{
@ -91,6 +88,13 @@ ircd::gpt::model::cache_path
{ "default", "model.cache.localhost" },
};
decltype(ircd::gpt::model::dataset_path)
ircd::gpt::model::dataset_path
{
{ "name", "ircd.gpt.model.dataset.path" },
{ "default", string_view{} },
};
decltype(ircd::gpt::model::path)
ircd::gpt::model::path
{
@ -104,15 +108,35 @@ ircd::gpt::model::path
decltype(ircd::gpt::model::default_model)
ircd::gpt::model::default_model;
decltype(ircd::gpt::model::default_dataset)
ircd::gpt::model::default_dataset;
decltype(ircd::gpt::model::default_data)
ircd::gpt::model::default_data;
void
ircd::gpt::model::init()
noexcept
{
if(!model::path)
return;
if(!init_from_cache(model::cache_path))
init_from_json(model::cache_path, model::path);
if(model::dataset_path)
init_dataset(model::dataset_path);
}
void
ircd::gpt::model::fini()
noexcept
{
default_model = nullptr;
default_model_shm = {};
default_dataset = nullptr;
default_data.clear();
default_dataset_shm = {};
}
bool
@ -170,10 +194,9 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
const string_view &model_path)
{
util::timer stopwatch;
auto decoder
{
std::make_unique<model::decoder>()
};
auto decoder(std::make_unique<model::decoder>());
memset(decoder.get(), 0x0, sizeof(model::decoder));
// Load the top level files, vocab etc
for(size_t i(0); i < 4; ++i)
@ -274,6 +297,55 @@ ircd::gpt::model::init_from_json_handle(decoder &d,
};
}
bool
ircd::gpt::model::init_dataset(const string_view &path)
{
if(!fs::is_reg(path))
return false;
const auto size
{
fs::size(path)
};
const fs::fd fd
{
path
};
fs::map::opts map_opts;
map_opts.huge2mb = true;
default_dataset_shm = fs::map
{
fd, map_opts, size
};
default_dataset = string_view
(
default_dataset_shm
);
size_t checkpoint(0);
default_data.resize(260000); //TODO: XXX
ircd::tokens(default_dataset, '\n', [&checkpoint]
(const string_view &line)
{
default_data.at(checkpoint++) = line;
});
char pbuf[48];
log::info
{
log, "dataset(%p) mapped `%s' %s @%lu",
data(default_dataset_shm),
path,
pretty(pbuf, iec(size)),
checkpoint,
};
return true;
}
void
ircd::gpt::model::init_wpe_weight(decoder &d,
const string_view &name,

View file

@ -32,7 +32,7 @@ decltype(ircd::gpt::pipe::flush_cycles)
ircd::gpt::pipe::flush_cycles
{
{ "name", "ircd.gpt.pipe.flush" },
{ "default", 0L, },
{ "default", 1L, },
};
decltype(ircd::gpt::pipe::default_model)
@ -73,20 +73,40 @@ ircd::gpt::pipe::init()
{
*pipe::default_code, *pipe::default_model
};
log::debug
{
log, "Pipe initialized from model:%p data:%p code:%p desc:%p",
&default_model,
pipe::default_model,
pipe::default_code,
pipe::default_desc,
};
}
void
ircd::gpt::pipe::fini()
noexcept
{
delete default_desc;
default_desc = nullptr;
const auto pending
{
cl::work::list.size()
};
delete default_code;
default_code = nullptr;
if(pending)
{
log::warning
{
log, "Waiting for %zu pending tasks to leave the pipe...",
pending,
};
delete default_model;
default_model = nullptr;
cl::sync();
}
delete default_desc; default_desc = nullptr;
delete default_code; default_code = nullptr;
delete default_model; default_model = nullptr;
}
//
@ -96,26 +116,29 @@ noexcept
void
ircd::gpt::generate(task &task)
{
if(unlikely(!pipe::default_model))
pipe::init();
assert(pipe::default_model);
assert(task.opts);
const auto &opts
{
*task.opts
};
assert(task.ctrl);
auto &ctrl
{
*task.ctrl
};
ctrl.cycle = 0;
ctrl.call = IRCD_GPT_ECOMPLETE;
ctrl.host_tsc = prof::cycles();
size_t cycle(ctrl.cycle);
const size_t tokens(ctrl.tokens);
volatile const size_t tokens(ctrl.tokens);
volatile const auto epoch(ctrl.epoch);
volatile size_t cycle(ctrl.cycle);
std::deque<pipe::exec> list;
for(; cycle < opts.limit; ++cycle)
for(; cycle < opts.limit && run::level == run::level::RUN; ++cycle)
{
// When the release/acquire bits are set the control pages are sent
// and received; only set on first and last iterations of this loop.
@ -137,7 +160,7 @@ ircd::gpt::generate(task &task)
pipe::flush_cycles
// Skip flushing on cycles already performing IO or waiting.
&& !rel && !acq && list.size() <= pipe::queue_cycles
&& !acq && list.size() <= pipe::queue_cycles
// The configuration item can specify an interval greater than
// one between flushes.
@ -167,6 +190,8 @@ ircd::gpt::generate(task &task)
// Wait for all unfinished
list.clear();
assert(ctrl.magic == 0xC7012C70);
// Interp error codes
if(unlikely(ctrl.call <= 0))
throw error
@ -176,7 +201,7 @@ ircd::gpt::generate(task &task)
reflect(ctrl.call),
};
always_assert(ctrl.cycle == cycle);
assert(ctrl.cycle == cycle);
}
void
@ -199,7 +224,7 @@ ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
log::logf
{
log, log::level::DEBUG,
"coil:%-2lu %8s %8s %8s %8s\n",
"coil:%-2lu %8s %8s %8s %8s",
i,
util::pretty(tmbuf[0], si(pro[0]), 1),
util::pretty(tmbuf[1], si(pro[1]), 1),
@ -259,13 +284,18 @@ ircd::gpt::pipe::exec::exec(task &task,
}
,range_lm_logit
{
{ 262 * 192UL, 0 }, // align_up(50257) / 192
{ 192UL, 0 },
{ 786 * 64UL, 0 }, // align_up(50257) / 64
{ 64UL, 0 },
}
,range_lm_logsm
{
{ 1 * 256UL, 0 },
{ 256UL, 0 },
}
,range_lm_select
{
{ 1 * 192UL, 0 },
{ 192UL, 0 },
{ 1 * 256UL, 0 },
{ 256UL, 0 },
}
,release_opts
{
@ -314,6 +344,10 @@ ircd::gpt::pipe::exec::exec(task &task,
{
desc->lm_logit, range_lm_logit, lmhead_opts
}
,lm_logsm
{
desc->lm_logsm, range_lm_logsm, lmhead_opts
}
,lm_select
{
desc->lm_select, range_lm_select, lmamax_opts
@ -342,8 +376,8 @@ ircd::gpt::pipe::code::compile_opts
" -cl-finite-math-only"
" -cl-unsafe-math-optimizations"
" -cl-fast-relaxed-math"
//" -cl-mad-enable"
//" -cl-single-precision-constant"
" -cl-mad-enable"
" -cl-single-precision-constant"
//" -cl-fp32-correctly-rounded-divide-sqrt"
};
@ -412,6 +446,16 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
65536 * sizeof(float),
mutable_buffer{}
}
,logexp
{
65536 * sizeof(float),
mutable_buffer{}
}
,logsm
{
65536 * sizeof(float),
mutable_buffer{}
}
,ctrl
{
sizeof(struct ircd_gpt_task),
@ -452,12 +496,24 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
accum,
model.embed->token,
}
,lm_logsm
{
code,
"ircd_gpt_lm_logsm",
ctrl,
opts,
logsm,
logexp,
logit,
}
,lm_select
{
code,
"ircd_gpt_lm_select",
ctrl,
opts,
logsm,
logexp,
logit,
}
,layer

View file

@ -17336,6 +17336,7 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
};
out
<< text
<< output
<< std::endl;
return true;
@ -17347,6 +17348,14 @@ console_cmd__gpt(opt &out, const string_view &line)
return console_cmd__gpt__raw(out, line);
}
bool
console_cmd__gpt__pipe__reset(opt &out, const string_view &line)
{
gpt::pipe::fini();
gpt::pipe::init();
return true;
}
bool
console_cmd__gpt__token(opt &out, const string_view &line)
{
@ -17368,3 +17377,33 @@ console_cmd__gpt__token(opt &out, const string_view &line)
return true;
}
bool
console_cmd__gpt__data(opt &out, const string_view &line)
{
const params param{line, " ",
{
"id"
}};
const auto idx
{
param.at<uint>("id")
};
const json::object step
{
gpt::model::default_data.at(idx)
};
out
<< "#" << step["id"]
<< " | " << step["length"]
<< " " << (step["ended"]? "ended"_sv: string_view{})
<< std::endl
<< std::endl
<< step["text"]
<< std::endl;
return true;
}