mirror of
https://github.com/matrix-construct/construct
synced 2024-12-26 07:23:53 +01:00
ircd::gpt::pipe: Correctness; compute loss, statistics; pipeline optimize.
This commit is contained in:
parent
0a6be0efed
commit
9c062d9c3f
13 changed files with 867 additions and 444 deletions
|
@ -22,6 +22,8 @@ namespace ircd::gpt::model
|
|||
|
||||
constexpr auto align {64};
|
||||
extern const decoder *default_model;
|
||||
extern string_view default_dataset;
|
||||
extern std::vector<json::object> default_data;
|
||||
}
|
||||
|
||||
/// Attention aperature
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
/// This stucture and its mutable companion in `task.h` determine the outcome
|
||||
/// of the next execution cycle; options are immutable to device software but
|
||||
/// may be changed by the host between executions cycles if desired.
|
||||
///
|
||||
///
|
||||
struct ircd_gpt_opts
|
||||
{
|
||||
/// Specifies the nominal halting condition based on a sequence of tokens.
|
||||
|
@ -89,6 +89,82 @@ struct ircd_gpt_opts
|
|||
#endif
|
||||
;
|
||||
|
||||
/// Embedding vector elements
|
||||
uint embed_elems
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
768
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Attention unit width multiple
|
||||
uint attn_elems
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
embed_elems * 3
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// FFNN unit width multiple
|
||||
uint ffnn_elems
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
embed_elems * 4
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
uint embed_width
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
embed_elems / 4
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
uint attn_width
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
attn_elems / 4
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
uint attn_height
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
embed_elems / 4
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
uint ffnn_width
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
ffnn_elems / 4
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
uint ffnn_height
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
embed_elems / 4
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Specifies the token context size in tokens.
|
||||
uint logits
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
50257
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Seed for the task's PRNG.
|
||||
ulong seed
|
||||
#ifdef __cplusplus
|
||||
|
@ -97,6 +173,32 @@ struct ircd_gpt_opts
|
|||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Training steps
|
||||
ulong training_steps
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
250000
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Validation steps
|
||||
ulong validation_steps
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
5000
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
ushort label
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
198
|
||||
}
|
||||
#endif
|
||||
;
|
||||
}
|
||||
__attribute__((aligned(4096)));
|
||||
|
||||
|
|
|
@ -22,7 +22,9 @@ struct ircd::gpt::pipe::desc
|
|||
cl::data
|
||||
state, // qry/key/val projection (tokens * embed * 3 * float)
|
||||
accum, // accumulator (tokens * embed * float)
|
||||
logit, // result output vector (50257 * float)
|
||||
logit, // result logit vector (50257 * float)
|
||||
logexp, // outputs distribution (50257 * float)
|
||||
logsm, // outputs distribution (50257 * float)
|
||||
ctrl, // control page
|
||||
opts; // options page
|
||||
|
||||
|
@ -30,6 +32,7 @@ struct ircd::gpt::pipe::desc
|
|||
lm_embed,
|
||||
lm_norm,
|
||||
lm_logit,
|
||||
lm_logsm,
|
||||
lm_select;
|
||||
|
||||
std::unique_ptr<struct desc::layer>
|
||||
|
|
|
@ -40,6 +40,7 @@ struct ircd::gpt::pipe::exec
|
|||
range_positive, // Dimension range of a layer kernel.
|
||||
range_lm_norm, // Dimension range of the final norm kernel.
|
||||
range_lm_logit, // Dimension range of the language logit kernel.
|
||||
range_lm_logsm, // Dimension range of the language statistic kernel.
|
||||
range_lm_select; // Dimension range of the language token kernel.
|
||||
|
||||
cl::exec
|
||||
|
@ -48,7 +49,8 @@ struct ircd::gpt::pipe::exec
|
|||
lm_embed, // Compute token and positional embeddings.
|
||||
coil[12 * 2], // Pass over all layers.
|
||||
lm_norm, // Final normalization.
|
||||
lm_logit, // Compute logit result vector.
|
||||
lm_logit, // Compute language logits.
|
||||
lm_logsm, // Statistics on the logits.
|
||||
lm_select, // Select next token.
|
||||
acquire_ctrl; // Acquire the control page.
|
||||
|
||||
|
|
|
@ -71,6 +71,42 @@ struct ircd_gpt_task
|
|||
/// State counters for the accept/error sequence codes.
|
||||
uint accept_seq[4], error_seq[4];
|
||||
|
||||
/// Loss for last token of last cycle
|
||||
float loss;
|
||||
|
||||
/// Sum loss over all cycles
|
||||
float loss_sum;
|
||||
|
||||
/// Average loss over all cycles
|
||||
float loss_mean;
|
||||
|
||||
/// Perplexity score for last token of last cycle
|
||||
float perp;
|
||||
|
||||
/// Perplexity sum over all cycles
|
||||
float perp_sum;
|
||||
|
||||
/// Perplexity mean over context
|
||||
float perp_mean;
|
||||
|
||||
/// Logit softmax mu
|
||||
float samax_mu;
|
||||
|
||||
/// Logit softmax sum
|
||||
float samax_sum;
|
||||
|
||||
/// Logit softmax lambda
|
||||
float samax_lambda;
|
||||
|
||||
/// Certainty difference score for last token of last cycle
|
||||
float cert;
|
||||
|
||||
/// Certainty sum over all cycles
|
||||
float cert_sum;
|
||||
|
||||
/// Certainty mean over context
|
||||
float cert_mean;
|
||||
|
||||
/// The token buffer starts at offset 2048 and continues to the end of
|
||||
/// the page; options specify the size of the tokens buffer in tokens.
|
||||
/// Additional pages must be attached for larger buffer sizes.
|
||||
|
@ -112,5 +148,6 @@ enum ircd::gpt::task::status
|
|||
};
|
||||
|
||||
static_assert(sizeof(struct ircd_gpt_task) == 4096);
|
||||
static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
|
||||
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
|
||||
#endif
|
||||
|
|
|
@ -9,9 +9,14 @@
|
|||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
#define HAVE_IRCD_GPT_TOKEN_H
|
||||
|
||||
struct ircd_gpt_attn_mask
|
||||
{
|
||||
bool
|
||||
token[1024];
|
||||
};
|
||||
|
||||
union ircd_gpt_token
|
||||
{
|
||||
float
|
||||
|
@ -19,36 +24,37 @@ union ircd_gpt_token
|
|||
attn[12][64];
|
||||
};
|
||||
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
union ircd_gpt_tokenv
|
||||
{
|
||||
float4
|
||||
word[768/4],
|
||||
attn[12][64/4];
|
||||
|
||||
union ircd_gpt_token
|
||||
token;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct ircd_gpt_attn_qkv
|
||||
{
|
||||
union ircd_gpt_token
|
||||
qry,
|
||||
key,
|
||||
val;
|
||||
};
|
||||
|
||||
struct ircd_gpt_qkv
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
struct ircd_gpt_attn_qkvv
|
||||
{
|
||||
union ircd_gpt_tokenv
|
||||
qry,
|
||||
key,
|
||||
val;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct ircd_gpt_qkvv
|
||||
{
|
||||
union ircd_gpt_tokenv
|
||||
qry,
|
||||
key,
|
||||
val;
|
||||
};
|
||||
|
||||
struct ircd_gpt_attn_mask
|
||||
{
|
||||
bool
|
||||
token[1024];
|
||||
};
|
||||
|
||||
union ircd_gpt_aperature
|
||||
union ircd_gpt_attn_aperature
|
||||
{
|
||||
float
|
||||
word[768],
|
||||
|
@ -56,9 +62,13 @@ union ircd_gpt_aperature
|
|||
proj[3][768],
|
||||
qkv[3][12][64],
|
||||
attn[12][64];
|
||||
|
||||
union ircd_gpt_token
|
||||
token[3];
|
||||
};
|
||||
|
||||
union ircd_gpt_aperaturev
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
union ircd_gpt_attn_aperaturev
|
||||
{
|
||||
float4
|
||||
word[768/4],
|
||||
|
@ -66,6 +76,32 @@ union ircd_gpt_aperaturev
|
|||
proj[3][768/4],
|
||||
qkv[3][12][64/4],
|
||||
attn[12][64/4];
|
||||
|
||||
union ircd_gpt_tokenv
|
||||
token[3];
|
||||
};
|
||||
#endif
|
||||
|
||||
union ircd_gpt_ffnn_aperature
|
||||
{
|
||||
float
|
||||
word[768],
|
||||
fcon[3072],
|
||||
proj[4][768];
|
||||
|
||||
union ircd_gpt_token
|
||||
token[4];
|
||||
};
|
||||
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
union ircd_gpt_ffnn_aperaturev
|
||||
{
|
||||
float4
|
||||
word[768/4],
|
||||
fcon[3072/4],
|
||||
proj[4][768/4];
|
||||
|
||||
union ircd_gpt_tokenv
|
||||
token[4];
|
||||
};
|
||||
#endif
|
||||
|
|
|
@ -19,14 +19,17 @@ ircd_simt_math_mean_f4lldr(__local float4 *const restrict out,
|
|||
out[i] = in[i];
|
||||
ircd_simt_reduce_add_f4lldr(out, num, i);
|
||||
|
||||
float numerator = 0.0f;
|
||||
float4 numeratorv = out[i];
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
numerator += numeratorv[k];
|
||||
if(i == 0)
|
||||
{
|
||||
float numerator = 0.0f;
|
||||
float4 numeratorv = out[i];
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
numerator += numeratorv[k];
|
||||
|
||||
out[i] = numerator;
|
||||
}
|
||||
|
||||
out[i] = numerator;
|
||||
ircd_simt_broadcast_f4lldr(out, num, i);
|
||||
|
||||
numeratorv = out[i];
|
||||
const float4 numeratorv = out[i];
|
||||
out[i] = numeratorv / (num * 4);
|
||||
}
|
||||
|
|
|
@ -28,15 +28,16 @@ ircd_simt_reduce_add_f4lldr(__local float4 *const buf,
|
|||
/// the greatest value is placed in index [0], the rest of the buffer is
|
||||
/// trashed.
|
||||
inline void
|
||||
ircd_simt_reduce_max_f4lldr(__local float *const buf,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
ircd_simt_reduce_max_flldr(__local float *const buf,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(li < stride)
|
||||
buf[li] = max(buf[li], buf[li + stride]);
|
||||
if(buf[li] < buf[li + stride])
|
||||
buf[li] = buf[li + stride];
|
||||
}
|
||||
}
|
||||
|
|
270
ircd/gpt.cc
270
ircd/gpt.cc
|
@ -8,17 +8,14 @@
|
|||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#include <ircd/gpt/pipe/pipe.h>
|
||||
|
||||
namespace ircd::gpt
|
||||
{
|
||||
template<class T>
|
||||
static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
|
||||
|
||||
static void gelu(f32x4 &, const f32x4 &);
|
||||
static void gelu(f32x4 *, const f32x4 *);
|
||||
static void norm(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const f32);
|
||||
static void fmma4(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
|
||||
static void fmma3(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
|
||||
static void fmma2(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const size_t);
|
||||
static void fmma1(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
|
||||
static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
|
||||
static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
|
||||
static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
|
||||
|
@ -84,6 +81,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
|||
auto &errc(ctrl.error_seq);
|
||||
auto &accc(ctrl.accept_seq);
|
||||
ctrl.tokens = in.size();
|
||||
ctrl.head = 0;
|
||||
|
||||
const size_t tmax
|
||||
{
|
||||
|
@ -112,25 +110,34 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
|||
else
|
||||
embed(data(dst), in[j], j, opts);
|
||||
|
||||
#if RB_DEBUG
|
||||
static char dbuf[512] {0};
|
||||
char report[1536] {0};
|
||||
char tmbuf[1][64] {{0}};
|
||||
const size_t report_size = snprintf
|
||||
(
|
||||
report, sizeof(report),
|
||||
"%-2u -- %-3u [%5u] --- --- %s 0 0 | %8s",
|
||||
"%-4u %4u %4u:%-4u %1u%1u [ %6.2fL %6.2f%% ] %6.2fL %5.1f%% %s",
|
||||
ctrl.epoch,
|
||||
ctrl.cycle,
|
||||
j,
|
||||
ctrl.tokens,
|
||||
ctrl.token[j],
|
||||
vocab::debug(dbuf, ctrl.token[j]).c_str(),
|
||||
pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
|
||||
0,
|
||||
0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
vocab::debug(dbuf, in[j]).c_str()
|
||||
);
|
||||
|
||||
log::info
|
||||
log::logf
|
||||
{
|
||||
log, "%s",
|
||||
log, log::level::DEBUG,
|
||||
"%s",
|
||||
string_view{report, report_size}
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t cycles(0);
|
||||
|
@ -192,24 +199,30 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
|||
const size_t report_size = snprintf
|
||||
(
|
||||
report, sizeof(report),
|
||||
"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
|
||||
j,
|
||||
"%4u:%-4u %4u:%-4u %1u%1u [ %4.1f%% %6.2f%% %5.2fL ] %5.1f%% %5.1f%% %4.1fL %s %04x %8s %8s | %8s",
|
||||
j + in.size(),
|
||||
ctrl.tokens,
|
||||
ctrl.cycle,
|
||||
ctrl.epoch,
|
||||
out[j],
|
||||
ctrl.cycle,
|
||||
accc[0] + accc[1] + accc[2],
|
||||
errc[0] + errc[1] + errc[2],
|
||||
ctrl.cert_mean < 100.0? ctrl.cert_mean: NAN,
|
||||
ctrl.perp_mean < 100.0? ctrl.perp_mean: NAN,
|
||||
ctrl.loss_mean < 100.0? ctrl.loss_mean: NAN,
|
||||
ctrl.cert < 100.0? ctrl.cert: NAN,
|
||||
ctrl.perp < 100.0? ctrl.perp: NAN,
|
||||
ctrl.loss < 100.0? ctrl.loss: NAN,
|
||||
vocab::debug(dbuf, out[j]).c_str(),
|
||||
out[j],
|
||||
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
|
||||
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
|
||||
pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
|
||||
);
|
||||
|
||||
log::info
|
||||
log::logf
|
||||
{
|
||||
log, "%s",
|
||||
log, log::level::DEBUG,
|
||||
"%s",
|
||||
string_view{report, report_size}
|
||||
};
|
||||
}
|
||||
|
@ -375,41 +388,19 @@ ircd::gpt::coil(float *__restrict__ accum,
|
|||
a[j][k * 64 + l] = attns[k][j][l];
|
||||
}
|
||||
|
||||
static const math::fmma_opts fmma_opts
|
||||
{
|
||||
768, 768, 2U
|
||||
};
|
||||
|
||||
for(uint j(0); j < tokens; ++j)
|
||||
fmma2((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, tokens);
|
||||
fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
|
||||
|
||||
for(uint j(0); j < tokens; ++j)
|
||||
ffnn(accum + j * 768, accum + j * 768, decoder, i);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::ffnn(float *const out,
|
||||
const float *const in,
|
||||
const model::decoder &decoder,
|
||||
const uint laynum)
|
||||
{
|
||||
constexpr float ln2_epsilon
|
||||
{
|
||||
0.00001
|
||||
};
|
||||
|
||||
const auto &layer
|
||||
{
|
||||
decoder.layer[laynum]
|
||||
};
|
||||
|
||||
static float
|
||||
buf alignas(64) [768],
|
||||
buf2 alignas(64) [3072];
|
||||
|
||||
memset(buf2, 0x0, sizeof(buf2));
|
||||
norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
|
||||
fmma3((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight);
|
||||
gelu((f32x4 *)buf2, (const f32x4 *)buf2);
|
||||
fmma4((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
|
||||
const float *const __restrict__ in,
|
||||
|
@ -440,8 +431,13 @@ ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
|
|||
|
||||
norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon);
|
||||
|
||||
static const math::fmma_opts fmma_opts
|
||||
{
|
||||
768, 2304, 2U,
|
||||
};
|
||||
|
||||
memset(proj, 0x0, sizeof(proj));
|
||||
fmma1((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight);
|
||||
fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts);
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
|
@ -548,6 +544,43 @@ ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
|
|||
out[j][k][m] += in[j][k][l] * val[l][j][m];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::ffnn(float *const out,
|
||||
const float *const in,
|
||||
const model::decoder &decoder,
|
||||
const uint laynum)
|
||||
{
|
||||
static const math::fmma_opts fmma3_opts
|
||||
{
|
||||
768, 3072, 2U,
|
||||
};
|
||||
|
||||
static const math::fmma_opts fmma4_opts
|
||||
{
|
||||
3072, 768, 2U,
|
||||
};
|
||||
|
||||
constexpr float ln2_epsilon
|
||||
{
|
||||
0.00001
|
||||
};
|
||||
|
||||
const auto &layer
|
||||
{
|
||||
decoder.layer[laynum]
|
||||
};
|
||||
|
||||
static float
|
||||
buf alignas(64) [768],
|
||||
buf2 alignas(64) [3072];
|
||||
|
||||
memset(buf2, 0x0, sizeof(buf2));
|
||||
norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
|
||||
fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts);
|
||||
gelu((f32x4 *)buf2, (const f32x4 *)buf2);
|
||||
fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::norm(f32x4 *const __restrict__ out,
|
||||
const f32x4 *const __restrict__ in,
|
||||
|
@ -567,143 +600,18 @@ ircd::gpt::norm(f32x4 *const __restrict__ out,
|
|||
out[j] = out[j] * weight[j] + bias[j];
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void
|
||||
ircd::gpt::fmma1(f32x4 *const __restrict__ out,
|
||||
const f32x4 *const __restrict__ in,
|
||||
const f32x4 *const __restrict__ bias,
|
||||
const f32x4 *const __restrict__ weight)
|
||||
ircd::gpt::fmma(T *const __restrict__ out,
|
||||
const T *const __restrict__ in,
|
||||
const T *const __restrict__ bias,
|
||||
const T *const __restrict__ weight,
|
||||
const math::fmma_opts &opts)
|
||||
{
|
||||
constexpr uint width
|
||||
{
|
||||
2304
|
||||
};
|
||||
for(uint i(0); i < opts.rows / simd::lanes<T>(); ++i)
|
||||
out[i] += bias[i];
|
||||
|
||||
constexpr uint height
|
||||
{
|
||||
768
|
||||
};
|
||||
|
||||
constexpr uint lanes
|
||||
{
|
||||
simd::lanes<f32x4>()
|
||||
};
|
||||
|
||||
for(uint j(0); j < width / lanes; ++j)
|
||||
out[j] += bias[j];
|
||||
|
||||
static const math::fmma_opts opts
|
||||
{
|
||||
width,
|
||||
height,
|
||||
2U,
|
||||
'y',
|
||||
};
|
||||
|
||||
math::fmma<opts>(out, in, weight);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma2(f32x4 *const __restrict__ out,
|
||||
const f32x4 *const __restrict__ in,
|
||||
const f32x4 *const __restrict__ bias,
|
||||
const f32x4 *const __restrict__ weight,
|
||||
const size_t num)
|
||||
{
|
||||
constexpr uint width
|
||||
{
|
||||
768
|
||||
};
|
||||
|
||||
constexpr uint height
|
||||
{
|
||||
768
|
||||
};
|
||||
|
||||
constexpr uint lanes
|
||||
{
|
||||
simd::lanes<f32x4>()
|
||||
};
|
||||
|
||||
for(uint j(0); j < width / lanes; ++j)
|
||||
out[j] += bias[j];
|
||||
|
||||
static const math::fmma_opts opts
|
||||
{
|
||||
width,
|
||||
height,
|
||||
2U,
|
||||
};
|
||||
|
||||
math::fmma<opts>(out, in, weight);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma3(f32x4 *const __restrict__ out,
|
||||
const f32x4 *const __restrict__ in,
|
||||
const f32x4 *const __restrict__ bias,
|
||||
const f32x4 *const __restrict__ weight)
|
||||
{
|
||||
constexpr uint width
|
||||
{
|
||||
3072
|
||||
};
|
||||
|
||||
constexpr uint height
|
||||
{
|
||||
768
|
||||
};
|
||||
|
||||
constexpr uint lanes
|
||||
{
|
||||
simd::lanes<f32x4>()
|
||||
};
|
||||
|
||||
for(uint j(0); j < width / lanes; ++j)
|
||||
out[j] += bias[j];
|
||||
|
||||
static const math::fmma_opts opts
|
||||
{
|
||||
width,
|
||||
height,
|
||||
2U,
|
||||
'y',
|
||||
};
|
||||
|
||||
math::fmma<opts>(out, in, weight);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma4(f32x4 *const __restrict__ out,
|
||||
const f32x4 *const __restrict__ in,
|
||||
const f32x4 *const __restrict__ bias,
|
||||
const f32x4 *const __restrict__ weight)
|
||||
{
|
||||
constexpr uint width
|
||||
{
|
||||
3072
|
||||
};
|
||||
|
||||
constexpr uint height
|
||||
{
|
||||
768
|
||||
};
|
||||
|
||||
constexpr uint lanes
|
||||
{
|
||||
simd::lanes<f32x4>()
|
||||
};
|
||||
|
||||
for(uint j(0); j < height / lanes; ++j)
|
||||
out[j] += bias[j];
|
||||
|
||||
static const math::fmma_opts opts
|
||||
{
|
||||
width,
|
||||
height,
|
||||
2U,
|
||||
};
|
||||
|
||||
math::fmma<opts>(out, in, weight);
|
||||
math::fmma(out, in, weight, opts);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
544
ircd/gpt_cl.cl
544
ircd/gpt_cl.cl
|
@ -10,8 +10,8 @@
|
|||
|
||||
|
||||
inline void
|
||||
ircd_gpt_norm_fmad(__local float4 *const out,
|
||||
__local const float4 *const in,
|
||||
ircd_gpt_norm_fmad(__local float4 *const restrict out,
|
||||
__local const float4 *const restrict in,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight,
|
||||
const uint i)
|
||||
|
@ -27,35 +27,26 @@ ircd_gpt_sgemv(__local float4 *const restrict out,
|
|||
__global const float4 *const restrict weight,
|
||||
const uint width,
|
||||
const uint height,
|
||||
const uint tiles,
|
||||
const uint i)
|
||||
{
|
||||
const uint seg = height / tiles;
|
||||
|
||||
float4 acc = bias[i];
|
||||
for(uint j = 0; j < seg; ++j)
|
||||
for(uint t = 0; t < tiles; ++t)
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
{
|
||||
const uint
|
||||
jidx = t * seg + j,
|
||||
kidx = jidx * 4 + k,
|
||||
widx = kidx * width + i;
|
||||
|
||||
acc += weight[widx] * in[jidx][k];
|
||||
}
|
||||
for(uint j = 0; j < height; ++j)
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
acc += in[j][k] * weight[width * (j * 4 + k) + i];
|
||||
|
||||
out[i] = acc;
|
||||
}
|
||||
|
||||
/// Gaussian Error Linear Unit
|
||||
inline void
|
||||
ircd_gpt_gelu(__local float4 *const out,
|
||||
__local const float4 *const in_,
|
||||
const uint i)
|
||||
ircd_gpt_ffnn_gelu(__local float4 *const out,
|
||||
__local const float4 *const in_,
|
||||
const uint i)
|
||||
{
|
||||
float4 a,
|
||||
const float4
|
||||
in = in_[i];
|
||||
|
||||
float4 a;
|
||||
a = 0.044715f;
|
||||
a *= in;
|
||||
a *= in;
|
||||
|
@ -71,14 +62,35 @@ ircd_gpt_gelu(__local float4 *const out,
|
|||
out[i] = a;
|
||||
}
|
||||
|
||||
//
|
||||
// core
|
||||
//
|
||||
inline void
|
||||
__attribute__((always_inline))
|
||||
ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__local union ircd_gpt_ffnn_aperaturev *const restrict out,
|
||||
__local const union ircd_gpt_tokenv *const in,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight)
|
||||
{
|
||||
const uint
|
||||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
width = opts->ffnn_width,
|
||||
height = opts->ffnn_height;
|
||||
|
||||
__kernel void
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ircd_gpt_sgemv(out->fcon, in->word, bias, weight, width, height, i * ln + li);
|
||||
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li);
|
||||
}
|
||||
|
||||
inline void
|
||||
__attribute__((always_inline))
|
||||
ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__local union ircd_gpt_tokenv *const restrict token,
|
||||
__local union ircd_gpt_tokenv *const restrict tmp,
|
||||
__local union ircd_gpt_ffnn_aperaturev *const restrict buf,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight,
|
||||
__global const float4 *const restrict fcon_bias,
|
||||
|
@ -92,72 +104,34 @@ ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
|
|||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
__local union ircd_gpt_aperaturev token;
|
||||
__local float4 tmp[768/4];
|
||||
|
||||
// Fetch local copy of the global accumulator. We operate on a cached
|
||||
// copy as input, and add our output to the global upon completion.
|
||||
token.word[li] = accum[wi].word[li];
|
||||
wn = get_num_groups(0),
|
||||
width = opts->ffnn_width,
|
||||
height = opts->ffnn_height;
|
||||
|
||||
// Layer re-normalization
|
||||
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
|
||||
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||
ircd_simt_math_norm_f4lldr(token->word, token->word, buf->word, ln, li);
|
||||
ircd_gpt_norm_fmad(tmp->word, token->word, norm_bias, norm_weight, li);
|
||||
|
||||
// ln's writes are still pending but fcon reads results across threads.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Fully connected
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
|
||||
ircd_gpt_ffnn_fcon(ctrl, opts, buf, tmp, fcon_bias, fcon_weight);
|
||||
|
||||
// Gaussian Error Linear Unit
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
|
||||
|
||||
// Projection
|
||||
ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
|
||||
|
||||
// Accumulation; end of layer
|
||||
accum[wi].word[li] += tmp[li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__local const union ircd_gpt_tokenv *const restrict xattn,
|
||||
__global const float4 *const restrict proj_bias,
|
||||
__global const float4 *const restrict proj_weight)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
gn = get_global_size(0),
|
||||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
__local float4
|
||||
in[768/4],
|
||||
out[768/4];
|
||||
|
||||
// Fetch
|
||||
in[li] = xattn->word[li];
|
||||
|
||||
// Need this here if xattn is __local
|
||||
// fcon's writes are still pending but proj reads results across threads.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Projection
|
||||
ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
|
||||
|
||||
// Accumulation; end of layer
|
||||
accum[wi].word[li] += out[li];
|
||||
ircd_gpt_sgemv(token->word, buf->fcon, proj_bias, proj_weight, height, width, li);
|
||||
}
|
||||
|
||||
__kernel void
|
||||
inline void
|
||||
__attribute__((always_inline))
|
||||
ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__local union ircd_gpt_tokenv *const restrict out,
|
||||
__global const struct ircd_gpt_qkvv *const restrict token,
|
||||
__local union ircd_gpt_tokenv *const restrict tmp,
|
||||
__global const struct ircd_gpt_attn_qkvv *const restrict token,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],
|
||||
{
|
||||
const uint
|
||||
|
@ -168,18 +142,13 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
|
|||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
__local union
|
||||
{
|
||||
float
|
||||
attn[12][96];
|
||||
}
|
||||
self;
|
||||
__local union ircd_gpt_token *const restrict self = &tmp->token;
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
if(mask[wi].token[i])
|
||||
self.attn[li][i] = 0.0f;
|
||||
self->attn[li][i] = 0.0f;
|
||||
else
|
||||
self.attn[li][i] = -10000.0f;
|
||||
self->attn[li][i] = -10000.0f;
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
if(mask[wi].token[i])
|
||||
|
@ -190,40 +159,153 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
|
|||
key = token[i].key.attn[li][j],
|
||||
res = qry * key;
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
self.attn[li][i] += res[k];
|
||||
self->attn[li][i] += res[k];
|
||||
}
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
if(mask[wi].token[i])
|
||||
self.attn[li][i] /= 8.0f;
|
||||
self->attn[li][i] /= 8.0f;
|
||||
|
||||
float mu = -10000.0f;
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
mu = max(mu, self->attn[li][i]);
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
self.attn[li][i] = exp(self.attn[li][i]);
|
||||
self->attn[li][i] = exp(self->attn[li][i] - mu);
|
||||
|
||||
float4 vacc = 0.0f;
|
||||
float sum = 0.0f;
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
vacc[i % 4] += self.attn[li][i];
|
||||
|
||||
float acc = 0.0f;
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
acc += vacc[i];
|
||||
sum += self->attn[li][i];
|
||||
|
||||
const float lambda = 1.0f / sum;
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
self.attn[li][i] /= acc;
|
||||
self->attn[li][i] *= lambda;
|
||||
|
||||
for(uint j = 0; j < 64/4; ++j)
|
||||
out->attn[li][j] = 0.0f;
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
for(uint j = 0; j < 64/4; ++j)
|
||||
out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
|
||||
out->attn[li][j] += token[i].val.attn[li][j] * self->attn[li][i];
|
||||
}
|
||||
|
||||
inline void
|
||||
__attribute__((always_inline))
|
||||
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__local union ircd_gpt_tokenv *const out,
|
||||
__local const union ircd_gpt_tokenv *const xattn,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
gn = get_global_size(0),
|
||||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0),
|
||||
height = opts->attn_height,
|
||||
width = opts->attn_height; // same
|
||||
|
||||
// Projection
|
||||
ircd_gpt_sgemv(out->word, xattn->word, bias, weight, height, width, li);
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const struct ircd_gpt_attn_qkvv *const restrict state,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
|
||||
__global const float4 *const restrict attn_proj_bias,
|
||||
__global const float4 *const restrict attn_proj_weight,
|
||||
__global const float4 *const restrict ffnn_norm_bias,
|
||||
__global const float4 *const restrict ffnn_norm_weight,
|
||||
__global const float4 *const restrict ffnn_fcon_bias,
|
||||
__global const float4 *const restrict ffnn_fcon_weight,
|
||||
__global const float4 *const restrict ffnn_proj_bias,
|
||||
__global const float4 *const restrict ffnn_proj_weight)
|
||||
{
|
||||
const uint
|
||||
li = get_local_id(0),
|
||||
wi = get_group_id(0);
|
||||
|
||||
__local union ircd_gpt_ffnn_aperaturev
|
||||
ffnn_fcon;
|
||||
|
||||
__local union ircd_gpt_tokenv
|
||||
buf0, buf1;
|
||||
|
||||
// Self-attention backend; this computes the self-attention result now
|
||||
// that keys and values are globally visible across tokens.
|
||||
ircd_gpt_attn_self
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
&buf1,
|
||||
&buf0,
|
||||
state,
|
||||
mask
|
||||
);
|
||||
|
||||
// Self-attention's writes are pending on each thread but each proj
|
||||
// call requires results from all threads for input to the matmul.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Project result of self-attention.
|
||||
ircd_gpt_attn_proj
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
&buf0,
|
||||
&buf1,
|
||||
attn_proj_bias,
|
||||
attn_proj_weight
|
||||
);
|
||||
|
||||
// Frontend accumulation
|
||||
{
|
||||
const float4
|
||||
attn = buf0.word[li],
|
||||
resid = accum[wi].word[li];
|
||||
|
||||
buf0.word[li] += resid;
|
||||
accum[wi].word[li] += attn;
|
||||
}
|
||||
|
||||
// Backend mlp; layer-norm acquires any pending writes, no fence required.
|
||||
ircd_gpt_ffnn
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
&buf0,
|
||||
&buf1,
|
||||
&ffnn_fcon,
|
||||
ffnn_norm_bias,
|
||||
ffnn_norm_weight,
|
||||
ffnn_fcon_bias,
|
||||
ffnn_fcon_weight,
|
||||
ffnn_proj_bias,
|
||||
ffnn_proj_weight
|
||||
);
|
||||
|
||||
// Backend accumulation
|
||||
{
|
||||
const float4
|
||||
ffnn = buf0.word[li],
|
||||
resid = accum[wi].word[li],
|
||||
result = ffnn + resid;
|
||||
|
||||
accum[wi].word[li] = result;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_aperaturev *const restrict out,
|
||||
__global const union ircd_gpt_tokenv *const restrict in,
|
||||
__global union ircd_gpt_attn_aperaturev *const restrict state,
|
||||
__global const union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight,
|
||||
__global const float4 *const restrict fcon_bias,
|
||||
|
@ -235,74 +317,32 @@ ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
|
|||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
wn = get_num_groups(0),
|
||||
width = opts->attn_width,
|
||||
height = opts->attn_height;
|
||||
|
||||
__local union ircd_gpt_aperaturev token;
|
||||
__local float4 tmp[768/4];
|
||||
__local union ircd_gpt_attn_aperaturev
|
||||
token;
|
||||
|
||||
token.word[li] = in[wi].word[li];
|
||||
__local float4
|
||||
tmp[768/4];
|
||||
|
||||
token.word[li] = accum[wi].word[li];
|
||||
|
||||
// Layer re-normalization
|
||||
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
|
||||
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||
|
||||
// Ln's writes are still pending; fcon requires results across threads.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Fully connected
|
||||
for(uint i = 0; i < 3; ++i)
|
||||
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
|
||||
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, width, height, i * ln + li);
|
||||
|
||||
// Export queries, keys, and values.
|
||||
for(uint i = 0; i < 3; ++i)
|
||||
out[wi].proj[i][li] = token.proj[i][li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const struct ircd_gpt_qkvv *const restrict state,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
|
||||
__global const float4 *const restrict attn_proj_bias,
|
||||
__global const float4 *const restrict attn_proj_weight,
|
||||
__global const float4 *const restrict ffnn_norm_bias,
|
||||
__global const float4 *const restrict ffnn_norm_weight,
|
||||
__global const float4 *const restrict ffnn_fcon_bias,
|
||||
__global const float4 *const restrict ffnn_fcon_weight,
|
||||
__global const float4 *const restrict ffnn_proj_bias,
|
||||
__global const float4 *const restrict ffnn_proj_weight)
|
||||
{
|
||||
__local union ircd_gpt_tokenv value;
|
||||
|
||||
ircd_gpt_attn_self
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
&value,
|
||||
state,
|
||||
mask
|
||||
);
|
||||
|
||||
ircd_gpt_attn_proj
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
&value,
|
||||
attn_proj_bias,
|
||||
attn_proj_weight
|
||||
);
|
||||
|
||||
ircd_gpt_ffnn
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
ffnn_norm_bias,
|
||||
ffnn_norm_weight,
|
||||
ffnn_fcon_bias,
|
||||
ffnn_fcon_weight,
|
||||
ffnn_proj_bias,
|
||||
ffnn_proj_weight
|
||||
);
|
||||
state[wi].proj[i][li] = token.proj[i][li];
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -320,7 +360,8 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
|||
const uint word_idx)
|
||||
{
|
||||
const ushort
|
||||
token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
|
||||
ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
|
||||
token = ctrl->token[ring_idx];
|
||||
|
||||
const float4
|
||||
wte = vocab[token].word[word_idx],
|
||||
|
@ -362,9 +403,9 @@ ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
|
|||
|
||||
// Final re-normalization
|
||||
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
|
||||
ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
|
||||
ircd_gpt_norm_fmad(tmp.word, token.word, norm_bias, norm_weight, li);
|
||||
|
||||
accum[0].word[li] = token.word[li];
|
||||
accum[wi].word[li] = tmp.word[li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
|
@ -375,31 +416,122 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
|
|||
__global const union ircd_gpt_tokenv *const restrict token)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0);
|
||||
gi = get_global_id(0),
|
||||
ti = ctrl->tokens - 1,
|
||||
words = opts->embed_width;
|
||||
|
||||
float4 acc = 0.0f;
|
||||
for(uint j = 0; j < 768/4; ++j)
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for(uint j = 0; j < words; ++j)
|
||||
{
|
||||
const float4
|
||||
in = accum[0].word[j],
|
||||
vocab = token[gi].word[j],
|
||||
res = vocab * in;
|
||||
in = accum[ti].word[j],
|
||||
vocab = token[gi].word[j];
|
||||
|
||||
acc += res;
|
||||
acc += vocab * in;
|
||||
}
|
||||
|
||||
float res = 0.0f;
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
res += acc[k];
|
||||
|
||||
logit[gi] = res;
|
||||
if(gi < opts->logits)
|
||||
logit[gi] = res;
|
||||
else
|
||||
logit[gi] = -10000.0f;
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_lm_logsm(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global float4 *const restrict logsm,
|
||||
__global float4 *const restrict logexp,
|
||||
__global const float4 *const restrict logit)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
logits = opts->logits,
|
||||
logits_alignup = logits + (ln - (logits % ln)),
|
||||
tn = logits_alignup / ln / 4,
|
||||
ti = tn * li;
|
||||
|
||||
__local float share[256];
|
||||
__local float4 share4[256];
|
||||
|
||||
share4[li] = -10000.0f;
|
||||
for(uint i = ti; i < ti + tn; ++i)
|
||||
share4[li] = max(share4[li], logit[i]);
|
||||
|
||||
share[li] = -10000.0f;
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
share[li] = max(share[li], share4[li][k]);
|
||||
|
||||
ircd_simt_reduce_max_flldr(share, ln, li);
|
||||
|
||||
if(li == 0)
|
||||
share4[li] = ctrl->samax_mu = share[li];
|
||||
|
||||
ircd_simt_broadcast_f4lldr(share4, ln, li);
|
||||
|
||||
const float4
|
||||
mu = share4[li];
|
||||
|
||||
share4[li] = 0.0f;
|
||||
for(uint i = ti; i < ti + tn; ++i)
|
||||
{
|
||||
const float4
|
||||
reg = logit[i] - mu,
|
||||
res = exp(reg);
|
||||
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
if(i * 4 + k < logits)
|
||||
share4[li][k] += res[k];
|
||||
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
if(i * 4 + k < logits)
|
||||
logexp[i][k] = res[k];
|
||||
else
|
||||
logexp[i][k] = 0.0f;
|
||||
}
|
||||
|
||||
ircd_simt_reduce_add_f4lldr(share4, ln, li);
|
||||
|
||||
if(li == 0)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
sum += share4[li][k];
|
||||
|
||||
share4[li][0] = ctrl->samax_sum = sum;
|
||||
share4[li][1] = ctrl->samax_lambda = 1.0f / sum;
|
||||
}
|
||||
|
||||
ircd_simt_broadcast_f4lldr(share4, ln, li);
|
||||
|
||||
const float4
|
||||
sum = share4[li][0],
|
||||
lambda = share4[li][1];
|
||||
for(uint i = ti; i < ti + tn; ++i)
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
if(i * 4 + k < logits)
|
||||
logsm[i] = logexp[i] * lambda;
|
||||
else
|
||||
logsm[i] = 0.0f;
|
||||
}
|
||||
|
||||
inline void
|
||||
__attribute__((always_inline))
|
||||
ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
const uint li)
|
||||
{
|
||||
// If the call value has been set to something other than default we
|
||||
// do nothing else here.
|
||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||
return;
|
||||
|
||||
// No action for other threads right now
|
||||
if(li != 0)
|
||||
return;
|
||||
|
@ -411,11 +543,6 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
|
|||
ctrl->call = IRCD_GPT_ETOKENS;
|
||||
#endif
|
||||
|
||||
// If the call value has been set to something other than default we
|
||||
// do nothing else here.
|
||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||
return;
|
||||
|
||||
// On the last cycle, with no prior call or error code set, indicate
|
||||
// a nominal exit condition.
|
||||
if(ctrl->cycle + 1 >= opts->limit)
|
||||
|
@ -425,48 +552,83 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
|
|||
}
|
||||
|
||||
ctrl->cycle += 1;
|
||||
ctrl->magic = 0xC7012C70U;
|
||||
}
|
||||
|
||||
inline void
|
||||
__attribute__((always_inline))
|
||||
ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
const uint li,
|
||||
__local const ushort *const restrict idx)
|
||||
__local const ushort *const restrict idx,
|
||||
__global const float *const restrict logsm,
|
||||
__global const float *const restrict logexp,
|
||||
__global const float *const restrict logit)
|
||||
{
|
||||
// When the hypercall code is already set, bail here.
|
||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||
return;
|
||||
|
||||
// To read from cells other than idx[0] we need this barrier.
|
||||
if(opts->top_k > 1)
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// No action for other threads right now
|
||||
// Mask for write-leader
|
||||
if(li != 0)
|
||||
return;
|
||||
|
||||
// When the hypercall code is already set, bail here.
|
||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||
return;
|
||||
|
||||
const bool
|
||||
buffer_full = ctrl->tokens >= opts->buffer_tokens;
|
||||
|
||||
const ulong
|
||||
rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
|
||||
sel = rnd % max(opts->top_k, 1U);
|
||||
rnd = opts->top_k > 1?
|
||||
ircd_simt_rand_xoshiro256pg(ctrl->rand): 1UL;
|
||||
|
||||
const ushort
|
||||
token = idx[sel],
|
||||
token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
|
||||
entro = max(opts->top_k, 1U),
|
||||
select = rnd % entro,
|
||||
token = idx[select],
|
||||
dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
|
||||
tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
|
||||
head = buffer_full?
|
||||
(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;
|
||||
|
||||
ctrl->token[token_idx] = token;
|
||||
const ushort
|
||||
next_select = select + 1,
|
||||
next_token = idx[next_select];
|
||||
|
||||
if(buffer_full)
|
||||
ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
|
||||
else
|
||||
ctrl->tokens++;
|
||||
const float
|
||||
test_lsm = logexp[opts->label] * ctrl->samax_lambda,
|
||||
loss = 0.0f - log(test_lsm * ctrl->samax_lambda),
|
||||
perp = logsm[token] * 100.0f,
|
||||
cert = ((logsm[token] - logsm[next_token]) / logsm[token]) * 100.0f,
|
||||
loss_sum = ctrl->loss_sum + loss,
|
||||
perp_sum = ctrl->perp_sum + perp,
|
||||
cert_sum = ctrl->cert_sum + cert,
|
||||
mean_div = ctrl->epoch + 1.0f,
|
||||
loss_mean = loss_sum / mean_div,
|
||||
perp_mean = perp_sum / mean_div,
|
||||
cert_mean = cert_sum / mean_div;
|
||||
|
||||
ctrl->loss = loss;
|
||||
ctrl->loss_sum = loss_sum;
|
||||
ctrl->loss_mean = loss_mean;
|
||||
ctrl->perp = perp;
|
||||
ctrl->perp_sum = perp_sum;
|
||||
ctrl->perp_mean = perp_mean;
|
||||
ctrl->cert = cert;
|
||||
ctrl->cert_sum = cert_sum;
|
||||
ctrl->cert_mean = cert_mean;
|
||||
ctrl->head = head;
|
||||
ctrl->tokens = tokens;
|
||||
ctrl->token[dest] = token;
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global const float *const restrict logsm,
|
||||
__global const float *const restrict logexp,
|
||||
__global const float *const restrict logit)
|
||||
{
|
||||
const uint
|
||||
|
@ -476,17 +638,17 @@ ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
|
|||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0),
|
||||
tn = 262,
|
||||
tn = opts->logits / ln,
|
||||
ti = tn * li;
|
||||
|
||||
__local ushort idx[192];
|
||||
__local ushort idx[256];
|
||||
|
||||
idx[li] = ti;
|
||||
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
|
||||
if(logit[j] > logit[idx[li]])
|
||||
for(uint j = ti + 1; j < ti + tn; ++j)
|
||||
if(logsm[j] > logsm[idx[li]])
|
||||
idx[li] = j;
|
||||
|
||||
ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
|
||||
ircd_gpt_lm_result(ctrl, opts, li, idx);
|
||||
ircd_simt_sort_idx16_flldr(idx, logsm, ln, li);
|
||||
ircd_gpt_lm_result(ctrl, opts, li, idx, logsm, logexp, logit);
|
||||
ircd_gpt_leave(ctrl, opts, li);
|
||||
}
|
||||
|
|
|
@ -32,20 +32,25 @@ namespace ircd::gpt::model
|
|||
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &);
|
||||
|
||||
static bool init_dataset(const string_view &);
|
||||
static bool init_from_cache(const string_view &);
|
||||
static void init_from_json_handle(decoder &, const init_handler &, const size_t &);
|
||||
static void init_from_json(const string_view &, const string_view &);
|
||||
static void init() noexcept;
|
||||
static void init(), fini() noexcept;
|
||||
|
||||
extern const init_handler
|
||||
manifest[],
|
||||
manifest_h[],
|
||||
manifest_td[];
|
||||
manifest_h[];
|
||||
|
||||
extern conf::item<std::string> path;
|
||||
extern conf::item<std::string> cache_path;
|
||||
extern conf::item<std::string>
|
||||
path,
|
||||
cache_path,
|
||||
dataset_path;
|
||||
|
||||
static fs::map
|
||||
default_model_shm,
|
||||
default_dataset_shm;
|
||||
|
||||
static fs::map default_model_shm;
|
||||
static std::unique_ptr<decoder> default_model_res;
|
||||
}
|
||||
|
||||
|
@ -76,14 +81,6 @@ ircd::gpt::model::manifest
|
|||
{ "wte.weight.json", init_wte_weight },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::manifest_td)
|
||||
ircd::gpt::model::manifest_td
|
||||
{
|
||||
{ "test.jsonl", nullptr, },
|
||||
{ "valid.jsonl", nullptr, },
|
||||
{ "train.jsonl", nullptr, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::cache_path)
|
||||
ircd::gpt::model::cache_path
|
||||
{
|
||||
|
@ -91,6 +88,13 @@ ircd::gpt::model::cache_path
|
|||
{ "default", "model.cache.localhost" },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::dataset_path)
|
||||
ircd::gpt::model::dataset_path
|
||||
{
|
||||
{ "name", "ircd.gpt.model.dataset.path" },
|
||||
{ "default", string_view{} },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::path)
|
||||
ircd::gpt::model::path
|
||||
{
|
||||
|
@ -104,15 +108,35 @@ ircd::gpt::model::path
|
|||
decltype(ircd::gpt::model::default_model)
|
||||
ircd::gpt::model::default_model;
|
||||
|
||||
decltype(ircd::gpt::model::default_dataset)
|
||||
ircd::gpt::model::default_dataset;
|
||||
|
||||
decltype(ircd::gpt::model::default_data)
|
||||
ircd::gpt::model::default_data;
|
||||
|
||||
void
|
||||
ircd::gpt::model::init()
|
||||
noexcept
|
||||
{
|
||||
if(!model::path)
|
||||
return;
|
||||
|
||||
if(!init_from_cache(model::cache_path))
|
||||
init_from_json(model::cache_path, model::path);
|
||||
|
||||
if(model::dataset_path)
|
||||
init_dataset(model::dataset_path);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::fini()
|
||||
noexcept
|
||||
{
|
||||
default_model = nullptr;
|
||||
default_model_shm = {};
|
||||
|
||||
default_dataset = nullptr;
|
||||
default_data.clear();
|
||||
default_dataset_shm = {};
|
||||
}
|
||||
|
||||
bool
|
||||
|
@ -170,10 +194,9 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
|
|||
const string_view &model_path)
|
||||
{
|
||||
util::timer stopwatch;
|
||||
auto decoder
|
||||
{
|
||||
std::make_unique<model::decoder>()
|
||||
};
|
||||
|
||||
auto decoder(std::make_unique<model::decoder>());
|
||||
memset(decoder.get(), 0x0, sizeof(model::decoder));
|
||||
|
||||
// Load the top level files, vocab etc
|
||||
for(size_t i(0); i < 4; ++i)
|
||||
|
@ -274,6 +297,55 @@ ircd::gpt::model::init_from_json_handle(decoder &d,
|
|||
};
|
||||
}
|
||||
|
||||
bool
|
||||
ircd::gpt::model::init_dataset(const string_view &path)
|
||||
{
|
||||
if(!fs::is_reg(path))
|
||||
return false;
|
||||
|
||||
const auto size
|
||||
{
|
||||
fs::size(path)
|
||||
};
|
||||
|
||||
const fs::fd fd
|
||||
{
|
||||
path
|
||||
};
|
||||
|
||||
fs::map::opts map_opts;
|
||||
map_opts.huge2mb = true;
|
||||
default_dataset_shm = fs::map
|
||||
{
|
||||
fd, map_opts, size
|
||||
};
|
||||
|
||||
default_dataset = string_view
|
||||
(
|
||||
default_dataset_shm
|
||||
);
|
||||
|
||||
size_t checkpoint(0);
|
||||
default_data.resize(260000); //TODO: XXX
|
||||
ircd::tokens(default_dataset, '\n', [&checkpoint]
|
||||
(const string_view &line)
|
||||
{
|
||||
default_data.at(checkpoint++) = line;
|
||||
});
|
||||
|
||||
char pbuf[48];
|
||||
log::info
|
||||
{
|
||||
log, "dataset(%p) mapped `%s' %s @%lu",
|
||||
data(default_dataset_shm),
|
||||
path,
|
||||
pretty(pbuf, iec(size)),
|
||||
checkpoint,
|
||||
};
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_wpe_weight(decoder &d,
|
||||
const string_view &name,
|
||||
|
|
|
@ -32,7 +32,7 @@ decltype(ircd::gpt::pipe::flush_cycles)
|
|||
ircd::gpt::pipe::flush_cycles
|
||||
{
|
||||
{ "name", "ircd.gpt.pipe.flush" },
|
||||
{ "default", 0L, },
|
||||
{ "default", 1L, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::pipe::default_model)
|
||||
|
@ -73,20 +73,40 @@ ircd::gpt::pipe::init()
|
|||
{
|
||||
*pipe::default_code, *pipe::default_model
|
||||
};
|
||||
|
||||
log::debug
|
||||
{
|
||||
log, "Pipe initialized from model:%p data:%p code:%p desc:%p",
|
||||
&default_model,
|
||||
pipe::default_model,
|
||||
pipe::default_code,
|
||||
pipe::default_desc,
|
||||
};
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::pipe::fini()
|
||||
noexcept
|
||||
{
|
||||
delete default_desc;
|
||||
default_desc = nullptr;
|
||||
const auto pending
|
||||
{
|
||||
cl::work::list.size()
|
||||
};
|
||||
|
||||
delete default_code;
|
||||
default_code = nullptr;
|
||||
if(pending)
|
||||
{
|
||||
log::warning
|
||||
{
|
||||
log, "Waiting for %zu pending tasks to leave the pipe...",
|
||||
pending,
|
||||
};
|
||||
|
||||
delete default_model;
|
||||
default_model = nullptr;
|
||||
cl::sync();
|
||||
}
|
||||
|
||||
delete default_desc; default_desc = nullptr;
|
||||
delete default_code; default_code = nullptr;
|
||||
delete default_model; default_model = nullptr;
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -96,26 +116,29 @@ noexcept
|
|||
void
|
||||
ircd::gpt::generate(task &task)
|
||||
{
|
||||
if(unlikely(!pipe::default_model))
|
||||
pipe::init();
|
||||
assert(pipe::default_model);
|
||||
|
||||
assert(task.opts);
|
||||
const auto &opts
|
||||
{
|
||||
*task.opts
|
||||
};
|
||||
|
||||
assert(task.ctrl);
|
||||
auto &ctrl
|
||||
{
|
||||
*task.ctrl
|
||||
};
|
||||
|
||||
ctrl.cycle = 0;
|
||||
ctrl.call = IRCD_GPT_ECOMPLETE;
|
||||
ctrl.host_tsc = prof::cycles();
|
||||
size_t cycle(ctrl.cycle);
|
||||
const size_t tokens(ctrl.tokens);
|
||||
volatile const size_t tokens(ctrl.tokens);
|
||||
volatile const auto epoch(ctrl.epoch);
|
||||
volatile size_t cycle(ctrl.cycle);
|
||||
|
||||
std::deque<pipe::exec> list;
|
||||
for(; cycle < opts.limit; ++cycle)
|
||||
for(; cycle < opts.limit && run::level == run::level::RUN; ++cycle)
|
||||
{
|
||||
// When the release/acquire bits are set the control pages are sent
|
||||
// and received; only set on first and last iterations of this loop.
|
||||
|
@ -137,7 +160,7 @@ ircd::gpt::generate(task &task)
|
|||
pipe::flush_cycles
|
||||
|
||||
// Skip flushing on cycles already performing IO or waiting.
|
||||
&& !rel && !acq && list.size() <= pipe::queue_cycles
|
||||
&& !acq && list.size() <= pipe::queue_cycles
|
||||
|
||||
// The configuration item can specify an interval greater than
|
||||
// one between flushes.
|
||||
|
@ -167,6 +190,8 @@ ircd::gpt::generate(task &task)
|
|||
// Wait for all unfinished
|
||||
list.clear();
|
||||
|
||||
assert(ctrl.magic == 0xC7012C70);
|
||||
|
||||
// Interp error codes
|
||||
if(unlikely(ctrl.call <= 0))
|
||||
throw error
|
||||
|
@ -176,7 +201,7 @@ ircd::gpt::generate(task &task)
|
|||
reflect(ctrl.call),
|
||||
};
|
||||
|
||||
always_assert(ctrl.cycle == cycle);
|
||||
assert(ctrl.cycle == cycle);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -199,7 +224,7 @@ ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
|
|||
log::logf
|
||||
{
|
||||
log, log::level::DEBUG,
|
||||
"coil:%-2lu %8s %8s %8s %8s\n",
|
||||
"coil:%-2lu %8s %8s %8s %8s",
|
||||
i,
|
||||
util::pretty(tmbuf[0], si(pro[0]), 1),
|
||||
util::pretty(tmbuf[1], si(pro[1]), 1),
|
||||
|
@ -259,13 +284,18 @@ ircd::gpt::pipe::exec::exec(task &task,
|
|||
}
|
||||
,range_lm_logit
|
||||
{
|
||||
{ 262 * 192UL, 0 }, // align_up(50257) / 192
|
||||
{ 192UL, 0 },
|
||||
{ 786 * 64UL, 0 }, // align_up(50257) / 64
|
||||
{ 64UL, 0 },
|
||||
}
|
||||
,range_lm_logsm
|
||||
{
|
||||
{ 1 * 256UL, 0 },
|
||||
{ 256UL, 0 },
|
||||
}
|
||||
,range_lm_select
|
||||
{
|
||||
{ 1 * 192UL, 0 },
|
||||
{ 192UL, 0 },
|
||||
{ 1 * 256UL, 0 },
|
||||
{ 256UL, 0 },
|
||||
}
|
||||
,release_opts
|
||||
{
|
||||
|
@ -314,6 +344,10 @@ ircd::gpt::pipe::exec::exec(task &task,
|
|||
{
|
||||
desc->lm_logit, range_lm_logit, lmhead_opts
|
||||
}
|
||||
,lm_logsm
|
||||
{
|
||||
desc->lm_logsm, range_lm_logsm, lmhead_opts
|
||||
}
|
||||
,lm_select
|
||||
{
|
||||
desc->lm_select, range_lm_select, lmamax_opts
|
||||
|
@ -342,8 +376,8 @@ ircd::gpt::pipe::code::compile_opts
|
|||
" -cl-finite-math-only"
|
||||
" -cl-unsafe-math-optimizations"
|
||||
" -cl-fast-relaxed-math"
|
||||
//" -cl-mad-enable"
|
||||
//" -cl-single-precision-constant"
|
||||
" -cl-mad-enable"
|
||||
" -cl-single-precision-constant"
|
||||
//" -cl-fp32-correctly-rounded-divide-sqrt"
|
||||
};
|
||||
|
||||
|
@ -412,6 +446,16 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
|||
65536 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,logexp
|
||||
{
|
||||
65536 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,logsm
|
||||
{
|
||||
65536 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,ctrl
|
||||
{
|
||||
sizeof(struct ircd_gpt_task),
|
||||
|
@ -452,12 +496,24 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
|||
accum,
|
||||
model.embed->token,
|
||||
}
|
||||
,lm_logsm
|
||||
{
|
||||
code,
|
||||
"ircd_gpt_lm_logsm",
|
||||
ctrl,
|
||||
opts,
|
||||
logsm,
|
||||
logexp,
|
||||
logit,
|
||||
}
|
||||
,lm_select
|
||||
{
|
||||
code,
|
||||
"ircd_gpt_lm_select",
|
||||
ctrl,
|
||||
opts,
|
||||
logsm,
|
||||
logexp,
|
||||
logit,
|
||||
}
|
||||
,layer
|
||||
|
|
|
@ -17336,6 +17336,7 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
|
|||
};
|
||||
|
||||
out
|
||||
<< text
|
||||
<< output
|
||||
<< std::endl;
|
||||
return true;
|
||||
|
@ -17347,6 +17348,14 @@ console_cmd__gpt(opt &out, const string_view &line)
|
|||
return console_cmd__gpt__raw(out, line);
|
||||
}
|
||||
|
||||
bool
|
||||
console_cmd__gpt__pipe__reset(opt &out, const string_view &line)
|
||||
{
|
||||
gpt::pipe::fini();
|
||||
gpt::pipe::init();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
console_cmd__gpt__token(opt &out, const string_view &line)
|
||||
{
|
||||
|
@ -17368,3 +17377,33 @@ console_cmd__gpt__token(opt &out, const string_view &line)
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
console_cmd__gpt__data(opt &out, const string_view &line)
|
||||
{
|
||||
const params param{line, " ",
|
||||
{
|
||||
"id"
|
||||
}};
|
||||
|
||||
const auto idx
|
||||
{
|
||||
param.at<uint>("id")
|
||||
};
|
||||
|
||||
const json::object step
|
||||
{
|
||||
gpt::model::default_data.at(idx)
|
||||
};
|
||||
|
||||
out
|
||||
<< "#" << step["id"]
|
||||
<< " | " << step["length"]
|
||||
<< " " << (step["ended"]? "ended"_sv: string_view{})
|
||||
<< std::endl
|
||||
<< std::endl
|
||||
<< step["text"]
|
||||
<< std::endl;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue