diff --git a/include/ircd/gpt/ctrl.h b/include/ircd/gpt/ctrl.h index dad122ae2..54f7eafa9 100644 --- a/include/ircd/gpt/ctrl.h +++ b/include/ircd/gpt/ctrl.h @@ -11,89 +11,71 @@ #pragma once #define HAVE_IRCD_GPT_CTRL_H -/// Epoch Precision Interrupt Controller -/// -struct ircd_gpt_ctrl_epic -{ - /// Accumulates the number of task cycles. The cycle counter is incremented - /// by device software after each repetition of the kernel pipeline to - /// produce one additional token. - ulong cycle; - - /// Accumulates the epoch count for the task. The counter is incremented - /// by one in device software before control returns back to the host. - /// Several cycles may occur during each epoch. - ulong epoch; - - /// Accumulates the training epoch count for the task. The counter is - /// incremented by one in device software for each backward propagation. - ulong step; - - /// Updated by the host with the value of the timestamp register as sampled - /// immediately before each transfer of control to the device. - ulong host_tsc; - - /// Accumulates time in microseconds elapsed for the task. - ulong elapsed; -}; - -/// Token Context Buffer (Control Block) -/// -struct ircd_gpt_ctrl_tokens -{ - /// Token ring head. Tokens in the ring extend behind the head for - /// `tokens`. The `head` value is automatically modulated by device - /// software to wrap around the ring. - uint head; - - /// Token counter. The counter indicates the number of valid tokens in - /// the context buffer. This value must not exceed the buffer size. - uint count; - - /// Accumulates the number of tokens produced by the task. Several tokens - /// may be produced each epoch, but currently only one token is produced - /// each cycle. - ulong produced; - - /// Accumulates the number tokens witnessed by the task. The number of - /// tokens in the context for each cycle is counted as witnessed. - ulong witnessed; -}; - -/// Target label register (abridged) -/// +/// Result logit control block. struct ircd_gpt_ctrl_logit { /// Vocabulary token. ushort token; /// Padding #0. - ushort _pad0; + ushort flag; /// Result logit softmax probability. float samax; }; -/// Target label register (full) -/// +/// Target label control block. Results for each target are registered +/// and state is updated each cycle. struct ircd_gpt_ctrl_label { - /// Vocabulary token. - ushort token; - - /// Padding #0. - ushort _pad0; - - /// Result logit softmax probability. - float samax; + /// Logit descriptor + struct ircd_gpt_ctrl_logit logit; /// Loss state struct ircd_math_mean loss; /// Perplexity state - struct ircd_math_mean perp; -} -__attribute__((aligned(64))); + struct ircd_math_mean ppl; +}; + +/// Master clock +struct ircd_gpt_ctrl_clk +{ + /// Master clock. The cycle count is incremented by one in device software + /// after each repetition of the kernels producing one additional token. + /// The cycle count resets to zero before the beginning of each sample. + uint cycle; + + /// Master clock. Sample consists of one or more cycles; sample count is + /// incremented by one in device software after every accept condition, + /// growing monotonically for the `step`; resets to zero each `step`. + uint samp; + + /// Master clock. Step (or timestep) consists of one or more samples. Step + /// count is incremented by one in device software after each backward + /// propagation. Step grows monotonically even across epochs. + uint step; + + /// Master clock. Epoch consists of one or more steps; epoch count is + /// incremented by one after every backward propagation. + uint epoch; +}; + +/// Profiling block +struct ircd_gpt_ctrl_prof +{ + /// Host timestamp sampled at last control page transfer to the device. + ulong released; + + /// Host timestamp sampled when this control page accquired by the host. + ulong acquired; + + /// Device timestamp at beginning of cycle. + ulong entered; + + /// Device timestamp at end of cycle. + ulong finished; +}; /// Task Control Page /// @@ -104,50 +86,94 @@ __attribute__((aligned(64))); /// struct ircd_gpt_ctrl { - /// Epoch counting & interrupt control block. - struct ircd_gpt_ctrl_epic epic; + /// Accept register. If >= 0 the cycle produced a token which satisfies the + /// indicated accept condition. + int accept; - /// Token context control block. Contains state for the token context - /// buffer; the buffer with the tokens themselves is elsewhere. - struct ircd_gpt_ctrl_tokens tokens; + /// Dispatch register. Device software wishes additional cycles to be + /// commanded by the host. Effectively minimum distance until next accept. + uint dispatch; - /// Top result summary from the softed result logit softmax vector. This - /// is updated each cycle by device software with extended statistics on - /// the top N results. - struct ircd_gpt_ctrl_logit top[16]; + /// Token counter. The counter indicates the number of valid tokens in + /// the context buffer. This value must not exceed the opts.buffer_size. + /// This value should not exceed the opts.context_size at least for now. + uint count; - /// Target label control block. Results for each target are registered - /// and state is updated each cycle. - struct ircd_gpt_ctrl_label label[4]; + /// Token counter. The counter indicates the number of valid tokens in + /// the context buffer. This value must not exceed the opts.buffer_size. + /// This value should not exceed the opts.context_size at least for now. + uint tokens; - /// Result logit vector softmax internal state. - struct ircd_math_samax samax; + /// Master clock. + struct ircd_gpt_ctrl_clk clk; + + /// Profiling related. + struct ircd_gpt_ctrl_prof prof; /// PRNG xoshiro256 internal state (note: see opts.h to seed the prng). ulong rand[4]; - /// Perform backprop TODO: XXX - bool prop; + /// Top result summary from the softed result logit softmax vector. This + /// is updated each cycle by device software with extended statistics on + /// the top N results. + struct ircd_gpt_ctrl_logit top[16] __attribute__((aligned(8))); - /// Header magic 0xC7012C70 - uint magic; + /// User label control block. Results for each target are registered + /// and state is updated each cycle; averaged for each step. + struct ircd_gpt_ctrl_label label[14] __attribute__((aligned(64))); - /// The token buffer starts at offset 2048 and continues to the end of - /// the page; options specify the size of the tokens buffer in tokens. - /// Additional pages must be attached for larger buffer sizes. - ushort token[] __attribute__((aligned(2048))); + /// Target result label; traces training token. + struct ircd_gpt_ctrl_label target __attribute__((aligned(64))); + + /// Selected result token label. + struct ircd_gpt_ctrl_label select __attribute__((aligned(64))); + + /// Incremented when the target is the selected token. + uint hit, miss; + + /// Attention summary; [layer][head] => [token]. Each value points to a + /// position in the token buffer. The top-scoring softmax result for each + /// head in each layer is attending to the token.at(value) for this cycle. + /// These values are completely updated every cycle. + ushort attn[12][12]; + + /// Header magic: host sets 0xDEADBEEF before release to device; device + /// sets 0xC7012C7012 before release to host. + ulong magic; + + /// Token buffer + ushort token[1024] __attribute__((aligned(2048))); } __attribute__((aligned(4096))); -#ifdef __cplusplus +#if defined(__cplusplus) namespace ircd::gpt { using ctrl = struct ircd_gpt_ctrl; + using ctrl_clk = struct ircd_gpt_ctrl_clk; + using ctrl_prof = struct ircd_gpt_ctrl_prof; + using ctrl_logit = struct ircd_gpt_ctrl_logit; + using ctrl_label = struct ircd_gpt_ctrl_label; + + string_view debug_token_at(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = -1U); + string_view debug_token(const mutable_buffer &, const opts &, const ctrl &, const uint fmt = -1U); + string_view debug_head(const mutable_buffer &, const opts &, const ctrl_clk &); + string_view debug_head(const mutable_buffer &, const opts &, const ctrl &); + string_view debug(const mutable_buffer &, const opts &, const ctrl_logit &, const uint fmt = 0); + string_view debug(const mutable_buffer &, const opts &, const ctrl_label &, const uint fmt = 0); + string_view debug(const mutable_buffer &, const opts &, const ctrl &); + + string_view debug_attn(const mutable_buffer &, const opts &, const ctrl &, const uint); + string_view debug_label(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = 0); + string_view debug_top(const mutable_buffer &, const opts &, const ctrl &, const uint); } #endif -#ifdef __cplusplus -static_assert(sizeof(struct ircd_gpt_ctrl) == 4096); +#if defined(__cplusplus) +static_assert(sizeof(struct ircd_gpt_ctrl) % 4096 == 0); +#endif + +#if defined(__cplusplus) && defined(__GLIBCXX__) static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048); static_assert(std::is_standard_layout::value); #endif diff --git a/include/ircd/gpt/epoch.h b/include/ircd/gpt/epoch.h new file mode 100644 index 000000000..a4b4b9433 --- /dev/null +++ b/include/ircd/gpt/epoch.h @@ -0,0 +1,38 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_EPOCH_H + +/// Perform one task epoch on the device. +/// +struct ircd::gpt::epoch +{ + gpt::task &task; + pipe::desc &desc; + + const gpt::opts &opts; + gpt::ctrl &ctrl; + + const uint id; + const size_t start, stop; + f32 *moment[2]; + + pipe::prof profile; + + void profile_accumulate(const pipe::prof &); + + public: + bool done() const noexcept; + bool operator()(); + + epoch(gpt::task &); + ~epoch() noexcept; +}; diff --git a/include/ircd/gpt/gpt.h b/include/ircd/gpt/gpt.h index f3b91a5d4..cef5fe9ae 100644 --- a/include/ircd/gpt/gpt.h +++ b/include/ircd/gpt/gpt.h @@ -17,16 +17,36 @@ namespace ircd::gpt { IRCD_EXCEPTION(ircd::error, error) + struct samp; + struct step; + struct epoch; struct task; extern log::log log; } #include "vocab.h" -#include "model.h" #include "token.h" +#include "vector.h" +#include "model.h" #include "opts.h" #include "ctrl.h" -#include "task.h" #include "pipe/pipe.h" +#include "samp.h" +#include "step.h" +#include "epoch.h" +#include "task.h" #include "generate.h" + +namespace ircd::gpt +{ + void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept; + + void log_debug(const opts &, const ctrl &); + void log_debug_token(const opts &, const ctrl &, const uint); + void log_debug_attns(const opts &, const ctrl &); + void log_debug_attns_top(const opts &, const ctrl &); + void log_debug_labels(const opts &, const ctrl &); + void log_debug_topn(const opts &, const ctrl &); + void log_debug_prof(const opts &, const ctrl &, const pipe::prof &); +} diff --git a/include/ircd/gpt/gpu.h b/include/ircd/gpt/gpu.h new file mode 100644 index 000000000..6568d8418 --- /dev/null +++ b/include/ircd/gpt/gpu.h @@ -0,0 +1,290 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +typedef union ircd_gpt_vector_f32x4 ircd_gpt_vectorv __attribute__((aligned(4096))); +typedef struct ircd_gpt_attn_qkv_f32x4 ircd_gpt_attn_qkvv __attribute__((aligned(4096))); +typedef union ircd_gpt_attn_aperature_f32x4 ircd_gpt_attn_aperaturev __attribute__((aligned(4096))); +typedef union ircd_gpt_ffnn_aperature_f32x4 ircd_gpt_ffnn_aperaturev __attribute__((aligned(4096))); + +// +// Frontside +// + +void +__attribute__((internal_linkage)) +ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *out, + __local const ircd_gpt_vectorv *in, + __global const ircd_gpt_vectorv *bias, + __global const ircd_gpt_vectorv *weight, + const uint li); + +void +__attribute__((internal_linkage)) +ircd_gpt_norm(__local ircd_gpt_vectorv *out, + __local const ircd_gpt_vectorv *in, + __local ircd_gpt_vectorv *tmp, + __global const ircd_gpt_vectorv *bias, + __global const ircd_gpt_vectorv *weight, + const uint ln, + const uint li); + +void +__attribute__((internal_linkage)) +ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *out, + __local const ircd_gpt_ffnn_aperaturev *in, + const uint i); + +void +__attribute__((internal_linkage)) +ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *opts, + __local ircd_gpt_ffnn_aperaturev *out, + __local const ircd_gpt_vectorv *in, + __global const ircd_gpt_ffnn_aperaturev *bias, + __global const ircd_gpt_ffnn_aperaturev *weight, + const uint li); + +void +__attribute__((internal_linkage)) +ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __local ircd_gpt_ffnn_aperaturev *out, + __local const ircd_gpt_vectorv *in, + __global const ircd_gpt_ffnn_aperaturev *bias, + __global const ircd_gpt_ffnn_aperaturev *weight, + const uint ln, + const uint li); + +void +__attribute__((internal_linkage)) +ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *opts, + __local ircd_gpt_vectorv *out, + __local const ircd_gpt_ffnn_aperaturev *in, + __global const ircd_gpt_vectorv *bias, + __global const ircd_gpt_vectorv *weight, + const uint li); + +void +__attribute__((internal_linkage)) +ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __local ircd_gpt_vectorv *vector, + __local ircd_gpt_ffnn_aperaturev *buf, + __local ircd_gpt_vectorv *tmp, + __global const ircd_gpt_vectorv *norm_bias, + __global const ircd_gpt_vectorv *norm_weight, + __global const ircd_gpt_ffnn_aperaturev *fcon_bias, + __global const ircd_gpt_ffnn_aperaturev *fcon_weight, + __global const ircd_gpt_vectorv *proj_bias, + __global const ircd_gpt_vectorv *proj_weight, + const uint ln, + const uint li); + +void +__attribute__((internal_linkage)) +ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *opts, + __local ircd_gpt_attn_aperaturev *out, + __local const ircd_gpt_vectorv *in, + __global const ircd_gpt_attn_aperaturev *bias, + __global const ircd_gpt_attn_aperaturev *weight, + const uint ln, + const uint li); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __private const uint layer, + __global ircd_gpt_attn_aperaturev *state, + __global const ircd_gpt_vectorv *accum, + __global const ircd_gpt_vectorv *norm_bias, + __global const ircd_gpt_vectorv *norm_weight, + __global const ircd_gpt_attn_aperaturev *fcon_bias, + __global const ircd_gpt_attn_aperaturev *fcon_weight); + +// +// head +// + +void +__attribute__((internal_linkage)) +_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *accum, + __global const ircd_gpt_vectorv *pos, + __global const ircd_gpt_vectorv *vocab, + const ushort out_idx, + const ushort tok_idx, + const ushort word_idx); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *accum, + __global const ircd_gpt_vectorv *pos, + __global const ircd_gpt_vectorv *vocab); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *accum, + __global const ircd_gpt_vectorv *norm_bias, + __global const ircd_gpt_vectorv *norm_weight); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global float *logit, + __global const ircd_gpt_vectorv *accum, + __global const ircd_gpt_vectorv *pos, + __global const ircd_gpt_vectorv *vocab); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global float logit[65536]); + +void +__attribute__((internal_linkage)) +ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __local const ushort *idx, + __global const float *logsm, + const uint i); + +void +__attribute__((internal_linkage)) +ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __local struct ircd_math_mean *mean, + const float last); + +void +__attribute__((internal_linkage)) +ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __local struct ircd_gpt_ctrl_label *label, + __global const float *logsm); + +ushort +__attribute__((internal_linkage)) +ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __local const ushort *idx, + __global const float *logsm); + +uint +__attribute__((internal_linkage)) +ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + const uint i); + +uint +__attribute__((internal_linkage)) +ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + const uint i); + +int +__attribute__((internal_linkage)) +ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts); + +void +__attribute__((internal_linkage)) +ircd_gpt_accept(__local struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts); + +// +// backside +// + +void +__attribute__((internal_linkage)) +ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global float4 *param_, + __global float4 *exp_avg_, + __global float4 *exp_avg_sqr_); + +// +// backpropagations +// + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *bias, + __global ircd_gpt_vectorv *bias_m0, + __global ircd_gpt_vectorv *bias_m1, + __global ircd_gpt_vectorv *weight, + __global ircd_gpt_vectorv *weight_m0, + __global ircd_gpt_vectorv *weight_m1); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *norm_bias, + __global ircd_gpt_vectorv *norm_bias_m0, + __global ircd_gpt_vectorv *norm_bias_m1, + __global ircd_gpt_vectorv *norm_weight, + __global ircd_gpt_vectorv *norm_weight_m0, + __global ircd_gpt_vectorv *norm_weight_m1, + __global ircd_gpt_attn_aperaturev *fcon_bias, + __global ircd_gpt_attn_aperaturev *fcon_bias_m0, + __global ircd_gpt_attn_aperaturev *fcon_bias_m1, + __global ircd_gpt_attn_aperaturev *fcon_weight, + __global ircd_gpt_attn_aperaturev *fcon_weight_m0, + __global ircd_gpt_attn_aperaturev *fcon_weight_m1, + __global ircd_gpt_vectorv *proj_bias, + __global ircd_gpt_vectorv *proj_bias_m0, + __global ircd_gpt_vectorv *proj_bias_m1, + __global ircd_gpt_vectorv *proj_weight, + __global ircd_gpt_vectorv *proj_weight_m0, + __global ircd_gpt_vectorv *proj_weight_m1); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *norm_bias, + __global ircd_gpt_vectorv *norm_bias_m0, + __global ircd_gpt_vectorv *norm_bias_m1, + __global ircd_gpt_vectorv *norm_weight, + __global ircd_gpt_vectorv *norm_weight_m0, + __global ircd_gpt_vectorv *norm_weight_m1, + __global ircd_gpt_ffnn_aperaturev *fcon_bias, + __global ircd_gpt_ffnn_aperaturev *fcon_bias_m0, + __global ircd_gpt_ffnn_aperaturev *fcon_bias_m1, + __global ircd_gpt_ffnn_aperaturev *fcon_weight, + __global ircd_gpt_ffnn_aperaturev *fcon_weight_m0, + __global ircd_gpt_ffnn_aperaturev *fcon_weight_m1, + __global ircd_gpt_vectorv *proj_bias, + __global ircd_gpt_vectorv *proj_bias_m0, + __global ircd_gpt_vectorv *proj_bias_m1, + __global ircd_gpt_vectorv *proj_weight, + __global ircd_gpt_vectorv *proj_weight_m0, + __global ircd_gpt_vectorv *proj_weight_m1); + +__kernel void +__attribute__((visibility("protected"))) +ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *ctrl, + __constant const struct ircd_gpt_opts *opts, + __global ircd_gpt_vectorv *pos, + __global ircd_gpt_vectorv *pos_m0, + __global ircd_gpt_vectorv *pos_m1, + __global ircd_gpt_vectorv *token, + __global ircd_gpt_vectorv *token_m0, + __global ircd_gpt_vectorv *token_m1); diff --git a/include/ircd/gpt/model.h b/include/ircd/gpt/model.h index b196e2d25..cbbd72cec 100644 --- a/include/ircd/gpt/model.h +++ b/include/ircd/gpt/model.h @@ -20,66 +20,105 @@ namespace ircd::gpt::model struct embed; struct decoder; - constexpr auto align {64}; + struct prop; + struct text; + extern decoder *default_model; + extern float *default_moment[2]; + extern float *default_checkpoint[3]; extern string_view default_dataset; extern std::vector default_data; + + constexpr auto alignment {4096}; } +/// Layer normalization +struct ircd::gpt::model::norm +{ + union ircd_gpt_vector + bias alignas(alignment), + weight alignas(alignment); +}; + /// Attention aperature struct ircd::gpt::model::attn { - float - attn_bias alignas(align) [2304], - attn_weight alignas(align) [768][2304]; + model::norm + norm; - float - proj_bias alignas(align) [768], - proj_weight alignas(align) [768][768]; + union ircd_gpt_attn_aperature + fcon_bias alignas(alignment), + fcon_weight alignas(alignment) [768]; + + union ircd_gpt_vector + proj_bias alignas(alignment), + proj_weight alignas(alignment) [768]; }; /// Feed-forward neural network struct ircd::gpt::model::ffnn { - float - fc_bias alignas(align) [3072], - fc_weight alignas(align) [768][3072]; + model::norm + norm; - float - proj_bias alignas(align) [768], - proj_weight alignas(align) [3072][768]; -}; + union ircd_gpt_ffnn_aperature + fcon_bias alignas(alignment), + fcon_weight alignas(alignment) [768]; -/// Layer normalization -struct ircd::gpt::model::norm -{ - float - bias alignas(align) [768], - weight alignas(align) [768]; + union ircd_gpt_vector + proj_bias alignas(alignment), + proj_weight alignas(alignment) [3072]; }; /// Transformer block struct ircd::gpt::model::block { - norm ln1; - model::attn attn; + model::attn + attn; - norm ln2; - model::ffnn ffnn; + model::ffnn + ffnn; }; /// Vocabulary embeddings struct ircd::gpt::model::embed { - float - pos alignas(align) [1024][768], - token alignas(align) [65536][768]; + model::norm + norm; + + union ircd_gpt_vector + pos alignas(alignment) [1024], + token alignas(alignment) [65536]; }; -struct ircd::gpt::model::decoder +/// Transformer decoder +struct alignas(ircd::gpt::model::alignment) +ircd::gpt::model::decoder { - block layer[12]; + model::block + layer[12]; - norm f; - embed word; + model::embed + embed; +}; + +struct ircd::gpt::model::prop +{ + static constexpr const char + *const ended {"ended"}, + *const id {"id"}, + *const length {"length"}, + *const text {"text"}; +}; + +struct ircd::gpt::model::text +:json::tuple +< + json::property, + json::property, + json::property, + json::property +> +{ + using super_type::tuple; }; diff --git a/include/ircd/gpt/opts.h b/include/ircd/gpt/opts.h index 3c4e64255..9f0b8ae85 100644 --- a/include/ircd/gpt/opts.h +++ b/include/ircd/gpt/opts.h @@ -11,6 +11,13 @@ #pragma once #define HAVE_IRCD_GPT_OPTS_H +#if defined(__cplusplus) +namespace ircd::gpt::model +{ + struct decoder; +} +#endif + /// Task Options Page /// /// The option block is directly shared with task software as constant data. @@ -20,30 +27,23 @@ /// struct ircd_gpt_opts { - #ifdef __cplusplus - ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept; + #if defined(__cplusplus) + ircd_gpt_opts() noexcept; #endif - /// Reference to the model (currently not available in device software). - #ifndef __cplusplus - const void *model; - #else - const ircd::gpt::model::decoder *model; - #endif + // + // Frontside + // /// Seed for the task's PRNG. ulong seed; - /// Limit number of output tokens. Default of -1 is unlimited; the number - /// of tokens generated will be limited by other factors. - uint limit; - /// Flip random coins over the top k logits each round. Setting to 1 /// deterministically selects the top logit. uint top_k; /// Flip a random coin between 0 and top_p ( = 90 = 0.9) for logit select. - uint top_p; + float top_p; /// Registers the top n result logits in the ctrl block each cycle. uint top_n; @@ -51,59 +51,25 @@ struct ircd_gpt_opts /// Number of target labels to register results for in the ctrl block. uint labels; - /// Bitbar toggling various debug modes + /// Number of pages available after the control block for the frame log. + uint frames; + + /// Limit number of output tokens. Default of -1; other halting conditions + /// will be used. + uint limit; + + /// Bitbar toggling various debug modes. uint debug; - /// Specifies the token context size in tokens. - uint context_tokens; + /// Accepting condition codes. + ushort accept[4][8] __attribute__((aligned(4))); - /// Specifies the token buffer size in tokens. - uint buffer_tokens; + // + // Backside + // - /// Decoding layers. - uint layers; - - /// SIMD lane count. - uint lanes; - - /// Embedding vector elements - uint embed_elems; - - /// Cross-attention dimension - uint attn_rank; - - /// Attention unit fcon width multiple - uint attn_mult; - - /// (computed) MLP unit fcon width multiple - uint ffnn_mult; - - /// (computed) attention unit width multiple - uint attn_elems; - - /// FFNN unit width multiple - uint ffnn_elems; - - /// SIMD lane count - uint lanes; - - /// (computed) `embed_elems` / `lanes` - uint embed_width; - - /// (computed) Attention unit X dimension - uint attn_width; - - /// (computed) Attention unit Y dimension - uint attn_height; - - /// (computed) MLP backend X dimension - uint ffnn_width; - - /// (computed) MLP backend Y dimension - uint ffnn_height; - - /// Number of possible target n-grams. - uint logits; + /// Samples per step. + uint batch_size; /// Training steps uint training_steps; @@ -122,15 +88,90 @@ struct ircd_gpt_opts /// Denorm smoothing float epsilon; + + /// Tuning convergence rate + float lambda; + + // + // Model dimensions + // + + /// Number of possible target n-grams. + uint logits; + + /// Specifies the token buffer size in tokens. + uint buffer_tokens; + + /// Specifies the token context size in tokens. + uint context_tokens; + + /// Decoding layers. + uint layers; + + /// SIMD lane count. + uint lanes; + + /// Embedding vector elements + uint embed_elems; + + /// (computed) `embed_elems` / `lanes` + uint embed_width; + + /// Cross-attention dimension + uint attn_rank; + + /// Attention unit fcon width multiple + uint attn_mult; + + /// (computed) attention unit width multiple + uint attn_elems; + + /// (computed) Attention unit X dimension + uint attn_fcon_width; + + /// (computed) Attention unit Y dimension + uint attn_fcon_height; + + /// (computed) Attention unit X dimension + uint attn_proj_width; + + /// (computed) Attention unit Y dimension + uint attn_proj_height; + + /// (computed) Packed attention array total element count + uint attn_self_elems; + + /// MLP unit fcon width multiple + uint ffnn_mult; + + /// (computed) FFNN unit width multiple + uint ffnn_elems; + + /// (computed) MLP backend X dimension + uint ffnn_fcon_width; + + /// (computed) MLP backend Y dimension + uint ffnn_fcon_height; + + /// (computed) MLP backend X dimension + uint ffnn_proj_width; + + /// (computed) MLP backend Y dimension + uint ffnn_proj_height; } __attribute__((aligned(4096))); -#ifdef __cplusplus +#if defined(__cplusplus) namespace ircd::gpt { using opts = ::ircd_gpt_opts; } +#endif +#if defined(__cplusplus) static_assert(sizeof(struct ircd_gpt_opts) == 4096); +#endif + +#if defined(__cplusplus) && defined(__GLIBCXX__) static_assert(std::is_standard_layout::value); #endif diff --git a/include/ircd/gpt/pipe/code.h b/include/ircd/gpt/pipe/code.h index c61020622..f8d7ce6f7 100644 --- a/include/ircd/gpt/pipe/code.h +++ b/include/ircd/gpt/pipe/code.h @@ -16,12 +16,15 @@ struct ircd::gpt::pipe::code :cl::code { static conf::item default_path; - static conf::item default_opts; + static conf::item default_compile_opts; + static conf::item default_link_opts; static conf::item cache_path; static string_view make_cache_path(const mutable_buffer &); - static cl::code from_cache(const string_view &opts, const string_view &path); - static cl::code from_source(const string_view &opts); + + static cl::code from_cache(); + static cl::code from_source(const string_view &comp_opts = {}, const string_view &link_opts = {}); + static cl::code from_bitcode(const string_view &link_opts = {}); void set_cache(const string_view &path); bool put_cache(); diff --git a/include/ircd/gpt/pipe/cycle.h b/include/ircd/gpt/pipe/cycle.h new file mode 100644 index 000000000..73da93d14 --- /dev/null +++ b/include/ircd/gpt/pipe/cycle.h @@ -0,0 +1,48 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_PIPE_CYCLE_H + +namespace ircd::gpt::pipe +{ + const gpt::ctrl &acquire(cycle &); +} + +/// Perform one task cycle on the device. The cycle is an atomic unit of +/// generation producing one token. +/// +/// Constructions of this object enqueue device commands to complete an +/// additional cycle of the task as provided by `ctrl` and `opts`. +/// +/// Destructions of this object yield the ircd::ctx until those commands +/// are complete. +/// +struct ircd::gpt::pipe::cycle +{ + struct profile; + + static constexpr size_t stages + { + 4 + 3 + (12 * 2) + 4 + 2 + (12 * 2) + 1 + }; + + pipe::desc &desc; + uint tick; + uint count; + uint tokens; + uint cached; + uint frame; + pipe::range range; + std::array stage; + + cycle(gpt::samp &); + ~cycle() noexcept; +}; diff --git a/include/ircd/gpt/pipe/desc.h b/include/ircd/gpt/pipe/desc.h index b68147db6..0300d39e2 100644 --- a/include/ircd/gpt/pipe/desc.h +++ b/include/ircd/gpt/pipe/desc.h @@ -16,43 +16,62 @@ struct ircd::gpt::pipe::desc { struct layer; + // Model descriptor pipe::model *model; + + // Code descriptor pipe::code *code; + // Memories cl::data - state, // [root] projection (layers * tokens * embed * 3 * float) + opts, // [root] options page + ctrl, // [root] control page master, // [root] single allocation for additional buffers: + state, // [-sub] projection (layers * tokens * embed * 3 * float) accum, // [-sub] accumulator (tokens * embed * float) logit, // [-sub] result logit vector (50257 * float) - logsm, // [-sub] outputs distribution (50257 * float) - ctrl, // [root] control page - opts; // [root] options page + attns, // [-sub] result attention softmax + frame[8]; // [root] result stream + // Programs cl::kern + alloc, + enter, lm_embed, lm_norm, lm_logit, lm_logsm, lm_select, - lm_norm_backprop, - lm_embed_backprop; + lm_prop_embed, + lm_prop_norm, + leave[8]; - std::unique_ptr - layer[12]; + /// Coil pack + std::unique_ptr layer[12]; - desc(pipe::code &, pipe::model &); + /// Attention projection for first N tokens already contained in `state`. + uint cached {0}; + + desc(const gpt::opts *const &, + gpt::ctrl *const &, + pipe::model &model, + pipe::code &code); }; +/// Pipe descriptor: coil layer struct ircd::gpt::pipe::desc::layer { cl::data - state; // [-sub] qry/key/val projection (tokens * embed * 3 * float) + state, // [-sub] qry/key/val projection (tokens * embed * 3 * float) + attns; // [-sub] attn softmax result (((tokens * tokens) / 2) * 12 * float) cl::kern - negative, - positive, - backattn, - backffnn; + attn, + ffnn, + prop_attn, + prop_ffnn; - layer(pipe::desc &, const int); + layer(pipe::desc &, + const gpt::opts *const &, + const uint laynum); }; diff --git a/include/ircd/gpt/pipe/exec.h b/include/ircd/gpt/pipe/exec.h deleted file mode 100644 index 3cec74cb8..000000000 --- a/include/ircd/gpt/pipe/exec.h +++ /dev/null @@ -1,65 +0,0 @@ -// Matrix Construct -// -// Copyright (C) Matrix Construct Developers, Authors & Contributors -// Copyright (C) 2016-2021 Jason Volk -// -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice is present in all copies. The -// full license for this software is available in the LICENSE file. - -#pragma once -#define HAVE_IRCD_GPT_PIPE_EXEC_H - -/// Perform one task cycle on the device. -/// -/// Constructions of this object enqueue device commands to complete an -/// additional epoch of the task as provided by `ctrl` and `opts`. -/// -/// Destructions of this object yield the ircd::ctx until those commands -/// are complete. -/// -/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is -/// achieved by constructing several objects before following with destructions -/// i.e in a std::deque. -/// -struct ircd::gpt::pipe::exec -{ - pipe::desc *desc; - - const_buffer - send_opts, // Set when sending the options page. - send_ctrl, // Set when sending the control page. - send_coil, // Set when sending the updated model. - send_head; // Set when sending the updated model. - - mutable_buffer - recv_ctrl; // Set when receiving the control page. - - cl::kern::range - range_full, - range_last, - range_lm_embed, // Dimension range of the lm_embed kernel. - range_negative, // Dimension range of a layer kernel. - range_positive, // Dimension range of a layer kernel. - range_lm_norm, // Dimension range of the final norm kernel. - range_lm_logit, // Dimension range of the language logit kernel. - range_lm_logsm, // Dimension range of the language statistic kernel. - range_lm_select; // Dimension range of the language token kernel. - - cl::exec - release_opts, // Release the options page. - release_ctrl, // Release the control page. - release_coil, // Release updates to the model. - release_head, // Release updates to the model. - lm_embed, // Compute token and positional embeddings. - coil[12 * 2], // Pass over all layers. - lm_norm, // Final normalization. - lm_logit, // Compute language logits. - lm_logsm, // Statistics on the logits. - lm_select, // Select next token. - acquire_ctrl; // Acquire the control page. - - exec(task &, const size_t tokens, const bool rel, const bool acq); - ~exec() noexcept; -}; diff --git a/include/ircd/gpt/pipe/model.h b/include/ircd/gpt/pipe/model.h index ce62bd671..24dde174a 100644 --- a/include/ircd/gpt/pipe/model.h +++ b/include/ircd/gpt/pipe/model.h @@ -21,15 +21,15 @@ struct ircd::gpt::pipe::model struct attn; struct ffnn; struct block; + struct embed; struct decoder; - struct language; + const gpt::model::decoder *decode_const {nullptr}; + gpt::model::decoder *decode_mutable {nullptr}; std::unique_ptr decode; - std::unique_ptr embed; - bool invalid {false}; - model(const gpt::model::decoder &, const gpt::model::embed &); - model(gpt::model::decoder &, gpt::model::embed &); + model(const gpt::model::decoder &); + model(gpt::model::decoder &); ~model() noexcept; }; @@ -49,8 +49,8 @@ struct ircd::gpt::pipe::model::tensor bias, weight; - tensor(cl::data *, const off_t, const const_buffer &bias, const const_buffer &weight); - tensor(cl::data *, const off_t, const mutable_buffer &bias, const mutable_buffer &weight); + tensor(cl::data *, const off_t, const const_buffer &bias, const off_t, const const_buffer &weight); + tensor(cl::data *, const off_t, const mutable_buffer &bias, const off_t, const mutable_buffer &weight); }; struct ircd::gpt::pipe::model::attn @@ -60,8 +60,8 @@ struct ircd::gpt::pipe::model::attn fcon, proj; - attn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::attn &); - attn(cl::data *, const off_t, gpt::model::norm &, gpt::model::attn &); + attn(cl::data *, const off_t, const gpt::model::attn &); + attn(cl::data *, const off_t, gpt::model::attn &); }; struct ircd::gpt::pipe::model::ffnn @@ -71,40 +71,33 @@ struct ircd::gpt::pipe::model::ffnn fcon, proj; - ffnn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::ffnn &); - ffnn(cl::data *, const off_t, gpt::model::norm &, gpt::model::ffnn &); + ffnn(cl::data *, const off_t, const gpt::model::ffnn &); + ffnn(cl::data *, const off_t, gpt::model::ffnn &); }; struct ircd::gpt::pipe::model::block { - // Single layer memory roots - cl::data - master[3]; + model::attn + attn; - // Layer units - model::attn attn; - model::ffnn ffnn; + model::ffnn + ffnn; block(cl::data *, const off_t, const gpt::model::block &, const size_t); block(cl::data *, const off_t, gpt::model::block &, const size_t); - block(const gpt::model::block &, const size_t); - block(gpt::model::block &, const size_t); }; -struct ircd::gpt::pipe::model::language +struct ircd::gpt::pipe::model::embed { - cl::data - master[3]; + tensor + norm; matrix pos, token; - language(cl::data *, const off_t, const gpt::model::embed &); - language(cl::data *, const off_t, gpt::model::embed &); - language(const gpt::model::embed &); - language( gpt::model::embed &); - ~language() noexcept; + embed(cl::data *, const off_t, const gpt::model::embed &); + embed(cl::data *, const off_t, gpt::model::embed &); }; struct ircd::gpt::pipe::model::decoder @@ -115,10 +108,11 @@ struct ircd::gpt::pipe::model::decoder // Layer blocks model::block - block[12]; + layer[12]; - // Final norm - tensor norm; + // Language model head + model::embed + embed; decoder(const gpt::model::decoder &); decoder(gpt::model::decoder &); diff --git a/include/ircd/gpt/pipe/pipe.h b/include/ircd/gpt/pipe/pipe.h index 8b29965f1..c309721cd 100644 --- a/include/ircd/gpt/pipe/pipe.h +++ b/include/ircd/gpt/pipe/pipe.h @@ -13,21 +13,21 @@ namespace ircd::gpt::pipe { - struct model; struct code; + struct model; struct desc; - struct exec; + struct range; + struct cycle; + struct prof; - extern model *default_model; - extern code *default_code; - extern desc *default_desc; - - void generate(task &); + extern conf::item queue_cycles; void init(), fini() noexcept; }; -#include "model.h" #include "code.h" +#include "model.h" #include "desc.h" -#include "exec.h" +#include "range.h" +#include "cycle.h" +#include "prof.h" diff --git a/include/ircd/gpt/pipe/prof.h b/include/ircd/gpt/pipe/prof.h new file mode 100644 index 000000000..e3bb0a418 --- /dev/null +++ b/include/ircd/gpt/pipe/prof.h @@ -0,0 +1,56 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_PIPE_PROF_H + +namespace ircd::gpt::pipe +{ + string_view debug(const mutable_buffer &, const prof &, const size_t &pos); + string_view debug(const mutable_buffer &, const prof &); +} + +/// Extract profiling information for a cycle. This object contains timing +/// state integers for each corresponding stage of the cycle. +/// +/// Default constructions initialize to zero and the state can also be used +/// as an accumulator. +struct ircd::gpt::pipe::prof +{ + static constexpr size_t stages + { + cycle::stages + }; + + static constexpr size_t phases + { + num_of() + }; + + using phase = cl::work::prof::phase; + using phase_array = cl::work::prof; + using stage_array = std::array; + using info_type = std::tuple; + using info_array = std::array; + using info_name_array = std::array; + + static info_name_array name; + static info_array info; + + private: + static bool init; + static void init_info(const pipe::cycle &); + + public: + stage_array ts; + + prof(const pipe::cycle &); + prof() noexcept; +}; diff --git a/include/ircd/gpt/pipe/range.h b/include/ircd/gpt/pipe/range.h new file mode 100644 index 000000000..3538c3101 --- /dev/null +++ b/include/ircd/gpt/pipe/range.h @@ -0,0 +1,39 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_PIPE_RANGE_H + +struct ircd::gpt::pipe::range +{ + cl::kern::range + _full, + _last, + alloc, + embed, + attn, + ffnn, + fffnn, + fnorm, + logit, + logsm, + select, + prop_embed, + prop_norm, + prop_attn, + prop_ffnn; + + range(const uint tick, + const uint count, + const uint tokens, + const uint cached, + const bool fwd, + const bool rev) noexcept; +}; diff --git a/include/ircd/gpt/samp.h b/include/ircd/gpt/samp.h new file mode 100644 index 000000000..a0c3a6e68 --- /dev/null +++ b/include/ircd/gpt/samp.h @@ -0,0 +1,49 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_SAMP_H + +/// Perform one task step on the device. The step is a sequence of cycles +/// which generate tokens until satisfying a halting condition. The number of +/// cycles for the step is limited by the size of the context buffer. +/// +struct ircd::gpt::samp +{ + gpt::step &step; + pipe::desc &desc; + + const gpt::opts &opts; + gpt::ctrl &ctrl; + + const uint id; + + int accept; + uint dispatch; + uint cycle; + uint tokens; + uint count; + + pipe::prof profile; + std::deque queue; + + public: + void profile_accumulate(const pipe::prof &); + bool retire(pipe::cycle &, const gpt::ctrl &); + bool evaluate(pipe::cycle &); + uint tokenize(); + + public: + bool done() const noexcept; + bool operator()(); + + samp(gpt::step &); + ~samp() noexcept; +}; diff --git a/include/ircd/gpt/step.h b/include/ircd/gpt/step.h new file mode 100644 index 000000000..3cadbf7a3 --- /dev/null +++ b/include/ircd/gpt/step.h @@ -0,0 +1,40 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_STEP_H + +/// Perform one task step on the device. The step is a sequence of cycles +/// which generate tokens until satisfying a halting condition. The number of +/// cycles for the step is limited by the size of the context buffer. +/// +struct ircd::gpt::step +{ + gpt::epoch &epoch; + pipe::desc &desc; + + const gpt::opts &opts; + gpt::ctrl &ctrl; + + const uint id; + const uint start; + + pipe::prof profile; + + void profile_accumulate(const pipe::prof &); + bool backpropagate(); + + public: + bool done() const noexcept; + bool operator()(); + + step(gpt::epoch &); + ~step() noexcept; +}; diff --git a/include/ircd/gpt/task.h b/include/ircd/gpt/task.h index 0521100a3..243f77d65 100644 --- a/include/ircd/gpt/task.h +++ b/include/ircd/gpt/task.h @@ -12,6 +12,14 @@ #define HAVE_IRCD_GPT_TASK_H #ifdef __cplusplus +namespace ircd::gpt +{ + void seed(task &, const uint64_t &) noexcept; + void seed(task &) noexcept; + void clear(task &) noexcept; + void reset(task &) noexcept; +} + /// Task Context /// /// State for a task. @@ -22,27 +30,25 @@ struct ircd::gpt::task /// Reference to the attached options. const gpt::opts *opts {nullptr}; - /// Reference to control pages. + /// Reference to user's control block. gpt::ctrl *ctrl {nullptr}; - /// Current task status. - enum status status {'\0'}; + /// Pipe code + std::unique_ptr code; + + /// Pipe model + std::unique_ptr model; + + /// Pipe state + pipe::desc desc; + + public: + bool done() const noexcept; + bool operator()(); + + task(const gpt::opts * = nullptr, + gpt::ctrl * = nullptr); - task(const gpt::opts * = nullptr, gpt::ctrl * = nullptr); ~task() noexcept; }; - -/// The current status of a task is indicated with intelligible characters -enum ircd::gpt::task::status -:char -{ - QUEUED = 'Q', ///< Queued for execution. - RUNNING = 'R', ///< Currently being executed. - ACCEPT = 'A', ///< Execution completed successfully. - ERROR = 'E', ///< Execution did not complete successfully. -}; - -static_assert(sizeof(struct ircd_gpt_ctrl) == 4096); -static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048); -static_assert(std::is_standard_layout::value); #endif diff --git a/include/ircd/gpt/token.h b/include/ircd/gpt/token.h index 7cd963999..b928c8758 100644 --- a/include/ircd/gpt/token.h +++ b/include/ircd/gpt/token.h @@ -11,127 +11,67 @@ #pragma once #define HAVE_IRCD_GPT_TOKEN_H -union ircd_gpt_token +namespace ircd::gpt { - float - word[768], - attn[12][64]; + struct token; +} + +/// Token is just a 16-bit index into the vocabulary. This lightweight wrapper +/// convenience constructs from a string lookup or from a u16 directly. +class ircd::gpt::token +{ + uint16_t val; + + public: + operator const uint16_t &() const; + operator uint16_t &(); + + operator string_view() const; + + token(const_buffer &buf) noexcept; + token(const string_view &); + token(const uint16_t &) noexcept; }; -#ifdef __OPENCL_C_VERSION__ -union ircd_gpt_tokenv +static_assert(sizeof(ircd::gpt::token) == sizeof(uint16_t)); +static_assert(std::is_standard_layout::value); + +/// Direct construction; no lookup +inline +ircd::gpt::token::token(const uint16_t &val) +noexcept +:val{val} +{} + +/// Must resolve to one token or error thrown. +inline +ircd::gpt::token::token(const string_view &str) +:val{vocab::tokenize(str)} +{} + +/// Consumes input for one token off front of buf +inline +ircd::gpt::token::token(const_buffer &buf) +noexcept +:val{vocab::tokenize(buf)} +{} + +inline ircd::gpt::token::operator +string_view() +const { - float4 - word[768/4], - attn[12][64/4]; + return vocab::token[val]; +} - union ircd_gpt_token - token; -}; -#endif - -struct ircd_gpt_attn_qkv +inline ircd::gpt::token::operator +uint16_t &() { - union ircd_gpt_token - qry, - key, - val; -}; + return val; +} -#ifdef __OPENCL_C_VERSION__ -struct ircd_gpt_attn_qkvv +inline ircd::gpt::token::operator +const uint16_t &() +const { - union ircd_gpt_tokenv - qry, - key, - val; -}; -#endif - -union ircd_gpt_attn_aperature -{ - float - fcon[2304], - proj[3][768], - qkv[3][12][64]; - - union ircd_gpt_token - token[3]; -}; - -#ifdef __OPENCL_C_VERSION__ -union ircd_gpt_attn_aperaturev -{ - float4 - fcon[2304/4], - proj[3][768/4], - qkv[3][12][64/4]; - - union ircd_gpt_token_f32x4 - token[3]; -}; - -union ircd_gpt_attn_aperature_f32x8 -{ - float8 - fcon[2304/8], - proj[3][768/8], - qkv[3][12][64/8]; - - union ircd_gpt_token_f32x8 - token[3]; -}; - -union ircd_gpt_attn_aperature_f32x16 -{ - float16 - fcon[2304/16], - proj[3][768/16], - qkv[3][12][64/16]; - - union ircd_gpt_token_f32x16 - token[3]; -}; -#endif - -union ircd_gpt_ffnn_aperature -{ - float - fcon[3072], - proj[4][768]; - - union ircd_gpt_token - token[4]; -}; - -#ifdef __OPENCL_C_VERSION__ -union ircd_gpt_ffnn_aperaturev -{ - float4 - fcon[3072/4], - proj[4][768/4]; - - union ircd_gpt_token_f32x4 - token[4]; -}; - -union ircd_gpt_ffnn_aperature_f32x8 -{ - float8 - fcon[3072/8], - proj[4][768/8]; - - union ircd_gpt_token_f32x4 - token[4]; -}; - -union ircd_gpt_ffnn_aperature_f32x16 -{ - float16 - fcon[3072/16], - proj[4][768/16]; - - union ircd_gpt_token_f32x4 - token[4]; -}; -#endif + return val; +} diff --git a/include/ircd/gpt/vector.h b/include/ircd/gpt/vector.h new file mode 100644 index 000000000..fd97bf619 --- /dev/null +++ b/include/ircd/gpt/vector.h @@ -0,0 +1,147 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_VECTOR_H + +#if !defined(__SIZEOF_FLOAT4__) && defined(__OPENCL_VERSION__) + #define __SIZEOF_FLOAT4__ 16 +#endif + +#if !defined(__SIZEOF_FLOAT8__) && defined(__OPENCL_VERSION__) + #define __SIZEOF_FLOAT8__ 32 +#endif + +#if !defined(__SIZEOF_FLOAT16__) && defined(__OPENCL_VERSION__) + #define __SIZEOF_FLOAT16__ 64 +#endif + +#ifndef __OPENCL_VERSION__ + #define __constant +#endif + +static __constant const uint +ircd_gpt_context_tokens = 512, // 1024, +ircd_gpt_vector_elems = 768, +ircd_gpt_attn_rank = 12, +ircd_gpt_attn_segs = 3, +ircd_gpt_ffnn_segs = 4; + +static __constant const uint +ircd_gpt_vector_attn_elems = ircd_gpt_vector_elems / ircd_gpt_attn_rank, +ircd_gpt_attn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_attn_segs, +ircd_gpt_ffnn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_ffnn_segs; + +// +// embed vector +// + +#if defined(__SIZEOF_FLOAT__) +union ircd_gpt_vector +{ + float + elem[ircd_gpt_vector_elems], + attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems]; +}; +#endif + +#if defined(__SIZEOF_FLOAT4__) +union ircd_gpt_vector_f32x4 +{ + float4 + elem[ircd_gpt_vector_elems / 4], + attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4]; + + union ircd_gpt_vector + vector; +}; +#endif + +// +// attn qkv +// + +#if defined(__SIZEOF_FLOAT__) +struct ircd_gpt_attn_qkv +{ + union ircd_gpt_vector + qry, + key, + val; +}; +#endif + +#if defined(__SIZEOF_FLOAT4__) +struct ircd_gpt_attn_qkv_f32x4 +{ + union ircd_gpt_vector_f32x4 + qry, + key, + val; +}; +#endif + +// +// attn aperature +// + +#if defined(__SIZEOF_FLOAT__) +union ircd_gpt_attn_aperature +{ + float + fcon[ircd_gpt_attn_fcon_elems], + proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems], + qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems]; + + union ircd_gpt_vector + vector[ircd_gpt_attn_segs]; +}; +#endif + +#if defined(__SIZEOF_FLOAT4__) +union ircd_gpt_attn_aperature_f32x4 +{ + float4 + fcon[ircd_gpt_attn_fcon_elems / 4], + proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems / 4], + qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4]; + + union ircd_gpt_vector_f32x4 + vector[ircd_gpt_attn_segs]; +}; +#endif + +// +// ffnn aperature +// + +#if defined(__SIZEOF_FLOAT__) +union ircd_gpt_ffnn_aperature +{ + float + fcon[ircd_gpt_ffnn_fcon_elems], + proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems]; + + union ircd_gpt_vector + vector[ircd_gpt_ffnn_segs]; +}; +#endif + +#if defined(__SIZEOF_FLOAT4__) +union ircd_gpt_ffnn_aperature_f32x4 +{ + float4 + fcon[ircd_gpt_ffnn_fcon_elems / 4], + proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems / 4]; + + union ircd_gpt_vector_f32x4 + vector[ircd_gpt_ffnn_segs]; +}; +#endif diff --git a/include/ircd/gpt/vocab.h b/include/ircd/gpt/vocab.h index 04b6fb8a4..ac314d797 100644 --- a/include/ircd/gpt/vocab.h +++ b/include/ircd/gpt/vocab.h @@ -34,11 +34,17 @@ namespace ircd::gpt::vocab merges_path; // Tokenize UTF-8 input string of any length into proper token values, - vector_view tokenize(const vector_view &out, const string_view &in); + vector_view tokenize(const vector_view &out, const string_view &in) noexcept; + + // Tokenize one token. The buffer is advanced consuming one token per call. + u16 tokenize(const_buffer &) noexcept; + + // Tokenize one token. Error thrown if input is not exactly one token. + u16 tokenize(const string_view &in); // Decode token values to build output text string. - string_view detokenize(const mutable_buffer &out, const vector_view &in); + string_view detokenize(const mutable_buffer &out, const vector_view &in) noexcept; // Other tools - string_view debug(const mutable_buffer &buf, const u16 token); + string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U); } diff --git a/ircd/Makefile.am b/ircd/Makefile.am index da5d53ea4..69c3057f6 100644 --- a/ircd/Makefile.am +++ b/ircd/Makefile.am @@ -53,6 +53,7 @@ if LTO if CLANG AM_CXXFLAGS += -fstrict-vtable-pointers AM_CXXFLAGS += -fwhole-program-vtables +#AM_LDFLAGS += -Wl,-plugin-opt,-pass-remarks='.*' endif endif @@ -87,22 +88,6 @@ AM_LDFLAGS += -Wl,--enable-runtime-pseudo-reloc AM_LDFLAGS += -export-symbols-regex '*' endif -ROCKSDB_SRC_CPPFLAGS =# -ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include -ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb - -GPT_FP_CXXFLAGS =# -GPT_FP_CXXFLAGS += -fno-math-errno -GPT_FP_CXXFLAGS += -fno-trapping-math -GPT_FP_CXXFLAGS += -ffinite-math-only -GPT_FP_CXXFLAGS += -fno-signed-zeros -GPT_FP_CXXFLAGS += -fassociative-math -GPT_FP_CXXFLAGS += -ffp-contract=fast -GPT_FP_CXXFLAGS += -freciprocal-math -if CLANG -GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero -endif - libircddir = @libdir@ libircd_LTLIBRARIES = libircd.la @@ -241,16 +226,12 @@ libircd_la_SOURCES += png.cc if OPENCL libircd_la_SOURCES += cl.cc endif -libircd_la_SOURCES += gpt.cc -libircd_la_SOURCES += gpt_pipe.cc -libircd_la_SOURCES += gpt_model.cc libircd_la_SOURCES += gpt_vocab.cc +libircd_la_SOURCES += gpt_model.cc +libircd_la_SOURCES += gpt_pipe_code.cc +libircd_la_SOURCES += gpt_pipe.cc libircd_la_SOURCES += gpt_cpu.cc -if OPENCL -if CLANG -BUILT_SOURCES += gpt_gpu.o -endif -endif +libircd_la_SOURCES += gpt.cc libircd_la_SOURCES += openssl.cc libircd_la_SOURCES += rfc1459.cc libircd_la_SOURCES += rfc3986.cc @@ -283,6 +264,23 @@ libircd_la_SOURCES += ircd.cc # Specific unit option composition # +ROCKSDB_SRC_CPPFLAGS =# +ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include +ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb + +GPT_FP_CXXFLAGS =# +GPT_FP_CXXFLAGS += -fno-math-errno +GPT_FP_CXXFLAGS += -fno-trapping-math +GPT_FP_CXXFLAGS += -ffinite-math-only +GPT_FP_CXXFLAGS += -fno-signed-zeros +GPT_FP_CXXFLAGS += -fassociative-math +GPT_FP_CXXFLAGS += -freciprocal-math +GPT_FP_CXXFLAGS += -ffp-contract=fast +if CLANG +GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero +GPT_FP_CXXFLAGS += -ffp-model=fast +endif + client.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS} ctx_x86_64.lo: AM_CPPFLAGS := -I$(top_srcdir)/include ctx.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS} @@ -307,7 +305,7 @@ endif if IOU fs_iou.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS} endif -gpt.lo: AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS} +gpt_cpu.lo: AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS} http.lo: AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS} http.lo: AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS} ios.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS} @@ -364,12 +362,118 @@ default.profdata: -$(LLVM_PROFDATA) merge -output=default.profdata default.proftext endif +############################################################################### +# +# Hardware Acceleration / sub-targets +# + if CLANG if OPENCL -gpt_gpu.ll: gpt_gpu.cl - $(CC) -S -emit-llvm -std=CL1.1 $(AM_CPPFLAGS) $(CPPFLAGS) $(DEFS) -Xclang -finclude-default-header -include "ircd/config.h" -o $@ -c $^ -gpt_gpu.o: gpt_gpu.ll - $(CC) -std=CL1.1 -o $@ -c $^ -endif -endif +GPU_CPPFLAGS =# +GPU_CPPFLAGS += -D__OPENCL_VERSION__=110 +GPU_CPPFLAGS += -Dcl_clang_storage_class_specifiers +GPU_CPPFLAGS += -DNOFP64 +GPU_CPPFLAGS += $(AM_CPPFLAGS) +GPU_CPPFLAGS += $(CPPFLAGS) +GPU_CPPFLAGS += $(DEFS) + +GPU_CFLAGS =# +GPU_CFLAGS += -std=cl1.1 +GPU_CFLAGS += -fident +GPU_CFLAGS += -fno-builtin +GPU_CFLAGS += -fverbose-asm +GPU_CFLAGS += -fno-discard-value-names +GPU_CFLAGS += -mllvm -propagate-attrs=true +GPU_CFLAGS += -O3 +GPU_CFLAGS += -fno-trapping-math +GPU_CFLAGS += -ffinite-math-only +GPU_CFLAGS += -fno-signed-zeros +GPU_CFLAGS += -ffp-contract=fast +#GPU_CFLAGS += -include "ircd/config.h" +#GPU_CFLAGS += -include "clc/clc.h" +#GPU_CFLAGS += -Wassume +#GPU_CFLAGS += -Rpass-analysis=".*" +#GPU_CFLAGS += -Rpass-missed=".*" + +GPU_LINKFLAGS =# +GPU_LINKFLAGS +=# + +GPU_OPTFLAGS =# +GPU_OPTFLAGS += -propagate-attrs=true +GPU_OPTFLAGS += -O3 +#GPU_OPTFLAGS += -opt-bisect-limit=-1 +#GPU_OPTFLAGS += -debug-pass=Arguments +#GPU_OPTFLAGS += -pass-remarks='.*' + +GPU_ASFLAGS =# +GPU_ASFLAGS += -fident +GPU_ASFLAGS += -fno-builtin +GPU_ASFLAGS += -mllvm -propagate-attrs=true +GPU_ASFLAGS += -mllvm -verify-machineinstrs +GPU_ASFLAGS += -O3 +GPU_ASFLAGS += -Rpass-analysis=asm-printer +#GPU_ASFLAGS += -Rpass-analysis=".*" +#GPU_ASFLAGS += -mllvm -debug-pass=Arguments +#GPU_ASFLAGS += -mllvm -pass-remarks='.*' + +# +# SPV +# + +BUILT_SOURCES += gpt_gpu.spv.bc +gpt_gpu.spv.bc: gpt_gpu.cl + clang-13 -target spir-- $(GPU_CPPFLAGS) -O0 -emit-llvm -o $@ -x cl -c $^ + +BUILT_SOURCES += gpt_gpu.spv +gpt_gpu.spv: gpt_gpu.spv.bc + llvm-spirv -o $@ $^ + +CLEANFILES += gpt_gpu.spv.cc +libircd_la_SOURCES += gpt_gpu.spv.cc +gpt_gpu.spv.cc: gpt_gpu.spv + xxd -i $^ $@ + +# +# R600 +# + +R600_TARGET = r600-- + +# +# R600 Saint Barthélemy +# + +R600_BARTS_CFLAGS = $(GPU_CFLAGS) +R600_BARTS_CFLAGS += -target $(R600_TARGET) +R600_BARTS_CFLAGS += -mcpu=barts +R600_BARTS_CFLAGS += -Xclang -mlink-bitcode-file -Xclang /usr/lib/clc/barts-r600--.bc + +BUILT_SOURCES += gpt_gpu.r600_barts.bc +gpt_gpu.r600_barts.bc: gpt_gpu.cl + $(CC) $(R600_BARTS_CFLAGS) $(GPU_CPPFLAGS) -emit-llvm -o $@ -x cl -c $^ + +#BUILT_SOURCES += gpt_gpu.r600_barts.link.bc +gpt_gpu.r600_barts.link.bc: gpt_gpu.r600_barts.bc + llvm-link-14 $(GPU_LINKFLAGS) -o $@ $^ /usr/lib/clc/barts-r600--.bc + opt-14 $(GPU_OPTFLAGS) -o $@ $@ + +#BUILT_SOURCES += gpt_gpu.r600_barts.s +gpt_gpu.r600_barts.s: gpt_gpu.r600_barts.link.bc + $(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -S -o $@ -x ir $^ + +#BUILT_SOURCES += gpt_gpu.r600_barts.o +gpt_gpu.r600_barts.o: gpt_gpu.r600_barts.link.bc + $(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -o $@ -x ir $^ + +CLEANFILES += gpt_gpu.r600_barts.bc.cc +libircd_la_SOURCES += gpt_gpu.r600_barts.bc.cc +gpt_gpu.r600_barts.bc.cc: gpt_gpu.r600_barts.bc + xxd -i $^ $@ + +# +# +# + +endif # OPENCL +endif # CLANG diff --git a/ircd/gpt.cc b/ircd/gpt.cc index 6cf022c7a..96be65de9 100644 --- a/ircd/gpt.cc +++ b/ircd/gpt.cc @@ -8,13 +8,6 @@ // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. -namespace ircd::gpt -{ - size_t backprop(task &, const f32, model::decoder &, f32 *const (&)[2], size_t = 0); - - void generate_debug(task &, const uint &, const uint &); -} - decltype(ircd::gpt::log) ircd::gpt::log { @@ -26,20 +19,21 @@ ircd::gpt::generate(const mutable_buffer &out, const string_view &in, task &task) { - u16 buf[2][1024]; + u16 input_buf[1024]; const auto input_tokens { - vocab::tokenize(buf[0], in) + gpt::vocab::tokenize(input_buf, in) }; + u16 output_buf[1024]; const auto output_tokens { - generate(buf[1], input_tokens, task) + generate(output_buf, input_tokens, task) }; const auto output { - vocab::detokenize(out, output_tokens) + gpt::vocab::detokenize(out, output_tokens) }; return output; @@ -50,193 +44,309 @@ ircd::gpt::generate(const vector_view &out, const vector_view &in, task &task) { - assert(task.ctrl); assert(task.opts); - - uint ret(0); - bool halt(false); - - const auto &opts(*task.opts); - auto &ctrl(*task.ctrl); - ctrl.tokens.count = 0; - ctrl.tokens.head = 0; - - uint j(0); - while(j < in.size() && ctrl.tokens.count < opts.buffer_tokens) - ctrl.token[ctrl.tokens.count++] = in[j++]; - - const size_t in_size + const auto &opts { - ctrl.tokens.count + *task.opts }; + assert(task.ctrl); + auto &ctrl + { + *task.ctrl + }; + + size_t in_i(0); + while(in_i < in.size() && ctrl.count < opts.buffer_tokens) + if(in[in_i] == 628) + { + ctrl.token[ctrl.count++] = 198; + ctrl.token[ctrl.count++] = 198; + in_i++; + } + else ctrl.token[ctrl.count++] = in[in_i++]; + generate(task); - for(uint i(0); i < ctrl.tokens.count && ret < out.size() && !halt; ++i) - { - const auto j - { - (i + ctrl.tokens.head) % opts.buffer_tokens - }; + size_t out_i(0); + for(; out_i < out.size() && in_i + out_i < ctrl.count; out_i++) + out[out_i] = ctrl.token[in_i + out_i]; - const auto tok - { - ctrl.token[j] - }; - - if(j >= in_size) - out[ret++] = tok; - - if(likely(~opts.debug & 0x01)) - continue; - - if(likely(~opts.debug & 0x02)) - if(j < in_size) - continue; - - generate_debug(task, j, in_size); - } - - ctx::interruption_point(); return vector_view { - out, ret + out, out_i }; } void ircd::gpt::generate(task &task) { - const auto &opts(*task.opts); - auto &ctrl(*task.ctrl); - - const size_t in_size + assert(task.opts); + const auto &opts { - ctrl.tokens.count + *task.opts }; - uint64_t cycles(0); - if(ctrl.prop) + assert(task.ctrl); + auto &ctrl { - static f32 *_momentum[2]; - if(!_momentum[0]) - { - _momentum[0] = new f32[sizeof(model::decoder) / 4] {0.0f}; - _momentum[1] = new f32[sizeof(model::decoder) / 4] {0.0f}; - } - - f32 *const momentum[2] - { - _momentum[0], _momentum[1], - }; - - const prof::scope_cycles task_cycles - { - cycles - }; - - backprop(task, ctrl.label[0].loss.mean, *model::default_model, momentum); - } - - if(ctrl.prop) - { - log::debug - { - log, "Backpropagation of %2.6f in %lu cycles.", - ctrl.label[0].loss.mean, - cycles, - }; - - ctrl.epic.epoch = 0; - ctrl.label[0].loss.mean = 0; - ctrl.label[0].loss.last = ctrl.label[0].loss.mean; - ctrl.label[0].perp.mean = 0; - ctrl.label[0].perp.last = ctrl.label[0].perp.mean; - ctrl.prop = false; - pipe::default_model->invalid = true; - return; - } - - cycles = 0; - util::timer stopwatch; - { - const prof::scope_cycles task_cycles - { - cycles - }; - - pipe::generate(task); - } - - const milliseconds last_time - { - stopwatch.at() + *task.ctrl }; - ctrl.epic.elapsed += last_time.count(); + gpt::epoch epoch + { + task + }; + + gpt::step step + { + epoch + }; + + gpt::samp samp + { + step + }; + + bool halt {false}; do + { + gpt::pipe::cycle cycle + { + samp + }; + + halt = !samp.evaluate(cycle); + } + while(!halt); +} + +// +// debug +// + +void +ircd::gpt::log_debug_prof(const opts &opts, + const ctrl &ctrl, + const pipe::prof &prof) +{ + static char + buf[2][512]; + + const auto head + { + debug_head(buf[0], opts, ctrl) + }; + + for(uint i(0); i < prof.stages; ++i) + { + if(!std::get<1>(prof.info[i])) + continue; + + log::logf + { + log, log::level::DEBUG, + "%s %2u: %s", + head, + i, + pipe::debug(buf[1], prof, i), + }; + } } void -ircd::gpt::generate_debug(task &task, - const uint &i, - const uint &in_size) +ircd::gpt::log_debug_topn(const opts &opts, + const ctrl &ctrl) { - const auto &opts(*task.opts); - auto &ctrl(*task.ctrl); + static char + buf[2][512]; - const auto j + const auto head { - (i + ctrl.tokens.head) % opts.buffer_tokens + debug_head(buf[0], opts, ctrl) }; - const auto tok + for(uint i(0); i < opts.top_n; ++i) + log::logf + { + log, log::level::DEBUG, + "%s %s", + head, + debug_top(buf[1], opts, ctrl, i), + }; +} + +void +ircd::gpt::log_debug_labels(const opts &opts, + const ctrl &ctrl) +{ + static char + buf[2][512]; + + const auto head { - ctrl.token[j] + debug_head(buf[0], opts, ctrl) }; - static char dbuf[512]; - static char report[1536]; - static char tmbuf[4][64]; - const size_t bsz(ctrl.tokens.count - in_size); - const size_t report_size = snprintf - ( - report, sizeof(report), - "%-3u %-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP %5.1f%% %s %04x %8s %8s | %8s", - j, - ctrl.tokens.count, - ctrl.epic.epoch, - ctrl.epic.cycle, - 0.0f, // cert - std::clamp(ctrl.label[0].perp.mean, 0.0f, 100.0f), - std::clamp(ctrl.label[0].loss.mean, 0.0f, 99.99f), - ctrl.label[0].token == tok? '+': ' ', - ' ', // flag place - ' ', // flag place - ctrl.label[0].token, - std::clamp(ctrl.label[0].loss.last, 0.0f, 99.99f), - std::clamp(ctrl.label[0].perp.last, 0.0f, 100.0f), - 0.0f, // cert - vocab::debug(dbuf, tok).c_str(), - tok, - pretty(tmbuf[0], milliseconds(0ms / bsz), 1).c_str(), - pretty(tmbuf[1], si(0UL / bsz), 1).c_str(), - pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str() - ); + for(uint i(0); i < opts.labels; ++i) + log::logf + { + log, log::level::DEBUG, + "%s %s", + head, + debug_label(buf[1], opts, ctrl, i, 1), + }; +} + +void +ircd::gpt::log_debug_attns_top(const opts &opts, + const ctrl &ctrl) +{ + static char + buf[8][512]; + + const auto head + { + debug_head(buf[0], opts, ctrl) + }; + + std::map tokm; + for(uint i(0); i < opts.layers; ++i) + for(uint j(0); j < opts.attn_rank; ++j) + tokm[ctrl.attn[i][j]]++; + + std::vector> tok(begin(tokm), end(tokm)); + std::sort(begin(tok), end(tok), [&tokm] + (const auto &a, const auto &b) + { + return b.second < a.second; + }); + + for(const auto &[idx, score] : tok) + { + const auto barsz + { + std::min(score, std::min(80U, uint(sizeof(buf[2]) - 1))) + }; + + memset(buf[2], '|', barsz); + buf[2][barsz] = '\0'; + + log::logf + { + log, log::level::DEBUG, + "%s %s [%3u] %s %-3u", + head, + vocab::debug(buf[1], ctrl.token[idx], 1), + idx, + buf[2], + score, + }; + } +} + +void +ircd::gpt::log_debug_attns(const opts &opts, + const ctrl &ctrl) +{ + static char + buf[2][512]; + + const auto head + { + debug_head(buf[0], opts, ctrl) + }; + + for(uint i(0); i < ctrl.count; ++i) + log::logf + { + log, log::level::DEBUG, + "%s %s", + head, + debug_attn(buf[1], opts, ctrl, i), + }; +} + +void +ircd::gpt::log_debug_token(const opts &opts, + const ctrl &ctrl, + const uint i) +{ + static char + buf[2][512]; log::logf { log, log::level::DEBUG, - "%s", - string_view{report, report_size} + "%s %s", + debug_head(buf[0], opts, ctrl), + debug_token_at(buf[1], opts, ctrl, i), }; } +void +ircd::gpt::log_debug(const opts &opts, + const ctrl &ctrl) +{ + static char + buf[2][512]; + + log::logf + { + log, log::level::DEBUG, + "%s %s", + debug_head(buf[0], opts, ctrl), + debug(buf[1], opts, ctrl), + }; +} + +/////////////////////////////////////////////////////////////////////////////// // // gpt::task // +void +ircd::gpt::reset(task &task) +noexcept +{ + clear(task); + seed(task); +} + +void +ircd::gpt::clear(task &task) +noexcept +{ + assert(task.ctrl); + memset(task.ctrl, 0x0, sizeof(gpt::ctrl)); +} + +void +ircd::gpt::seed(task &task) +noexcept +{ + assert(task.opts); + seed(task, task.opts->seed); +} + +void +ircd::gpt::seed(task &task, + const uint64_t &val) +noexcept +{ + assert(task.ctrl); + task.ctrl->rand[0] = val; + task.ctrl->rand[1] = val; + task.ctrl->rand[2] = 65537; + task.ctrl->rand[3] = -1UL; +} + +// +// gpt::task::task +// + ircd::gpt::task::task(const gpt::opts *const opts, gpt::ctrl *const ctrl) +try :opts { opts @@ -245,121 +355,996 @@ ircd::gpt::task::task(const gpt::opts *const opts, { ctrl } -,frame +,code { - new gpt::ctrl[opts->frames] + std::make_unique() +} +,model +{ + std::make_unique + ( + *const_cast(gpt::model::default_model) + ) +} +,desc +{ + this->opts, + this->ctrl, + *this->model, + *this->code, } { - memset(ctrl, 0x0, sizeof(gpt::ctrl)); + assert(aligned(opts, size_t(cl::data::gart_page_size))); + assert(aligned(ctrl, size_t(cl::data::gart_page_size))); + seed(*this, this->opts->seed); } +catch(const std::exception &e) +{ + log::error + { + log, "Task ctor :%s", e.what() + }; + + throw; +} ircd::gpt::task::~task() noexcept { } +bool +ircd::gpt::task::operator()() +{ + gpt::epoch epoch + { + *this + }; + + while(!epoch()) + ctx::interruption_point(); + + return done(); +} + +bool +ircd::gpt::task::done() +const noexcept +{ + return false; +} + +/////////////////////////////////////////////////////////////////////////////// // -// gpt::opts +// epoch // -ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model) -noexcept -:model +namespace ircd::gpt { - model?: ircd::gpt::model::default_model + static thread_local u16 marker alignas(64) [1024]; } -,seed + +// +// epoch::epoch +// + +ircd::gpt::epoch::epoch(gpt::task &task) +:task +{ + task +} +,desc +{ + task.desc +} +,opts +{ + *task.opts +} +,ctrl +{ + *task.ctrl +} +,id +{ + ctrl.clk.epoch +} +,start +{ + 0 +} +,stop +{ + std::min(start + uint(opts.batch_size), gpt::model::default_data.size()) +} +,moment +{ + gpt::model::default_moment[0], + gpt::model::default_moment[1], +} +{ + assert(task.opts); + assert(task.ctrl); + + ctrl.clk.step = 0; +} + +ircd::gpt::epoch::~epoch() +noexcept +{ + if(opts.debug & 0x80000000U) + log_debug_prof(opts, ctrl, this->profile); +} + +bool +ircd::gpt::epoch::operator()() +{ + gpt::step step + { + *this + }; + + while(!step()) + ctx::interruption_point(); + + if(!step.backpropagate()) + throw error + { + "Failed to backprop." + }; + + return done(); +} + +bool +ircd::gpt::epoch::done() +const noexcept +{ + return ctrl.clk.epoch != id; +} + +void +ircd::gpt::epoch::profile_accumulate(const pipe::prof &profile) +{ + for(size_t i(0); i < profile.ts.size(); ++i) + for(size_t j(0); j < profile.phases; ++j) + this->profile.ts[i][j] += profile.ts[i][j]; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// step::step +// + +ircd::gpt::step::step(gpt::epoch &epoch) +:epoch +{ + epoch +} +,desc +{ + epoch.desc +} +,opts +{ + epoch.opts +} +,ctrl +{ + epoch.ctrl +} +,id +{ + ctrl.clk.step +} +,start +{ + ctrl.clk.step * opts.batch_size +} +{ + assert(opts.batch_size > 0); + + ctrl.clk.samp = 0; + ctrl.hit = 0; + ctrl.miss = 0; + ctrl.target.ppl = {{0}}; + ctrl.target.loss = {{0}}; + ctrl.select.ppl = {{0}}; + ctrl.select.loss = {{0}}; + + for(uint i(0); i < opts.labels; ++i) + { + ctrl.label[i].ppl = {{0}}; + ctrl.label[i].loss = {{0}}; + } +} + +ircd::gpt::step::~step() +noexcept +{ + if(opts.debug & 0x40000000U) + log_debug_prof(opts, ctrl, this->profile); +} + +bool +ircd::gpt::step::backpropagate() +{ + const auto hit + { + ctrl.target.logit.token == ctrl.select.logit.token + }; + + const auto select_loss_mean + { + ctrl.select.loss.mean + }; + + const auto target_loss_mean + { + ctrl.target.loss.mean + }; + + const auto loss_mean + { + (target_loss_mean + select_loss_mean) / 2.0f + }; + + static float mean_best { 10000.0f }, target_mean_best { 10000.0f }; + static ulong hit_best; + static bool tack, last_tack; + last_tack = tack; + + const auto loss + { + loss_mean + }; + + const bool improve_global + { + target_loss_mean < target_mean_best + }; + + const bool improve + { + improve_global + }; + + if(improve) + mean_best = loss, + target_mean_best = target_loss_mean, + hit_best = ctrl.hit; + else + tack = !tack; + + const auto grad + { + !tack? loss : -loss + }; + + const auto steps + { + (opts.training_steps + opts.validation_steps + opts.testing_steps) / opts.batch_size + }; + + const auto step + { + this->epoch.id * steps + this->id + }; + + log::logf + { + log, improve? log::level::INFO: log::level::ERROR, + "epoch:%u step:%u completed range[%u -> %zu] dsid:%u target:%-10.7f select:%-10.7f loss:%-10.7f [ %10.7f ] hit:%u miss:%u", + this->epoch.id, + step, + this->start, + this->start + opts.batch_size, + this->id * opts.batch_size + ctrl.clk.samp, + target_loss_mean, + select_loss_mean, + loss, + grad * opts.alpha, + ctrl.hit, + ctrl.miss, + }; + + if(!opts.alpha) + return true; + + if(!improve) + return false; + + cl::exec + { + desc.model->decode->master[0], std::memory_order_acq_rel + }; + + auto &model + { + *mutable_cast(desc.model->decode_const) + }; + + const mutable_buffer model_buffer + { + reinterpret_cast(&model), + sizeof(gpt::model::decoder) * 3 + }; + + const mutable_buffer checkpoint_buffer + { + reinterpret_cast(&model) + sizeof(gpt::model::decoder) * 3, + sizeof(gpt::model::decoder) * 3 + }; + + if(improve) + copy(checkpoint_buffer, model_buffer); + else + copy(model_buffer, checkpoint_buffer); + + ircd::timer stopwatch; + backprop(opts, step, grad, model, epoch.moment); + allocator::sync(model_buffer); + + char pbuf[1][32]; + log::logf + { + log, improve? log::level::DEBUG: log::level::ERROR, + "backpropagation step:%u lr:%-8.6f mean:%-10.7f$L hits:%-5u Tbest:%-10.7f$L Mbest:%-10.7f$L Hbest:%-5lu grad:{ %10.7f$L } %s", + step, + opts.alpha, + loss_mean, + ctrl.hit, + target_mean_best, + mean_best, + hit_best, + grad, + pretty(pbuf[0], stopwatch.at(), 1), + }; + + return true; +} + +bool +ircd::gpt::step::operator()() +{ + gpt::samp samp + { + *this + }; + + while(!samp()) + ctx::interruption_point(); + + return done(); +} + +bool +ircd::gpt::step::done() +const noexcept +{ + return ctrl.clk.step != id; +} + +void +ircd::gpt::step::profile_accumulate(const pipe::prof &profile) +{ + for(size_t i(0); i < profile.ts.size(); ++i) + for(size_t j(0); j < profile.phases; ++j) + this->profile.ts[i][j] += profile.ts[i][j]; + + epoch.profile_accumulate(profile); +} + +/////////////////////////////////////////////////////////////////////////////// +// +// samp::samp +// + +ircd::gpt::samp::samp(gpt::step &step) +:step +{ + step +} +,desc +{ + step.desc +} +,opts +{ + step.opts +} +,ctrl +{ + step.ctrl +} +,id +{ + ctrl.clk.samp +} +,accept +{ + -1 +} +,dispatch +{ + 1 +} +,cycle +{ + 0 +} +,tokens +{ + tokenize() +} +,count +{ + int(opts.limit) > 0? + tokens - opts.limit: + int(opts.limit) < 0? + std::abs(int(opts.limit)): + tokens +} +{ + desc.cached = 0; + + ctrl.clk.cycle = cycle; + ctrl.dispatch = dispatch; + ctrl.accept = accept; + ctrl.count = count; + ctrl.tokens = tokens; + ctrl.magic = 0xDEADBEEF; + + for(uint i(0); i < opts.labels; ++i) + { + ctrl.label[i].ppl = {{0}}; + ctrl.label[i].loss = {{0}}; + } + + assert(ctrl.count > 0); + assert(ctrl.count < opts.context_tokens); + assert(ctrl.count <= ctrl.tokens); + + if(opts.debug & 0x01) + for(uint j(0); j < ctrl.count; ++j) + log_debug_token(opts, ctrl, j); +} + +ircd::gpt::samp::~samp() +noexcept +{ + if(run::level != run::level::RUN) + return; + + cl::exec + { + desc.ctrl, std::memory_order_acq_rel + }; + + if(opts.debug & 0x04) + log_debug(opts, ctrl); + + if(opts.debug & 0x40) + log_debug_labels(opts, ctrl); + + if(opts.debug & 0x20000000U) + log_debug_prof(opts, ctrl, this->profile); +} + +bool +ircd::gpt::samp::operator()() +{ + if(dispatch > 0) + { + ctx::interruption_point(); + queue.emplace_back(*this); + desc.cached = tokens; + tokens += count < tokens? 0: 1; + ++cycle; + ++count; + --dispatch; + return false; + } + + while(!queue.empty()) + { + const unwind pop{[this] + { + queue.pop_front(); + }}; + + if(evaluate(queue.front())) + break; + } + + return done(); +} + +bool +ircd::gpt::samp::done() +const noexcept +{ + return accept >= 0; +} + +uint +ircd::gpt::samp::tokenize() +{ + const auto idx + { + step.start + ctrl.clk.samp + }; + + const gpt::model::text text + { + gpt::model::default_data.at(idx) + }; + + const json::string input + { + json::get<"text"_>(text) + }; + + thread_local char str_buf[16_KiB]; + const string_view str + { + json::unescape(str_buf, input) + }; + + assert(!empty(str)); + static const auto delim + { + "\n\n"_sv + }; + + const int phrases + ( + ircd::token_count(str, delim) + ); + + uint count(0); + int p(phrases); + assert(p >= 0); + + if(startswith(str, delim)) + { + ctrl.token[count++] = 198; + ctrl.token[count++] = 198; + } + + ircd::tokens(str, delim, [this, &count, &p, &phrases] + (const string_view &phrase) -> bool + { + assert(!empty(phrase)); + const vector_view buf + { + ctrl.token + count, opts.buffer_tokens - count + }; + + const auto in + { + gpt::vocab::tokenize(buf, phrase) + }; + + if(count + size(in) + 2 > opts.context_tokens) + return false; + + count += size(in); + ctrl.token[count++] = 198; + ctrl.token[count++] = 198; + + assert(p > 0); + marker[--p] = count; + return true; + }); + + for(assert(p >= 0); p < phrases; ++p) + if(marker[p] <= opts.context_tokens) + break; + + assert(p <= phrases); + count = marker[p]; + + for(uint i(count); i < opts.buffer_tokens; ++i) + ctrl.token[i] = 198; + + if(!endswith(str, delim)) + count -= 2; + + assert(count > 0); + assert(count <= opts.context_tokens); + return count; +} + +bool +ircd::gpt::samp::evaluate(pipe::cycle &cycle) +{ + cl::exec + { + desc.frame[cycle.frame], std::memory_order_consume + }; + + const auto &frame + { + acquire(cycle) + }; + + if(!retire(cycle, frame)) + return false; + + memcpy(&ctrl, &frame, sizeof(gpt::ctrl)); + + const uint + batch_size = opts.batch_size, + samps = opts.training_steps + opts.validation_steps + opts.testing_steps, + steps = samps / batch_size; + + const bool + accepting = accept >= 0, + cycling = !accepting, + sampling = accepting, + stepping = sampling && (frame.clk.samp + 1) >= batch_size, + epoching = stepping && (frame.clk.step + 1) >= steps; + + //ctrl[ctrl.count] = ctrl.select.logit.token; + //ctrl.count++; + + if(accepting) + { + ctrl.clk.cycle += cycling; + ctrl.clk.samp += sampling; + ctrl.clk.step += stepping; + ctrl.clk.epoch += epoching; + } + + return true; +} + +bool +ircd::gpt::samp::retire(pipe::cycle &cycle, + const gpt::ctrl &frame) +{ + assert(accept < 0); + accept = frame.accept; + dispatch = frame.dispatch; + + if(cl::profile_queue) + { + const pipe::prof profile + { + cycle + }; + + if(opts.debug & 0x10000000U) + log_debug_prof(opts, frame, profile); + + profile_accumulate(profile); + } + + if(opts.debug & 0x02) + log_debug(opts, frame); + + if(opts.debug & 0x20) + log_debug_labels(opts, frame); + + if(opts.debug & 0x10) + log_debug_topn(opts, frame); + + if(opts.debug & 0x200) + log_debug_attns_top(opts, frame); + + dispatch &= boolmask(ircd::run::level == run::level::RUN); + dispatch &= boolmask(!ctx::interruption_requested()); + dispatch &= boolmask(accept < 0); + const bool finished + { + dispatch == 0 + }; + + return finished; +} + +void +ircd::gpt::samp::profile_accumulate(const pipe::prof &profile) +{ + for(size_t i(0); i < profile.ts.size(); ++i) + for(size_t j(0); j < profile.phases; ++j) + this->profile.ts[i][j] += profile.ts[i][j]; + + step.profile_accumulate(profile); +} + +/////////////////////////////////////////////////////////////////////////////// +// +// ctrl +// + +ircd::string_view +ircd::gpt::debug_top(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl, + const uint i) +{ + thread_local char buf[2][256]; + + assert(opts.top_n > i); + const auto &top + { + ctrl.top[i] + }; + + return fmt::sprintf + { + out, "%s T%02d %s", + vocab::debug(buf[0], top.token, 1), + i, + debug(buf[1], opts, top), + }; +} + +ircd::string_view +ircd::gpt::debug_label(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl, + const uint i, + const uint fmt) +{ + thread_local char buf[2][256]; + + assert(opts.labels > i); + const auto &label + { + ctrl.label[i] + }; + + return fmt::sprintf + { + out, "%s L%02d %s", + vocab::debug(buf[0], label.logit.token, 1), + i, + debug(buf[1], opts, label, fmt), + }; +} + +ircd::string_view +ircd::gpt::debug_attn(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl, + const uint ti) +{ + thread_local char buf[4][256]; + assert(ti < ctrl.count); + + memset(buf[1], 0x0, sizeof(buf[1])); + for(uint i(0); i < opts.layers; ++i) + { + const auto f{[&](const auto &a) { return a == ti; }}; + if(std::none_of(ctrl.attn[i], ctrl.attn[i] + opts.attn_rank, f)) + continue; + + strlcat{buf[1], fmt::sprintf + { + buf[2], " %1x[", uint(i) + }}; + + for(uint j(0); j < opts.attn_rank; ++j) + if(ctrl.attn[i][j] == ti) + strlcat{buf[1], fmt::sprintf + { + buf[2], "%1x", uint(j) + }}; + + strlcat{buf[1], "]"_sv}; + } + + return fmt::sprintf + { + out, "%s [%3u] <-%s", + vocab::debug(buf[0], ctrl.token[ti], 1), + ti, + string_view{buf[1]}, + }; +} + +ircd::string_view +ircd::gpt::debug(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl) +{ + thread_local char + buf[8][128], + tmbuf[4][32]; + + int top_idx {-1}; + for(uint i(0); i < opts.top_n; ++i) + if(ctrl.top[i].token == ctrl.select.logit.token) + { + top_idx = i; + break; + } + + return fmt::sprintf + { + out, "%s %s %c T%02d %4u %6.2f%% %10.7f$L %c %s %s", + vocab::debug(buf[0], ctrl.select.logit.token, 1), + debug(buf[1], opts, ctrl.select), + ctrl.target.logit.token == ctrl.top[0].token? '=' : ' ', + top_idx, + ctrl.hit, + (ctrl.hit / float(ctrl.hit + ctrl.miss)) * 100.0f, + ctrl.target.loss.mean - ctrl.select.loss.mean, + ctrl.target.logit.token == ctrl.select.logit.token? '=' : ' ', + debug(buf[2], opts, ctrl.target), + vocab::debug(buf[3], ctrl.target.logit.token, 1), + }; +} + +ircd::string_view +ircd::gpt::debug(const mutable_buffer &out, + const opts &opts, + const ctrl_label &label, + const uint fmt) +{ + thread_local char buf[64], bar[128]; + + const auto diff + { + log2f(65536) - label.loss.mean + }; + + const auto pct + { + (diff / log2f(opts.logits)) * 100.0f + }; + + const auto barsz + { + std::min(uint(pct), std::min(66U, uint(sizeof(bar) - 1))) + }; + + memset(bar, '|', barsz); + bar[barsz] = '\0'; + + return fmt::sprintf + { + out, + fmt == 1? + "%s %10.7f$La %6.2f%% %s": + "%s %10.7f$La", + debug(buf, opts, label.logit, fmt), + label.loss.mean, + pct, + string_view{bar}, + }; +} + +ircd::string_view +ircd::gpt::debug(const mutable_buffer &out, + const opts &opts, + const ctrl_logit &logit, + const uint fmt) +{ + return fmt::sprintf + { + out, "%6.2f%% %10.7f$L %5.1f$P", + logit.samax * 100.0f, + +0.0f - logf(logit.samax), + (1.0f - logit.samax) * log2f(opts.logits), + }; +} + +ircd::string_view +ircd::gpt::debug_head(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl) +{ + thread_local char head[64]; + + return fmt::sprintf + { + out, "%s[%4u]-%1u", + debug_head(head, opts, ctrl.clk), + ctrl.count, + ctrl.dispatch, + }; +} + +ircd::string_view +ircd::gpt::debug_head(const mutable_buffer &out, + const opts &opts, + const ctrl_clk &clk) +{ + return fmt::sprintf + { + out, "%02u:%06u|%04u|%04u|%04u", + clk.epoch, + clk.step * opts.batch_size + clk.samp, + clk.step, + clk.samp, + clk.cycle, + }; +} + +ircd::string_view +ircd::gpt::debug_token(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl, + const uint fmt) +{ + assert(ctrl.count > 0); + const auto pos + { + ctrl.count - 1 + }; + + return debug_token_at(out, opts, ctrl, pos, fmt); +} + +ircd::string_view +ircd::gpt::debug_token_at(const mutable_buffer &out, + const opts &opts, + const ctrl &ctrl, + const uint i, + const uint fmt) +{ + const auto &token + { + ctrl.token[i] + }; + + return vocab::debug(out, token, fmt); +} + +/////////////////////////////////////////////////////////////////////////////// +// +// opts +// + +ircd_gpt_opts::ircd_gpt_opts() +noexcept +:seed { 1234567890UL } -,limit -{ - -1U -} ,top_k { - 2U + 16 } ,top_p { - 90U + 0.90f } ,top_n { - 16 + 0 } ,labels { 0 } +,frames +{ + 8 +} +,limit +{ + -1U +} ,debug { - 0x01 + 0x00 } -,context_tokens +,accept { - 1024U + { 198, 198, ushort(-1), }, + { 0, 0, 0, ushort(-1), }, + { ushort(-1), }, + { ushort(-1), }, } -,buffer_tokens +,batch_size { - 1024U -} -,embed_elems -{ - 768U -} -,attn_rank -{ - 12U -} -,attn_mult -{ - 3U -} -,ffnn_mult -{ - 4U -} -,attn_elems -{ - embed_elems * attn_mult -} -,ffnn_elems -{ - embed_elems * ffnn_mult -} -,lanes -{ - 4U -} -,layers -{ - 12 -} -,embed_width -{ - embed_elems / lanes -} -,attn_width -{ - attn_elems / lanes -} -,attn_height -{ - embed_elems / lanes -} -,ffnn_width -{ - ffnn_elems / lanes -} -,ffnn_height -{ - embed_elems / lanes -} -,logits -{ - 50257 + 32 } ,training_steps { @@ -375,7 +1360,7 @@ noexcept } ,alpha { - 0.001f + 0.00002 } ,beta { @@ -384,7 +1369,95 @@ noexcept } ,epsilon { - 0.000001 + 0.00001 +} +,lambda +{ + 0.5 +} +,logits +{ + 50256 +} +,buffer_tokens +{ + 1024 - 16 // XXX +} +,context_tokens +{ + 512 // 1024 +} +,layers +{ + 12 +} +,lanes +{ + 4 +} +,embed_elems +{ + 768 +} +,embed_width +{ + embed_elems / lanes +} +,attn_rank +{ + 12 +} +,attn_mult +{ + 3 +} +,attn_elems +{ + embed_elems * attn_mult +} +,attn_fcon_width +{ + attn_elems / lanes +} +,attn_fcon_height +{ + embed_elems / lanes +} +,attn_proj_width +{ + embed_elems / lanes +} +,attn_proj_height +{ + embed_elems / lanes +} +,attn_self_elems +{ + (uint(powl(context_tokens, 2)) / 2) * attn_rank +} +,ffnn_mult +{ + 4 +} +,ffnn_elems +{ + embed_elems * ffnn_mult +} +,ffnn_fcon_width +{ + ffnn_elems / lanes +} +,ffnn_fcon_height +{ + embed_elems / lanes +} +,ffnn_proj_width +{ + embed_elems / lanes +} +,ffnn_proj_height +{ + ffnn_elems / lanes } { } diff --git a/ircd/gpt_cpu.cc b/ircd/gpt_cpu.cc index a338b9369..d2ef62a1e 100644 --- a/ircd/gpt_cpu.cc +++ b/ircd/gpt_cpu.cc @@ -8,17 +8,21 @@ // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. +#pragma clang fp exceptions(ignore) +#pragma clang fp reassociate(on) +#pragma clang fp contract(fast) + namespace ircd::gpt { - static size_t adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32, size_t); - static size_t adamw(task &, const f32, f32 *, const size_t, f32 *const (&)[2], const size_t); + static void adamw(const opts &, const u32, const f32, const uint, f32 *, f32 *, f32 *); - static size_t backprop(task &, const f32, model::norm &, f32 *const (&)[2], size_t); - static size_t backprop(task &, const f32, model::attn &, f32 *const (&)[2], size_t); - static size_t backprop(task &, const f32, model::ffnn &, f32 *const (&)[2], size_t); - static size_t backprop(task &, const f32, model::block &, f32 *const (&)[2], size_t); - static size_t backprop(task &, const f32, model::embed &, f32 *const (&)[2], size_t); - extern size_t backprop(task &, const f32, model::decoder &, f32 *const (&)[2], size_t = 0); + static void backprop(const opts &, const u32, const f32, model::norm &, model::norm &, model::norm &); + static void backprop(const opts &, const u32, const f32, model::attn &, model::attn &, model::attn &); + static void backprop(const opts &, const u32, const f32, model::ffnn &, model::ffnn &, model::ffnn &); + static void backprop(const opts &, const u32, const f32, model::block &, model::block &, model::block &); + static void backprop(const opts &, const u32, const f32, model::embed &, model::embed &, model::embed &); + static void backprop(const opts &, const u32, const f32, model::decoder &, model::decoder &, model::decoder &); + extern void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept; template static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &); @@ -37,7 +41,7 @@ namespace ircd::gpt static void logits(float *, const float *, const model::decoder &); static void tail(float *, const float *, const model::decoder &); static u16 argmax(const float *, const opts &); - static void embed(float *, const u16 token, const u16 position, const opts &); + static void embed(float *, const u16 token, const u16 position, const model::decoder &); static f32 logit alignas(64) [65536], @@ -49,21 +53,20 @@ void ircd::gpt::embed(float *const out, const u16 token, const u16 position, - const opts &opts) + const model::decoder &model) { - assert(opts.model); const auto &wpe { - opts.model->word.pos[position] + model.embed.pos[position] }; const auto &wte { - opts.model->word.token[token] + model.embed.token[token] }; for(uint j(0); j < 768; ++j) - out[j] = wte[j] + wpe[j]; + out[j] = wte.elem[j] + wpe.elem[j]; } uint16_t @@ -117,7 +120,7 @@ ircd::gpt::tail(float *const __restrict__ logit, for(uint i(0); i < 768; ++i) buf[0][i] = state[i]; - norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.f.bias, (const f32x4 *)d.f.weight, lnf_epsilon); + norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.embed.norm.bias.elem, (const f32x4 *)d.embed.norm.weight.elem, lnf_epsilon); logits(logit, buf[0], d); //logitsmax(logit, logit, vocab::tokens); } @@ -132,7 +135,7 @@ ircd::gpt::logits(float *const __restrict__ out, for(uint j(0); j < vocab::tokens; ++j) for(uint k(0); k < 768; ++k) - out[j] += in[k] * d.word.token[j][k]; + out[j] += in[k] * d.embed.token[j].elem[k]; } [[gnu::noinline]] @@ -192,7 +195,7 @@ ircd::gpt::coil(float *__restrict__ accum, }; for(uint j(0); j < tokens; ++j) - fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts); + fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias.elem, (const f32x4 *)layer.attn.proj_weight, fmma_opts); for(uint j(0); j < tokens; ++j) ffnn(accum + j * 768, accum + j * 768, decoder, i); @@ -227,7 +230,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64], buf alignas(64) [768], proj alignas(64) [2304]; - norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon); + norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.attn.norm.bias.elem, (const f32x4 *)layer.attn.norm.weight.elem, ln1_epsilon); static const math::fmma_opts fmma_opts { @@ -235,7 +238,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64], }; memset(proj, 0x0, sizeof(proj)); - fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts); + fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.fcon_bias.fcon, (const f32x4 *)layer.attn.fcon_weight, fmma_opts); #pragma clang loop unroll (disable) for(uint j(0); j < 12; ++j) @@ -372,10 +375,10 @@ ircd::gpt::ffnn(float *const out, buf2 alignas(64) [3072]; memset(buf2, 0x0, sizeof(buf2)); - norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon); - fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts); + norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ffnn.norm.bias.elem, (const f32x4 *)layer.ffnn.norm.weight.elem, ln2_epsilon); + fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fcon_bias.fcon, (const f32x4 *)layer.ffnn.fcon_weight, fmma3_opts); gelu((f32x4 *)buf2, (const f32x4 *)buf2); - fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts); + fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias.elem, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts); } void @@ -431,213 +434,219 @@ ircd::gpt::gelu(f32x4 &out, // [[gnu::noinline]] -size_t -ircd::gpt::backprop(task &task, +void +ircd::gpt::backprop(const opts &opts, + const u32 step, const f32 grad, - model::decoder ¶m, - f32 *const (&moment)[2], - size_t off) + model::decoder &__restrict__ param, + f32 *const __restrict__ buf[2]) +noexcept { - for(uint i(0); i < 12; ++i) - off = backprop(task, grad, param.layer[i], moment, off); - - off = backprop(task, grad, param.f, moment, off); - off = backprop(task, grad, param.word, moment, off); - return off; -} - -size_t -ircd::gpt::backprop(task &task, - const f32 grad, - model::embed ¶m, - f32 *const (&moment)[2], - size_t off) -{ - assert(task.opts); - const auto &opts + model::decoder *const __restrict__ moment[2] { - *task.opts + reinterpret_cast(buf[0]), + reinterpret_cast(buf[1]), }; + backprop(opts, step, grad, param, *moment[0], *moment[1]); +} + +void +ircd::gpt::backprop(const opts &opts, + const u32 step, + const f32 grad, + model::decoder &__restrict__ param, + model::decoder &__restrict__ moment0, + model::decoder &__restrict__ moment1) +{ + fpe::errors_handle eh; + + assume(opts.attn_rank > 0); + assume(opts.layers > 0); + const auto eln + { + opts.layers - 1 // step % opts.layers + }; + + for(int i(opts.layers - 1); i >= int(opts.layers - eln - 1); --i) + { + assert(i >= 0 && i < int(opts.layers)); + backprop(opts, step, grad, param.layer[i], moment0.layer[i], moment1.layer[i]); + } + + backprop(opts, step, grad, param.embed, moment0.embed, moment1.embed); + + auto pending(eh.pending()); + eh.clear_pending(); + pending &= ~pending & FE_INEXACT; + if(unlikely(pending)) + fpe::throw_errors(pending); +} + +void +ircd::gpt::backprop(const opts &opts, + const u32 step, + const f32 grad, + model::embed &__restrict__ param, + model::embed &__restrict__ moment0, + model::embed &__restrict__ moment1) +{ + backprop(opts, step, grad, param.norm, moment0.norm, moment1.norm); + + assume(opts.context_tokens > 0); for(uint i(0); i < opts.context_tokens; ++i) - off = adamw(task, grad, param.pos[i], 768, moment, off); + adamw(opts, step, grad, 768, param.pos[i].elem, moment0.pos[i].elem, moment1.pos[i].elem); + assume(opts.logits > 0); for(uint i(0); i < opts.logits; ++i) - off = adamw(task, grad, param.token[i], 768, moment, off); - - return off; + adamw(opts, step, grad, 768, param.token[i].elem, moment0.token[i].elem, moment1.token[i].elem); } -size_t -ircd::gpt::backprop(task &task, +void +ircd::gpt::backprop(const opts &opts, + const u32 step, const f32 grad, - model::block ¶m, - f32 *const (&moment)[2], - size_t off) + model::block &__restrict__ param, + model::block &__restrict__ moment0, + model::block &__restrict__ moment1) { - off = backprop(task, grad, param.ln1, moment, off); - off = backprop(task, grad, param.attn, moment, off); - off = backprop(task, grad, param.ln2, moment, off); - off = backprop(task, grad, param.ffnn, moment, off); - return off; + backprop(opts, step, grad, param.attn.norm, moment0.attn.norm, moment1.attn.norm); + backprop(opts, step, grad, param.attn, moment0.attn, moment1.attn); + + backprop(opts, step, grad, param.ffnn.norm, moment0.ffnn.norm, moment1.ffnn.norm); + backprop(opts, step, grad, param.ffnn, moment0.ffnn, moment1.ffnn); } -size_t -ircd::gpt::backprop(task &task, +void +ircd::gpt::backprop(const opts &opts, + const u32 step, const f32 grad, - model::attn ¶m, - f32 *const (&moment)[2], - size_t off) + model::attn &__restrict__ param, + model::attn &__restrict__ moment0, + model::attn &__restrict__ moment1) { - off = adamw(task, grad, param.attn_bias, 2304, moment, off); + adamw(opts, step, grad, 2304, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon); for(uint i(0); i < 768; ++i) - off = adamw(task, grad, param.attn_weight[i], 2304, moment, off); + adamw(opts, step, grad, 2304, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon); - off = adamw(task, grad, param.proj_bias, 768, moment, off); + adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem); for(uint i(0); i < 768; ++i) - off = adamw(task, grad, param.proj_weight[i], 768, moment, off); - - return off; + adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem); } -size_t -ircd::gpt::backprop(task &task, +void +ircd::gpt::backprop(const opts &opts, + const u32 step, const f32 grad, - model::ffnn ¶m, - f32 *const (&moment)[2], - size_t off) + model::ffnn &__restrict__ param, + model::ffnn &__restrict__ moment0, + model::ffnn &__restrict__ moment1) { - off = adamw(task, grad, param.fc_bias, 3072, moment, off); + adamw(opts, step, grad, 3072, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon); for(uint i(0); i < 768; ++i) - off = adamw(task, grad, param.fc_weight[i], 3072, moment, off); + adamw(opts, step, grad, 3072, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon); - off = adamw(task, grad, param.proj_bias, 768, moment, off); + adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem); for(uint i(0); i < 3072; ++i) - off = adamw(task, grad, param.proj_weight[i], 768, moment, off); - - return off; + adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem); } -size_t -ircd::gpt::backprop(task &task, +void +ircd::gpt::backprop(const opts &opts, + const u32 step, const f32 grad, - model::norm ¶m, - f32 *const (&moment)[2], - size_t off) + model::norm &__restrict__ param, + model::norm &__restrict__ moment0, + model::norm &__restrict__ moment1) { - off = adamw(task, grad, param.bias, 768, moment, off); - off = adamw(task, grad, param.weight, 768, moment, off); - return off; + adamw(opts, step, grad, 768, param.bias.elem, moment0.bias.elem, moment1.bias.elem); + adamw(opts, step, grad, 768, param.weight.elem, moment0.weight.elem, moment1.weight.elem); } -[[gnu::noinline]] -size_t -ircd::gpt::adamw(task &task, - const f32 grad, - f32 *const p_, - const size_t num, - f32 *const (&__restrict__ m_)[2], - size_t off) +namespace ircd::gpt { - assert(task.opts); - const auto &opts + static f32x4 adamw_moment(const f32x4, const f32, const f32); + static f32x4 adamw_numer(const f32x4, const f32, const u32); + static f32x4 adamw_denom(const f32x4, const f32, const u32); + static f32x4 adamw_delta(const f32x4, const f32x4, const f32, const f32, const f32, const u32); + static void adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32); + static void adamw(const opts &, const u32, const f32, const u32, f32 *, f32 *, f32 *); +} + +void +ircd::gpt::adamw(const opts &opts, + const u32 step, + const f32 grad, + const u32 num, + f32 *const __restrict__ param, + f32 *const __restrict__ moment0, + f32 *const __restrict__ moment1) +{ + f32x4 *const __restrict__ val[3] { - *task.opts + reinterpret_cast(param), + reinterpret_cast(moment0), + reinterpret_cast(moment1), }; - assert(task.ctrl); - auto &ctrl + const auto n { - *task.ctrl + num / 4 }; - f32x4 *const p[3] - { - reinterpret_cast(p_), - reinterpret_cast(m_[0]) + off, - reinterpret_cast(m_[1]) + off, - }; - - assert(num >= 4); - const uint n - { - uint(num) / 4 - }; - - // Assume loop body always taken w/o soundness; otherwise extra branch. - assert(n > 0); - uint i(0); do - { - off = adamw + assume(0 < n); + for(uint i(0); i < n; ++i) + adamw ( - p[0][i], - p[1][i], - p[2][i], + val[0][i], + val[1][i], + val[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], - ctrl.epic.step, - off + step + 1 ); - } - while(++i < n); - - return off; } -size_t +void ircd::gpt::adamw(f32x4 &__restrict__ param, f32x4 &__restrict__ moment0, f32x4 &__restrict__ moment1, - const f32 grad, - const f32 alpha, + const f32 grad_, + const f32 alpha_, const f32 beta0, const f32 beta1, - const u32 step, - const size_t off) + const u32 step) { - const f32x4 one + const f32 alpha { - 1.0f, 1.0f, 1.0f, 1.0f, + grad_ < 0? -alpha_ : alpha_ }; - const f32x4 a[2] + const f32 grad { - { one - beta0 }, - { one - beta1 }, + grad_ < 0? -grad_ : grad_ }; - const f32x4 avg_mul[2] + const f32 grad_grad { - { moment0 * beta0 }, - { moment1 * beta1 }, + grad * grad }; - const f32x4 avg_dot[2] + const f32x4 moment[] { - { avg_mul[0] + a[0] * grad }, - { avg_mul[1] + a[1] * grad * grad }, - }; - - const f32x4 bias[2] - { - { avg_dot[0] / (one - powf(beta0, step + 1)) }, - { avg_dot[1] / (one - powf(beta1, step + 1)) }, - }; - - const f32x4 denom - { - sqrtf(bias[1]) + 0.000001f // epsilon + adamw_moment(moment0, grad, beta0), + adamw_moment(moment1, grad_grad, beta1) }; const f32x4 delta { - alpha * (bias[0] / denom) + adamw_delta(moment[0], moment[1], alpha, beta0, beta1, step) }; const f32x4 update @@ -645,8 +654,168 @@ ircd::gpt::adamw(f32x4 &__restrict__ param, param - delta }; - moment0 = avg_dot[0]; - moment1 = avg_dot[1]; + if((false)) + for(uint i(0); i < 4; ++i) + printf("%-15p p[%11.8lf] m[%11.8lf][%11.8lf] g[%11.8lf] d[%11.8lf] p[%11.8lf] m[%11.8lf][%11.8lf]\n", + ((f32 *)¶m) + i, + param[i], + moment0[i], + moment1[i], + grad, + delta[i], + update[i], + moment[0][i], + moment[1][i]); + + assert(std::isnormal(update[0])); + assert(std::isnormal(update[1])); + assert(std::isnormal(update[2])); + assert(std::isnormal(update[3])); + + assert(std::isnormal(moment[0][0])); + assert(std::isnormal(moment[0][1])); + assert(std::isnormal(moment[0][2])); + assert(std::isnormal(moment[0][3])); + + assert(std::isnormal(moment[1][0])); + assert(std::isnormal(moment[1][1])); + assert(std::isnormal(moment[1][2])); + assert(std::isnormal(moment[1][3])); + param = update; - return off + 1; + //__builtin_nontemporal_store(update, ¶m); + + moment0 = moment[0]; + moment1 = moment[1]; + //__builtin_nontemporal_store(moment[0], &moment0); + //__builtin_nontemporal_store(moment[1], &moment1); +} + +ircd::f32x4 +ircd::gpt::adamw_delta(const f32x4 moment0, + const f32x4 moment1, + const f32 alpha, + const f32 beta0, + const f32 beta1, + const u32 step) +{ + static const f32 epsilon + { + FLT_EPSILON + }; + + const f32x4 denom + { + adamw_denom(moment1, beta1, step) + epsilon + }; + + const f32x4 decay + { + adamw_numer(moment0, beta0, step) + }; + + const f32x4 smooth + { + alpha * decay + }; + + assert(std::isnormal(denom[0])); + assert(std::isnormal(denom[1])); + assert(std::isnormal(denom[2])); + assert(std::isnormal(denom[3])); + const f32x4 delta + { + smooth / denom + }; + + return delta; +} + +ircd::f32x4 +ircd::gpt::adamw_denom(const f32x4 moment, + const f32 beta, + const u32 step) +{ + static const f32x4 one + { + 1.0f, 1.0f, 1.0f, 1.0f, + }; + + assert(step > 0); + const f32x4 decay + { + one - powf(beta, step) + }; + + assert(std::isnormal(decay[0])); + assert(std::isnormal(decay[1])); + assert(std::isnormal(decay[2])); + assert(std::isnormal(decay[3])); + const f32x4 bias + { + moment / decay + }; + + const f32x4 denom + { + sqrtf(bias) + }; + + return denom; +} + +ircd::f32x4 +ircd::gpt::adamw_numer(const f32x4 moment, + const f32 beta, + const u32 step) +{ + static const f32x4 one + { + 1.0f, 1.0f, 1.0f, 1.0f, + }; + + assert(step > 0); + const f32x4 decay + { + one - powf(beta, step) + }; + + assert(std::isnormal(decay[0])); + assert(std::isnormal(decay[1])); + assert(std::isnormal(decay[2])); + assert(std::isnormal(decay[3])); + const f32x4 bias + { + moment / decay + }; + + return bias; +} + +ircd::f32x4 +ircd::gpt::adamw_moment(const f32x4 moment, + const f32 grad, + const f32 beta) +{ + static const f32x4 one + { + 1.0f, 1.0f, 1.0f, 1.0f, + }; + + const f32x4 rate + { + one - beta + }; + + const f32x4 avg + { + moment * beta + }; + + const f32x4 dot + { + rate * grad + avg + }; + + return dot; } diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl index 661f6b465..9104e82f9 100644 --- a/ircd/gpt_gpu.cl +++ b/ircd/gpt_gpu.cl @@ -8,31 +8,1458 @@ // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. +//#pragma OPENCL EXTENSION cl_amd_device_attribute_query : enable +//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store :enable +//#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +//#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +//#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable +//#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable + +#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable + +#pragma clang fp exceptions(ignore) +#pragma clang fp reassociate(on) +#pragma clang fp contract(fast) + +#include +#include + +#define __region __attribute__((address_space(0x02))) + +#if !defined(assume) + #define assume(x) __builtin_assume(x) +#endif + +#if defined(__SPIR) + #define restrict +#elif defined(__cplusplus) + #define restrict __restrict +#endif + +#if __OPENCL_VERSION__ < 120 + #define static __attribute__((internal_linkage)) +#else + #define static __constant static +#endif + +#pragma clang attribute push(__attribute__((always_inline)), apply_to = function) +#pragma clang attribute push(__attribute__((internal_linkage)), apply_to = function) #include -#include +#include #include #include +#pragma clang attribute pop +#pragma clang attribute pop -inline void -__attribute__((always_inline)) -ircd_gpt_norm_fmad(__local float4 *const out, - __local const float4 *const in, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight, - const uint i) +#if __OPENCL_VERSION__ >= 120 + #undef static +#endif + +#include + +// +// head +// + +__kernel void +__attribute__((visibility("protected"))) +__attribute__((reqd_work_group_size(192, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_alloc(__global const void *const restrict model, + __global void *const restrict master, + __constant const void *const opts, + __global void *const restrict ctrl, + __global void *const restrict frame0, + __global void *const restrict frame1, + __global void *const restrict frame2, + __global void *const restrict frame3, + __global void *const restrict frame4, + __global void *const restrict frame5, + __global void *const restrict frame6, + __global void *const restrict frame7) { - out[i] = in[i] * weight[i] + bias[i]; +} + +__kernel void +__attribute__((visibility("protected"))) +__attribute__((reqd_work_group_size(192, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_enter(__global const void *const restrict model, + __global void *const restrict state, + __global void *const restrict master, + __constant const struct ircd_gpt_opts *const opts, + __global struct ircd_gpt_ctrl *const restrict ctrl) +{ + const ushort + gi = get_global_id(0), + li = get_local_id(0), + ln = get_local_size(0), + cycle = ctrl->clk.cycle; + + if(li == 0) + ;//ctrl->prof.entered = __builtin_readcyclecounter(); +} + +__kernel void +__attribute__((vec_type_hint(float4))) +__attribute__((reqd_work_group_size(192, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict accum, + __global const ircd_gpt_vectorv *const restrict pos, + __global const ircd_gpt_vectorv *const restrict vocab) +{ + const ushort + li = get_local_id(0), + ln = get_local_size(0); + + const uint + wo = get_global_offset(0); + + assume(ln == 192); + assume(wo % ln == 0); + + const ushort + wi = wo / ln + get_group_id(0); + + _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li); +} + +static void +_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict accum, + __global const ircd_gpt_vectorv *const restrict pos, + __global const ircd_gpt_vectorv *const restrict vocab, + const ushort out_idx, + const ushort tok_idx, + const ushort elem_idx) +{ + const ushort + token = ctrl->token[tok_idx]; + + const float4 + wpe = pos[tok_idx].elem[elem_idx], + wte = vocab[token].elem[elem_idx], + res = wte + wpe; + + accum[out_idx].elem[elem_idx] = res; +} + +// +// Frontside +// + +void +ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_ffnn_aperaturev *const restrict out, + __local const ircd_gpt_vectorv *const restrict in, + __global const ircd_gpt_ffnn_aperaturev *const restrict bias, + __global const ircd_gpt_ffnn_aperaturev *const restrict weight, + const uint li) +{ + const uint + lanes = 4, + segs = ircd_gpt_ffnn_segs, + height = ircd_gpt_vector_elems / lanes; + + assume(height > 0); + assume(height % lanes == 0); + + for(uint x = 0; x < segs; ++x) + out->proj[x][li] = bias->proj[x][li]; + + for(uint y = 0; y < height; ++y) + for(uint k = 0; k < lanes; ++k) + for(uint x = 0; x < segs; ++x) + { + const uint + row = y * lanes + k; + + out->proj[x][li] += in->elem[y][k] * weight[row].proj[x][li]; + } +} + +void +ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_ffnn_aperaturev *const restrict out, + __local const ircd_gpt_vectorv *const restrict in, + __global const ircd_gpt_ffnn_aperaturev *const restrict bias, + __global const ircd_gpt_ffnn_aperaturev *const restrict weight, + const uint ln, + const uint li) +{ + const uint + segs = ircd_gpt_ffnn_segs; + + // Fully connected + ircd_gpt_ffnn_fcon_tmul + ( + opts, + out, + in, + bias, + weight, + li + ); + + for(uint i = 0; i < segs; ++i) + ircd_gpt_ffnn_gelu(out, out, i * ln + li); +} + +void +ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_vectorv *const restrict out, + __local const ircd_gpt_ffnn_aperaturev *const restrict in, + __global const ircd_gpt_vectorv *const restrict bias, + __global const ircd_gpt_vectorv *const restrict weight, + const uint li) +{ + const uint + lanes = 4, + height = ircd_gpt_ffnn_fcon_elems / lanes; + + assume(height > 0); + assume(height % lanes == 0); + + out->elem[li] = bias->elem[li]; + + for(uint y = 0; y < height; ++y) + for(uint k = 0; k < lanes; ++k) + { + const uint + row = y * lanes + k; + + out->elem[li] += in->fcon[y][k] * weight[row].elem[li]; + } +} + +void +ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_vectorv *const restrict token, + __local ircd_gpt_ffnn_aperaturev *const restrict buf, + __local ircd_gpt_vectorv *const restrict tmp, + __global const ircd_gpt_vectorv *const restrict norm_bias, + __global const ircd_gpt_vectorv *const restrict norm_weight, + __global const ircd_gpt_ffnn_aperaturev *const restrict fcon_bias, + __global const ircd_gpt_ffnn_aperaturev *const restrict fcon_weight, + __global const ircd_gpt_vectorv *const restrict proj_bias, + __global const ircd_gpt_vectorv *const restrict proj_weight, + const uint ln, + const uint li) +{ + // Layer re-normalization + ircd_gpt_norm(token, token, tmp, norm_bias, norm_weight, ln, li); + + // ln's writes are still pending but fcon reads results across threads. + barrier(CLK_LOCAL_MEM_FENCE); + + // Fully connected + ircd_gpt_ffnn_fcon + ( + ctrl, + opts, + buf, + token, + fcon_bias, + fcon_weight, + ln, + li + ); + + // fcon's writes are still pending but proj reads results across threads. + barrier(CLK_LOCAL_MEM_FENCE); + + // Projection + ircd_gpt_ffnn_proj_tmul + ( + opts, + token, + buf, + proj_bias, + proj_weight, + li + ); +} + +static void +ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local float self[restrict][12], + const uint ln, + const uint li, + const uint wn, + const uint wi) +{ + struct ircd_math_samax samax = + { + .mu = -10000.0f, + .sum = 0.0f, + }; + + __attribute__((opencl_unroll_hint)) + for(uint i = 0; i < wn; ++i) + samax.mu = max(samax.mu, self[i][li]); + + __attribute__((opencl_unroll_hint)) + for(uint i = 0; i < wn; ++i) + self[i][li] -= samax.mu; + + for(uint i = 0; i < wn; ++i) + self[i][li] = native_exp(self[i][li]); + + __attribute__((opencl_unroll_hint)) + for(uint i = 0; i < wn; ++i) + samax.sum += self[i][li]; + + samax.sum += FLT_EPSILON; + samax.lambda = 1.0f / samax.sum; + + __attribute__((opencl_unroll_hint)) + for(uint i = 0; i < wn; ++i) + self[i][li] *= samax.lambda; +} + +static void +ircd_gpt_attn_self_keys(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local float self[restrict][ircd_gpt_attn_rank], + __global const ircd_gpt_attn_qkvv *const restrict token, + const uint ln, + const uint li, + const uint wi, + const uint kn, + const uint i) +{ + assume(i < wi); + + self[i][li] = 0.0f; + + __attribute__((opencl_unroll_hint)) + for(uint k = 0; k < kn; ++k) + { + float4 + qry = token[wi].qry.attn[li][k], + key = token[i].key.attn[li][k], + res = qry * key; + + self[i][li] += ircd_simt_reduce_add_f4(res); + } + + self[i][li] /= 8.0f; +} + +static void +ircd_gpt_attn_self_vals(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_vectorv *const restrict out, + __local const float self[restrict][ircd_gpt_attn_rank], + __global const ircd_gpt_attn_qkvv *const restrict token, + const uint li, + const uint wi, + const uint ki, + const uint ti) +{ + out->attn[ti][ki] = 0.0f; + + __attribute__((opencl_unroll_hint)) + for(uint i = 0; i < wi; ++i) + { + const float4 + val = token[i].val.attn[ti][ki], + attn = self[i][ti], + res = attn * val; + + out->attn[ti][ki] += res; + } +} + +static void +ircd_gpt_attn_self(__global struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_vectorv *const restrict out, + __local float self[restrict][ircd_gpt_attn_rank], + __global float attns[restrict][ircd_gpt_attn_rank], + __global const ircd_gpt_attn_qkvv *const restrict token, + const uint ln, + const uint li, + const uint wi) +{ + //assume(opts->attn_rank == sizeof(self[0]) / sizeof(float)); + assume(opts->attn_rank == ircd_gpt_attn_rank); + assume(ctrl->count < ircd_gpt_context_tokens); + assume(ctrl->tokens <= ircd_gpt_context_tokens); + assume(ctrl->tokens > wi); + assume(ctrl->tokens > 0); + + const uint + wn = ctrl->tokens, + kn = ln / opts->attn_rank, + ki = li / opts->attn_rank, + ti = li % opts->attn_rank; + + // Low-rank mask + if(li < opts->attn_rank) + { + // Left attention + uint i; + for(i = 0; i < wi; ++i) + ircd_gpt_attn_self_keys(ctrl, opts, self, token, ln, li, wi, kn, i); + + // Future mask + __attribute__((opencl_unroll_hint)) + while(i < wn) + self[i++][li] = -10000.0f; + + // Three-piece softmax + ircd_gpt_attn_self_samax(ctrl, opts, self, ln, li, wn, wi); + } + + // Propagate to full width for value dot prod. + barrier(CLK_LOCAL_MEM_FENCE); + ircd_gpt_attn_self_vals(ctrl, opts, out, self, token, li, wi, ki, ti); + + // Save softmax results for later analysis/observation. + if(li < opts->attn_rank) + { + __attribute__((opencl_unroll_hint)) + for(uint i = 0; i < wn; ++i) + attns[i][li] = self[i][li]; + } +} + +static void +ircd_gpt_attn_proj_tmul(__constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_vectorv *const restrict out, + __local const ircd_gpt_vectorv *const restrict in, + __global const ircd_gpt_vectorv *const restrict bias, + __global const ircd_gpt_vectorv *const restrict weight, + const uint li) +{ + const uint + lanes = 4, + height = ircd_gpt_vector_elems / 4; + + assume(height > 0); + assume(height % lanes == 0); + + out->elem[li] = bias->elem[li]; + + for(uint y = 0; y < height; ++y) + for(uint k = 0; k < lanes; ++k) + { + const uint + row = y * lanes + k; + + const float4 + a = in->elem[y][k], + b = weight[row].elem[li]; + + out->elem[li] += a * b; + } +} + +__kernel void +__attribute__((vec_type_hint(float4))) +__attribute__((reqd_work_group_size(192, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(192, 192))) +__attribute__((visibility("protected"))) +ircd_gpt_coil(__global struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __private const uint layer, + __global ircd_gpt_vectorv *const restrict accum, + __global float attns[restrict][ircd_gpt_attn_rank], + __global const ircd_gpt_attn_qkvv *const restrict state, + __global const ircd_gpt_vectorv *const restrict attn_proj_bias, + __global const ircd_gpt_vectorv *const restrict attn_proj_weight, + __global const ircd_gpt_vectorv *const restrict ffnn_norm_bias, + __global const ircd_gpt_vectorv *const restrict ffnn_norm_weight, + __global const ircd_gpt_ffnn_aperaturev *const restrict ffnn_fcon_bias, + __global const ircd_gpt_ffnn_aperaturev *const restrict ffnn_fcon_weight, + __global const ircd_gpt_vectorv *const restrict ffnn_proj_bias, + __global const ircd_gpt_vectorv *const restrict ffnn_proj_weight) +{ + const uint + li = get_local_id(0), + ln = get_local_size(0), + wo = get_global_offset(0), + wi = wo / ln + get_group_id(0); + + assume(ln == 192); + assume(wo % ln == 0); + + __local union + { + float + attn_self[ircd_gpt_context_tokens][ircd_gpt_attn_rank]; + + ircd_gpt_ffnn_aperaturev + ffnn_fcon[2]; + + ircd_gpt_vectorv + vector[8]; + } + buf; + + __local ircd_gpt_vectorv + buf0, buf1, + *const restrict attn_self = &buf1, + *const restrict token = &buf0, + *const restrict tmp = &buf1; + + // Self-attention backend; this computes the self-attention result now + // that keys and values are globally visible across tokens. + ircd_gpt_attn_self + ( + ctrl, + opts, + attn_self, + buf.attn_self, + attns, + state, + ln, + li, + wi + ); + + barrier(CLK_LOCAL_MEM_FENCE); + + // Project result of self-attention. + ircd_gpt_attn_proj_tmul + ( + opts, + token, + attn_self, + attn_proj_bias, + attn_proj_weight, + li + ); + + // Frontend accumulation + { + const float4 + attn = token->elem[li], + resid = accum[wi].elem[li], + result = resid + attn; + + token->elem[li] = result; + accum[wi].elem[li] = result; + } + + // Backend mlp; layer-norm acquires any pending writes, no fence required. + ircd_gpt_ffnn + ( + ctrl, + opts, + token, + buf.ffnn_fcon, + tmp, + ffnn_norm_bias, + ffnn_norm_weight, + ffnn_fcon_bias, + ffnn_fcon_weight, + ffnn_proj_bias, + ffnn_proj_weight, + ln, + li + ); + + // Backend accumulation + { + const float4 + ffnn = token->elem[li], + resid = accum[wi].elem[li], + result = resid + ffnn; + + accum[wi].elem[li] = result; + } +} + +static void +ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *const opts, + __local ircd_gpt_attn_aperaturev *const restrict out, + __local const ircd_gpt_vectorv *const restrict in, + __global const ircd_gpt_attn_aperaturev *const restrict bias, + __global const ircd_gpt_attn_aperaturev *const restrict weight, + const uint ln, + const uint li) +{ + const uint + lanes = 4, + segs = ircd_gpt_attn_segs, + height = ircd_gpt_vector_elems / lanes; + + assume(height > 0); + assume(height % segs == 0); + assume(height % lanes == 0); + + for(uint x = 0; x < segs; ++x) + out->proj[x][li] = bias->proj[x][li]; + + for(uint y = 0; y < height; ++y) + for(uint k = 0; k < lanes; ++k) + for(uint x = 0; x < segs; ++x) + { + const uint + row = y * lanes + k; + + const float4 + a = in->elem[y][k], + b = weight[row].proj[x][li]; + + out->proj[x][li] += a * b; + } +} + +__kernel void +__attribute__((vec_type_hint(float4))) +__attribute__((reqd_work_group_size(192, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __private const uint layer, + __global ircd_gpt_attn_aperaturev *const restrict state, + __global const ircd_gpt_vectorv *const restrict accum, + __global const ircd_gpt_vectorv *const restrict norm_bias, + __global const ircd_gpt_vectorv *const restrict norm_weight, + __global const ircd_gpt_attn_aperaturev *const restrict fcon_bias, + __global const ircd_gpt_attn_aperaturev *const restrict fcon_weight) +{ + const uint + li = get_local_id(0), + ln = get_local_size(0), + wo = get_global_offset(0), + wi = wo / ln + get_group_id(0), + segs = ircd_gpt_attn_segs; + + assume(ln == 192); + assume(wo % ln == 0); + + __local ircd_gpt_attn_aperaturev + attn; + + __local ircd_gpt_vectorv + token, *const restrict tmp = attn.vector; + + token.elem[li] = accum[wi].elem[li]; + + // Layer re-normalization + ircd_gpt_norm(&token, &token, tmp, norm_bias, norm_weight, ln, li); + + // Ln's writes are still pending; fcon requires results across threads. + barrier(CLK_LOCAL_MEM_FENCE); + + // Fully connected + ircd_gpt_attn_fcon_tmul + ( + opts, + &attn, + &token, + fcon_bias, + fcon_weight, + ln, + li + ); + + // Export queries, keys, and values. + for(uint x = 0; x < segs; ++x) + state[wi].proj[x][li] = attn.proj[x][li]; +} + +__kernel void +__attribute__((vec_type_hint(float4))) +__attribute__((reqd_work_group_size(192, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict accum, + __global const ircd_gpt_vectorv *const restrict norm_bias, + __global const ircd_gpt_vectorv *const restrict norm_weight) +{ + const uint + li = get_local_id(0), + ln = get_local_size(0), + wo = get_global_offset(0), + wi = wo / ln + get_group_id(0); + + assume(ln == 192); + assume(wo % ln == 0); + + __local ircd_gpt_vectorv + tmp, token; + + token.elem[li] = accum[wi].elem[li]; + + // Final re-normalization + ircd_gpt_norm(&token, &token, &tmp, norm_bias, norm_weight, ln, li); + + accum[wi].elem[li] = token.elem[li]; +} + +__kernel void +ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global float *const restrict logit, + __global const ircd_gpt_vectorv *const restrict accum, + __global const ircd_gpt_vectorv *const restrict pos, + __global const ircd_gpt_vectorv *const restrict vocab) +{ + const uint + gi = get_global_id(0), + wi = ctrl->count - 1; + + assume(opts->embed_width == 192); + assume(opts->logits <= 65536); + + if(gi >= opts->logits) + { + logit[gi] = -10000.0f; + return; + } + + float acc = 0.0f; + for(uint j = 0; j < opts->embed_width; ++j) + { + const float4 + token = vocab[gi].elem[j], + in = accum[wi].elem[j], + wpe = pos[wi].elem[j], + res = in * token + wpe; + + acc += ircd_simt_reduce_add_f4(res); + } + + logit[gi] = acc; +} + +__kernel void +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(256, 256))) +ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global float logit[restrict 65536]) +{ + const uint + li = get_local_id(0), + ln = get_local_size(0), + //wo = get_global_offset(0), + //wi = wo / ln + get_group_id(0), + wn = 50432, + tn = wn / ln, + start = tn * li, + stop = min(start + tn, opts->logits); + + __local float + mu[256], sum[256], lambda[256]; + + __local struct ircd_math_samax + samax; + + assume(ln == 256); + + mu[li] = -10000.0f; + __attribute__((opencl_unroll_hint)) + for(uint ti = start; ti < stop; ++ti) + mu[li] = max(mu[li], logit[ti]); + + ircd_simt_reduce_max_flldr(mu, ln, li); + + if(li == 0) + samax.mu = mu[li]; + + sum[li] = 0.0f; + for(uint ti = start; ti < stop; ++ti) + { + const float + sub = logit[ti] - samax.mu, + res = native_exp(sub); + + sum[li] += res; + } + + ircd_simt_reduce_add_flldr(sum, ln, li); + + if(li == 0) + sum[li] += FLT_EPSILON, + samax.sum = sum[li], + samax.lambda = lambda[li] = 1.0f / sum[li]; + + ircd_simt_broadcast_flldr(lambda, ln, li); + + for(uint ti = start; ti < stop; ++ti) + { + const float + sub = logit[ti] - samax.mu, + res = lambda[li] * native_exp(sub); + + logit[ti] = res; + } +} + +void +ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local const ushort *const restrict idx, + __global const float *const restrict logsm, + const uint i) +{ + const ushort + token = idx[i]; + + const float + samax = logsm[token] + FLT_EPSILON; + + ctrl->top[i].token = token; + ctrl->top[i].samax = samax; +} + +void +ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local struct ircd_math_mean *const mean, + const float last) +{ + const uint + div = mean->div + 1, + sum_sel = mean->div % 4; + + const float + sum = mean->sum[0] + mean->sum[1] + mean->sum[2] + mean->sum[3] + last, + res = sum / div; + + mean->sum[sum_sel] += last; + mean->div = div; + mean->last = last; + mean->mean = res; +} + +void +ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local struct ircd_gpt_ctrl_label *const label, + __global const float *const restrict logsm) +{ + const ushort + token = label->logit.token; + + const float + samax = logsm[token] + FLT_EPSILON, + loss = 0.0f - native_log(samax), + ppl = (1.0f - samax) * native_log2(opts->logits); + + label->logit.samax = samax; + ircd_gpt_lm_result_label_mean(ctrl, opts, &label->loss, loss); + ircd_gpt_lm_result_label_mean(ctrl, opts, &label->ppl, ppl); +} + +ushort +ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local const ushort *const restrict idx, + __global const float *const restrict logsm) +{ + const ulong + ent_k = max(opts->top_k, 1U) - 1, + rnd = ircd_simt_rand_xoshiro256pl(ctrl->rand); + + const float + ent_p = min(max(opts->top_p, 0.0f), 1.0f), + thresh = ent_p; + + float acc = 1.0f; + ushort select = 0; + for(; select < ent_k; ++select) + if((acc -= logsm[idx[select]]) < thresh) + break; + + const ushort + token = idx[select]; + + return token; +} + +static ushort +ircd_gpt_lm_result(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local const ushort *const restrict idx, + __global const float *const restrict logsm) +{ + const ushort + token = ircd_gpt_lm_result_select(ctrl, opts, idx, logsm); + + // Update the dynamic result label. + ctrl->select.logit.token = token; + ircd_gpt_lm_result_label(ctrl, opts, &ctrl->select, logsm); + + // Update the dynamic target label. + ctrl->target.logit.token = ctrl->count < ctrl->tokens? + ctrl->token[ctrl->count]: + ctrl->select.logit.token; + + ircd_gpt_lm_result_label(ctrl, opts, &ctrl->target, logsm); + + const bool + hit = ctrl->select.logit.token == ctrl->target.logit.token; + + // Update the token context. + if(ctrl->count == ctrl->tokens) + { + ctrl->token[ctrl->count] = ctrl->select.logit.token; + ctrl->tokens++; + } + + ctrl->miss += !hit; + ctrl->hit += hit; + ctrl->count++; + return token; +} + +static void +ircd_gpt_lm_result_attns(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global const float *const restrict attns, + const uint ln, + const uint li) +{ + const uint + layer = li / opts->layers, + head = li % opts->attn_rank, + base = layer * opts->attn_self_elems; + + uint best = 0; + float bestv = 10000.0f; + for(uint i = 0; i < ctrl->count; ++i) + { + const uint + bx = (((i + 1) * i) / 2) * opts->attn_rank, + idx = base + bx + i * 12 + head; + + if(attns[idx] < bestv) + bestv = attns[idx], + best = i; + } + + ctrl->attn[layer][head] = best; +} + +__kernel void +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(256, 256))) +__attribute__((visibility("protected"))) +ircd_gpt_lm_select(__global struct ircd_gpt_ctrl *const restrict ctrl_, + __constant const struct ircd_gpt_opts *const opts, + __global const float logsm[restrict 65536], + __global const float *const restrict attns) +{ + const uint + li = get_local_id(0), + ln = get_local_size(0), + logits_pad = ln - (opts->logits % ln), + tn = (opts->logits + logits_pad) / ln, + start = tn * li, + stop = min(start + tn, opts->logits); + + __local ushort idx[256]; + __local struct ircd_gpt_ctrl ctrl; + __private event_t event[1]; + + assume(ln == 256); + assume(start < stop); + + event[0] = async_work_group_copy + ( + (__local char16 *)&ctrl, + (__global const char16 *)ctrl_, + sizeof(struct ircd_gpt_ctrl) / sizeof(char16), + 0 + ); + + idx[li] = start; + for(uint j = start + 1; j < stop; ++j) + if(logsm[j] > logsm[idx[li]]) + idx[li] = j; + + ircd_simt_sort_idx16_flldr(idx, logsm, ln, li); + wait_group_events(1, event); + + if(ctrl.count >= opts->buffer_tokens) + return; + + if(li < opts->top_n) + ircd_gpt_lm_result_top(&ctrl, opts, idx, logsm, li); + + if(li < opts->labels) + ircd_gpt_lm_result_label(&ctrl, opts, ctrl.label + li, logsm); + + if(li < opts->layers * opts->attn_rank) + ircd_gpt_lm_result_attns(&ctrl, opts, attns, ln, li); + + barrier(CLK_LOCAL_MEM_FENCE); + + if(li == 0) + ircd_gpt_lm_result(&ctrl, opts, idx, logsm); + + barrier(CLK_LOCAL_MEM_FENCE); + + event[0] = async_work_group_copy + ( + (__global char16 *)ctrl_, + (__local const char16 *)&ctrl, + sizeof(struct ircd_gpt_ctrl) / sizeof(char16), + 0 + ); + + wait_group_events(1, event); +} + +__kernel void +__attribute__((visibility("protected"))) +__attribute__((reqd_work_group_size(256, 1, 1))) +__attribute__((amdgpu_flat_work_group_size(256, 256))) +ircd_gpt_leave(__global const void *const restrict model, + __global void *const restrict state, + __global void *const restrict master, + __constant const struct ircd_gpt_opts *const opts, + __global struct ircd_gpt_ctrl *const ctrl_, + __global struct ircd_gpt_ctrl *const frame) +{ + const ushort + li = get_local_id(0), + ln = get_local_size(0); + + assume(ln == 256); + + __local struct ircd_gpt_ctrl _ctrl; + __local struct ircd_gpt_ctrl *const ctrl = &_ctrl; + + if(li == 0) + *ctrl = *ctrl_; + + barrier(CLK_LOCAL_MEM_FENCE); + + if(li == 0 && ctrl->accept < 0) + ircd_gpt_accept(ctrl, opts); + + barrier(CLK_LOCAL_MEM_FENCE); + + const uint + batch_size = opts->batch_size, + samps = opts->training_steps + opts->validation_steps + opts->testing_steps, + steps = samps / batch_size; + + const bool + accepting = ctrl->accept >= 0, + cycling = !accepting, + sampling = accepting, + stepping = sampling && (ctrl->clk.samp + 1) >= batch_size, + epoching = stepping && (ctrl->clk.step + 1) >= steps; + + if(li == 0) + ;//ctrl->prof.finished = __builtin_readcyclecounter(); + + if(li == 0) + *frame = *ctrl; + + if(!accepting && li == 0) + { + ctrl->clk.cycle += cycling; + ctrl->clk.samp += sampling; + ctrl->clk.step += stepping; + ctrl->clk.epoch += epoching; + } +} + +void +ircd_gpt_accept(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts) +{ + const bool + unlimited = opts->limit == -1U; + + const uint + batch_size = opts->batch_size, + samps = opts->training_steps + opts->validation_steps + opts->testing_steps, + steps = samps / batch_size; + + const int + limit = min(opts->limit, opts->context_tokens), + cycle_remain = limit - (ctrl->clk.cycle + 1), // cycle not yet incr + token_remain = opts->context_tokens - ctrl->count, // but count already incr + remain_ = min(cycle_remain, token_remain), + accept_ = ircd_gpt_accept_check(ctrl, opts), + accept_abs = abs(accept_), + remain = accept_ < 0 && accept_abs < remain_? accept_abs: remain_, + _accept = accept_ >= 0? accept_: -remain; + + const bool + accepting = _accept >= 0, + dispatching = _accept < 0, + limiting = remain <= 0; + + const int + accept_num = 4, + accept = limiting? accept_num: _accept, + dispatch = accept >= 0? 0: remain; + + ctrl->accept = accept; + ctrl->dispatch = dispatch; + + ctrl->magic = 0xC7012C70UL; +} + +int +ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts) +{ + int best = 8; + for(uint i = 0; i < 4; ++i) + { + const int + remain = ircd_gpt_accept_match(ctrl, opts, i); + + if(remain == 0) + return i; + + if(remain < best) + best = remain; + } + + return -best; +} + +uint +ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + const uint i) +{ + const uint + len = ircd_gpt_accept_len(ctrl, opts, i), + n = min(ctrl->count, len), + maxlen = 8; + + uint ret = len?: maxlen; + for(uint j = 1; j <= n; ++j) + { + uint match = 0; + for(; match < j; ++match) + { + const uint + accept = opts->accept[i][match], + token = ctrl->token[ctrl->count - j + match]; + + if(token != accept) + break; + } + + if(match >= j) + if(!(ret = len - match)) + break; + } + + ret = max(ret, ctrl->tokens - ctrl->count); + ret = min(ret, maxlen); + return ret; +} + +uint +ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + const uint i) +{ + uint len = 0; + for(; len < 8; ++len) + if(opts->accept[i][len] == (ushort)-1U) + break; + + return len; +} + +// +// backside +// + +void +__attribute__((always_inline)) +ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global float4 *const restrict param_, + __global float4 *const restrict exp_avg_, + __global float4 *const restrict exp_avg_sqr_) +{ + const uint + li = get_local_id(0), + ts = ctrl->clk.step; + + const float4 + param = param_[li], + grad = ctrl->label[0].loss.mean, + alpha[2] = { 1.0f - opts->beta[0], 1.0f - opts->beta[1], }, + exp_avg = ts? exp_avg_[li]: 0.0f, + exp_avg_sqr = ts? exp_avg_sqr_[li]: 0.0f, + exp_avg_mul = exp_avg * opts->beta[0], + exp_avg_dot = exp_avg_mul + alpha[0] * grad, + exp_avg_sqr_mul = exp_avg_sqr * opts->beta[1], + exp_avg_sqr_dot = exp_avg_sqr_mul + alpha[1] * grad * grad, + denom = native_sqrt(exp_avg_sqr_dot) + FLT_EPSILON, + delta = opts->alpha * (exp_avg_dot / denom), + update = param - delta; + + param_[li] = param + FLT_EPSILON; + exp_avg_[li] = exp_avg + FLT_EPSILON; + exp_avg_sqr_[li] = exp_avg_sqr + FLT_EPSILON; + + //param_[li] = update; + //exp_avg_[li] = exp_avg_dot; + //exp_avg_sqr_[li] = exp_avg_sqr_dot; +} + +// +// backpropagations +// + +__kernel void +__attribute__((always_inline)) +//__attribute__((vec_type_hint(float4))) +//__attribute__((reqd_work_group_size(192, 1, 1))) +//__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict bias, + __global ircd_gpt_vectorv *const restrict bias_m0, + __global ircd_gpt_vectorv *const restrict bias_m1, + __global ircd_gpt_vectorv *const restrict weight, + __global ircd_gpt_vectorv *const restrict weight_m0, + __global ircd_gpt_vectorv *const restrict weight_m1) +{ + ircd_gpt_prop_elem + ( + ctrl, opts, + bias->elem, + bias_m0->elem, + bias_m1->elem + ); + + ircd_gpt_prop_elem + ( + ctrl, opts, + weight->elem, + weight_m0->elem, + weight_m1->elem + ); +} + +__kernel void +//__attribute__((vec_type_hint(float4))) +//__attribute__((reqd_work_group_size(192, 1, 1))) +//__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict norm_bias, + __global ircd_gpt_vectorv *const restrict norm_bias_m0, + __global ircd_gpt_vectorv *const restrict norm_bias_m1, + __global ircd_gpt_vectorv *const restrict norm_weight, + __global ircd_gpt_vectorv *const restrict norm_weight_m0, + __global ircd_gpt_vectorv *const restrict norm_weight_m1, + __global ircd_gpt_attn_aperaturev *const restrict fcon_bias, + __global ircd_gpt_attn_aperaturev *const restrict fcon_bias_m0, + __global ircd_gpt_attn_aperaturev *const restrict fcon_bias_m1, + __global ircd_gpt_attn_aperaturev *const restrict fcon_weight, + __global ircd_gpt_attn_aperaturev *const restrict fcon_weight_m0, + __global ircd_gpt_attn_aperaturev *const restrict fcon_weight_m1, + __global ircd_gpt_vectorv *const restrict proj_bias, + __global ircd_gpt_vectorv *const restrict proj_bias_m0, + __global ircd_gpt_vectorv *const restrict proj_bias_m1, + __global ircd_gpt_vectorv *const restrict proj_weight, + __global ircd_gpt_vectorv *const restrict proj_weight_m0, + __global ircd_gpt_vectorv *const restrict proj_weight_m1) +{ + const uint + fcon_height = opts->embed_elems, + proj_height = opts->embed_elems, + segs = 3; + + ircd_gpt_norm_prop + ( + ctrl, opts, + norm_bias, + norm_bias_m0, + norm_bias_m1, + norm_weight, + norm_weight_m0, + norm_weight_m1 + ); + + for(uint j = 0; j < segs; ++j) + ircd_gpt_prop_elem + ( + ctrl, opts, + fcon_bias->proj[j], + fcon_bias_m0->proj[j], + fcon_bias_m1->proj[j] + ); + + for(uint i = 0; i < fcon_height; ++i) + for(uint j = 0; j < segs; ++j) + ircd_gpt_prop_elem + ( + ctrl, opts, + fcon_weight[i].proj[j], + fcon_weight_m0[i].proj[j], + fcon_weight_m1[i].proj[j] + ); + + ircd_gpt_prop_elem + ( + ctrl, opts, + proj_bias->elem, + proj_bias_m0->elem, + proj_bias_m1->elem + ); + + for(uint i = 0; i < proj_height; ++i) + ircd_gpt_prop_elem + ( + ctrl, opts, + proj_weight[i].elem, + proj_weight_m0[i].elem, + proj_weight_m1[i].elem + ); +} + +__kernel void +//__attribute__((vec_type_hint(float4))) +//__attribute__((reqd_work_group_size(192, 1, 1))) +//__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict norm_bias, + __global ircd_gpt_vectorv *const restrict norm_bias_m0, + __global ircd_gpt_vectorv *const restrict norm_bias_m1, + __global ircd_gpt_vectorv *const restrict norm_weight, + __global ircd_gpt_vectorv *const restrict norm_weight_m0, + __global ircd_gpt_vectorv *const restrict norm_weight_m1, + __global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias, + __global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m0, + __global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m1, + __global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight, + __global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m0, + __global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m1, + __global ircd_gpt_vectorv *const restrict proj_bias, + __global ircd_gpt_vectorv *const restrict proj_bias_m0, + __global ircd_gpt_vectorv *const restrict proj_bias_m1, + __global ircd_gpt_vectorv *const restrict proj_weight, + __global ircd_gpt_vectorv *const restrict proj_weight_m0, + __global ircd_gpt_vectorv *const restrict proj_weight_m1) +{ + const uint + fcon_height = opts->embed_elems, + proj_height = opts->ffnn_elems, + segs = 4; + + ircd_gpt_norm_prop + ( + ctrl, opts, + norm_bias, + norm_bias_m0, + norm_bias_m1, + norm_weight, + norm_weight_m0, + norm_weight_m1 + ); + + for(uint j = 0; j < segs; ++j) + ircd_gpt_prop_elem + ( + ctrl, opts, + fcon_bias->proj[j], + fcon_bias_m0->proj[j], + fcon_bias_m1->proj[j] + ); + + for(uint i = 0; i < fcon_height; ++i) + for(uint j = 0; j < segs; ++j) + ircd_gpt_prop_elem + ( + ctrl, opts, + fcon_weight[i].proj[j], + fcon_weight_m0[i].proj[j], + fcon_weight_m1[i].proj[j] + ); + + ircd_gpt_prop_elem + ( + ctrl, opts, + proj_bias->elem, + proj_bias_m0->elem, + proj_bias_m1->elem + ); + + for(uint i = 0; i < proj_height; ++i) + ircd_gpt_prop_elem + ( + ctrl, opts, + proj_weight[i].elem, + proj_weight_m0[i].elem, + proj_weight_m1[i].elem + ); +} + +__kernel void +//__attribute__((vec_type_hint(float4))) +//__attribute__((reqd_work_group_size(192, 1, 1))) +//__attribute__((amdgpu_flat_work_group_size(192, 192))) +ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global ircd_gpt_vectorv *const restrict pos, + __global ircd_gpt_vectorv *const restrict pos_m0, + __global ircd_gpt_vectorv *const restrict pos_m1, + __global ircd_gpt_vectorv *const restrict token, + __global ircd_gpt_vectorv *const restrict token_m0, + __global ircd_gpt_vectorv *const restrict token_m1) +{ + const uint + ln = get_local_size(0), + wi = get_global_offset(0) / ln + get_group_id(0), + wn = ctrl->count, + cn = opts->context_tokens / wn, + ci = cn * wi, + tn = opts->logits / wn, + ti = tn * wi; + + for(uint i = ci; i < ci + cn; ++i) + ircd_gpt_prop_elem + ( + ctrl, opts, + pos[i].elem, + pos_m0[i].elem, + pos_m1[i].elem + ); + + for(uint i = ti; i < ti + tn; ++i) + ircd_gpt_prop_elem + ( + ctrl, opts, + token[i].elem, + token_m0[i].elem, + token_m1[i].elem + ); } /// Gaussian Error Linear Unit -inline void -__attribute__((always_inline)) -ircd_gpt_ffnn_gelu(__local float4 *const out, - __local const float4 *const in_, +void +ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *const out, + __local const ircd_gpt_ffnn_aperaturev *const in_, const uint i) { const float4 - in = in_[i]; + in = in_->fcon[i]; float4 a; a = 0.044715f; @@ -47,966 +1474,29 @@ ircd_gpt_ffnn_gelu(__local float4 *const out, a *= in; a *= 0.5f; - out[i] = a; + out->fcon[i] = a; } -// Matrix * Vector Multiply/Accumulate -inline float4 -__attribute__((flatten, always_inline)) -ircd_gpt_tmul_dot(__local const float4 *const restrict in, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight, - const uint width, - const uint height, - const uint col, - const uint i, - const uint j) +void +ircd_gpt_norm(__local ircd_gpt_vectorv *const out, + __local const ircd_gpt_vectorv *const in, + __local ircd_gpt_vectorv *const restrict tmp, + __global const ircd_gpt_vectorv *const restrict bias, + __global const ircd_gpt_vectorv *const restrict weight, + const uint ln, + const uint li) { - const uint - li = get_local_id(0), - ln = get_local_size(0), - lanes = 4; - - float4 - acc = 0.0f; - - for(uint k = 0; k < lanes; ++k) - { - const uint - row = j * lanes + k, - cell = row * width + col; - - acc += in[j][k] * weight[cell]; - } - - return acc; -} - -// Matrix * Vector Multiply/Accumulate -inline void -__attribute__((flatten, always_inline)) -ircd_gpt_tmul(__local float4 *const restrict out, - __local const float4 *const restrict in, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight, - const uint width, - const uint height, - const uint segs) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0); - - __attribute__((opencl_unroll_hint)) - for(uint i = 0; i < segs; ++i) - { - const uint - col = i * ln + li; - - out[col] = bias[col]; - } - - for(uint i = 0; i < segs; ++i) - { - const uint - col = i * ln + li; - - for(uint j = 0; j < height; ++j) - out[col] += ircd_gpt_tmul_dot(in, bias, weight, width, height, col, i, j); - } -} - -inline void -__attribute__((flatten, always_inline)) -ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local union ircd_gpt_ffnn_aperaturev *const restrict out, - __local const union ircd_gpt_tokenv *const in, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - width = opts->ffnn_width, - height = opts->ffnn_height, - tiles = opts->ffnn_mult; - - ircd_gpt_tmul(out->fcon, in->word, bias, weight, width, height, tiles); - - for(uint i = 0; i < tiles; ++i) - ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li); -} - -inline void -__attribute__((flatten, always_inline)) -ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local union ircd_gpt_tokenv *const restrict token, - __local union ircd_gpt_ffnn_aperaturev *const restrict buf, - __local union ircd_gpt_ffnn_aperaturev *const restrict tmp0, - __local union ircd_gpt_tokenv *const restrict tmp1, - __global const float4 *const restrict norm_bias, - __global const float4 *const restrict norm_weight, - __global const float4 *const restrict fcon_bias, - __global const float4 *const restrict fcon_weight, - __global const float4 *const restrict proj_bias, - __global const float4 *const restrict proj_weight) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0), - width = opts->ffnn_width, - height = opts->ffnn_height; - // Layer re-normalization - ircd_simt_math_norm_f4lldr(token->word, token->word, buf->word); - ircd_gpt_norm_fmad(token->word, token->word, norm_bias, norm_weight, li); - - // ln's writes are still pending but fcon reads results across threads. - barrier(CLK_LOCAL_MEM_FENCE); - - // Fully connected - ircd_gpt_ffnn_fcon(ctrl, opts, buf, token, fcon_bias, fcon_weight); - - // fcon's writes are still pending but proj reads results across threads. - barrier(CLK_LOCAL_MEM_FENCE); - - // Projection - ircd_gpt_tmul(token->word, buf->fcon, proj_bias, proj_weight, height, width, 1); + ircd_simt_math_norm_f4lldr(out->elem, in->elem, tmp->elem, ln, li); + ircd_gpt_norm_fmad(out, out, bias, weight, li); } -inline void -__attribute__((flatten, always_inline)) -ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local float self[][12], - const uint wn) +void +ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *const out, + __local const ircd_gpt_vectorv *const in, + __global const ircd_gpt_vectorv *const restrict bias, + __global const ircd_gpt_vectorv *const restrict weight, + const uint i) { - const uint - gn = get_global_size(0), - li = get_local_id(0), - ln = get_local_size(0); - - struct ircd_math_samax samax = - { - .mu = -10000.0f, - .sum = 0.0f, - }; - - for(uint i = 0; i < wn; ++i) - samax.mu = max(samax.mu, self[i][li]); - - for(uint i = 0; i < wn; ++i) - self[i][li] = exp(self[i][li] - samax.mu); - - __attribute__((opencl_unroll_hint)) - for(uint i = 0; i < wn; ++i) - samax.sum += self[i][li]; - - samax.lambda = 1.0f / samax.sum; - - __attribute__((opencl_unroll_hint)) - for(uint i = 0; i < wn; ++i) - self[i][li] *= samax.lambda; -} - -inline void -__attribute__((flatten, always_inline)) -ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local union ircd_gpt_tokenv *const restrict out, - __local float self[][12], - __global const struct ircd_gpt_attn_qkvv *const restrict token) -{ - const uint - gi = get_global_id(0), - gn = get_global_size(0), - li = get_local_id(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0), - wn = ctrl->tokens.count, - ti = li % opts->attn_rank, - ki = li / opts->attn_rank, - kn = ln / opts->attn_rank; - - // Low-rank mask - if(li < opts->attn_rank) - { - // For each token - for(uint i = 0; i < wn; ++i) - { - // Left-attention mask - if(wi < i) - { - self[i][li] = -10000.0f; - continue; - } - - float4 acc = 0.0f; - __attribute__((opencl_unroll_hint)) - for(uint k = 0; k < kn; ++k) - { - float4 - qry = token[wi].qry.attn[li][k], - key = token[i].key.attn[li][k]; - - acc += qry * key; - } - - const float - sum = ircd_simt_reduce_add_f4(acc), - res = sum / 8.0f; - - self[i][li] = res; - } - - // Three-piece softmax - ircd_gpt_attn_self_samax(ctrl, opts, self, wn); - } - - // Propagate to full width for value dot prod. - barrier(CLK_LOCAL_MEM_FENCE); - - float4 acc = 0.0f; - __attribute__((opencl_unroll_hint)) - for(uint i = 0; i < wi; ++i) - { - const float4 - attn = self[i][ti], - val = token[i].val.attn[ti][ki]; - - acc += attn * val; - } - - out->attn[ti][ki] = acc; -} - -inline void -__attribute__((flatten, always_inline)) -ircd_gpt_attn_proj(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local union ircd_gpt_tokenv *const out, - __local const union ircd_gpt_tokenv *const xattn, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight) -{ - const uint - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0), - width = opts->attn_height, // same - height = opts->attn_height; - - // Projection - ircd_gpt_tmul(out->word, xattn->word, bias, weight, width, height, 1); -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict accum, - __global const struct ircd_gpt_attn_qkvv *const restrict state, - __global const float4 *const restrict attn_proj_bias, - __global const float4 *const restrict attn_proj_weight, - __global const float4 *const restrict ffnn_norm_bias, - __global const float4 *const restrict ffnn_norm_weight, - __global const float4 *const restrict ffnn_fcon_bias, - __global const float4 *const restrict ffnn_fcon_weight, - __global const float4 *const restrict ffnn_proj_bias, - __global const float4 *const restrict ffnn_proj_weight) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0); - - __local union ircd_gpt_tokenv - buf1, buf0; - - __local union - { - union ircd_gpt_ffnn_aperaturev - ffnn_fcon[2]; - - float - attn_self[512][12]; - } - buf; - - // Self-attention backend; this computes the self-attention result now - // that keys and values are globally visible across tokens. - ircd_gpt_attn_self - ( - ctrl, - opts, - &buf1, - buf.attn_self, - state - ); - - barrier(CLK_LOCAL_MEM_FENCE); - - // Project result of self-attention. - ircd_gpt_attn_proj - ( - ctrl, - opts, - &buf0, - &buf1, - attn_proj_bias, - attn_proj_weight - ); - - // Frontend accumulation - { - const float4 - attn = buf0.word[li], - resid = accum[wi].word[li]; - - buf0.word[li] += resid; - accum[wi].word[li] += attn; - } - - // Backend mlp; layer-norm acquires any pending writes, no fence required. - ircd_gpt_ffnn - ( - ctrl, - opts, - &buf0, - buf.ffnn_fcon + 0, - buf.ffnn_fcon + 1, - &buf1, - ffnn_norm_bias, - ffnn_norm_weight, - ffnn_fcon_bias, - ffnn_fcon_weight, - ffnn_proj_bias, - ffnn_proj_weight - ); - - // Backend accumulation - { - const float4 - ffnn = buf0.word[li], - resid = accum[wi].word[li], - result = ffnn + resid; - - accum[wi].word[li] = result; - } -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_attn_aperaturev *const restrict state, - __global const union ircd_gpt_tokenv *const restrict accum, - __global const float4 *const restrict norm_bias, - __global const float4 *const restrict norm_weight, - __global const float4 *const restrict fcon_bias, - __global const float4 *const restrict fcon_weight) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0), - width = opts->attn_width, - height = opts->attn_height, - tiles = opts->attn_mult; - - __local union ircd_gpt_attn_aperaturev - token; - - __local float4 - tmp[768/4]; - - token.word[li] = accum[wi].word[li]; - - // Layer re-normalization - ircd_simt_math_norm_f4lldr(token.word, token.word, tmp); - ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); - - // Ln's writes are still pending; fcon requires results across threads. - barrier(CLK_LOCAL_MEM_FENCE); - - // Fully connected - ircd_gpt_tmul(token.fcon, tmp, fcon_bias, fcon_weight, width, height, tiles); - - // Export queries, keys, and values. - for(uint i = 0; i < tiles; ++i) - state[wi].proj[i][li] = token.proj[i][li]; -} - -// -// frontend -// - -inline void -__attribute__((always_inline)) -_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict accum, - __global const union ircd_gpt_tokenv *const restrict pos, - __global const union ircd_gpt_tokenv *const restrict vocab, - const uint out_idx, - const uint tok_idx, - const uint word_idx) -{ - const ushort - ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens, - token = ctrl->token[ring_idx]; - - const float4 - wte = vocab[token].word[word_idx], - wpe = pos[tok_idx].word[word_idx]; - - accum[out_idx].word[word_idx] = wte + wpe; -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict accum, - __global const union ircd_gpt_tokenv *const restrict pos, - __global const union ircd_gpt_tokenv *const restrict vocab) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0); - - _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li); -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict accum, - __global const float4 *const restrict norm_bias, - __global const float4 *const restrict norm_weight) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0); - - __local union ircd_gpt_tokenv - token, tmp; - - token.word[li] = accum[wi].word[li]; - - // Final re-normalization - ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word); - ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li); - - accum[wi].word[li] = token.word[li]; -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global float *const restrict logit, - __global const union ircd_gpt_tokenv *const restrict accum, - __global const union ircd_gpt_tokenv *const restrict token) -{ - const uint - gi = get_global_id(0), - ti = ctrl->tokens.count - 1, - words = opts->embed_width; - - float4 acc = 0.0f; - __attribute__((opencl_unroll_hint)) - for(uint j = 0; j < words; ++j) - { - const float4 - in = accum[ti].word[j], - vocab = token[gi].word[j]; - - acc += vocab * in; - } - - const float - ret = ircd_simt_reduce_add_f4(acc); - - if(gi < opts->logits) - logit[gi] = ret; - else - logit[gi] = -10000.0f; -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global float4 *const restrict logsm, - __global float4 *const restrict logexp, - __global const float4 *const restrict logit) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - logits = opts->logits, - logits_alignup = logits + (ln - (logits % ln)), - tn = logits_alignup / ln / 4, - ti = tn * li; - - __local float share[256]; - __local float4 share4[256]; - - share4[li] = -10000.0f; - for(uint i = ti; i < ti + tn; ++i) - share4[li] = max(share4[li], logit[i]); - - share[li] = -10000.0f; - for(uint k = 0; k < 4; ++k) - share[li] = max(share[li], share4[li][k]); - - ircd_simt_reduce_max_flldr(share); - - if(li == 0) - share4[li] = ctrl->samax.mu = share[li]; - - ircd_simt_broadcast_f4lldr(share4); - - const float4 - mu = share4[li]; - - share4[li] = 0.0f; - for(uint i = ti; i < ti + tn; ++i) - { - const float4 - reg = logit[i] - mu; - - float4 res; - for(uint k = 0; k < 4; ++k) - if(i * 4 + k < logits) - res[k] = exp(reg[k]); - else - res[k] = 0.0f; - - share4[li] += res; - logexp[i] = res; - } - - ircd_simt_reduce_add_f4lldr(share4); - - if(li == 0) - { - const float - sum = ircd_simt_reduce_add_f4(share4[li]); - - share4[li][0] = ctrl->samax.sum = sum; - share4[li][1] = ctrl->samax.lambda = 1.0f / sum; - } - - ircd_simt_broadcast_f4lldr(share4); - - const float4 - sum = share4[li][0], - lambda = share4[li][1]; - - for(uint i = ti; i < ti + tn; ++i) - logsm[i] = logexp[i] * lambda; -} - -inline void -__attribute__((always_inline)) -ircd_gpt_lm_result_top(__global struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local const ushort *const restrict idx, - __global const float *const restrict logsm, - __global const float *const restrict logit, - const uint i) -{ - const ushort - token = idx[i]; - - const float - samax = logsm[token]; - - ctrl->top[i].token = token; - ctrl->top[i].samax = samax; -} - -inline void -__attribute__((always_inline)) -ircd_gpt_lm_result_label(__global struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local const ushort *const restrict idx, - __global const float *const restrict logsm, - __global const float *const restrict logit, - const uint i) -{ - __global struct ircd_gpt_ctrl_label - *const label = ctrl->label + i; - - const ushort - token = label->token, - sum_sel = ctrl->epic.cycle % 3; - - const float - samax = logsm[token], - mean_div = ctrl->epic.cycle + 1.0f; - - const float - loss = 0.0f - log(samax), - loss_sum = label->loss.sum[0] + label->loss.sum[1] + label->loss.sum[2] + loss, - loss_mean = loss_sum / mean_div; - - const float - perp = (1.0f - samax) * native_log2(opts->logits), - perp_sum = label->perp.sum[0] + label->perp.sum[1] + label->perp.sum[2] + perp, - perp_mean = perp_sum / mean_div; - - label->samax = samax; - - label->loss.last = loss; - label->loss.sum[sum_sel] += loss; - label->loss.mean = loss_mean; - - label->perp.last = perp; - label->perp.sum[sum_sel] += perp; - label->perp.mean = perp_mean; -} - -inline void -__attribute__((always_inline)) -ircd_gpt_lm_result_select(__global struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __local const ushort *const restrict idx, - __global const float *const restrict logsm, - __global const float *const restrict logexp, - __global const float *const restrict logit) -{ - const bool - buffer_full = ctrl->tokens.count >= opts->buffer_tokens; - - const ulong - rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand), - ent_k = max(opts->top_k, 1U), - ent_p = max(1U, min(opts->top_p, 100U)); - - const float - thresh = (rnd % ent_p) / 100.0f; - - ushort select = 0; - float smacc = 0.0f; - for(; select < opts->top_k; ++select) - if((smacc += logsm[idx[select]]) > thresh) - break; - - const ushort - token = idx[select], - dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens, - tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens), - head = buffer_full? - (ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head; - - ctrl->tokens.head = head; - ctrl->tokens.count = tokens; - ctrl->token[dest] = token; -} - -inline void -__attribute__((always_inline)) -ircd_gpt_leave(__global struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - const uint li) -{ - if(ctrl->epic.cycle + 1 >= opts->limit) - ctrl->epic.epoch += 1; - - ctrl->epic.cycle += 1; - ctrl->magic = 0xC7012C70U; -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_lm_select(__global struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global const float *const restrict logsm, - __global const float *const restrict logit) -{ - const uint - li = get_local_id(0), - ln = get_local_size(0), - tn = opts->logits / ln, - ti = tn * li; - - __local ushort idx[256]; - - idx[li] = ti; - for(uint j = ti + 1; j < ti + tn; ++j) - if(logsm[j] > logsm[idx[li]]) - idx[li] = j; - - ircd_simt_sort_idx16_flldr(idx, logsm); - - if(li < opts->top_n) - ircd_gpt_lm_result_top(ctrl, opts, idx, logsm, logexp, logit, li); - - if(li < opts->labels) - ircd_gpt_lm_result_label(ctrl, opts, idx, logsm, logexp, logit, li); - - // Writes to `idx` from the sort are still pending across threads. - barrier(CLK_LOCAL_MEM_FENCE); - - // Mask for write-leader - if(li == 0) - ircd_gpt_lm_result_select(ctrl, opts, idx, logsm, logexp, logit); - - if(li != 0) - return; - - ircd_gpt_leave(ctrl, opts, li); -} - -// -// backpropagations -// - -inline void -__attribute__((always_inline)) -ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global float4 *const restrict param_, - __global float4 *const restrict exp_avg_, - __global float4 *const restrict exp_avg_sqr_) -{ - const uint - li = get_local_id(0), - step = ctrl->epic.step; - - const float4 - param = param_[li], - grad = ctrl->label[0].loss.mean, - alpha[2] = { 1.0f - opts->beta[0], 1.0f - opts->beta[1], }, - exp_avg = step? exp_avg_[li]: 0.0f, - exp_avg_sqr = step? exp_avg_sqr_[li]: 0.0f, - exp_avg_mul = exp_avg * opts->beta[0], - exp_avg_dot = exp_avg_mul + alpha[0] * grad, - exp_avg_sqr_mul = exp_avg_sqr * opts->beta[1], - exp_avg_sqr_dot = exp_avg_sqr_mul + alpha[1] * grad * grad, - denom = sqrt(exp_avg_sqr_dot) + opts->epsilon, - delta = opts->alpha * (exp_avg_dot / denom), - update = param - delta; - - param_[li] = update; - exp_avg_[li] = exp_avg_dot; - exp_avg_sqr_[li] = exp_avg_sqr_dot; -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict bias, - __global union ircd_gpt_tokenv *const restrict bias_m0, - __global union ircd_gpt_tokenv *const restrict bias_m1, - __global union ircd_gpt_tokenv *const restrict weight, - __global union ircd_gpt_tokenv *const restrict weight_m0, - __global union ircd_gpt_tokenv *const restrict weight_m1) -{ - ircd_gpt_prop_elem - ( - ctrl, opts, - bias->word, - bias_m0->word, - bias_m1->word - ); - - ircd_gpt_prop_elem - ( - ctrl, opts, - weight->word, - weight_m0->word, - weight_m1->word - ); -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict norm_bias, - __global union ircd_gpt_tokenv *const restrict norm_bias_m0, - __global union ircd_gpt_tokenv *const restrict norm_bias_m1, - __global union ircd_gpt_tokenv *const restrict norm_weight, - __global union ircd_gpt_tokenv *const restrict norm_weight_m0, - __global union ircd_gpt_tokenv *const restrict norm_weight_m1, - __global union ircd_gpt_attn_aperaturev *const restrict fcon_bias, - __global union ircd_gpt_attn_aperaturev *const restrict fcon_bias_m0, - __global union ircd_gpt_attn_aperaturev *const restrict fcon_bias_m1, - __global union ircd_gpt_attn_aperaturev *const restrict fcon_weight, - __global union ircd_gpt_attn_aperaturev *const restrict fcon_weight_m0, - __global union ircd_gpt_attn_aperaturev *const restrict fcon_weight_m1, - __global union ircd_gpt_tokenv *const restrict proj_bias, - __global union ircd_gpt_tokenv *const restrict proj_bias_m0, - __global union ircd_gpt_tokenv *const restrict proj_bias_m1, - __global union ircd_gpt_tokenv *const restrict proj_weight, - __global union ircd_gpt_tokenv *const restrict proj_weight_m0, - __global union ircd_gpt_tokenv *const restrict proj_weight_m1) -{ - ircd_gpt_norm_prop - ( - ctrl, opts, - norm_bias, - norm_bias_m0, - norm_bias_m1, - norm_weight, - norm_weight_m0, - norm_weight_m1 - ); - - for(uint j = 0; j < 3; ++j) - ircd_gpt_prop_elem - ( - ctrl, opts, - fcon_bias->proj[j], - fcon_bias_m0->proj[j], - fcon_bias_m1->proj[j] - ); - - for(uint i = 0; i < 768; ++i) - for(uint j = 0; j < 3; ++j) - ircd_gpt_prop_elem - ( - ctrl, opts, - fcon_weight[i].proj[j], - fcon_weight_m0[i].proj[j], - fcon_weight_m1[i].proj[j] - ); - - ircd_gpt_prop_elem - ( - ctrl, opts, - proj_bias->word, - proj_bias_m0->word, - proj_bias_m1->word - ); - - for(uint i = 0; i < 768; ++i) - ircd_gpt_prop_elem - ( - ctrl, opts, - proj_weight[i].word, - proj_weight_m0[i].word, - proj_weight_m1[i].word - ); -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict norm_bias, - __global union ircd_gpt_tokenv *const restrict norm_bias_m0, - __global union ircd_gpt_tokenv *const restrict norm_bias_m1, - __global union ircd_gpt_tokenv *const restrict norm_weight, - __global union ircd_gpt_tokenv *const restrict norm_weight_m0, - __global union ircd_gpt_tokenv *const restrict norm_weight_m1, - __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias, - __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m0, - __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m1, - __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight, - __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m0, - __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m1, - __global union ircd_gpt_tokenv *const restrict proj_bias, - __global union ircd_gpt_tokenv *const restrict proj_bias_m0, - __global union ircd_gpt_tokenv *const restrict proj_bias_m1, - __global union ircd_gpt_tokenv *const restrict proj_weight, - __global union ircd_gpt_tokenv *const restrict proj_weight_m0, - __global union ircd_gpt_tokenv *const restrict proj_weight_m1) -{ - ircd_gpt_norm_prop - ( - ctrl, opts, - norm_bias, - norm_bias_m0, - norm_bias_m1, - norm_weight, - norm_weight_m0, - norm_weight_m1 - ); - - for(uint j = 0; j < 4; ++j) - ircd_gpt_prop_elem - ( - ctrl, opts, - fcon_bias->proj[j], - fcon_bias_m0->proj[j], - fcon_bias_m1->proj[j] - ); - - for(uint i = 0; i < 768; ++i) - for(uint j = 0; j < 4; ++j) - ircd_gpt_prop_elem - ( - ctrl, opts, - fcon_weight[i].proj[j], - fcon_weight_m0[i].proj[j], - fcon_weight_m1[i].proj[j] - ); - - ircd_gpt_prop_elem - ( - ctrl, opts, - proj_bias->word, - proj_bias_m0->word, - proj_bias_m1->word - ); - - for(uint i = 0; i < 3072; ++i) - ircd_gpt_prop_elem - ( - ctrl, opts, - proj_weight[i].word, - proj_weight_m0[i].word, - proj_weight_m1[i].word - ); -} - -__kernel void -__attribute__((flatten)) -ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *const ctrl, - __constant const struct ircd_gpt_opts *const opts, - __global union ircd_gpt_tokenv *const restrict pos, - __global union ircd_gpt_tokenv *const restrict pos_m0, - __global union ircd_gpt_tokenv *const restrict pos_m1, - __global union ircd_gpt_tokenv *const restrict token, - __global union ircd_gpt_tokenv *const restrict token_m0, - __global union ircd_gpt_tokenv *const restrict token_m1) -{ - const uint - gn = get_global_size(0), - ln = get_local_size(0), - wi = get_global_offset(0) / ln + get_group_id(0), - wn = ctrl->tokens.count, - cn = opts->context_tokens / wn, - ci = cn * wi, - tn = opts->logits / wn, - ti = tn * wi; - - for(uint i = ci; i < ci + cn; ++i) - ircd_gpt_prop_elem - ( - ctrl, opts, - pos[i].word, - pos_m0[i].word, - pos_m1[i].word - ); - - for(uint i = ti; i < ti + tn; ++i) - ircd_gpt_prop_elem - ( - ctrl, opts, - token[i].word, - token_m0[i].word, - token_m1[i].word - ); + out->elem[i] = in->elem[i] * weight->elem[i] + bias->elem[i]; } diff --git a/ircd/gpt_model.cc b/ircd/gpt_model.cc index d8d7f9038..65e27a0d0 100644 --- a/ircd/gpt_model.cc +++ b/ircd/gpt_model.cc @@ -54,10 +54,14 @@ namespace ircd::gpt::model static fs::map default_model_shm, default_dataset_shm; - - static std::unique_ptr default_model_res; } +constexpr const char +*const ircd::gpt::model::prop::ended, +*const ircd::gpt::model::prop::id, +*const ircd::gpt::model::prop::length, +*const ircd::gpt::model::prop::text; + decltype(ircd::gpt::model::manifest_h) ircd::gpt::model::manifest_h { @@ -102,7 +106,7 @@ decltype(ircd::gpt::model::cache_hugepage) ircd::gpt::model::cache_hugepage { { "name", "ircd.gpt.model.cache.hugepage" }, - { "default", true }, + { "default", false }, }; decltype(ircd::gpt::model::cache_path) @@ -132,6 +136,12 @@ ircd::gpt::model::path decltype(ircd::gpt::model::default_model) ircd::gpt::model::default_model; +decltype(ircd::gpt::model::default_moment) +ircd::gpt::model::default_moment; + +decltype(ircd::gpt::model::default_checkpoint) +ircd::gpt::model::default_checkpoint; + decltype(ircd::gpt::model::default_dataset) ircd::gpt::model::default_dataset; @@ -144,17 +154,31 @@ ircd::gpt::model::init() if(!model::path) return; - if(!init_from_cache(model::cache_path)) - init_from_json(model::cache_path, model::path); - if(model::dataset_path) init_dataset(model::dataset_path); + + if(likely(init_from_cache(model::cache_path))) + return; + + init_from_json(model::cache_path, model::path); + if(unlikely(!init_from_cache(model::cache_path))) + throw error + { + "Failed to find and/or initialize model." + }; } void ircd::gpt::model::fini() noexcept { + default_checkpoint[2] = nullptr; + default_checkpoint[1] = nullptr; + default_checkpoint[0] = nullptr; + + default_moment[1] = nullptr; + default_moment[0] = nullptr; + default_model = nullptr; default_model_shm = {}; @@ -169,18 +193,33 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path) if(!fs::is_reg(cache_path)) return false; - const auto size + const auto file_size { fs::size(cache_path) }; - if(unlikely(size != sizeof(model::decoder))) + const auto decoder_size + { + sizeof(model::decoder) + }; + + const bool has_params + { + file_size >= decoder_size + }; + + const bool has_moments + { + file_size >= decoder_size * 6 + }; + + if(unlikely(!has_params)) throw error { - "Cached model `%s' size %zu differs from %zu.", + "Cached model `%s' size %zu insufficient for decoder size %zu.", cache_path, - size, - sizeof(model::decoder), + file_size, + decoder_size, }; const auto mode @@ -192,20 +231,41 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path) const fs::fd fd { - cache_path, mode + cache_path, fs::fd::opts + { + .mode = mode, + }, }; - fs::map::opts map_opts + const bool map_moments { - mode + has_moments || cache_shared + }; + + if(!has_moments && map_moments) + { + fs::truncate(fd, decoder_size * 6); + fs::allocate(fd, decoder_size * 5, decoder_size); + } + + const auto map_size + { + map_moments? + decoder_size * 6: + decoder_size + }; + + const fs::map::opts map_opts + { + .alignment = alignof(model::decoder), + .shared = bool(cache_shared), + .locked = bool(cache_locked), + .huge2mb = bool(cache_hugepage), }; - map_opts.locked = bool(cache_locked); - map_opts.shared = bool(cache_shared); - map_opts.huge2mb = bool(cache_hugepage); default_model_shm = fs::map { - fd, map_opts, sizeof(decoder) + fd, map_opts, map_size }; default_model = reinterpret_cast @@ -213,13 +273,28 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path) data(default_model_shm) ); + if(map_moments) + { + default_moment[0] = reinterpret_cast(default_model + 1); + default_moment[1] = reinterpret_cast(default_model + 2); + default_checkpoint[0] = reinterpret_cast(default_model + 3); + default_checkpoint[1] = reinterpret_cast(default_model + 4); + default_checkpoint[2] = reinterpret_cast(default_model + 5); + } + + allocator::lock({(const char *)default_model, sizeof(decoder)}); + fs::prefetch(default_model_shm, sizeof(decoder)); + char pbuf[48]; log::info { - log, "model(%p) mapped cached model `%s' %s", + log, "model(%p) mapped cached model `%s' params:%b moments:%b align:%u %s", data(default_model_shm), cache_path, - pretty(pbuf, iec(size)), + has_params, + has_moments, + map_opts.alignment, + pretty(pbuf, iec(map_size)), }; return true; @@ -264,9 +339,6 @@ ircd::gpt::model::init_from_json(const string_view &cache_path, cache_path, stopwatch.pretty(pbuf[1]), }; - - default_model_res = std::move(decoder); - default_model = default_model_res.get(); } void @@ -363,6 +435,7 @@ ircd::gpt::model::init_dataset(const string_view &path) size_t checkpoint(0); default_data.resize(260000); //TODO: XXX + fs::prefetch(default_dataset_shm, size); ircd::tokens(default_dataset, '\n', [&checkpoint] (const string_view &line) { @@ -379,6 +452,7 @@ ircd::gpt::model::init_dataset(const string_view &path) checkpoint, }; + fs::evict(default_dataset_shm, size); return true; } @@ -393,9 +467,9 @@ ircd::gpt::model::init_wpe_weight(decoder &d, { size_t j(0); for(const auto &elem : vec) - d.word.pos[i][j++] = lex_cast(elem); + d.embed.pos[i].elem[j++] = lex_cast(elem); - always_assert(j == sizeof(d.word.pos[i]) / sizeof(float)); + always_assert(j == sizeof(d.embed.pos[i]) / sizeof(float)); ++i; } } @@ -411,9 +485,9 @@ ircd::gpt::model::init_wte_weight(decoder &d, { size_t j(0); for(const auto &elem : vec) - d.word.token[i][j++] = lex_cast(elem); + d.embed.token[i].elem[j++] = lex_cast(elem); - always_assert(j == sizeof(d.word.token[i]) / sizeof(float)); + always_assert(j == sizeof(d.embed.token[i]) / sizeof(float)); ++i; } } @@ -426,9 +500,9 @@ ircd::gpt::model::init_f_weight(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.f.weight[i++] = lex_cast(elem); + d.embed.norm.weight.elem[i++] = lex_cast(elem); - always_assert(i == sizeof(d.f.weight) / sizeof(float)); + always_assert(i == sizeof(d.embed.norm.weight) / sizeof(float)); } void @@ -439,9 +513,9 @@ ircd::gpt::model::init_f_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.f.bias[i++] = lex_cast(elem); + d.embed.norm.bias.elem[i++] = lex_cast(elem); - always_assert(i == sizeof(d.f.bias) / sizeof(float)); + always_assert(i == sizeof(d.embed.norm.bias) / sizeof(float)); } void @@ -455,16 +529,16 @@ ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d, { size_t j(0); for(const auto &elem : vec) - d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast(elem); + d.layer[layer].ffnn.fcon_weight[i].fcon[j++] = lex_cast(elem); - always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float)); + always_assert(j == sizeof(d.layer[layer].ffnn.fcon_weight[i]) / sizeof(float)); ++i; } always_assert ( - i == sizeof(d.layer[layer].ffnn.fc_weight) - / sizeof(d.layer[layer].ffnn.fc_weight[0]) + i == sizeof(d.layer[layer].ffnn.fcon_weight) + / sizeof(d.layer[layer].ffnn.fcon_weight[0]) ); } @@ -476,9 +550,9 @@ ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].ffnn.fc_bias[i++] = lex_cast(elem); + d.layer[layer].ffnn.fcon_bias.fcon[i++] = lex_cast(elem); - always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float)); + always_assert(i == sizeof(d.layer[layer].ffnn.fcon_bias) / sizeof(float)); } void @@ -492,7 +566,7 @@ ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d, { size_t j(0); for(const auto &elem : vec) - d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast(elem); + d.layer[layer].ffnn.proj_weight[i].elem[j++] = lex_cast(elem); always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float)); ++i; @@ -513,7 +587,7 @@ ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].ffnn.proj_bias[i++] = lex_cast(elem); + d.layer[layer].ffnn.proj_bias.elem[i++] = lex_cast(elem); always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float)); } @@ -526,9 +600,9 @@ ircd::gpt::model::init_h_ln_1_weight(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].ln1.weight[i++] = lex_cast(elem); + d.layer[layer].attn.norm.weight.elem[i++] = lex_cast(elem); - always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float)); + always_assert(i == sizeof(d.layer[layer].attn.norm.weight) / sizeof(float)); } void @@ -539,9 +613,9 @@ ircd::gpt::model::init_h_ln_1_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].ln1.bias[i++] = lex_cast(elem); + d.layer[layer].attn.norm.bias.elem[i++] = lex_cast(elem); - always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float)); + always_assert(i == sizeof(d.layer[layer].attn.norm.bias) / sizeof(float)); } void @@ -552,9 +626,9 @@ ircd::gpt::model::init_h_ln_2_weight(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].ln2.weight[i++] = lex_cast(elem); + d.layer[layer].ffnn.norm.weight.elem[i++] = lex_cast(elem); - always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float)); + always_assert(i == sizeof(d.layer[layer].ffnn.norm.weight) / sizeof(float)); } void @@ -565,9 +639,9 @@ ircd::gpt::model::init_h_ln_2_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].ln2.bias[i++] = lex_cast(elem); + d.layer[layer].ffnn.norm.bias.elem[i++] = lex_cast(elem); - always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float)); + always_assert(i == sizeof(d.layer[layer].ffnn.norm.bias) / sizeof(float)); } void @@ -581,16 +655,16 @@ ircd::gpt::model::init_h_attn_attn_weight(decoder &d, { size_t j(0); for(const auto &elem : vec) - d.layer[layer].attn.attn_weight[i][j++] = lex_cast(elem); + d.layer[layer].attn.fcon_weight[i].fcon[j++] = lex_cast(elem); - always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float)); + always_assert(j == sizeof(d.layer[layer].attn.fcon_weight[i]) / sizeof(float)); ++i; } always_assert ( - i == sizeof(d.layer[layer].attn.attn_weight) - / sizeof(d.layer[layer].attn.attn_weight[0]) + i == sizeof(d.layer[layer].attn.fcon_weight) + / sizeof(d.layer[layer].attn.fcon_weight[0]) ); } @@ -602,9 +676,9 @@ ircd::gpt::model::init_h_attn_attn_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].attn.attn_bias[i++] = lex_cast(elem); + d.layer[layer].attn.fcon_bias.fcon[i++] = lex_cast(elem); - always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float)); + always_assert(i == sizeof(d.layer[layer].attn.fcon_bias) / sizeof(float)); } void @@ -618,7 +692,7 @@ ircd::gpt::model::init_h_attn_proj_weight(decoder &d, { size_t j(0); for(const auto &elem : vec) - d.layer[layer].attn.proj_weight[i][j++] = lex_cast(elem); + d.layer[layer].attn.proj_weight[i].elem[j++] = lex_cast(elem); always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float)); ++i; @@ -639,7 +713,7 @@ ircd::gpt::model::init_h_attn_proj_bias(decoder &d, { size_t i(0); for(const auto &elem : vec) - d.layer[layer].attn.proj_bias[i++] = lex_cast(elem); + d.layer[layer].attn.proj_bias.elem[i++] = lex_cast(elem); always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float)); } diff --git a/ircd/gpt_pipe.cc b/ircd/gpt_pipe.cc index ebd084f7f..fc1447097 100644 --- a/ircd/gpt_pipe.cc +++ b/ircd/gpt_pipe.cc @@ -10,15 +10,9 @@ namespace ircd::gpt::pipe { - static void profile_dumplog(pipe::exec &); + static void profile_dumplog(pipe::cycle &); - extern conf::item queue_cycles; extern const ircd::run::changed handle_quit; - - static ircd::cl::exec::opts - send_opts_opts, send_ctrl_opts, send_coil_opts, send_head_opts, - anode_opts, negative_opts, positive_opts, cathode_opts, - lmhead_opts, lmamax_opts, backprop_opts, recv_ctrl_opts; } decltype(ircd::gpt::pipe::queue_cycles) @@ -28,62 +22,19 @@ ircd::gpt::pipe::queue_cycles { "default", 1L, }, }; -decltype(ircd::gpt::pipe::default_model) -ircd::gpt::pipe::default_model; - -decltype(ircd::gpt::pipe::default_code) -ircd::gpt::pipe::default_code; - -decltype(ircd::gpt::pipe::default_desc) -ircd::gpt::pipe::default_desc; - decltype(ircd::gpt::pipe::handle_quit) ircd::gpt::pipe::handle_quit { run::level::QUIT, pipe::fini }; +[[gnu::visibility("hidden")]] void ircd::gpt::pipe::init() { - const gpt::model::decoder &default_model - { - *gpt::model::default_model - }; - - assert(!pipe::default_model); - pipe::default_model = new pipe::model - { - default_model, default_model.word - }; - - pipe::default_code = new pipe::code - { - - }; - - pipe::default_desc = new pipe::desc - { - *pipe::default_code, *pipe::default_model - }; - - //XXX - send_ctrl_opts.flush = true; - send_ctrl_opts.nice = 1; - lmamax_opts.flush = true; - lmamax_opts.nice = 2; - recv_ctrl_opts.flush = true; - - log::debug - { - log, "Pipe initialized from model:%p data:%p code:%p desc:%p", - &default_model, - pipe::default_model, - pipe::default_code, - pipe::default_desc, - }; } +[[using gnu: cold, visibility("hidden")]] void ircd::gpt::pipe::fini() noexcept @@ -101,372 +52,409 @@ noexcept }; cl::sync(); - - delete default_desc; default_desc = nullptr; - delete default_code; default_code = nullptr; - delete default_model; default_model = nullptr; + ctx::yield(); } // -// pipe +// pipe::prof // -void -ircd::gpt::pipe::generate(task &task) +ircd::string_view +ircd::gpt::pipe::debug(const mutable_buffer &buf, + const prof &p) { - assert(pipe::default_model); - - assert(task.opts); - const auto &opts - { - *task.opts - }; - - assert(task.ctrl); - auto &ctrl - { - *task.ctrl - }; - - ctrl.epic.cycle = 0; - ctrl.epic.host_tsc = prof::cycles(); - - const auto tokens(ctrl.tokens.count); - const auto epoch(ctrl.epic.epoch); - volatile auto cycle(ctrl.epic.cycle); - - std::deque list; - for(; cycle < opts.limit; ++cycle) - { - // When the release/acquire bits are set the control pages are sent - // and received; only set on first and last iterations of this loop. - const bool - rel(cycle == 0), - acq(cycle + 1 >= opts.limit || ctx::interruption_requested()); - - // Enqueue the cycle's commands - list.emplace_back - ( - task, tokens + cycle, rel, acq - ); - - if(ctx::interruption_requested()) - if(acq || termination(ctx::cur())) - break; - - // Enqueue consecutive repetitions of our kernel batch before waiting - // on the first; based on the configuration. XXX get from ircd::cl - if(list.size() <= pipe::queue_cycles) - continue; - - // Profiling branch - if((false)) + window_buffer window(buf); + for(uint i(0); i < p.stages; ++i) + window([&p, &i](auto buf) { - auto &ex(list.front()); - profile_dumplog(ex); - } + size_t ret(0); + ret += consume(buf, size(debug(buf, p, i))); + ret += consume(buf, copy(buf, '\n')); + return ret; + }); - // Destructing the front of the queue waits for completion by yielding - // this ircd::ctx. - list.pop_front(); - } - - // Wait for all unfinished - list.clear(); - - assert(ctrl.magic == 0xC7012C70); - assert(ctrl.epic.cycle == cycle || ctx::interruption_requested()); - this_ctx::interruption_point(); + return window.completed(); } -void -ircd::gpt::pipe::profile_dumplog(pipe::exec &exec) +ircd::string_view +ircd::gpt::pipe::debug(const mutable_buffer &buf, + const prof &p, + const size_t &i) { - constexpr size_t coils + using phase = prof::phase; + + assert(i < p.info.size()); + assert(i < p.ts.size()); + + char tbuf[4][32]; + return fmt::sprintf { - sizeof(exec.coil) / sizeof(cl::exec) + buf, "%-20s %04x [ %10s %10s %10s %10s ]", + std::get<0>(p.info[i]), + std::get<1>(p.info[i]), + pretty(tbuf[0], p.ts[i][phase::QUEUE], 1), + pretty(tbuf[1], p.ts[i][phase::SUBMIT], 1), + pretty(tbuf[2], p.ts[i][phase::START], 1), + pretty(tbuf[3], p.ts[i][phase::END], 1), }; +} - for(size_t i(0); i < coils; ++i) +// +// prof::prof +// + +decltype(ircd::gpt::pipe::prof::info) +ircd::gpt::pipe::prof::info; + +decltype(ircd::gpt::pipe::prof::name) +ircd::gpt::pipe::prof::name; + +[[gnu::visibility("hidden")]] +decltype(ircd::gpt::pipe::prof::init) +ircd::gpt::pipe::prof::init; + +ircd::gpt::pipe::prof::prof() +noexcept +{ + for(uint i(0); i < stages; ++i) + for(uint j(0); j < phases; ++j) + ts[i][j] = 0ns; +} + +ircd::gpt::pipe::prof::prof(const cycle &c) +{ + if(!std::exchange(init, true)) + init_info(c); + + if(!cl::profile_queue) + return; + + for(uint i(0); i < stages; ++i) { - exec.coil[i].wait(); - const auto &pro + const cl::work::prof p { - exec.coil[i].profile() + c.stage[i] }; - char tmbuf[4][32] {{0}}; - log::logf - { - log, log::level::DEBUG, - "coil:%-2lu %8s %8s %8s %8s", - i, - util::pretty(tmbuf[0], si(pro[0]), 1), - util::pretty(tmbuf[1], si(pro[1]), 1), - util::pretty(tmbuf[2], si(pro[2]), 1), - util::pretty(tmbuf[3], si(pro[3]), 1), - }; + ts[i][phase::QUEUE] = p[phase::SUBMIT] - p[phase::QUEUE]; + ts[i][phase::SUBMIT] = p[phase::START] - p[phase::SUBMIT]; + ts[i][phase::START] = p[phase::END] - p[phase::START]; + ts[i][phase::END] = p[phase::END] - p[phase::QUEUE]; } } +[[gnu::visibility("hidden")]] +void +ircd::gpt::pipe::prof::init_info(const cycle &c) +{ + static_assert + ( + name.size() >= stages + ); + + for(uint i(0); i < stages; ++i) + info[i] = info_type + { + c.stage[i].name(name[i]), + c.stage[i].type(), + }; +} + +/////////////////////////////////////////////////////////////////////////////// // -// pipe::exec +// pipe::cycle // -ircd::gpt::pipe::exec::exec(task &task, - const size_t tokens, - const bool release, - const bool acquire) +const ircd::gpt::ctrl & +ircd::gpt::pipe::acquire(cycle &cycle) +{ + // Some tail stages may not be active each cycle + const auto last_exec + { + std::find_if(std::rbegin(cycle.stage), std::rend(cycle.stage), [] + (const auto &work) + { + return work.handle; + }) + }; + + assert(last_exec != std::rend(cycle.stage)); + last_exec->wait(); + + const auto ctrl + { + reinterpret_cast(cycle.desc.frame[cycle.frame].ptr()) + }; + + assert(ctrl); + assert(ctrl->magic != 0xDEADBEEF); + assert(ctrl->magic == 0xC7012C70UL); + return *ctrl; +} + +// +// pipe::cycle::cycle +// + +ircd::gpt::pipe::cycle::cycle(gpt::samp &samp) :desc { - default_desc + samp.desc } -,send_opts +,tick { - reinterpret_cast(task.opts), - release? - sizeof(gpt::opts): - 0 + samp.cycle } -,send_ctrl +,count { - reinterpret_cast(task.ctrl), - release? - sizeof(gpt::ctrl): - 0 + samp.count } -,send_coil +,tokens { - reinterpret_cast(gpt::model::default_model), - release && desc->model->invalid? - (sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm)): - 0 + samp.tokens } -,send_head +,cached { - reinterpret_cast(&gpt::model::default_model->word), - release && desc->model->invalid? - sizeof(gpt::model::embed): - 0 + desc.cached } -,recv_ctrl +,frame { - reinterpret_cast(task.ctrl), - acquire? - sizeof(gpt::ctrl): - 0 + tick % samp.opts.frames } -,range_full +,range { - { tokens * 192UL, 0, }, - { 192UL, 0, }, + tick, + count, + tokens, + cached, + true, + false, } -,range_last +,stage +{ + cl::exec // data + { + desc.opts, std::memory_order_release + }, + cl::exec // data + { + desc.ctrl, std::memory_order_release + }, + cl::exec // data + { + desc.frame[frame], std::memory_order_release + }, + cl::exec // data + { + desc.model->decode->master[0], std::memory_order_release + }, + cl::exec // Initial kernel + { + desc.alloc, range.alloc, + }, + cl::exec // Initial cycle kernel + { + desc.enter, range.embed, + }, + cl::exec // Compute token and positional embeddings. + { + desc.lm_embed, range.embed, + }, + // Forward Pass + cl::exec { desc.layer[0x00]->attn, range.attn }, + cl::exec { desc.layer[0x00]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x01]->attn, range.attn }, + cl::exec { desc.layer[0x01]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x02]->attn, range.attn }, + cl::exec { desc.layer[0x02]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x03]->attn, range.attn }, + cl::exec { desc.layer[0x03]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x04]->attn, range.attn }, + cl::exec { desc.layer[0x04]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x05]->attn, range.attn }, + cl::exec { desc.layer[0x05]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x06]->attn, range.attn }, + cl::exec { desc.layer[0x06]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x07]->attn, range.attn }, + cl::exec { desc.layer[0x07]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x08]->attn, range.attn }, + cl::exec { desc.layer[0x08]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x09]->attn, range.attn }, + cl::exec { desc.layer[0x09]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x0a]->attn, range.attn }, + cl::exec { desc.layer[0x0a]->ffnn, range.ffnn }, + cl::exec { desc.layer[0x0b]->attn, range.attn }, + cl::exec { desc.layer[0x0b]->ffnn, range.fffnn }, + cl::exec // Final normalization. + { + desc.lm_norm, range.fnorm + }, + cl::exec // Compute language logits. + { + desc.lm_logit, range.logit + }, + cl::exec // Statistics on the logits. + { + desc.lm_logsm, range.logsm + }, + cl::exec // Select next token. + { + desc.lm_select, range.select + }, + cl::exec // Backpropagate + { + desc.lm_prop_embed, range.prop_embed + }, + cl::exec // Backpropagate + { + desc.lm_prop_norm, range.prop_norm + }, + // Backward Pass + cl::exec { desc.layer[0x0b]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x0b]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x0a]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x0a]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x09]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x09]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x08]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x08]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x07]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x07]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x06]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x06]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x05]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x05]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x04]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x04]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x03]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x03]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x02]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x02]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x01]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x01]->prop_attn, range.prop_attn }, + cl::exec { desc.layer[0x00]->prop_ffnn, range.prop_ffnn }, + cl::exec { desc.layer[0x00]->prop_attn, range.prop_attn }, + cl::exec // Final kernel + { + desc.leave[frame], range.select + }, +} +{ +} + +ircd::gpt::pipe::cycle::~cycle() +noexcept +{ +} + +////////////////////////////////////////////////////////////////////////////// +// +// pipe::range +// + +ircd::gpt::pipe::range::range(const uint tick, + const uint count, + const uint tokens, + const uint cached, + const bool fwd, + const bool rev) +noexcept +:_full +{ + { (tokens - cached) * 192UL, 0 }, + { 192UL, 0 }, + { cached * 192UL, 0 }, +} +,_last { { 1 * 192UL, 0 }, { 192UL, 0 }, - { (tokens - 1) * 192UL, 0 }, + { (count - 1) * 192UL, 0 }, } -,range_lm_embed +,alloc { - release? - range_full: - range_last + { (tick == 0) * 192UL, 0 }, + { 192UL, 0 }, } -,range_negative +,embed { - release? - range_full: - range_last + fwd? + _full: + cl::kern::range{}, } -,range_positive +,attn { - release? - range_full: - range_last + fwd? + _full: + cl::kern::range{}, } -,range_lm_norm +,ffnn { - range_last + fwd? + _full: + cl::kern::range{}, } -,range_lm_logit +,fffnn { - { 786 * 64UL, 0 }, // align_up(50257) / 64 - { 64UL, 0 }, + fwd && tokens > count? + _full: + fwd? + _last: + cl::kern::range{}, } -,range_lm_logsm +,fnorm { - { 1 * 256UL, 0 }, - { 256UL, 0 }, + fwd? + _last: + cl::kern::range{}, } -,range_lm_select +,logit // TODO: align_up(50257) / 64|256 { - { 1 * 256UL, 0 }, - { 256UL, 0 }, + { int(fwd) * 50432UL, 0 }, + { 64L, 0 }, } -,release_opts +,logsm { - desc->opts, send_opts, send_opts_opts, + { int(fwd) * 1 * 256UL, 0 }, + { 256UL, 0 }, } -,release_ctrl +,select { - desc->ctrl, send_ctrl, send_ctrl_opts + { int(fwd) * 1 * 256UL, 0 }, + { 256UL, 0 }, } -,release_coil +,prop_embed { - desc->model->decode->master[0], send_coil, send_coil_opts + { int(rev) * 0 * 192UL, 0 }, + { 192UL, 0 }, } -,release_head +,prop_norm { - desc->model->embed->master[0], send_head, send_head_opts + { int(rev) * 0 * 192UL, 0 }, + { 192UL, 0 }, } -,lm_embed +,prop_attn { - desc->lm_embed, range_lm_embed, anode_opts + { int(rev) * 0 * 192UL, 0 }, + { 192UL, 0 }, } -,coil +,prop_ffnn { - { desc->layer[0x00]->negative, range_negative, negative_opts }, - { desc->layer[0x00]->positive, range_positive, positive_opts }, - { desc->layer[0x01]->negative, range_negative, negative_opts }, - { desc->layer[0x01]->positive, range_positive, positive_opts }, - { desc->layer[0x02]->negative, range_negative, negative_opts }, - { desc->layer[0x02]->positive, range_positive, positive_opts }, - { desc->layer[0x03]->negative, range_negative, negative_opts }, - { desc->layer[0x03]->positive, range_positive, positive_opts }, - { desc->layer[0x04]->negative, range_negative, negative_opts }, - { desc->layer[0x04]->positive, range_positive, positive_opts }, - { desc->layer[0x05]->negative, range_negative, negative_opts }, - { desc->layer[0x05]->positive, range_positive, positive_opts }, - { desc->layer[0x06]->negative, range_negative, negative_opts }, - { desc->layer[0x06]->positive, range_positive, positive_opts }, - { desc->layer[0x07]->negative, range_negative, negative_opts }, - { desc->layer[0x07]->positive, range_positive, positive_opts }, - { desc->layer[0x08]->negative, range_negative, negative_opts }, - { desc->layer[0x08]->positive, range_positive, positive_opts }, - { desc->layer[0x09]->negative, range_negative, negative_opts }, - { desc->layer[0x09]->positive, range_positive, positive_opts }, - { desc->layer[0x0a]->negative, range_negative, negative_opts }, - { desc->layer[0x0a]->positive, range_positive, positive_opts }, - { desc->layer[0x0b]->negative, range_negative, negative_opts }, - { desc->layer[0x0b]->positive, range_positive, positive_opts }, + { int(rev) * 0 * 192UL, 0 }, + { 192UL, 0 }, } -,lm_norm -{ - desc->lm_norm, range_lm_norm, cathode_opts -} -,lm_logit -{ - desc->lm_logit, range_lm_logit, lmhead_opts -} -,lm_logsm -{ - desc->lm_logsm, range_lm_logsm, lmhead_opts -} -,lm_select -{ - desc->lm_select, range_lm_select, lmamax_opts -} -,acquire_ctrl -{ - desc->ctrl, recv_ctrl, recv_ctrl_opts -} -{ - if(release && desc->model->invalid) - desc->model->invalid = false; -} - -ircd::gpt::pipe::exec::~exec() -noexcept -{ -} - -// -// code -// - -decltype(ircd::gpt::pipe::code::default_path) -ircd::gpt::pipe::code::default_path -{ - { "name", "ircd.gpt.pipe.code.path" }, -}; - -decltype(ircd::gpt::pipe::code::default_opts) -ircd::gpt::pipe::code::default_opts -{ - { "name", "ircd.gpt.pipe.code.opts" }, - { "default", string_view - { - " -cl-strict-aliasing" - " -cl-no-signed-zeros" - " -cl-finite-math-only" - " -cl-unsafe-math-optimizations" - " -cl-fast-relaxed-math" - " -cl-mad-enable" - " -cl-single-precision-constant" - //" -cl-fp32-correctly-rounded-divide-sqrt" - - " -cl-kernel-arg-info" - }} -}; - -ircd::gpt::pipe::code::code() -:cl::code{[] -{ - const string_view code_path - { - default_path - }; - - const fs::fd fd - { - code_path - }; - - const std::string read - { - fs::read(fd) - }; - - const string_view bin - { - read - }; - - const vector_view bins - ( - &bin, 1 - ); - - const auto opts - { - fmt::snstringf - { - 4096, "%s -I%s", - string_view{default_opts}, - string_view{fs::base::include}, - } - }; - - return cl::code - { - bins, opts - }; -}()} -{ -} - -ircd::gpt::pipe::code::~code() -noexcept { } +/////////////////////////////////////////////////////////////////////////////// // // pipe::desc // -ircd::gpt::pipe::desc::desc(pipe::code &code, - pipe::model &model) +ircd::gpt::pipe::desc::desc(const gpt::opts *const &opt, + gpt::ctrl *const &ctrl_, + pipe::model &model, + pipe::code &code) :model { &model @@ -475,27 +463,44 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, { &code } -,state +,opts { - 0 - + 12 * 512 * 3 * 768 * sizeof(float), - mutable_buffer{}, + const_buffer + { + reinterpret_cast(opt), + sizeof(gpt::opts) + }, +} +,ctrl +{ + const_buffer + { + reinterpret_cast(ctrl_), + sizeof(gpt::ctrl) + }, } ,master { 0 - + 512 * 768 * sizeof(float) + + opt->layers * opt->context_tokens * opt->attn_elems * sizeof(float) + + opt->context_tokens * opt->embed_elems * sizeof(float) + 65536 * sizeof(float) - + 65536 * sizeof(float) - + 65536 * sizeof(float) - ,mutable_buffer{} + + opt->layers * opt->attn_self_elems * sizeof(float) +} +,state +{ + master, + { + opt->layers * opt->context_tokens * opt->attn_elems * sizeof(float), + off_t(0), + } } ,accum { master, { - 512 * 768 * sizeof(float), - off_t(0), + opt->context_tokens * opt->embed_elems * sizeof(float), + state.offset() + off_t(state.size()), }, } ,logit @@ -506,23 +511,52 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, accum.offset() + off_t(accum.size()), }, } -,logsm +,attns { master, { - 65536 * sizeof(float), - logit.offset() + off_t(logit.size()), - }, + opt->layers * opt->attn_self_elems * sizeof(float), + logit.offset() + off_t(logit.size()) + } } -,ctrl +,frame { - sizeof(gpt::ctrl), - mutable_buffer{} + // size, read, write, }, // idx + { sizeof(gpt::ctrl), true, false, }, // 0 + { sizeof(gpt::ctrl), true, false, }, // 1 + { sizeof(gpt::ctrl), true, false, }, // 2 + { sizeof(gpt::ctrl), true, false, }, // 3 + { sizeof(gpt::ctrl), true, false, }, // 4 + { sizeof(gpt::ctrl), true, false, }, // 5 + { sizeof(gpt::ctrl), true, false, }, // 6 + { sizeof(gpt::ctrl), true, false, }, // 7 } -,opts +,alloc { - sizeof(gpt::opts), - const_buffer{} + code, + "ircd_gpt_alloc", + model.decode->master[0], + master, + opts, + ctrl, + frame[0], + frame[1], + frame[2], + frame[3], + frame[4], + frame[5], + frame[6], + frame[7], +} +,enter +{ + code, + "ircd_gpt_enter", + model.decode->master[0], + state, + master, + opts, + ctrl, } ,lm_embed { @@ -531,8 +565,8 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, ctrl, opts, accum, - model.embed->pos.param, - model.embed->token.param, + model.decode->embed.pos.param, + model.decode->embed.token.param, } ,lm_norm { @@ -541,8 +575,8 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, ctrl, opts, accum, - model.decode->norm.bias.param, - model.decode->norm.weight.param, + model.decode->embed.norm.bias.param, + model.decode->embed.norm.weight.param, } ,lm_logit { @@ -552,7 +586,8 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, opts, logit, accum, - model.embed->token.param, + model.decode->embed.pos.param, + model.decode->embed.token.param, } ,lm_logsm { @@ -560,7 +595,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, "ircd_gpt_lm_logsm", ctrl, opts, - logsm, logit, } ,lm_select @@ -569,49 +603,132 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, "ircd_gpt_lm_select", ctrl, opts, - logsm, logit, + attns, } -,lm_norm_backprop -{ - code, - "ircd_gpt_norm_prop", - ctrl, - opts, - model.decode->norm.bias.param, - model.decode->norm.bias.moment[0], - model.decode->norm.bias.moment[1], - model.decode->norm.weight.param, - model.decode->norm.weight.moment[0], - model.decode->norm.weight.moment[1], -} -,lm_embed_backprop +,lm_prop_embed { code, "ircd_gpt_lm_embed_prop", ctrl, opts, - model.embed->pos.param, - model.embed->pos.moment[0], - model.embed->pos.moment[1], - model.embed->token.param, - model.embed->token.moment[0], - model.embed->token.moment[1], + model.decode->embed.pos.param, + model.decode->embed.pos.moment[0], + model.decode->embed.pos.moment[1], + model.decode->embed.token.param, + model.decode->embed.token.moment[0], + model.decode->embed.token.moment[1], +} +,lm_prop_norm +{ + code, + "ircd_gpt_norm_prop", + ctrl, + opts, + model.decode->embed.norm.bias.param, + model.decode->embed.norm.bias.moment[0], + model.decode->embed.norm.bias.moment[1], + model.decode->embed.norm.weight.param, + model.decode->embed.norm.weight.moment[0], + model.decode->embed.norm.weight.moment[1], +} +,leave +{ + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[0], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[1], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[2], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[3], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[4], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[5], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[6], + }, + { + code, + "ircd_gpt_leave", + model.decode->master[0], + state, + master, + opts, + ctrl, + frame[7], + }, } ,layer { - std::make_unique(*this, 0x00), - std::make_unique(*this, 0x01), - std::make_unique(*this, 0x02), - std::make_unique(*this, 0x03), - std::make_unique(*this, 0x04), - std::make_unique(*this, 0x05), - std::make_unique(*this, 0x06), - std::make_unique(*this, 0x07), - std::make_unique(*this, 0x08), - std::make_unique(*this, 0x09), - std::make_unique(*this, 0x0a), - std::make_unique(*this, 0x0b), + std::make_unique(*this, opt, 0x00), + std::make_unique(*this, opt, 0x01), + std::make_unique(*this, opt, 0x02), + std::make_unique(*this, opt, 0x03), + std::make_unique(*this, opt, 0x04), + std::make_unique(*this, opt, 0x05), + std::make_unique(*this, opt, 0x06), + std::make_unique(*this, opt, 0x07), + std::make_unique(*this, opt, 0x08), + std::make_unique(*this, opt, 0x09), + std::make_unique(*this, opt, 0x0a), + std::make_unique(*this, opt, 0x0b), } { } @@ -621,94 +738,106 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, // ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, - const int laynum) + const gpt::opts *const &opts, + const uint laynum) :state { desc.state, { - 512 * 3 * 768 * sizeof(float), - laynum * 512 * 3 * 768 * sizeof(float), + opts->context_tokens * opts->attn_elems * sizeof(float), + laynum * opts->context_tokens * opts->attn_elems * sizeof(float), } } -,negative +,attns +{ + desc.attns, + { + opts->attn_self_elems * sizeof(float), + laynum * opts->attn_self_elems * sizeof(float), + } +} +,attn { *desc.code, "ircd_gpt_attn_fcon", desc.ctrl, desc.opts, + laynum, state, desc.accum, - desc.model->decode->block[laynum].attn.norm.bias.param, - desc.model->decode->block[laynum].attn.norm.weight.param, - desc.model->decode->block[laynum].attn.fcon.bias.param, - desc.model->decode->block[laynum].attn.fcon.weight.param, + desc.model->decode->layer[laynum].attn.norm.bias.param, + desc.model->decode->layer[laynum].attn.norm.weight.param, + desc.model->decode->layer[laynum].attn.fcon.bias.param, + desc.model->decode->layer[laynum].attn.fcon.weight.param, } -,positive +,ffnn { *desc.code, "ircd_gpt_coil", desc.ctrl, desc.opts, + laynum, desc.accum, + attns, state, - desc.model->decode->block[laynum].attn.proj.bias.param, - desc.model->decode->block[laynum].attn.proj.weight.param, - desc.model->decode->block[laynum].ffnn.norm.bias.param, - desc.model->decode->block[laynum].ffnn.norm.weight.param, - desc.model->decode->block[laynum].ffnn.fcon.bias.param, - desc.model->decode->block[laynum].ffnn.fcon.weight.param, - desc.model->decode->block[laynum].ffnn.proj.bias.param, - desc.model->decode->block[laynum].ffnn.proj.weight.param, + desc.model->decode->layer[laynum].attn.proj.bias.param, + desc.model->decode->layer[laynum].attn.proj.weight.param, + desc.model->decode->layer[laynum].ffnn.norm.bias.param, + desc.model->decode->layer[laynum].ffnn.norm.weight.param, + desc.model->decode->layer[laynum].ffnn.fcon.bias.param, + desc.model->decode->layer[laynum].ffnn.fcon.weight.param, + desc.model->decode->layer[laynum].ffnn.proj.bias.param, + desc.model->decode->layer[laynum].ffnn.proj.weight.param, } -,backattn +,prop_attn { *desc.code, "ircd_gpt_coil_prop_attn", desc.ctrl, desc.opts, - desc.model->decode->block[laynum].attn.norm.bias.param, - desc.model->decode->block[laynum].attn.norm.bias.moment[0], - desc.model->decode->block[laynum].attn.norm.bias.moment[1], - desc.model->decode->block[laynum].attn.norm.weight.param, - desc.model->decode->block[laynum].attn.norm.weight.moment[0], - desc.model->decode->block[laynum].attn.norm.weight.moment[1], - desc.model->decode->block[laynum].attn.fcon.bias.param, - desc.model->decode->block[laynum].attn.fcon.bias.moment[0], - desc.model->decode->block[laynum].attn.fcon.bias.moment[1], - desc.model->decode->block[laynum].attn.fcon.weight.param, - desc.model->decode->block[laynum].attn.fcon.weight.moment[0], - desc.model->decode->block[laynum].attn.fcon.weight.moment[1], - desc.model->decode->block[laynum].attn.proj.bias.param, - desc.model->decode->block[laynum].attn.proj.bias.moment[0], - desc.model->decode->block[laynum].attn.proj.bias.moment[1], - desc.model->decode->block[laynum].attn.proj.weight.param, - desc.model->decode->block[laynum].attn.proj.weight.moment[0], - desc.model->decode->block[laynum].attn.proj.weight.moment[1], + desc.model->decode->layer[laynum].attn.norm.bias.param, + desc.model->decode->layer[laynum].attn.norm.bias.moment[0], + desc.model->decode->layer[laynum].attn.norm.bias.moment[1], + desc.model->decode->layer[laynum].attn.norm.weight.param, + desc.model->decode->layer[laynum].attn.norm.weight.moment[0], + desc.model->decode->layer[laynum].attn.norm.weight.moment[1], + desc.model->decode->layer[laynum].attn.fcon.bias.param, + desc.model->decode->layer[laynum].attn.fcon.bias.moment[0], + desc.model->decode->layer[laynum].attn.fcon.bias.moment[1], + desc.model->decode->layer[laynum].attn.fcon.weight.param, + desc.model->decode->layer[laynum].attn.fcon.weight.moment[0], + desc.model->decode->layer[laynum].attn.fcon.weight.moment[1], + desc.model->decode->layer[laynum].attn.proj.bias.param, + desc.model->decode->layer[laynum].attn.proj.bias.moment[0], + desc.model->decode->layer[laynum].attn.proj.bias.moment[1], + desc.model->decode->layer[laynum].attn.proj.weight.param, + desc.model->decode->layer[laynum].attn.proj.weight.moment[0], + desc.model->decode->layer[laynum].attn.proj.weight.moment[1], } -,backffnn +,prop_ffnn { *desc.code, "ircd_gpt_coil_prop_ffnn", desc.ctrl, desc.opts, - desc.model->decode->block[laynum].ffnn.norm.bias.param, - desc.model->decode->block[laynum].ffnn.norm.bias.moment[0], - desc.model->decode->block[laynum].ffnn.norm.bias.moment[1], - desc.model->decode->block[laynum].ffnn.norm.weight.param, - desc.model->decode->block[laynum].ffnn.norm.weight.moment[0], - desc.model->decode->block[laynum].ffnn.norm.weight.moment[1], - desc.model->decode->block[laynum].ffnn.fcon.bias.param, - desc.model->decode->block[laynum].ffnn.fcon.bias.moment[0], - desc.model->decode->block[laynum].ffnn.fcon.bias.moment[1], - desc.model->decode->block[laynum].ffnn.fcon.weight.param, - desc.model->decode->block[laynum].ffnn.fcon.weight.moment[0], - desc.model->decode->block[laynum].ffnn.fcon.weight.moment[1], - desc.model->decode->block[laynum].ffnn.proj.bias.param, - desc.model->decode->block[laynum].ffnn.proj.bias.moment[0], - desc.model->decode->block[laynum].ffnn.proj.bias.moment[1], - desc.model->decode->block[laynum].ffnn.proj.weight.param, - desc.model->decode->block[laynum].ffnn.proj.weight.moment[0], - desc.model->decode->block[laynum].ffnn.proj.weight.moment[1], + desc.model->decode->layer[laynum].ffnn.norm.bias.param, + desc.model->decode->layer[laynum].ffnn.norm.bias.moment[0], + desc.model->decode->layer[laynum].ffnn.norm.bias.moment[1], + desc.model->decode->layer[laynum].ffnn.norm.weight.param, + desc.model->decode->layer[laynum].ffnn.norm.weight.moment[0], + desc.model->decode->layer[laynum].ffnn.norm.weight.moment[1], + desc.model->decode->layer[laynum].ffnn.fcon.bias.param, + desc.model->decode->layer[laynum].ffnn.fcon.bias.moment[0], + desc.model->decode->layer[laynum].ffnn.fcon.bias.moment[1], + desc.model->decode->layer[laynum].ffnn.fcon.weight.param, + desc.model->decode->layer[laynum].ffnn.fcon.weight.moment[0], + desc.model->decode->layer[laynum].ffnn.fcon.weight.moment[1], + desc.model->decode->layer[laynum].ffnn.proj.bias.param, + desc.model->decode->layer[laynum].ffnn.proj.bias.moment[0], + desc.model->decode->layer[laynum].ffnn.proj.bias.moment[1], + desc.model->decode->layer[laynum].ffnn.proj.weight.param, + desc.model->decode->layer[laynum].ffnn.proj.weight.moment[0], + desc.model->decode->layer[laynum].ffnn.proj.weight.moment[1], } { } @@ -722,29 +851,31 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, // pipe::model::model // -ircd::gpt::pipe::model::model(gpt::model::decoder &decoder, - gpt::model::embed &embed) -:decode +ircd::gpt::pipe::model::model(gpt::model::decoder &decoder) +:decode_const +{ + std::addressof(decoder) +} +,decode_mutable +{ + std::addressof(decoder) +} +,decode { std::make_unique(decoder) } -,embed -{ - std::make_unique(embed) -} { } -ircd::gpt::pipe::model::model(const gpt::model::decoder &decoder, - const gpt::model::embed &embed) -:decode +ircd::gpt::pipe::model::model(const gpt::model::decoder &decoder) +:decode_const +{ + std::addressof(decoder) +} +,decode { std::make_unique(decoder) } -,embed -{ - std::make_unique(embed) -} { } @@ -762,26 +893,30 @@ ircd::gpt::pipe::model::decoder::decoder(gpt::model::decoder &decoder) { // params { - sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), mutable_buffer + mutable_buffer { - reinterpret_cast(decoder.layer), - sizeof(decoder.layer) + sizeof(decoder.f) + reinterpret_cast(&decoder) + sizeof(gpt::model::decoder) * 0, + sizeof(gpt::model::decoder) } }, - // first moment { - sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), - mutable_buffer{} + mutable_buffer + { + reinterpret_cast(&decoder) + sizeof(gpt::model::decoder) * 1, + sizeof(gpt::model::decoder) + } }, - // second moment { - sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), - mutable_buffer{} + mutable_buffer + { + reinterpret_cast(&decoder) + sizeof(gpt::model::decoder) * 2, + sizeof(gpt::model::decoder) + } }, } -,block +,layer { { master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, }, { master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, }, @@ -796,12 +931,11 @@ ircd::gpt::pipe::model::decoder::decoder(gpt::model::decoder &decoder) { master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, }, { master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, }, } -,norm +,embed { master, - off_t(sizeof(gpt::model::block) * 12), - mutable_buffer{decoder.f.bias}, - mutable_buffer{decoder.f.weight}, + off_t(offsetof(gpt::model::decoder, embed)), + decoder.embed, } { } @@ -811,34 +945,33 @@ ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder) { // params { - sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer + const_buffer { - reinterpret_cast(decoder.layer), - sizeof(decoder.layer) + sizeof(decoder.f) + reinterpret_cast(&decoder), + sizeof(gpt::model::decoder) } }, } -,block +,layer { - { master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, }, - { master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, }, - { master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, }, - { master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, }, - { master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, }, - { master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, }, - { master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, }, - { master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, }, - { master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, }, - { master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, }, - { master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, }, - { master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x00])), decoder.layer[0x00], 0x00, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x01])), decoder.layer[0x01], 0x01, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x02])), decoder.layer[0x02], 0x02, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x03])), decoder.layer[0x03], 0x03, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x04])), decoder.layer[0x04], 0x04, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x05])), decoder.layer[0x05], 0x05, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x06])), decoder.layer[0x06], 0x06, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x07])), decoder.layer[0x07], 0x07, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x08])), decoder.layer[0x08], 0x08, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x09])), decoder.layer[0x09], 0x09, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x0a])), decoder.layer[0x0a], 0x0a, }, + { master, off_t(offsetof(gpt::model::decoder, layer[0x0b])), decoder.layer[0x0b], 0x0b, }, } -,norm +,embed { master, - off_t(sizeof(gpt::model::block) * 12), - const_buffer{decoder.f.bias}, - const_buffer{decoder.f.weight}, + off_t(offsetof(gpt::model::decoder, embed)), + decoder.embed, } { } @@ -849,171 +982,65 @@ noexcept } // -// pipe::model::language +// pipe::model::embed // -ircd::gpt::pipe::model::language::language(gpt::model::embed &embed) -:master +ircd::gpt::pipe::model::embed::embed(cl::data *const master, + const off_t offset, + gpt::model::embed &embed) +:norm { - // params - { - sizeof(embed), mutable_buffer - { - reinterpret_cast(&embed), - sizeof(embed), - } - }, - - // first moment - { - sizeof(embed), mutable_buffer{}, - }, - - // second moment - { - sizeof(embed), mutable_buffer{}, - }, + master, + offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, bias)), + mutable_buffer{embed.norm.bias.elem}, + offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, weight)), + mutable_buffer{embed.norm.weight.elem}, } ,pos { - master, 0, mutable_buffer{embed.pos} + master, + offset + off_t(offsetof(gpt::model::embed, pos)), + mutable_buffer{embed.pos} } ,token { - master, sizeof(embed.pos), mutable_buffer{embed.token} + master, + offset + off_t(offsetof(gpt::model::embed, token)), + mutable_buffer{embed.token} } { } -ircd::gpt::pipe::model::language::language(const gpt::model::embed &embed) -:master +ircd::gpt::pipe::model::embed::embed(cl::data *const master, + const off_t offset, + const gpt::model::embed &embed) +:norm { - { - sizeof(embed), const_buffer - { - reinterpret_cast(&embed), - sizeof(embed), - } - }, + master, + offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, bias)), + const_buffer{embed.norm.bias.elem}, + offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, weight)), + const_buffer{embed.norm.weight.elem}, } ,pos { - master, 0, const_buffer{embed.pos} + master, + offset + off_t(offsetof(gpt::model::embed, pos)), + const_buffer{embed.pos} } ,token { - master, sizeof(embed.pos), const_buffer{embed.token} + master, + offset + off_t(offsetof(gpt::model::embed, token)), + const_buffer{embed.token} } { } -ircd::gpt::pipe::model::language::language(cl::data *const master, - const off_t offset, - gpt::model::embed &embed) -:pos -{ - master, offset, mutable_buffer{embed.pos} -} -,token -{ - master, offset + off_t(sizeof(embed.pos)), mutable_buffer{embed.token} -} -{ -} - -ircd::gpt::pipe::model::language::language(cl::data *const master, - const off_t offset, - const gpt::model::embed &embed) -:pos -{ - master, offset, const_buffer{embed.pos} -} -,token -{ - master, offset + off_t(sizeof(embed.pos)), const_buffer{embed.token} -} -{ -} - -ircd::gpt::pipe::model::language::~language() -noexcept -{ -} - // // pipe::model::block // -ircd::gpt::pipe::model::block::block(gpt::model::block &block, - const size_t layer) -:master -{ - // params - { - sizeof(block), mutable_buffer - { - reinterpret_cast(&block), sizeof(block) - } - }, - - // first moment - { - sizeof(block), - mutable_buffer{} - }, - - // second moment - { - sizeof(block), - mutable_buffer{} - }, -} -,attn -{ - master, - 0, - block.ln1, - block.attn, -} -,ffnn -{ - master, - off_t(sizeof(block.ln1) + sizeof(block.attn)), - block.ln2, - block.ffnn, -} -{ -} - -ircd::gpt::pipe::model::block::block(const gpt::model::block &block, - const size_t layer) -:master -{ - // params - { - sizeof(block), const_buffer - { - reinterpret_cast(&block), sizeof(block) - } - } -} -,attn -{ - master, - 0, - block.ln1, - block.attn, -} -,ffnn -{ - master, - off_t(sizeof(block.ln1) + sizeof(block.attn)), - block.ln2, - block.ffnn, -} -{ -} - ircd::gpt::pipe::model::block::block(cl::data *const master, const off_t offset, gpt::model::block &block, @@ -1021,15 +1048,13 @@ ircd::gpt::pipe::model::block::block(cl::data *const master, :attn { master, - offset, - block.ln1, + offset + off_t(offsetof(gpt::model::block, attn)), block.attn, } ,ffnn { master, - offset + off_t(sizeof(block.ln1) + sizeof(block.attn)), - block.ln2, + offset + off_t(offsetof(gpt::model::block, ffnn)), block.ffnn, } { @@ -1042,15 +1067,13 @@ ircd::gpt::pipe::model::block::block(cl::data *const master, :attn { master, - offset, - block.ln1, + offset + off_t(offsetof(gpt::model::block, attn)), block.attn, } ,ffnn { master, - offset + off_t(sizeof(block.ln1) + sizeof(block.attn)), - block.ln2, + offset + off_t(offsetof(gpt::model::block, ffnn)), block.ffnn, } { @@ -1062,78 +1085,62 @@ ircd::gpt::pipe::model::block::block(cl::data *const master, ircd::gpt::pipe::model::ffnn::ffnn(cl::data *const master, const off_t offset, - gpt::model::norm &norm, gpt::model::ffnn &ffnn) :norm { master, - offset, - mutable_buffer{norm.bias}, - mutable_buffer{norm.weight}, + offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, bias)), + mutable_buffer{ffnn.norm.bias.elem}, + offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, weight)), + mutable_buffer{ffnn.norm.weight.elem}, } ,fcon { master, - offset + off_t(sizeof(norm)), - mutable_buffer{ffnn.fc_bias}, - mutable_buffer{ffnn.fc_weight}, + offset + off_t(offsetof(gpt::model::ffnn, fcon_bias)), + mutable_buffer{ffnn.fcon_bias.fcon}, + offset + off_t(offsetof(gpt::model::ffnn, fcon_weight)), + mutable_buffer{ffnn.fcon_weight}, } ,proj { master, - offset + off_t(sizeof(norm) + sizeof(ffnn.fc_bias) + sizeof(ffnn.fc_weight)), - mutable_buffer{ffnn.proj_bias}, + offset + off_t(offsetof(gpt::model::ffnn, proj_bias)), + mutable_buffer{ffnn.proj_bias.elem}, + offset + off_t(offsetof(gpt::model::ffnn, proj_weight)), mutable_buffer{ffnn.proj_weight}, } { - always_assert - ( - ircd::data(const_buffer{ffnn.proj_weight}) - == - ircd::data(const_buffer{norm.bias}) + - sizeof(norm) + - sizeof(ffnn.fc_bias) + - sizeof(ffnn.fc_weight) + - ircd::size(const_buffer{ffnn.proj_bias}) - ); } ircd::gpt::pipe::model::ffnn::ffnn(cl::data *const master, const off_t offset, - const gpt::model::norm &norm, const gpt::model::ffnn &ffnn) :norm { master, - offset, - const_buffer{norm.bias}, - const_buffer{norm.weight}, + offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, bias)), + const_buffer{ffnn.norm.bias.elem}, + offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, weight)), + const_buffer{ffnn.norm.weight.elem}, } ,fcon { master, - offset + off_t(sizeof(norm)), - const_buffer{ffnn.fc_bias}, - const_buffer{ffnn.fc_weight}, + offset + off_t(offsetof(gpt::model::ffnn, fcon_bias)), + const_buffer{ffnn.fcon_bias.fcon}, + offset + off_t(offsetof(gpt::model::ffnn, fcon_weight)), + const_buffer{ffnn.fcon_weight}, } ,proj { master, - offset + off_t(sizeof(norm) + sizeof(ffnn.fc_bias) + sizeof(ffnn.fc_weight)), - const_buffer{ffnn.proj_bias}, + offset + off_t(offsetof(gpt::model::ffnn, proj_bias)), + const_buffer{ffnn.proj_bias.elem}, + offset + off_t(offsetof(gpt::model::ffnn, proj_weight)), const_buffer{ffnn.proj_weight}, } { - always_assert - ( - ircd::data(const_buffer{ffnn.proj_weight}) - == - ircd::data(const_buffer{norm.bias}) + - sizeof(norm) + - sizeof(ffnn.fc_bias) + - sizeof(ffnn.fc_weight) + - ircd::size(const_buffer{ffnn.proj_bias}) - ); } // @@ -1142,78 +1149,62 @@ ircd::gpt::pipe::model::ffnn::ffnn(cl::data *const master, ircd::gpt::pipe::model::attn::attn(cl::data *const master, const off_t offset, - gpt::model::norm &norm, gpt::model::attn &attn) :norm { master, - offset, - mutable_buffer{norm.bias}, - mutable_buffer{norm.weight}, + offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, bias)), + mutable_buffer{attn.norm.bias.elem}, + offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, weight)), + mutable_buffer{attn.norm.weight.elem}, } ,fcon { master, - offset + off_t(sizeof(norm)), - mutable_buffer{attn.attn_bias}, - mutable_buffer{attn.attn_weight}, + offset + off_t(offsetof(gpt::model::attn, fcon_bias)), + mutable_buffer{attn.fcon_bias.fcon}, + offset + off_t(offsetof(gpt::model::attn, fcon_weight)), + mutable_buffer{attn.fcon_weight}, } ,proj { master, - offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)), - mutable_buffer{attn.proj_bias}, + offset + off_t(offsetof(gpt::model::attn, proj_bias)), + mutable_buffer{attn.proj_bias.elem}, + offset + off_t(offsetof(gpt::model::attn, proj_weight)), mutable_buffer{attn.proj_weight}, } { - always_assert - ( - ircd::data(const_buffer{attn.proj_weight}) - == - ircd::data(const_buffer{norm.bias}) + - sizeof(norm) + - sizeof(attn.attn_bias) + - sizeof(attn.attn_weight) + - ircd::size(const_buffer{attn.proj_bias}) - ); } ircd::gpt::pipe::model::attn::attn(cl::data *const master, const off_t offset, - const gpt::model::norm &norm, const gpt::model::attn &attn) :norm { master, - offset, - const_buffer{norm.bias}, - const_buffer{norm.weight}, + offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, bias)), + const_buffer{attn.norm.bias.elem}, + offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, weight)), + const_buffer{attn.norm.weight.elem}, } ,fcon { master, - offset + off_t(sizeof(norm)), - const_buffer{attn.attn_bias}, - const_buffer{attn.attn_weight}, + offset + off_t(offsetof(gpt::model::attn, fcon_bias)), + const_buffer{attn.fcon_bias.fcon}, + offset + off_t(offsetof(gpt::model::attn, fcon_weight)), + const_buffer{attn.fcon_weight}, } ,proj { master, - offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)), - const_buffer{attn.proj_bias}, + offset + off_t(offsetof(gpt::model::attn, proj_bias)), + const_buffer{attn.proj_bias.elem}, + offset + off_t(offsetof(gpt::model::attn, proj_weight)), const_buffer{attn.proj_weight}, } { - always_assert - ( - ircd::data(const_buffer{attn.proj_weight}) - == - ircd::data(const_buffer{norm.bias}) + - sizeof(norm) + - sizeof(attn.attn_bias) + - sizeof(attn.attn_weight) + - ircd::size(const_buffer{attn.proj_bias}) - ); } // @@ -1221,38 +1212,40 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master, // ircd::gpt::pipe::model::tensor::tensor(cl::data *const master, - const off_t offset, + const off_t bias_offset, const mutable_buffer &bias, + const off_t weight_offset, const mutable_buffer &weight) :bias { master, - offset, + bias_offset, bias, } ,weight { master, - off_t(offset + ircd::size(bias)), + weight_offset, weight, } { } ircd::gpt::pipe::model::tensor::tensor(cl::data *const master, - const off_t offset, + const off_t bias_offset, const const_buffer &bias, + const off_t weight_offset, const const_buffer &weight) :bias { master, - offset, + bias_offset, bias, } ,weight { master, - off_t(offset + ircd::size(bias)), + weight_offset, weight, } { @@ -1269,7 +1262,7 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master, { master[0], { - ircd::size(param), + pad_to(ircd::size(param), 4096), offset, }, } @@ -1279,7 +1272,7 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master, { master[1], { - ircd::size(param), + pad_to(ircd::size(param), 4096), offset, }, }, @@ -1288,12 +1281,13 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master, { master[2], { - ircd::size(param), + pad_to(ircd::size(param), 4096), offset, }, }, } { + assert(aligned(offset, 4096)); } ircd::gpt::pipe::model::matrix::matrix(cl::data *const master, @@ -1303,8 +1297,8 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master, { master[0], { - ircd::size(param), // size - offset, // offset + pad_to(ircd::size(param), 4096), + offset, }, } { diff --git a/ircd/gpt_pipe_code.cc b/ircd/gpt_pipe_code.cc index e19ce75e5..723cabb58 100644 --- a/ircd/gpt_pipe_code.cc +++ b/ircd/gpt_pipe_code.cc @@ -152,10 +152,12 @@ ircd::gpt::pipe::code::set_cache(const string_view &path) } extern const uint8_t -gpt_gpu_r600_barts_bc[]; +gpt_gpu_r600_barts_bc[], +gpt_gpu_spv[]; extern const uint -gpt_gpu_r600_barts_bc_len; +gpt_gpu_r600_barts_bc_len, +gpt_gpu_spv_len; ircd::cl::code ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts) @@ -164,6 +166,9 @@ ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts) { reinterpret_cast(gpt_gpu_r600_barts_bc), gpt_gpu_r600_barts_bc_len + + //reinterpret_cast(gpt_gpu_spv), + //gpt_gpu_spv_len }; char pbuf[1][48]; diff --git a/ircd/gpt_vocab.cc b/ircd/gpt_vocab.cc index 425a22970..c5274843b 100644 --- a/ircd/gpt_vocab.cc +++ b/ircd/gpt_vocab.cc @@ -10,6 +10,7 @@ namespace ircd::gpt::vocab { + static u8x16 get_token(const u16); static u16 find_token(const u8x16); static u16 find_merge(const u8x16, const u8x16); static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint); @@ -23,11 +24,11 @@ namespace ircd::gpt::vocab static u64x2 tokenize_block(u16x16 &, const u8x16, const i8x16) noexcept; static void init_tokens(), init_merges(); - [[gnu::visibility("internal")]] extern const char32_t charset[256]; } /// Remapping of single byte characters (Control (C0) and Basic Latin (ASCII)). +[[gnu::visibility("internal")]] decltype(ircd::gpt::vocab::charset) ircd::gpt::vocab::charset alignas(64) @@ -169,7 +170,8 @@ ircd::gpt::vocab::init_merges() ircd::string_view ircd::gpt::vocab::debug(const mutable_buffer &out, - const u16 idx) + const u16 idx, + const uint mask) { const auto *const token { @@ -177,13 +179,21 @@ ircd::gpt::vocab::debug(const mutable_buffer &out, }; thread_local char strbuf[2][512]; - return string_view{fmt::sprintf + return fmt::sprintf { - out, "%5u [%32s] %s", + out, "%5u %s%32s%s%s%s", idx, - simd::print_chr(strbuf[0], token[idx]), - simd::print_mem(strbuf[1], token[idx]), - }}; + mask & 0x1? + "[ "_sv: string_view{}, + mask & 0x1? + simd::print_chr(strbuf[0], token[idx]): string_view{}, + mask & 0x1? + " ]"_sv: string_view{}, + mask & 0x2? + " "_sv: string_view{}, + mask & 0x2? + simd::print_mem(strbuf[1], token[idx]): string_view{}, + }; } // @@ -193,6 +203,7 @@ ircd::gpt::vocab::debug(const mutable_buffer &out, ircd::string_view ircd::gpt::vocab::detokenize(const mutable_buffer &out, const vector_view &in) +noexcept { size_t off(0); for(const u16 &id : in) @@ -228,9 +239,65 @@ ircd::gpt::vocab::detokenize(const mutable_buffer &out, // tokenize // +uint16_t +ircd::gpt::vocab::tokenize(const string_view &in) +{ + char str_buf[16]; + const string_view str + { + str_buf, copy(str_buf, in) + }; + + u16 buf[16]; + const auto out + { + tokenize(buf, str) + }; + + if(unlikely(out.size() != 1)) + throw error + { + "Input tokenizes to %zu tokens.", + out.size() + }; + + return buf[0]; +} + +uint16_t +ircd::gpt::vocab::tokenize(const_buffer &in) +noexcept +{ + char str_buf[16]; + const string_view str + { + str_buf, copy(str_buf, in) + }; + + u16 buf[16]; + const auto out + { + tokenize(buf, str) + }; + + const auto &tok + { + get_token(buf[0]) + }; + + const auto consumed + { + simd::strlen(tok) + }; + + consume(in, consumed); + return buf[0]; +} + ircd::vector_view ircd::gpt::vocab::tokenize(const vector_view &out, const string_view &in) +noexcept { using input_t = u8x16; using block_t = u16x16; @@ -801,13 +868,8 @@ ircd::gpt::vocab::bpe_score(u16 (&score)[16], ircd::u16 ircd::gpt::vocab::find_token(const u8x16 string) { - const auto *const __restrict__ token - { - reinterpret_cast(vocab::token) - }; - for(uint i(0); i < tokens; ++i) - if(simd::streq(string, token[i])) + if(simd::streq(string, get_token(i))) return i; return u16(-1U); @@ -835,3 +897,14 @@ ircd::gpt::vocab::find_merge(const u8x16 a, return u16(-1U); } + +ircd::u8x16 +ircd::gpt::vocab::get_token(const u16 idx) +{ + const auto *const __restrict__ token + { + reinterpret_cast(vocab::token) + }; + + return token[idx]; +}