diff --git a/include/ircd/gpt/generate.h b/include/ircd/gpt/generate.h index f2a681fcb..4bbb7e728 100644 --- a/include/ircd/gpt/generate.h +++ b/include/ircd/gpt/generate.h @@ -13,15 +13,16 @@ namespace ircd::gpt { + void + generate(task &); + vector_view generate(const vector_view &out, const vector_view &in, - const opts * = &default_opts, - task * = nullptr); + task &); string_view generate(const mutable_buffer &out, const string_view &in, - const opts * = &default_opts, - task * = nullptr); + task &); } diff --git a/include/ircd/gpt/gpt.h b/include/ircd/gpt/gpt.h index 3abf43bda..8fb24b765 100644 --- a/include/ircd/gpt/gpt.h +++ b/include/ircd/gpt/gpt.h @@ -20,63 +20,14 @@ namespace ircd::gpt struct opts; struct task; - extern const opts default_opts; extern log::log log; } +#include "hypercall.h" #include "vocab.h" #include "model.h" +#include "token.h" +#include "opts.h" #include "task.h" +#include "pipe/pipe.h" #include "generate.h" - -/// Primary Options -/// -/// Use this structure to configure and control specifics of the machine. -/// These settings are immutable for the operations. To maintain state between -/// calls see task.h -/// -struct ircd::gpt::opts -{ - /// Specifies the nominal halting condition based on the sequence of - /// tokens. Generation will complete when this sequence is witnessed. Set - /// tokens to -1 starting from the back to not match that token. Setting - /// all tokens to -1 will ignore this condition. - uint accept_code[3][3] - { - { 13, 198, -1U, }, - { 198, 198, -1U, }, - { -1U, -1U, -1U, }, - }; - - /// Specifies the exceptional halting condition based on the sequence of - /// tokens. By default, the three zeros represent three outputs of '!' - /// which is probably an error code; note that a true "!!!" is represented - /// by token number 10185. Set tokens to -1 starting from the back to - /// not match that token; generated output after errors is usually garbage. - uint error_code[3][3] - { - { 0, 0, 0, }, - { -1U, 0, 0, }, - { -1U, 0, 0, }, - }; - - /// Limit number of output tokens. Default of -1 is unlimited; the number - /// of tokens generated will be limited by other factors. - uint limit - { - -1U - }; - - /// Flip random coins over the top k logits each round. Setting to 1 - /// deterministically selects the top logit. - uint top_k - { - 2 - }; - - /// Pointer to the model - const model::decoder *model - { - model::default_model - }; -}; diff --git a/include/ircd/gpt/hypercall.h b/include/ircd/gpt/hypercall.h new file mode 100644 index 000000000..13fda2260 --- /dev/null +++ b/include/ircd/gpt/hypercall.h @@ -0,0 +1,34 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_HYPERCALL_H + +/// Hypercalling code enumeration. +/// +/// Error codes are all negative values. Zero is also an error. +enum ircd_gpt_hypercall +{ + /// General nominal completion code; similar to EXIT_SUCCESS, etc. + IRCD_GPT_ACCEPT = 1, + + /// Failed or incomplete execution occurred. After an execution attempt + /// it indicates no execution likely took place. Device software never + /// sets this value; it is the initial value set by the host before + /// execution. + IRCD_GPT_ECOMPLETE = 0, +}; + +#ifdef __cplusplus +namespace ircd::gpt +{ + string_view reflect(const enum ircd_gpt_hypercall) noexcept; +} +#endif diff --git a/include/ircd/gpt/model.h b/include/ircd/gpt/model.h index 5f6a0033d..cb2b8032c 100644 --- a/include/ircd/gpt/model.h +++ b/include/ircd/gpt/model.h @@ -67,7 +67,8 @@ struct ircd::gpt::model::block norm ln2; model::ffnn ffnn; -}; +} +__attribute__((packed)); /// Vocabulary embeddings struct ircd::gpt::model::embed diff --git a/include/ircd/gpt/opts.h b/include/ircd/gpt/opts.h new file mode 100644 index 000000000..0071c3a3a --- /dev/null +++ b/include/ircd/gpt/opts.h @@ -0,0 +1,123 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_OPTS_H + +/// Task Options Page +/// +/// The option block is directly shared with task software as constant data. +/// This stucture and its mutable companion in `task.h` determine the outcome +/// of the next execution cycle; options are immutable to device software but +/// may be changed by the host between executions cycles if desired. +/// +struct ircd_gpt_opts +{ + /// Specifies the nominal halting condition based on a sequence of tokens. + /// Generation will complete with success after one of these sequences is + /// witnessed. Set tokens to -1 starting from the back for shorter + /// sequences; zero-length sequences (all -1's) are never matched. + uint accept_code[4][4] + #ifdef __cplusplus + { + { 13, 198, -1U, -1U, }, + { 198, 198, -1U, -1U, }, + { -1U, -1U, -1U, -1U, }, + { -1U, -1U, -1U, -1U, }, + } + #endif + ; + + /// Specifies the exceptional halting condition based on the sequence of + /// tokens. By default, the three zeros represent three outputs of '!' + /// which is probably an error; note that a true "!!!" is represented by + /// token number 10185. Set tokens to -1 starting from the back to not + /// match that token; generated output after errors is usually garbage. + uint error_code[4][4] + #ifdef __cplusplus + { + { 0, 0, 0, -1U, }, + { -1U, -1U, -1U, -1U, }, + { -1U, -1U, -1U, -1U, }, + { -1U, -1U, -1U, -1U, }, + } + #endif + ; + + /// Limit number of output tokens. Default of -1 is unlimited; the number + /// of tokens generated will be limited by other factors. + uint limit + #ifdef __cplusplus + { + 1 + } + #endif + ; + + /// Flip random coins over the top k logits each round. Setting to 1 + /// deterministically selects the top logit. + uint top_k + #ifdef __cplusplus + { + 2 + } + #endif + ; + + /// Specifies the token context size in tokens. + uint context_tokens + #ifdef __cplusplus + { + 1024 + } + #endif + ; + + /// Specifies the token buffer size in tokens. + uint buffer_tokens + #ifdef __cplusplus + { + 1024 + } + #endif + ; + + /// Seed for the task's PRNG. + ulong seed + #ifdef __cplusplus + { + 1234567890UL + } + #endif + ; +} +__attribute__((aligned(4096))); + +#ifdef __cplusplus +/// Generator Task Options. +/// +/// Parameters for a task. Options are constant and one instance can be shared +/// between multiple task instances. This structure extends the task options +/// page, starting a new page which is not visible to device software; C++ and +/// host pointers are available. +/// +struct ircd::gpt::opts +:ircd_gpt_opts +{ + /// Pointer to the model + const model::decoder *model + { + model::default_model + }; +}; + +static_assert(sizeof(struct ircd_gpt_opts) == 4096); +static_assert(std::is_standard_layout::value); +#endif diff --git a/include/ircd/gpt/pipe/code.h b/include/ircd/gpt/pipe/code.h new file mode 100644 index 000000000..4d21ed1d3 --- /dev/null +++ b/include/ircd/gpt/pipe/code.h @@ -0,0 +1,22 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_PIPE_CODE_H + +/// Pipe code segment. +struct ircd::gpt::pipe::code +:cl::code +{ + static const string_view compile_opts; + + code(); + ~code() noexcept; +}; diff --git a/include/ircd/gpt/pipe/desc.h b/include/ircd/gpt/pipe/desc.h new file mode 100644 index 000000000..6863821fc --- /dev/null +++ b/include/ircd/gpt/pipe/desc.h @@ -0,0 +1,47 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_PIPE_DESC_H + +/// Pipe descriptor +struct ircd::gpt::pipe::desc +{ + struct layer; + + pipe::model *model; + pipe::code *code; + + cl::data + state, // qry/key/val projection (tokens * embed * 3 * float) + accum, // accumulator (tokens * embed * float) + logit, // result output vector (50257 * float) + ctrl, // control page + opts; // options page + + cl::kern + lm_embed, + lm_norm, + lm_logit, + lm_select; + + std::unique_ptr + layer[12]; + + desc(pipe::code &, pipe::model &); +}; + +struct ircd::gpt::pipe::desc::layer +{ + cl::kern negative; + cl::kern positive; + + layer(pipe::desc &, const int); +}; diff --git a/include/ircd/gpt/pipe/exec.h b/include/ircd/gpt/pipe/exec.h new file mode 100644 index 000000000..dee2213bf --- /dev/null +++ b/include/ircd/gpt/pipe/exec.h @@ -0,0 +1,57 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_GPT_PIPE_EXEC_H + +/// Perform one task cycle on the device. +/// +/// Constructions of this object enqueue device commands to complete an +/// additional epoch of the task as provided by `ctrl` and `opts`. +/// +/// Destructions of this object yield the ircd::ctx until those commands +/// are complete. +/// +/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is +/// achieved by constructing several objects before following with destructions +/// i.e in a std::deque. +/// +struct ircd::gpt::pipe::exec +{ + pipe::desc *desc; + + const_buffer + send_opts, // Set when sending the options page. + send_ctrl; // Set when sending the control page. + + mutable_buffer + recv_ctrl; // Set when receiving the control page. + + cl::kern::range + range_lm_embed, // Dimension range of the lm_embed kernel. + range_negative, // Dimension range of a layer kernel. + range_positive, // Dimension range of a layer kernel. + range_lm_norm, // Dimension range of the final norm kernel. + range_lm_logit, // Dimension range of the language logit kernel. + range_lm_select; // Dimension range of the language token kernel. + + cl::exec + release_opts, // Release the options page. + release_ctrl, // Release the control page. + lm_embed, // Compute token and positional embeddings. + coil[12 * 2], // Pass over all layers. + lm_norm, // Final normalization. + lm_logit, // Compute logit result vector. + lm_select, // Select next token. + acquire_ctrl; // Acquire the control page. + + exec(task &, const size_t tokens, const bool rel, const bool acq); + ~exec() noexcept; +}; diff --git a/include/ircd/gpt/pipe/model.h b/include/ircd/gpt/pipe/model.h index 4d27f48a0..cdaef24ca 100644 --- a/include/ircd/gpt/pipe/model.h +++ b/include/ircd/gpt/pipe/model.h @@ -59,11 +59,13 @@ struct ircd::gpt::pipe::model::block model::attn attn; model::ffnn ffnn; + block(cl::data &, const off_t, const gpt::model::block &, const size_t); block(const gpt::model::block &, const size_t); }; struct ircd::gpt::pipe::model::decoder { + cl::data master; model::block block[12]; tensor norm; diff --git a/include/ircd/gpt/pipe/pipe.h b/include/ircd/gpt/pipe/pipe.h index fad88239e..b3d81dc07 100644 --- a/include/ircd/gpt/pipe/pipe.h +++ b/include/ircd/gpt/pipe/pipe.h @@ -17,7 +17,6 @@ namespace ircd::gpt::pipe struct code; struct desc; struct exec; - struct bank; extern model *default_model; extern code *default_code; @@ -27,70 +26,6 @@ namespace ircd::gpt::pipe }; #include "model.h" -#include "ctrl.h" - -struct ircd::gpt::pipe::code -:cl::code -{ - static const string_view compile_opts; - - code(); - ~code() noexcept; -}; - -struct ircd::gpt::pipe::desc -{ - struct layer; - - pipe::model *model; - pipe::code *code; - - cl::data opts; - cl::data ctrl; - cl::data state; - cl::data xattn; - cl::data accum; - cl::data logit; - cl::kern anode; - std::unique_ptr layer[12]; - cl::kern cathode; - cl::kern lmhead; - cl::kern lmamax; - - desc(pipe::code &, pipe::model &); -}; - -struct ircd::gpt::pipe::desc::layer -{ - cl::kern negative; - cl::kern selfattn; - cl::kern positive; - - layer(pipe::desc &, const int); -}; - -struct ircd::gpt::pipe::exec -{ - pipe::desc *desc; - - mutable_buffer out_ctrl; - const_buffer in_ctrl, in_opts; - - cl::kern::range range_anode; - cl::kern::range range_coil; - cl::kern::range range_negative; - cl::kern::range range_selfattn; - cl::kern::range range_positive; - cl::kern::range range_cathode; - cl::kern::range range_lmhead; - cl::kern::range range_lmamax; - - cl::exec send[2]; - cl::exec tail[1]; - cl::exec coil[12 * 3]; - cl::exec head[3]; - cl::exec recv[1]; - - exec(ctor_ctrl &, const ctor_opts &); - ~exec() noexcept; -}; +#include "code.h" +#include "desc.h" +#include "exec.h" diff --git a/include/ircd/gpt/task.h b/include/ircd/gpt/task.h index 8905abce2..6edcd728b 100644 --- a/include/ircd/gpt/task.h +++ b/include/ircd/gpt/task.h @@ -11,8 +11,77 @@ #pragma once #define HAVE_IRCD_GPT_TASK_H -/// Context to maintain state across calls. +/// Task Control Page /// +/// The control block is shared with our device software. Execution state is +/// maintained in the task control block across cycles. The control block is +/// the mutable state component for an execution; for the immutable component +/// also shared with device software see opts.h. +/// +struct ircd_gpt_task +{ + /// Header magic 0xC7012C70 + uint magic; + + /// Hypercall code set by our device software upon completion and control + /// transfer back to the host. Negative codes indicate errors, positive + /// codes are used for status and/or procedure calls; zero is also an error. + enum ircd_gpt_hypercall call; + + /// Token ring head. Tokens in the ring extend behind the head for + /// `tokens`. The `head` value is automatically modulated by device + /// software to wrap around the ring. + uint head; + + /// Token counter. The counter indicates the number of valid tokens in + /// the context buffer. This value must not exceed the buffer size. + uint tokens; + + /// Accumulates the number of task cycles. The cycle counter is incremented + /// by device software after each repetition of the kernel pipeline to + /// produce one additional token. + ulong cycle; + + /// Accumulates the epoch count for the task. The counter is incremented + /// by one in device software before control returns back to the host. + /// Several cycles may occur during each epoch. + ulong epoch; + + /// Accumulates the number of tokens produced by the task. Several tokens + /// may be produced each epoch, but currently only one token is produced + /// each cycle. + ulong produced; + + /// Accumulates the number tokens witnessed by the task. The number of + /// tokens in the context for each cycle is counted as witnessed. + ulong witnessed; + + /// Accumulates time in microseconds elapsed for the task. + ulong elapsed; + + /// PRNG xoshiro256 state. This is the de facto random seed which can be + /// set before cycle entry by the host. It is updated by device software + /// when used. + ulong rand[4]; + + /// Updated by the host with the value of the timestamp register as sampled + /// immediately before each transfer of control to the device. + ulong host_tsc; + + /// State counters for the accept/error sequence codes. + uint accept_seq[4], error_seq[4]; + + /// The token buffer starts at offset 2048 and continues to the end of + /// the page; options specify the size of the tokens buffer in tokens. + /// Additional pages must be attached for larger buffer sizes. + ushort token[] __attribute__((aligned(2048))); +} +__attribute__((aligned(4096))); + +#ifdef __cplusplus +/// Task Context +/// +/// State for a task. struct ircd::gpt::task { enum status :char; @@ -20,36 +89,16 @@ struct ircd::gpt::task /// Reference to the attached options. const gpt::opts *opts {nullptr}; + /// Reference to control pages. + struct ircd_gpt_task *ctrl {nullptr}; + /// Current task status. enum status status {'\0'}; - /// State counters for the accept codes specified in the options. - uint8_t accept_seq[3] {0}; + task(const gpt::opts * = nullptr, + struct ircd_gpt_task * = nullptr); - /// State counters for the error codes specified in the options. - uint8_t error_seq[3] {0}; - - /// Accumulates the number of executions by the user. Each call to the - /// interface is an execution. - uint64_t epoch {0}; - - /// Accumulates the number of tokens produced by the task. Several tokens - /// may be produced each epoch. - uint64_t produced {0}; - - /// Accumulates the number tokens witnessed by the task. The number of - /// tokens in the context for each produced token is counted as witnessed. - uint64_t witnessed {0}; - - /// Accumulates the number of CPU reference cycles consumed by the task. - /// This counter does not reflect time when the task is queued or waiting - /// or offloaded to a co-processor/accelerator. - uint64_t cycles {0}; - - /// Accumulates the total time in milliseconds over all executions of the - /// task. This counter reflects total wall-clock time of all phases of - /// the execution. - milliseconds time {0ms}; + ~task() noexcept; }; /// The current status of a task is indicated with intelligible characters @@ -61,3 +110,7 @@ enum ircd::gpt::task::status ACCEPT = 'A', ///< Execution completed successfully. ERROR = 'E', ///< Execution did not complete successfully. }; + +static_assert(sizeof(struct ircd_gpt_task) == 4096); +static_assert(std::is_standard_layout::value); +#endif diff --git a/include/ircd/gpt/pipe/ctrl.h b/include/ircd/gpt/token.h similarity index 56% rename from include/ircd/gpt/pipe/ctrl.h rename to include/ircd/gpt/token.h index e1419a5b8..bc70b8614 100644 --- a/include/ircd/gpt/pipe/ctrl.h +++ b/include/ircd/gpt/token.h @@ -9,78 +9,46 @@ // full license for this software is available in the LICENSE file. #pragma once -#define HAVE_IRCD_GPT_PIPE_CTRL_H +#ifdef __OPENCL_C_VERSION__ +#define HAVE_IRCD_GPT_TOKEN_H -struct ctor_ctrl -{ - long call; - ulong pc; - ulong tokens; - ulong magic; - uchar pad[1024 - 32]; - - union - { - char str[3072]; - ushort token[1536]; - } - body; -} -__attribute__((aligned(4096))); - -struct ctor_opts -{ - uchar pad[4096]; -} -__attribute__((aligned(4096))); - -#ifndef __OPENCL_C_VERSION__ -static_assert(sizeof(struct ctor_ctrl) == 4096); -#endif - -#ifndef __OPENCL_C_VERSION__ -static_assert(sizeof(struct ctor_opts) == 4096); -#endif - -#ifndef __cplusplus - -union token +union ircd_gpt_token { float word[768], attn[12][64]; }; -union tokenv +union ircd_gpt_tokenv { float4 word[768/4], attn[12][64/4]; }; -struct qkv +struct ircd_gpt_qkv { - union token + union ircd_gpt_tokenv qry, key, val; }; -struct qkvv +struct ircd_gpt_qkvv { - union tokenv + union ircd_gpt_tokenv qry, key, val; }; -struct attn_mask +struct ircd_gpt_attn_mask { bool token[1024]; }; -union aperature +union ircd_gpt_aperature { float word[768], @@ -90,7 +58,7 @@ union aperature attn[12][64]; }; -union aperaturev +union ircd_gpt_aperaturev { float4 word[768/4], diff --git a/ircd/gpt.cc b/ircd/gpt.cc index 8d01098e2..6f99419ee 100644 --- a/ircd/gpt.cc +++ b/ircd/gpt.cc @@ -34,30 +34,22 @@ namespace ircd::gpt static f32 logit alignas(64) [65536], + embeds alignas(64) [1024 * 768], scratch alignas(64) [1024 * 768]; } -namespace ircd::gpt -{ - extern void transform(ctor_ctrl &, const ctor_opts &); -} - decltype(ircd::gpt::log) ircd::gpt::log { "gpt" }; -decltype(ircd::gpt::default_opts) -ircd::gpt::default_opts; - ircd::string_view ircd::gpt::generate(const mutable_buffer &out, const string_view &in, - const opts *opts, - task *task) + task &task) { - u16 buf[2][256]; + u16 buf[2][1024]; const auto input_tokens { vocab::tokenize(buf[0], in) @@ -65,7 +57,7 @@ ircd::gpt::generate(const mutable_buffer &out, const auto output_tokens { - generate(buf[1], input_tokens, opts, task) + generate(buf[1], input_tokens, task) }; const auto output @@ -79,13 +71,92 @@ ircd::gpt::generate(const mutable_buffer &out, ircd::vector_view ircd::gpt::generate(const vector_view &out, const vector_view &in, - const opts *opts, - task *task) + task &task) { + assert(task.ctrl); + assert(task.opts); + + uint ret(0); + bool halt(false); + + const auto &opts(*task.opts); + auto &ctrl(*task.ctrl); + auto &errc(ctrl.error_seq); + auto &accc(ctrl.accept_seq); + ctrl.tokens = in.size(); + + const size_t tmax + { + in.size() + opts.limit + }; + + const vector_view accum + { + gpt::scratch, tmax * 768 + }; + + const vector_view embeds + { + gpt::embeds, tmax * 768 + }; + + for(uint j(0); j < in.size(); ++j) + { + const vector_view dst + { + data(embeds) + j * 768, 768 + }; + + if(ircd::cl::enable) + ctrl.token[j] = in[j]; + else + embed(data(dst), in[j], j, opts); + + static char dbuf[512] {0}; + char report[1536] {0}; + char tmbuf[1][64] {{0}}; + const size_t report_size = snprintf + ( + report, sizeof(report), + "%-2u -- %-3u [%5u] --- --- %s 0 0 | %8s", + j, + ctrl.tokens, + ctrl.token[j], + vocab::debug(dbuf, ctrl.token[j]).c_str(), + pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str() + ); + + log::info + { + log, "%s", + string_view{report, report_size} + }; + } + + uint64_t cycles(0); + milliseconds last_time {0}; + util::timer stopwatch; + { + const prof::scope_cycles task_cycles + { + cycles + }; + + generate(task); + } + last_time = stopwatch.at(); + ctrl.elapsed += last_time.count(); + + /* + coil(data(scratch), tokens, *opts.model); + tail(logit, data(last_embed), *opts.model); + out[i] = argmax(logit, *opts); + */ + uint accc_thresh[3] {3, 3, 3}; for(uint i(0); i < 3; ++i) for(uint j(3); j > 0; --j) - if(opts->accept_code[i][j - 1] == -1U) + if(opts.accept_code[i][j - 1] == -1U) --accc_thresh[i]; else break; @@ -93,99 +164,22 @@ ircd::gpt::generate(const vector_view &out, uint errc_thresh[3] {3, 3, 3}; for(uint i(0); i < 3; ++i) for(uint j(3); j > 0; --j) - if(opts->error_code[i][j - 1] == -1U) + if(opts.error_code[i][j - 1] == -1U) --errc_thresh[i]; else break; - uint ret(0); - bool halt(false); - auto &errc(task->error_seq); - auto &accc(task->accept_seq); - for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i) + for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j) { - ctor_ctrl ctrl alignas(4096) {0}; - ctrl.pc = 1; - - const size_t tokens - { - in.size() + i - }; - - const vector_view scratch - { - gpt::scratch, tokens * 768 - }; - - for(uint j(0); j < in.size(); ++j) - { - const vector_view dst - { - data(scratch) + j * 768, 768 - }; - - if(ircd::cl::enable) - ctrl.body.token[ctrl.tokens++] = in[j]; - else - embed(data(dst), in[j], j, *opts); - } - - for(uint j(0); j < i; ++j) - { - const vector_view dst - { - data(scratch) + (in.size() + j) * 768, 768 - }; - - if(ircd::cl::enable) - ctrl.body.token[ctrl.tokens++] = out[j]; - else - embed(data(dst), out[j], in.size() + j, *opts); - } - - assert(!ircd::cl::enable || ctrl.tokens == tokens); - const vector_view last_embed - { - data(scratch) + (tokens - 1) * 768, 768 - }; - - const auto last_cycl(task->cycles); - milliseconds last_time {0}; - { - util::timer stopwatch; - const prof::scope_cycles task_cycles - { - task->cycles - }; - - if(ircd::cl::enable) - { - static const ctor_opts opts alignas(4096) {0}; - - transform(ctrl, opts); - out[i] = ctrl.body.token[ctrl.tokens - 1]; - assert(ctrl.tokens == tokens + 1); - } else { - coil(data(scratch), tokens, *opts->model); - tail(logit, data(last_embed), *opts->model); - out[i] = argmax(logit, *opts); - } - - last_time = stopwatch.at(); - task->time += last_time; - } + out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens]; for(uint j(0); j < 3; ++j) - errc[j] = - opts->error_code[j][errc[j]] == out[i]? - errc[j] + 1: - 0; + errc[j] = opts.error_code[j][errc[j]] == out[j]? + errc[j] + 1: 0; for(uint j(0); j < 3; ++j) - accc[j] = - opts->accept_code[j][accc[j]] == out[i]? - accc[j] + 1: - 0; + accc[j] = opts.accept_code[j][accc[j]] == out[j]? + accc[j] + 1: 0; for(uint j(0); j < 3; ++j) halt |= accc_thresh[j] && accc[j] >= accc_thresh[j], @@ -194,21 +188,23 @@ ircd::gpt::generate(const vector_view &out, static char dbuf[512] {0}; char report[1536] {0}; char tmbuf[4][64] {0}; - size_t report_size; - report_size = snprintf + const size_t bsz(ctrl.tokens - in.size()); + const size_t report_size = snprintf ( report, sizeof(report), - "%-2u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s", - i, + "%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s", + j, + j + in.size(), ctrl.tokens, - ret, - out[i], + ctrl.cycle, + ctrl.epoch, + out[j], accc[0] + accc[1] + accc[2], errc[0] + errc[1] + errc[2], - vocab::debug(dbuf, out[i]).c_str(), - pretty(tmbuf[0], last_time, 1).c_str(), - pretty(tmbuf[1], si(last_cycl), 1).c_str(), - pretty(tmbuf[2], task->time, 1).c_str() + vocab::debug(dbuf, out[j]).c_str(), + pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(), + pretty(tmbuf[1], si(cycles / bsz), 1).c_str(), + pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str() ); log::info @@ -216,24 +212,22 @@ ircd::gpt::generate(const vector_view &out, log, "%s", string_view{report, report_size} }; - - ++ret; - ctx::yield(); - ctx::interruption_point(); } + ret = ctrl.tokens - in.size(); for(uint i(0); i < 3; ++i) - if(accc_thresh[i] && task->accept_seq[i] >= accc_thresh[i]) + if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i]) { ret -= (3 - accc_thresh[i]); break; } - else if(errc_thresh[i] && task->error_seq[i] >= errc_thresh[i]) + else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i]) { ret -= (3 - errc_thresh[i]); break; } + ctx::interruption_point(); return vector_view { out, ret diff --git a/ircd/gpt_cl.cl b/ircd/gpt_cl.cl index 27fc6eeb8..8909edd47 100644 --- a/ircd/gpt_cl.cl +++ b/ircd/gpt_cl.cl @@ -8,129 +8,27 @@ // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. -inline void -ctor_local_bcast_ldr(__local float4 *const out, - const uint ln, - const uint li) -{ - for(uint stride = 1; stride < ln; stride <<= 1) - { - if(li < stride) - out[li + stride] = out[li]; - - barrier(CLK_LOCAL_MEM_FENCE); - } -} inline void -ctor_local_reduce_add_ldr(__local float4 *const out, - const uint ln, - const uint li) -{ - for(uint stride = ln >> 1; stride > 0; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if(li < stride) - out[li] += out[li + stride]; - } -} - -inline void -ctor_local_reduce_max_ldr(__local float *const out, - const uint ln, - const uint li) -{ - for(uint stride = ln >> 1; stride > 0; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if(li < stride) - out[li] = max(out[li], out[li + stride]); - } -} - -inline void -ctor_local_reduce_tournament_ldr(__local float *const best, - __local ushort *const idx, - const uint ln, - const uint li) -{ - for(uint stride = ln >> 1; stride > 0; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if(li < stride && best[li] < best[li + stride]) - { - best[li] = best[li + stride]; - idx[li] = idx[li + stride]; - } - } -} - -inline void -ctor_mean(__local float4 *const restrict out, - __local const float4 *const restrict in, - const uint num, - const uint i) -{ - out[i] = in[i]; - ctor_local_reduce_add_ldr(out, num, i); - - float numerator = 0.0f; - float4 numeratorv = out[i]; - for(uint k = 0; k < 4; ++k) - numerator += numeratorv[k]; - - out[i] = numerator; - ctor_local_bcast_ldr(out, num, i); - - numeratorv = out[i]; - out[i] = numeratorv / (num * 4); -} - -inline void -ctor_norm(__local float4 *const out, - __local const float4 *const in, - __local float4 *const restrict tmp, - const uint num, - const uint i) -{ - ctor_mean(tmp, in, num, i); - - const float4 - sub_mean = in[i] - tmp[i]; - - tmp[i] = pow(sub_mean, 2); - ctor_mean(out, tmp, num, i); - - const float4 - epsilon = 0.00001f, - s = sqrt(out[i] + epsilon); - - out[i] = sub_mean / s; -} - -inline void -ctor_norm_fmad(__local float4 *const out, - __local const float4 *const in, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight, - const uint i) +ircd_gpt_norm_fmad(__local float4 *const out, + __local const float4 *const in, + __global const float4 *const restrict bias, + __global const float4 *const restrict weight, + const uint i) { out[i] = in[i] * weight[i] + bias[i]; } // Matrix * Vector Multiply/Accumulate inline void -ctor_sgemv(__local float4 *const restrict out, - __local const float4 *const restrict in, - __global const float4 *const restrict bias, - __global const float4 *const restrict weight, - const uint width, - const uint height, - const uint tiles, - const uint i) +ircd_gpt_sgemv(__local float4 *const restrict out, + __local const float4 *const restrict in, + __global const float4 *const restrict bias, + __global const float4 *const restrict weight, + const uint width, + const uint height, + const uint tiles, + const uint i) { const uint seg = height / tiles; @@ -151,9 +49,9 @@ ctor_sgemv(__local float4 *const restrict out, } inline void -ctor_gelu(__local float4 *const out, - __local const float4 *const in_, - const uint i) +ircd_gpt_gelu(__local float4 *const out, + __local const float4 *const in_, + const uint i) { float4 a, in = in_[i]; @@ -178,14 +76,15 @@ ctor_gelu(__local float4 *const out, // __kernel void -ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union aperaturev *const restrict out, - __global const union tokenv *const restrict in, - __global const float4 *const restrict norm_bias, - __global const float4 *const restrict norm_weight, - __global const float4 *const restrict fcon_bias, - __global const float4 *const restrict fcon_weight) +ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_tokenv *const restrict accum, + __global const float4 *const restrict norm_bias, + __global const float4 *const restrict norm_weight, + __global const float4 *const restrict fcon_bias, + __global const float4 *const restrict fcon_weight, + __global const float4 *const restrict proj_bias, + __global const float4 *const restrict proj_weight) { const uint gi = get_global_id(0), @@ -195,31 +94,39 @@ ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl, wi = get_group_id(0), wn = get_num_groups(0); - __local union aperaturev token; + __local union ircd_gpt_aperaturev token; __local float4 tmp[768/4]; - token.word[li] = in[wi].word[li]; + // Fetch local copy of the global accumulator. We operate on a cached + // copy as input, and add our output to the global upon completion. + token.word[li] = accum[wi].word[li]; // Layer re-normalization - ctor_norm(token.word, token.word, tmp, ln, li); - ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); + ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li); + ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); // Fully connected - for(uint i = 0; i < 3; ++i) - ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li); + for(uint i = 0; i < 4; ++i) + ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li); - // Export queries, keys, and values. - for(uint i = 0; i < 3; ++i) - out[wi].proj[i][li] = token.proj[i][li]; + // Gaussian Error Linear Unit + for(uint i = 0; i < 4; ++i) + ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li); + + // Projection + ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li); + + // Accumulation; end of layer + accum[wi].word[li] += tmp[li]; } __kernel void -ctor_attn_proj(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const union tokenv *const restrict xattn, - __global const float4 *const restrict proj_bias, - __global const float4 *const restrict proj_weight) +ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_tokenv *const restrict accum, + __local const union ircd_gpt_tokenv *const restrict xattn, + __global const float4 *const restrict proj_bias, + __global const float4 *const restrict proj_weight) { const uint gi = get_global_id(0), @@ -234,25 +141,24 @@ ctor_attn_proj(__global const struct ctor_ctrl *const ctrl, out[768/4]; // Fetch - in[li] = xattn[wi].word[li]; + in[li] = xattn->word[li]; + + // Need this here if xattn is __local + barrier(CLK_LOCAL_MEM_FENCE); // Projection - ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li); + ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li); // Accumulation; end of layer accum[wi].word[li] += out[li]; } __kernel void -ctor_ffnn(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const float4 *const restrict norm_bias, - __global const float4 *const restrict norm_weight, - __global const float4 *const restrict fcon_bias, - __global const float4 *const restrict fcon_weight, - __global const float4 *const restrict proj_bias, - __global const float4 *const restrict proj_weight) +ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __local union ircd_gpt_tokenv *const restrict out, + __global const struct ircd_gpt_qkvv *const restrict token, + __global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024], { const uint gi = get_global_id(0), @@ -262,96 +168,13 @@ ctor_ffnn(__global const struct ctor_ctrl *const ctrl, wi = get_group_id(0), wn = get_num_groups(0); - __local union aperaturev token; - __local float4 tmp[768/4]; - - // Fetch local copy of the global accumulator. We operate on a cached - // copy as input, and add our output to the global upon completion. - token.word[li] = accum[wi].word[li]; - - // Layer re-normalization - ctor_norm(token.word, token.word, tmp, ln, li); - ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); - - // Fully connected - for(uint i = 0; i < 4; ++i) - ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li); - - // Gaussian Error Linear Unit - for(uint i = 0; i < 4; ++i) - ctor_gelu(token.fcon, token.fcon, i * ln + li); - - // Projection - ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li); - - // Accumulation; end of layer - accum[wi].word[li] += tmp[li]; -} - -__kernel void -ctor_backend(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const union tokenv *const restrict xattn, - __global const float4 *const restrict attn_proj_bias, - __global const float4 *const restrict attn_proj_weight, - __global const float4 *const restrict ffnn_norm_bias, - __global const float4 *const restrict ffnn_norm_weight, - __global const float4 *const restrict ffnn_fcon_bias, - __global const float4 *const restrict ffnn_fcon_weight, - __global const float4 *const restrict ffnn_proj_bias, - __global const float4 *const restrict ffnn_proj_weight) -{ - ctor_attn_proj - ( - ctrl, - opts, - accum, - xattn, - attn_proj_bias, - attn_proj_weight - ); - - ctor_ffnn - ( - ctrl, - opts, - accum, - ffnn_norm_bias, - ffnn_norm_weight, - ffnn_fcon_bias, - ffnn_fcon_weight, - ffnn_proj_bias, - ffnn_proj_weight - ); -} - -// -// ctrl -// - -__kernel void -ctor_attn_self(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict out, - __global const struct qkvv *const restrict token, - __global const struct attn_mask *const restrict mask) // [1024][1024], -{ - __local struct + __local union { float - attn[12][32]; + attn[12][96]; } self; - const uint - gi = get_global_id(0), - gn = get_global_size(0), - li = get_local_id(0), - ln = get_local_size(0), - wi = get_group_id(0), - wn = get_num_groups(0); - for(uint i = 0; i < wn; ++i) if(mask[wi].token[i]) self.attn[li][i] = 0.0f; @@ -389,114 +212,167 @@ ctor_attn_self(__global const struct ctor_ctrl *const ctrl, self.attn[li][i] /= acc; for(uint j = 0; j < 64/4; ++j) - out[wi].attn[li][j] = 0.0f; + out->attn[li][j] = 0.0f; for(uint i = 0; i < wn; ++i) for(uint j = 0; j < 64/4; ++j) - out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i]; + out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i]; } -// -// leads -// - __kernel void -ctor_anode0(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const union tokenv *const restrict pos, - __global const union tokenv *const restrict vocab) +ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_aperaturev *const restrict out, + __global const union ircd_gpt_tokenv *const restrict in, + __global const float4 *const restrict norm_bias, + __global const float4 *const restrict norm_weight, + __global const float4 *const restrict fcon_bias, + __global const float4 *const restrict fcon_weight) { const uint + gi = get_global_id(0), + gn = get_global_size(0), li = get_local_id(0), - wi = get_group_id(0); + ln = get_local_size(0), + wi = get_group_id(0), + wn = get_num_groups(0); - const ushort - token = ctrl->body.token[wi]; + __local union ircd_gpt_aperaturev token; + __local float4 tmp[768/4]; - const float4 - wte = vocab[token].word[li], - wpe = pos[wi].word[li]; + token.word[li] = in[wi].word[li]; - accum[wi].word[li] = wte + wpe; + // Layer re-normalization + ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li); + ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); + + // Fully connected + for(uint i = 0; i < 3; ++i) + ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li); + + // Export queries, keys, and values. + for(uint i = 0; i < 3; ++i) + out[wi].proj[i][li] = token.proj[i][li]; } __kernel void -ctor_anode1(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const union tokenv *const restrict pos, - __global const union tokenv *const restrict vocab) +ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_tokenv *const restrict accum, + __global const struct ircd_gpt_qkvv *const restrict state, + __global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024], + __global const float4 *const restrict attn_proj_bias, + __global const float4 *const restrict attn_proj_weight, + __global const float4 *const restrict ffnn_norm_bias, + __global const float4 *const restrict ffnn_norm_weight, + __global const float4 *const restrict ffnn_fcon_bias, + __global const float4 *const restrict ffnn_fcon_weight, + __global const float4 *const restrict ffnn_proj_bias, + __global const float4 *const restrict ffnn_proj_weight) +{ + __local union ircd_gpt_tokenv value; + + ircd_gpt_attn_self + ( + ctrl, + opts, + &value, + state, + mask + ); + + ircd_gpt_attn_proj + ( + ctrl, + opts, + accum, + &value, + attn_proj_bias, + attn_proj_weight + ); + + ircd_gpt_ffnn + ( + ctrl, + opts, + accum, + ffnn_norm_bias, + ffnn_norm_weight, + ffnn_fcon_bias, + ffnn_fcon_weight, + ffnn_proj_bias, + ffnn_proj_weight + ); +} + +// +// frontend +// + +inline void +_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_tokenv *const restrict out, + __global const union ircd_gpt_tokenv *const restrict pos, + __global const union ircd_gpt_tokenv *const restrict vocab, + const uint out_idx, + const uint tok_idx, + const uint word_idx) +{ + const ushort + token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens]; + + const float4 + wte = vocab[token].word[word_idx], + wpe = pos[tok_idx].word[word_idx]; + + out[out_idx].word[word_idx] = wte + wpe; +} + +__kernel void +ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_tokenv *const restrict accum, + __global const union ircd_gpt_tokenv *const restrict pos, + __global const union ircd_gpt_tokenv *const restrict vocab) { const uint li = get_local_id(0); for(uint i = 0; i < ctrl->tokens; ++i) - { - const ushort - token = ctrl->body.token[i]; - - const float4 - wte = vocab[token].word[li], - wpe = pos[i].word[li]; - - accum[i].word[li] = wte + wpe; - } + _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li); } __kernel void -ctor_anode2(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const union tokenv *const restrict pos, - __global const union tokenv *const restrict vocab) -{ - const uint - gi = get_global_id(0); - - const ushort - token = ctrl->body.token[gi]; - - for(uint i = 0; i < 768/4; ++i) - { - const float4 - wte = vocab[token].word[i], - wpe = pos[gi].word[i]; - - accum[gi].word[i] = wte + wpe; - } -} - -__kernel void -ctor_cathode(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global union tokenv *const restrict accum, - __global const float4 *const restrict norm_bias, - __global const float4 *const restrict norm_weight) +ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global union ircd_gpt_tokenv *const restrict accum, + __global const float4 *const restrict norm_bias, + __global const float4 *const restrict norm_weight) { const uint li = get_local_id(0), ln = get_local_size(0), wi = get_global_offset(0) / ln + get_group_id(0); - __local union tokenv + __local union ircd_gpt_tokenv token, tmp; token.word[li] = accum[wi].word[li]; // Final re-normalization - ctor_norm(token.word, token.word, tmp.word, ln, li); - ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li); + ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li); + ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li); accum[0].word[li] = token.word[li]; } __kernel void -ctor_lmhead(__global const struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global float *const restrict logit, - __global const union tokenv *const restrict accum, - __global const union tokenv *const restrict token) +ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global float *const restrict logit, + __global const union ircd_gpt_tokenv *const restrict accum, + __global const union ircd_gpt_tokenv *const restrict token) { const uint gi = get_global_id(0); @@ -519,10 +395,79 @@ ctor_lmhead(__global const struct ctor_ctrl *const ctrl, logit[gi] = res; } +inline void +ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + const uint li) +{ + // No action for other threads right now + if(li != 0) + return; + + // Run debug checks and assertions. + #ifdef RB_DEBUG + if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE) + if(ctrl->tokens < 2) + ctrl->call = IRCD_GPT_ETOKENS; + #endif + + // If the call value has been set to something other than default we + // do nothing else here. + if(ctrl->call != IRCD_GPT_ECOMPLETE) + return; + + // On the last cycle, with no prior call or error code set, indicate + // a nominal exit condition. + if(ctrl->cycle + 1 >= opts->limit) + { + ctrl->call = IRCD_GPT_ACCEPT; + ctrl->epoch += 1; + } + + ctrl->cycle += 1; +} + +inline void +ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + const uint li, + __local const ushort *const restrict idx) +{ + // To read from cells other than idx[0] we need this barrier. + if(opts->top_k > 1) + barrier(CLK_LOCAL_MEM_FENCE); + + // No action for other threads right now + if(li != 0) + return; + + // When the hypercall code is already set, bail here. + if(ctrl->call != IRCD_GPT_ECOMPLETE) + return; + + const bool + buffer_full = ctrl->tokens >= opts->buffer_tokens; + + const ulong + rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand), + sel = rnd % max(opts->top_k, 1U); + + const ushort + token = idx[sel], + token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens; + + ctrl->token[token_idx] = token; + + if(buffer_full) + ctrl->head = (ctrl->head + 1) % opts->buffer_tokens; + else + ctrl->tokens++; +} + __kernel void -ctor_lmamax(__global struct ctor_ctrl *const ctrl, - __constant const struct ctor_opts *const opts, - __global const float *const restrict logit) +ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl, + __constant const struct ircd_gpt_opts *const opts, + __global const float *const restrict logit) { const uint gi = get_global_id(0), @@ -535,25 +480,13 @@ ctor_lmamax(__global struct ctor_ctrl *const ctrl, ti = tn * li; __local ushort idx[192]; - __local float best[192]; idx[li] = ti; for(uint j = ti + 1; j < ti + tn && j < 50257; ++j) if(logit[j] > logit[idx[li]]) idx[li] = j; - best[li] = logit[idx[li]]; - ctor_local_reduce_tournament_ldr(best, idx, ln, li); - - if(li == 0 && ctrl->call == -1) - ctrl->body.token[ctrl->tokens++] = idx[li]; - - if(li == 0 && ctrl->call == -1) - ctrl->call = 1; - - #ifdef RB_DEBUG - if(li == 0 && ctrl->call == 1) - if(ctrl->tokens < 2) - ctrl->call = -2; - #endif + ircd_simt_sort_idx16_flldr(idx, logit, ln, li); + ircd_gpt_lm_result(ctrl, opts, li, idx); + ircd_gpt_leave(ctrl, opts, li); } diff --git a/ircd/gpt_model.cc b/ircd/gpt_model.cc index e4d0b866d..49f80620f 100644 --- a/ircd/gpt_model.cc +++ b/ircd/gpt_model.cc @@ -142,7 +142,7 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path) fs::map::opts map_opts; map_opts.huge2mb = true; - map_opts.locked = false; + map_opts.locked = true; default_model_shm = fs::map { fd, map_opts, sizeof(decoder) diff --git a/ircd/gpt_pipe.cc b/ircd/gpt_pipe.cc index 4ebaedc6e..bc83d83e7 100644 --- a/ircd/gpt_pipe.cc +++ b/ircd/gpt_pipe.cc @@ -8,21 +8,33 @@ // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. -#include - -namespace ircd::gpt -{ - void transform(ctor_ctrl &, const ctor_opts &); -} - namespace ircd::gpt::pipe { - static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts, - lmhead_opts, lmamax_opts; + static void profile_dumplog(pipe::exec &); + static ircd::cl::exec::opts + negative_opts, positive_opts, selfattn_opts, + cathode_opts, anode_opts, lmhead_opts, lmamax_opts; + + extern conf::item flush_cycles; + extern conf::item queue_cycles; extern const ircd::run::changed handle_quit; } +decltype(ircd::gpt::pipe::queue_cycles) +ircd::gpt::pipe::queue_cycles +{ + { "name", "ircd.gpt.pipe.queue" }, + { "default", 1L, }, +}; + +decltype(ircd::gpt::pipe::flush_cycles) +ircd::gpt::pipe::flush_cycles +{ + { "name", "ircd.gpt.pipe.flush" }, + { "default", 0L, }, +}; + decltype(ircd::gpt::pipe::default_model) ircd::gpt::pipe::default_model; @@ -82,144 +94,233 @@ noexcept // void -ircd::gpt::transform(ctor_ctrl &ctrl, - const ctor_opts &opts) +ircd::gpt::generate(task &task) { if(unlikely(!pipe::default_model)) pipe::init(); - ctrl.call = -1; - pipe::exec + const auto &opts { - ctrl, opts + *task.opts }; + auto &ctrl + { + *task.ctrl + }; + + ctrl.call = IRCD_GPT_ECOMPLETE; + ctrl.host_tsc = prof::cycles(); + size_t cycle(ctrl.cycle); + const size_t tokens(ctrl.tokens); + + std::deque list; + for(; cycle < opts.limit; ++cycle) + { + // When the release/acquire bits are set the control pages are sent + // and received; only set on first and last iterations of this loop. + const bool + rel(cycle == 0), + acq(cycle + 1 >= opts.limit); + + // Enqueue the cycle's commands + list.emplace_back + ( + task, tokens + cycle, rel, acq + ); + + // Conditions for a cl::flush here; this is not default but + // may be configured to improve some workloads. + const bool flush + { + // Flushing here is enabled by the configuration + pipe::flush_cycles + + // Skip flushing on cycles already performing IO or waiting. + && !rel && !acq && list.size() <= pipe::queue_cycles + + // The configuration item can specify an interval greater than + // one between flushes. + && cycle % pipe::flush_cycles == 0 + }; + + if(flush) + cl::flush(); + + // Enqueue consecutive repetitions of our kernel batch before waiting + // on the first; based on the configuration. XXX get from ircd::cl + if(list.size() <= pipe::queue_cycles) + continue; + + // Profiling branch + if((false)) + { + auto &ex(list.front()); + profile_dumplog(ex); + } + + // Destructing the front of the queue waits for completion by yielding + // this ircd::ctx. + list.pop_front(); + } + + // Wait for all unfinished + list.clear(); + + // Interp error codes if(unlikely(ctrl.call <= 0)) throw error { "hyper (#%d) :%s", - abs(ctrl.call), - ctrl.body.str, + abs(int(ctrl.call)), + reflect(ctrl.call), }; + + always_assert(ctrl.cycle == cycle); +} + +void +ircd::gpt::pipe::profile_dumplog(pipe::exec &exec) +{ + constexpr size_t coils + { + sizeof(exec.coil) / sizeof(cl::exec) + }; + + for(size_t i(0); i < coils; ++i) + { + exec.coil[i].wait(); + const auto &pro + { + exec.coil[i].profile() + }; + + char tmbuf[4][32] {{0}}; + log::logf + { + log, log::level::DEBUG, + "coil:%-2lu %8s %8s %8s %8s\n", + i, + util::pretty(tmbuf[0], si(pro[0]), 1), + util::pretty(tmbuf[1], si(pro[1]), 1), + util::pretty(tmbuf[2], si(pro[2]), 1), + util::pretty(tmbuf[3], si(pro[3]), 1), + }; + } } // // pipe::exec // -ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl, - const ctor_opts &opts) +ircd::gpt::pipe::exec::exec(task &task, + const size_t tokens, + const bool release, + const bool acquire) :desc { default_desc } -,out_ctrl +,send_opts { - reinterpret_cast(&ctrl), sizeof(ctor_ctrl) + reinterpret_cast(task.opts), + release? sizeof(struct ircd_gpt_opts): 0 } -,in_ctrl +,send_ctrl { - reinterpret_cast(&ctrl), sizeof(ctor_ctrl) + reinterpret_cast(task.ctrl), + release? sizeof(struct ircd_gpt_task): 0 } -,in_opts +,recv_ctrl { - reinterpret_cast(&opts), sizeof(ctor_opts) + reinterpret_cast(task.ctrl), + acquire? sizeof(struct ircd_gpt_task): 0 } -,range_anode +,range_lm_embed { - { ctrl.tokens, 0, }, - { 1, 0, }, -} -,range_coil -{ - { ctrl.tokens * 192UL, 0, }, - { 192UL, 0, }, + { 1 * 192UL, 0, }, + { 192UL, 0, }, } ,range_negative { - range_coil -} -,range_selfattn -{ - range_coil + { tokens * 192UL, 0, }, + { 192UL, 0, }, } ,range_positive { - range_coil + { tokens * 192UL, 0, }, + { 192UL, 0, }, } -,range_cathode +,range_lm_norm { - { 1 * 192UL, 0 }, - { 192UL, 0 }, - { (ctrl.tokens - 1) * 192UL, 0 }, + { 1 * 192UL, 0 }, + { 192UL, 0 }, + { (tokens - 1) * 192UL, 0 }, } -,range_lmhead +,range_lm_logit { { 262 * 192UL, 0 }, // align_up(50257) / 192 { 192UL, 0 }, } -,range_lmamax +,range_lm_select { { 1 * 192UL, 0 }, { 192UL, 0 }, } -,send +,release_opts { - { desc->opts, in_opts }, - { desc->ctrl, in_ctrl }, + desc->opts, send_opts } -,tail +,release_ctrl { - { desc->anode, range_anode, anode_opts }, + desc->ctrl, send_ctrl +} +,lm_embed +{ + desc->lm_embed, range_lm_embed, anode_opts } ,coil { { desc->layer[0x00]->negative, range_negative, negative_opts }, - { desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x00]->positive, range_positive, positive_opts }, { desc->layer[0x01]->negative, range_negative, negative_opts }, - { desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x01]->positive, range_positive, positive_opts }, { desc->layer[0x02]->negative, range_negative, negative_opts }, - { desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x02]->positive, range_positive, positive_opts }, { desc->layer[0x03]->negative, range_negative, negative_opts }, - { desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x03]->positive, range_positive, positive_opts }, { desc->layer[0x04]->negative, range_negative, negative_opts }, - { desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x04]->positive, range_positive, positive_opts }, { desc->layer[0x05]->negative, range_negative, negative_opts }, - { desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x05]->positive, range_positive, positive_opts }, { desc->layer[0x06]->negative, range_negative, negative_opts }, - { desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x06]->positive, range_positive, positive_opts }, { desc->layer[0x07]->negative, range_negative, negative_opts }, - { desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x07]->positive, range_positive, positive_opts }, { desc->layer[0x08]->negative, range_negative, negative_opts }, - { desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x08]->positive, range_positive, positive_opts }, { desc->layer[0x09]->negative, range_negative, negative_opts }, - { desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x09]->positive, range_positive, positive_opts }, { desc->layer[0x0a]->negative, range_negative, negative_opts }, - { desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x0a]->positive, range_positive, positive_opts }, { desc->layer[0x0b]->negative, range_negative, negative_opts }, - { desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts }, { desc->layer[0x0b]->positive, range_positive, positive_opts }, } -,head +,lm_norm { - { desc->cathode, range_cathode, cathode_opts }, - { desc->lmhead, range_lmhead, lmhead_opts }, - { desc->lmamax, range_lmamax, lmamax_opts }, + desc->lm_norm, range_lm_norm, cathode_opts } -,recv +,lm_logit { - { desc->ctrl, out_ctrl }, + desc->lm_logit, range_lm_logit, lmhead_opts +} +,lm_select +{ + desc->lm_select, range_lm_select, lmamax_opts +} +,acquire_ctrl +{ + desc->ctrl, recv_ctrl } { } @@ -296,29 +397,14 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, { &code } -,opts -{ - 4_KiB, - const_buffer{} -} -,ctrl -{ - 4_KiB, - mutable_buffer{} -} ,state { - 32 * 3 * 768 * sizeof(float), - mutable_buffer{} -} -,xattn -{ - 32 * 1 * 768 * sizeof(float), + 96 * 3 * 768 * sizeof(float), mutable_buffer{} } ,accum { - 32 * 768 * sizeof(float), + 96 * 768 * sizeof(float), mutable_buffer{} } ,logit @@ -326,16 +412,54 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, 65536 * sizeof(float), mutable_buffer{} } -,anode +,ctrl +{ + sizeof(struct ircd_gpt_task), + mutable_buffer{} +} +,opts +{ + sizeof(struct ircd_gpt_opts), + const_buffer{} +} +,lm_embed { code, - "ctor_anode2", + "ircd_gpt_lm_embed", ctrl, opts, accum, model.embed->pos, model.embed->token, } +,lm_norm +{ + code, + "ircd_gpt_lm_norm", + ctrl, + opts, + accum, + model.decode->norm.bias, + model.decode->norm.weight, +} +,lm_logit +{ + code, + "ircd_gpt_lm_logit", + ctrl, + opts, + logit, + accum, + model.embed->token, +} +,lm_select +{ + code, + "ircd_gpt_lm_select", + ctrl, + opts, + logit, +} ,layer { std::make_unique(*this, 0x00), @@ -351,34 +475,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code, std::make_unique(*this, 0x0a), std::make_unique(*this, 0x0b), } -,cathode -{ - code, - "ctor_cathode", - ctrl, - opts, - accum, - model.decode->norm.bias, - model.decode->norm.weight, -} -,lmhead -{ - code, - "ctor_lmhead", - ctrl, - opts, - logit, - accum, - model.embed->token, -} -,lmamax -{ - code, - "ctor_lmamax", - ctrl, - opts, - logit, -} { } @@ -391,7 +487,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, :negative { *desc.code, - "ctor_attn_fcon", + "ircd_gpt_attn_fcon", desc.ctrl, desc.opts, desc.state, @@ -401,24 +497,15 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc, desc.model->decode->block[laynum].attn.fcon.bias, desc.model->decode->block[laynum].attn.fcon.weight, } -,selfattn -{ - *desc.code, - "ctor_attn_self", - desc.ctrl, - desc.opts, - desc.xattn, - desc.state, - desc.model->decode->block[laynum].attn.mask, -} ,positive { *desc.code, - "ctor_backend", + "ircd_gpt_coil", desc.ctrl, desc.opts, desc.accum, - desc.xattn, + desc.state, + desc.model->decode->block[laynum].attn.mask, desc.model->decode->block[laynum].attn.proj.bias, desc.model->decode->block[laynum].attn.proj.weight, desc.model->decode->block[laynum].ffnn.norm.bias, @@ -486,23 +573,33 @@ noexcept // ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder) -:block +:master { - { decoder.layer[0x00], 0x00, }, - { decoder.layer[0x01], 0x01, }, - { decoder.layer[0x02], 0x02, }, - { decoder.layer[0x03], 0x03, }, - { decoder.layer[0x04], 0x04, }, - { decoder.layer[0x05], 0x05, }, - { decoder.layer[0x06], 0x06, }, - { decoder.layer[0x07], 0x07, }, - { decoder.layer[0x08], 0x08, }, - { decoder.layer[0x09], 0x09, }, - { decoder.layer[0x0a], 0x0a, }, - { decoder.layer[0x0b], 0x0b, }, + sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer + { + reinterpret_cast(decoder.layer), + sizeof(decoder.layer) + sizeof(decoder.f) + } +} +,block +{ + { master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, }, + { master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, }, + { master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, }, + { master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, }, + { master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, }, + { master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, }, + { master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, }, + { master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, }, + { master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, }, + { master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, }, + { master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, }, + { master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, }, } ,norm { + master, + off_t(sizeof(gpt::model::block) * 12), const_buffer{decoder.f.bias}, const_buffer{decoder.f.weight}, } @@ -544,6 +641,27 @@ ircd::gpt::pipe::model::block::block(const gpt::model::block &block, { } +ircd::gpt::pipe::model::block::block(cl::data &master, + const off_t offset, + const gpt::model::block &block, + const size_t layer) +:attn +{ + master, + offset, + block.ln1, + block.attn, +} +,ffnn +{ + master, + offset + off_t(sizeof(block.ln1) + sizeof(block.attn)), + block.ln2, + block.ffnn, +} +{ +} + // // pipe::model::ffnn // @@ -678,3 +796,48 @@ ircd::gpt::pipe::model::tensor::tensor(cl::data &master, } { } + +// +// gpt::task +// + +ircd::gpt::task::task(const gpt::opts *const opts, + struct ircd_gpt_task *const ctrl) +:opts +{ + opts +} +,ctrl +{ + ctrl +} +{ + memset(this->ctrl, 0x0, sizeof(ircd_gpt_task)); + + this->ctrl->rand[0] = this->opts->seed; + this->ctrl->rand[1] = this->opts->seed; + this->ctrl->rand[2] = -1UL; + this->ctrl->rand[3] = -1UL; +} + +ircd::gpt::task::~task() +noexcept +{ +} + +// +// hypercall +// + +ircd::string_view +ircd::gpt::reflect(const enum ircd_gpt_hypercall code) +noexcept +{ + switch(code) + { + case IRCD_GPT_ACCEPT: return "ACCEPT"; + case IRCD_GPT_ECOMPLETE: return "ECOMPLETE"; + } + + return "??????"; +} diff --git a/modules/console.cc b/modules/console.cc index 4769ea9eb..a1537e244 100644 --- a/modules/console.cc +++ b/modules/console.cc @@ -17324,10 +17324,15 @@ console_cmd__gpt__raw(opt &out, const string_view &line) opts.limit = param.at("limit"); opts.top_k = 3; - gpt::task task; + struct ircd_gpt_task ctrl; + gpt::task task + { + &opts, &ctrl + }; + const auto output { - gpt::generate(buf, text, &opts, &task) + gpt::generate(buf, text, task) }; out