0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-25 23:14:13 +01:00

ircd::gpt: Reorganize interface.

This commit is contained in:
Jason Volk 2021-04-02 13:01:38 -07:00
parent f096e7fcf5
commit 1870c364f4
17 changed files with 1067 additions and 778 deletions

View file

@ -13,15 +13,16 @@
namespace ircd::gpt namespace ircd::gpt
{ {
void
generate(task &);
vector_view<u16> vector_view<u16>
generate(const vector_view<u16> &out, generate(const vector_view<u16> &out,
const vector_view<const u16> &in, const vector_view<const u16> &in,
const opts * = &default_opts, task &);
task * = nullptr);
string_view string_view
generate(const mutable_buffer &out, generate(const mutable_buffer &out,
const string_view &in, const string_view &in,
const opts * = &default_opts, task &);
task * = nullptr);
} }

View file

@ -20,63 +20,14 @@ namespace ircd::gpt
struct opts; struct opts;
struct task; struct task;
extern const opts default_opts;
extern log::log log; extern log::log log;
} }
#include "hypercall.h"
#include "vocab.h" #include "vocab.h"
#include "model.h" #include "model.h"
#include "token.h"
#include "opts.h"
#include "task.h" #include "task.h"
#include "pipe/pipe.h"
#include "generate.h" #include "generate.h"
/// Primary Options
///
/// Use this structure to configure and control specifics of the machine.
/// These settings are immutable for the operations. To maintain state between
/// calls see task.h
///
struct ircd::gpt::opts
{
/// Specifies the nominal halting condition based on the sequence of
/// tokens. Generation will complete when this sequence is witnessed. Set
/// tokens to -1 starting from the back to not match that token. Setting
/// all tokens to -1 will ignore this condition.
uint accept_code[3][3]
{
{ 13, 198, -1U, },
{ 198, 198, -1U, },
{ -1U, -1U, -1U, },
};
/// Specifies the exceptional halting condition based on the sequence of
/// tokens. By default, the three zeros represent three outputs of '!'
/// which is probably an error code; note that a true "!!!" is represented
/// by token number 10185. Set tokens to -1 starting from the back to
/// not match that token; generated output after errors is usually garbage.
uint error_code[3][3]
{
{ 0, 0, 0, },
{ -1U, 0, 0, },
{ -1U, 0, 0, },
};
/// Limit number of output tokens. Default of -1 is unlimited; the number
/// of tokens generated will be limited by other factors.
uint limit
{
-1U
};
/// Flip random coins over the top k logits each round. Setting to 1
/// deterministically selects the top logit.
uint top_k
{
2
};
/// Pointer to the model
const model::decoder *model
{
model::default_model
};
};

View file

@ -0,0 +1,34 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_HYPERCALL_H
/// Hypercalling code enumeration.
///
/// Error codes are all negative values. Zero is also an error.
enum ircd_gpt_hypercall
{
/// General nominal completion code; similar to EXIT_SUCCESS, etc.
IRCD_GPT_ACCEPT = 1,
/// Failed or incomplete execution occurred. After an execution attempt
/// it indicates no execution likely took place. Device software never
/// sets this value; it is the initial value set by the host before
/// execution.
IRCD_GPT_ECOMPLETE = 0,
};
#ifdef __cplusplus
namespace ircd::gpt
{
string_view reflect(const enum ircd_gpt_hypercall) noexcept;
}
#endif

View file

@ -67,7 +67,8 @@ struct ircd::gpt::model::block
norm ln2; norm ln2;
model::ffnn ffnn; model::ffnn ffnn;
}; }
__attribute__((packed));
/// Vocabulary embeddings /// Vocabulary embeddings
struct ircd::gpt::model::embed struct ircd::gpt::model::embed

123
include/ircd/gpt/opts.h Normal file
View file

@ -0,0 +1,123 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_OPTS_H
/// Task Options Page
///
/// The option block is directly shared with task software as constant data.
/// This stucture and its mutable companion in `task.h` determine the outcome
/// of the next execution cycle; options are immutable to device software but
/// may be changed by the host between executions cycles if desired.
///
struct ircd_gpt_opts
{
/// Specifies the nominal halting condition based on a sequence of tokens.
/// Generation will complete with success after one of these sequences is
/// witnessed. Set tokens to -1 starting from the back for shorter
/// sequences; zero-length sequences (all -1's) are never matched.
uint accept_code[4][4]
#ifdef __cplusplus
{
{ 13, 198, -1U, -1U, },
{ 198, 198, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
}
#endif
;
/// Specifies the exceptional halting condition based on the sequence of
/// tokens. By default, the three zeros represent three outputs of '!'
/// which is probably an error; note that a true "!!!" is represented by
/// token number 10185. Set tokens to -1 starting from the back to not
/// match that token; generated output after errors is usually garbage.
uint error_code[4][4]
#ifdef __cplusplus
{
{ 0, 0, 0, -1U, },
{ -1U, -1U, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
}
#endif
;
/// Limit number of output tokens. Default of -1 is unlimited; the number
/// of tokens generated will be limited by other factors.
uint limit
#ifdef __cplusplus
{
1
}
#endif
;
/// Flip random coins over the top k logits each round. Setting to 1
/// deterministically selects the top logit.
uint top_k
#ifdef __cplusplus
{
2
}
#endif
;
/// Specifies the token context size in tokens.
uint context_tokens
#ifdef __cplusplus
{
1024
}
#endif
;
/// Specifies the token buffer size in tokens.
uint buffer_tokens
#ifdef __cplusplus
{
1024
}
#endif
;
/// Seed for the task's PRNG.
ulong seed
#ifdef __cplusplus
{
1234567890UL
}
#endif
;
}
__attribute__((aligned(4096)));
#ifdef __cplusplus
/// Generator Task Options.
///
/// Parameters for a task. Options are constant and one instance can be shared
/// between multiple task instances. This structure extends the task options
/// page, starting a new page which is not visible to device software; C++ and
/// host pointers are available.
///
struct ircd::gpt::opts
:ircd_gpt_opts
{
/// Pointer to the model
const model::decoder *model
{
model::default_model
};
};
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
#endif

View file

@ -0,0 +1,22 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_PIPE_CODE_H
/// Pipe code segment.
struct ircd::gpt::pipe::code
:cl::code
{
static const string_view compile_opts;
code();
~code() noexcept;
};

View file

@ -0,0 +1,47 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_PIPE_DESC_H
/// Pipe descriptor
struct ircd::gpt::pipe::desc
{
struct layer;
pipe::model *model;
pipe::code *code;
cl::data
state, // qry/key/val projection (tokens * embed * 3 * float)
accum, // accumulator (tokens * embed * float)
logit, // result output vector (50257 * float)
ctrl, // control page
opts; // options page
cl::kern
lm_embed,
lm_norm,
lm_logit,
lm_select;
std::unique_ptr<struct desc::layer>
layer[12];
desc(pipe::code &, pipe::model &);
};
struct ircd::gpt::pipe::desc::layer
{
cl::kern negative;
cl::kern positive;
layer(pipe::desc &, const int);
};

View file

@ -0,0 +1,57 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_PIPE_EXEC_H
/// Perform one task cycle on the device.
///
/// Constructions of this object enqueue device commands to complete an
/// additional epoch of the task as provided by `ctrl` and `opts`.
///
/// Destructions of this object yield the ircd::ctx until those commands
/// are complete.
///
/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
/// achieved by constructing several objects before following with destructions
/// i.e in a std::deque.
///
struct ircd::gpt::pipe::exec
{
pipe::desc *desc;
const_buffer
send_opts, // Set when sending the options page.
send_ctrl; // Set when sending the control page.
mutable_buffer
recv_ctrl; // Set when receiving the control page.
cl::kern::range
range_lm_embed, // Dimension range of the lm_embed kernel.
range_negative, // Dimension range of a layer kernel.
range_positive, // Dimension range of a layer kernel.
range_lm_norm, // Dimension range of the final norm kernel.
range_lm_logit, // Dimension range of the language logit kernel.
range_lm_select; // Dimension range of the language token kernel.
cl::exec
release_opts, // Release the options page.
release_ctrl, // Release the control page.
lm_embed, // Compute token and positional embeddings.
coil[12 * 2], // Pass over all layers.
lm_norm, // Final normalization.
lm_logit, // Compute logit result vector.
lm_select, // Select next token.
acquire_ctrl; // Acquire the control page.
exec(task &, const size_t tokens, const bool rel, const bool acq);
~exec() noexcept;
};

View file

@ -59,11 +59,13 @@ struct ircd::gpt::pipe::model::block
model::attn attn; model::attn attn;
model::ffnn ffnn; model::ffnn ffnn;
block(cl::data &, const off_t, const gpt::model::block &, const size_t);
block(const gpt::model::block &, const size_t); block(const gpt::model::block &, const size_t);
}; };
struct ircd::gpt::pipe::model::decoder struct ircd::gpt::pipe::model::decoder
{ {
cl::data master;
model::block block[12]; model::block block[12];
tensor norm; tensor norm;

View file

@ -17,7 +17,6 @@ namespace ircd::gpt::pipe
struct code; struct code;
struct desc; struct desc;
struct exec; struct exec;
struct bank;
extern model *default_model; extern model *default_model;
extern code *default_code; extern code *default_code;
@ -27,70 +26,6 @@ namespace ircd::gpt::pipe
}; };
#include "model.h" #include "model.h"
#include "ctrl.h" #include "code.h"
#include "desc.h"
struct ircd::gpt::pipe::code #include "exec.h"
:cl::code
{
static const string_view compile_opts;
code();
~code() noexcept;
};
struct ircd::gpt::pipe::desc
{
struct layer;
pipe::model *model;
pipe::code *code;
cl::data opts;
cl::data ctrl;
cl::data state;
cl::data xattn;
cl::data accum;
cl::data logit;
cl::kern anode;
std::unique_ptr<struct desc::layer> layer[12];
cl::kern cathode;
cl::kern lmhead;
cl::kern lmamax;
desc(pipe::code &, pipe::model &);
};
struct ircd::gpt::pipe::desc::layer
{
cl::kern negative;
cl::kern selfattn;
cl::kern positive;
layer(pipe::desc &, const int);
};
struct ircd::gpt::pipe::exec
{
pipe::desc *desc;
mutable_buffer out_ctrl;
const_buffer in_ctrl, in_opts;
cl::kern::range range_anode;
cl::kern::range range_coil;
cl::kern::range range_negative;
cl::kern::range range_selfattn;
cl::kern::range range_positive;
cl::kern::range range_cathode;
cl::kern::range range_lmhead;
cl::kern::range range_lmamax;
cl::exec send[2];
cl::exec tail[1];
cl::exec coil[12 * 3];
cl::exec head[3];
cl::exec recv[1];
exec(ctor_ctrl &, const ctor_opts &);
~exec() noexcept;
};

View file

@ -11,8 +11,77 @@
#pragma once #pragma once
#define HAVE_IRCD_GPT_TASK_H #define HAVE_IRCD_GPT_TASK_H
/// Context to maintain state across calls. /// Task Control Page
/// ///
/// The control block is shared with our device software. Execution state is
/// maintained in the task control block across cycles. The control block is
/// the mutable state component for an execution; for the immutable component
/// also shared with device software see opts.h.
///
struct ircd_gpt_task
{
/// Header magic 0xC7012C70
uint magic;
/// Hypercall code set by our device software upon completion and control
/// transfer back to the host. Negative codes indicate errors, positive
/// codes are used for status and/or procedure calls; zero is also an error.
enum ircd_gpt_hypercall call;
/// Token ring head. Tokens in the ring extend behind the head for
/// `tokens`. The `head` value is automatically modulated by device
/// software to wrap around the ring.
uint head;
/// Token counter. The counter indicates the number of valid tokens in
/// the context buffer. This value must not exceed the buffer size.
uint tokens;
/// Accumulates the number of task cycles. The cycle counter is incremented
/// by device software after each repetition of the kernel pipeline to
/// produce one additional token.
ulong cycle;
/// Accumulates the epoch count for the task. The counter is incremented
/// by one in device software before control returns back to the host.
/// Several cycles may occur during each epoch.
ulong epoch;
/// Accumulates the number of tokens produced by the task. Several tokens
/// may be produced each epoch, but currently only one token is produced
/// each cycle.
ulong produced;
/// Accumulates the number tokens witnessed by the task. The number of
/// tokens in the context for each cycle is counted as witnessed.
ulong witnessed;
/// Accumulates time in microseconds elapsed for the task.
ulong elapsed;
/// PRNG xoshiro256 state. This is the de facto random seed which can be
/// set before cycle entry by the host. It is updated by device software
/// when used.
ulong rand[4];
/// Updated by the host with the value of the timestamp register as sampled
/// immediately before each transfer of control to the device.
ulong host_tsc;
/// State counters for the accept/error sequence codes.
uint accept_seq[4], error_seq[4];
/// The token buffer starts at offset 2048 and continues to the end of
/// the page; options specify the size of the tokens buffer in tokens.
/// Additional pages must be attached for larger buffer sizes.
ushort token[] __attribute__((aligned(2048)));
}
__attribute__((aligned(4096)));
#ifdef __cplusplus
/// Task Context
///
/// State for a task.
struct ircd::gpt::task struct ircd::gpt::task
{ {
enum status :char; enum status :char;
@ -20,36 +89,16 @@ struct ircd::gpt::task
/// Reference to the attached options. /// Reference to the attached options.
const gpt::opts *opts {nullptr}; const gpt::opts *opts {nullptr};
/// Reference to control pages.
struct ircd_gpt_task *ctrl {nullptr};
/// Current task status. /// Current task status.
enum status status {'\0'}; enum status status {'\0'};
/// State counters for the accept codes specified in the options. task(const gpt::opts * = nullptr,
uint8_t accept_seq[3] {0}; struct ircd_gpt_task * = nullptr);
/// State counters for the error codes specified in the options. ~task() noexcept;
uint8_t error_seq[3] {0};
/// Accumulates the number of executions by the user. Each call to the
/// interface is an execution.
uint64_t epoch {0};
/// Accumulates the number of tokens produced by the task. Several tokens
/// may be produced each epoch.
uint64_t produced {0};
/// Accumulates the number tokens witnessed by the task. The number of
/// tokens in the context for each produced token is counted as witnessed.
uint64_t witnessed {0};
/// Accumulates the number of CPU reference cycles consumed by the task.
/// This counter does not reflect time when the task is queued or waiting
/// or offloaded to a co-processor/accelerator.
uint64_t cycles {0};
/// Accumulates the total time in milliseconds over all executions of the
/// task. This counter reflects total wall-clock time of all phases of
/// the execution.
milliseconds time {0ms};
}; };
/// The current status of a task is indicated with intelligible characters /// The current status of a task is indicated with intelligible characters
@ -61,3 +110,7 @@ enum ircd::gpt::task::status
ACCEPT = 'A', ///< Execution completed successfully. ACCEPT = 'A', ///< Execution completed successfully.
ERROR = 'E', ///< Execution did not complete successfully. ERROR = 'E', ///< Execution did not complete successfully.
}; };
static_assert(sizeof(struct ircd_gpt_task) == 4096);
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
#endif

View file

@ -9,78 +9,46 @@
// full license for this software is available in the LICENSE file. // full license for this software is available in the LICENSE file.
#pragma once #pragma once
#define HAVE_IRCD_GPT_PIPE_CTRL_H #ifdef __OPENCL_C_VERSION__
#define HAVE_IRCD_GPT_TOKEN_H
struct ctor_ctrl union ircd_gpt_token
{
long call;
ulong pc;
ulong tokens;
ulong magic;
uchar pad[1024 - 32];
union
{
char str[3072];
ushort token[1536];
}
body;
}
__attribute__((aligned(4096)));
struct ctor_opts
{
uchar pad[4096];
}
__attribute__((aligned(4096)));
#ifndef __OPENCL_C_VERSION__
static_assert(sizeof(struct ctor_ctrl) == 4096);
#endif
#ifndef __OPENCL_C_VERSION__
static_assert(sizeof(struct ctor_opts) == 4096);
#endif
#ifndef __cplusplus
union token
{ {
float float
word[768], word[768],
attn[12][64]; attn[12][64];
}; };
union tokenv union ircd_gpt_tokenv
{ {
float4 float4
word[768/4], word[768/4],
attn[12][64/4]; attn[12][64/4];
}; };
struct qkv struct ircd_gpt_qkv
{ {
union token union ircd_gpt_tokenv
qry, qry,
key, key,
val; val;
}; };
struct qkvv struct ircd_gpt_qkvv
{ {
union tokenv union ircd_gpt_tokenv
qry, qry,
key, key,
val; val;
}; };
struct attn_mask struct ircd_gpt_attn_mask
{ {
bool bool
token[1024]; token[1024];
}; };
union aperature union ircd_gpt_aperature
{ {
float float
word[768], word[768],
@ -90,7 +58,7 @@ union aperature
attn[12][64]; attn[12][64];
}; };
union aperaturev union ircd_gpt_aperaturev
{ {
float4 float4
word[768/4], word[768/4],

View file

@ -34,30 +34,22 @@ namespace ircd::gpt
static f32 static f32
logit alignas(64) [65536], logit alignas(64) [65536],
embeds alignas(64) [1024 * 768],
scratch alignas(64) [1024 * 768]; scratch alignas(64) [1024 * 768];
} }
namespace ircd::gpt
{
extern void transform(ctor_ctrl &, const ctor_opts &);
}
decltype(ircd::gpt::log) decltype(ircd::gpt::log)
ircd::gpt::log ircd::gpt::log
{ {
"gpt" "gpt"
}; };
decltype(ircd::gpt::default_opts)
ircd::gpt::default_opts;
ircd::string_view ircd::string_view
ircd::gpt::generate(const mutable_buffer &out, ircd::gpt::generate(const mutable_buffer &out,
const string_view &in, const string_view &in,
const opts *opts, task &task)
task *task)
{ {
u16 buf[2][256]; u16 buf[2][1024];
const auto input_tokens const auto input_tokens
{ {
vocab::tokenize(buf[0], in) vocab::tokenize(buf[0], in)
@ -65,7 +57,7 @@ ircd::gpt::generate(const mutable_buffer &out,
const auto output_tokens const auto output_tokens
{ {
generate(buf[1], input_tokens, opts, task) generate(buf[1], input_tokens, task)
}; };
const auto output const auto output
@ -79,13 +71,92 @@ ircd::gpt::generate(const mutable_buffer &out,
ircd::vector_view<ircd::u16> ircd::vector_view<ircd::u16>
ircd::gpt::generate(const vector_view<u16> &out, ircd::gpt::generate(const vector_view<u16> &out,
const vector_view<const u16> &in, const vector_view<const u16> &in,
const opts *opts, task &task)
task *task)
{ {
assert(task.ctrl);
assert(task.opts);
uint ret(0);
bool halt(false);
const auto &opts(*task.opts);
auto &ctrl(*task.ctrl);
auto &errc(ctrl.error_seq);
auto &accc(ctrl.accept_seq);
ctrl.tokens = in.size();
const size_t tmax
{
in.size() + opts.limit
};
const vector_view<f32> accum
{
gpt::scratch, tmax * 768
};
const vector_view<f32> embeds
{
gpt::embeds, tmax * 768
};
for(uint j(0); j < in.size(); ++j)
{
const vector_view<f32> dst
{
data(embeds) + j * 768, 768
};
if(ircd::cl::enable)
ctrl.token[j] = in[j];
else
embed(data(dst), in[j], j, opts);
static char dbuf[512] {0};
char report[1536] {0};
char tmbuf[1][64] {{0}};
const size_t report_size = snprintf
(
report, sizeof(report),
"%-2u -- %-3u [%5u] --- --- %s 0 0 | %8s",
j,
ctrl.tokens,
ctrl.token[j],
vocab::debug(dbuf, ctrl.token[j]).c_str(),
pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
);
log::info
{
log, "%s",
string_view{report, report_size}
};
}
uint64_t cycles(0);
milliseconds last_time {0};
util::timer stopwatch;
{
const prof::scope_cycles task_cycles
{
cycles
};
generate(task);
}
last_time = stopwatch.at<milliseconds>();
ctrl.elapsed += last_time.count();
/*
coil(data(scratch), tokens, *opts.model);
tail(logit, data(last_embed), *opts.model);
out[i] = argmax(logit, *opts);
*/
uint accc_thresh[3] {3, 3, 3}; uint accc_thresh[3] {3, 3, 3};
for(uint i(0); i < 3; ++i) for(uint i(0); i < 3; ++i)
for(uint j(3); j > 0; --j) for(uint j(3); j > 0; --j)
if(opts->accept_code[i][j - 1] == -1U) if(opts.accept_code[i][j - 1] == -1U)
--accc_thresh[i]; --accc_thresh[i];
else else
break; break;
@ -93,99 +164,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
uint errc_thresh[3] {3, 3, 3}; uint errc_thresh[3] {3, 3, 3};
for(uint i(0); i < 3; ++i) for(uint i(0); i < 3; ++i)
for(uint j(3); j > 0; --j) for(uint j(3); j > 0; --j)
if(opts->error_code[i][j - 1] == -1U) if(opts.error_code[i][j - 1] == -1U)
--errc_thresh[i]; --errc_thresh[i];
else else
break; break;
uint ret(0); for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
bool halt(false);
auto &errc(task->error_seq);
auto &accc(task->accept_seq);
for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
{ {
ctor_ctrl ctrl alignas(4096) {0}; out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
ctrl.pc = 1;
const size_t tokens
{
in.size() + i
};
const vector_view<f32> scratch
{
gpt::scratch, tokens * 768
};
for(uint j(0); j < in.size(); ++j)
{
const vector_view<f32> dst
{
data(scratch) + j * 768, 768
};
if(ircd::cl::enable)
ctrl.body.token[ctrl.tokens++] = in[j];
else
embed(data(dst), in[j], j, *opts);
}
for(uint j(0); j < i; ++j)
{
const vector_view<f32> dst
{
data(scratch) + (in.size() + j) * 768, 768
};
if(ircd::cl::enable)
ctrl.body.token[ctrl.tokens++] = out[j];
else
embed(data(dst), out[j], in.size() + j, *opts);
}
assert(!ircd::cl::enable || ctrl.tokens == tokens);
const vector_view<f32> last_embed
{
data(scratch) + (tokens - 1) * 768, 768
};
const auto last_cycl(task->cycles);
milliseconds last_time {0};
{
util::timer stopwatch;
const prof::scope_cycles task_cycles
{
task->cycles
};
if(ircd::cl::enable)
{
static const ctor_opts opts alignas(4096) {0};
transform(ctrl, opts);
out[i] = ctrl.body.token[ctrl.tokens - 1];
assert(ctrl.tokens == tokens + 1);
} else {
coil(data(scratch), tokens, *opts->model);
tail(logit, data(last_embed), *opts->model);
out[i] = argmax(logit, *opts);
}
last_time = stopwatch.at<milliseconds>();
task->time += last_time;
}
for(uint j(0); j < 3; ++j) for(uint j(0); j < 3; ++j)
errc[j] = errc[j] = opts.error_code[j][errc[j]] == out[j]?
opts->error_code[j][errc[j]] == out[i]? errc[j] + 1: 0;
errc[j] + 1:
0;
for(uint j(0); j < 3; ++j) for(uint j(0); j < 3; ++j)
accc[j] = accc[j] = opts.accept_code[j][accc[j]] == out[j]?
opts->accept_code[j][accc[j]] == out[i]? accc[j] + 1: 0;
accc[j] + 1:
0;
for(uint j(0); j < 3; ++j) for(uint j(0); j < 3; ++j)
halt |= accc_thresh[j] && accc[j] >= accc_thresh[j], halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
@ -194,21 +188,23 @@ ircd::gpt::generate(const vector_view<u16> &out,
static char dbuf[512] {0}; static char dbuf[512] {0};
char report[1536] {0}; char report[1536] {0};
char tmbuf[4][64] {0}; char tmbuf[4][64] {0};
size_t report_size; const size_t bsz(ctrl.tokens - in.size());
report_size = snprintf const size_t report_size = snprintf
( (
report, sizeof(report), report, sizeof(report),
"%-2u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s", "%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
i, j,
j + in.size(),
ctrl.tokens, ctrl.tokens,
ret, ctrl.cycle,
out[i], ctrl.epoch,
out[j],
accc[0] + accc[1] + accc[2], accc[0] + accc[1] + accc[2],
errc[0] + errc[1] + errc[2], errc[0] + errc[1] + errc[2],
vocab::debug(dbuf, out[i]).c_str(), vocab::debug(dbuf, out[j]).c_str(),
pretty(tmbuf[0], last_time, 1).c_str(), pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
pretty(tmbuf[1], si(last_cycl), 1).c_str(), pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
pretty(tmbuf[2], task->time, 1).c_str() pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
); );
log::info log::info
@ -216,24 +212,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
log, "%s", log, "%s",
string_view{report, report_size} string_view{report, report_size}
}; };
++ret;
ctx::yield();
ctx::interruption_point();
} }
ret = ctrl.tokens - in.size();
for(uint i(0); i < 3; ++i) for(uint i(0); i < 3; ++i)
if(accc_thresh[i] && task->accept_seq[i] >= accc_thresh[i]) if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
{ {
ret -= (3 - accc_thresh[i]); ret -= (3 - accc_thresh[i]);
break; break;
} }
else if(errc_thresh[i] && task->error_seq[i] >= errc_thresh[i]) else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
{ {
ret -= (3 - errc_thresh[i]); ret -= (3 - errc_thresh[i]);
break; break;
} }
ctx::interruption_point();
return vector_view<u16> return vector_view<u16>
{ {
out, ret out, ret

View file

@ -8,129 +8,27 @@
// copyright notice and this permission notice is present in all copies. The // copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file. // full license for this software is available in the LICENSE file.
inline void
ctor_local_bcast_ldr(__local float4 *const out,
const uint ln,
const uint li)
{
for(uint stride = 1; stride < ln; stride <<= 1)
{
if(li < stride)
out[li + stride] = out[li];
barrier(CLK_LOCAL_MEM_FENCE);
}
}
inline void inline void
ctor_local_reduce_add_ldr(__local float4 *const out, ircd_gpt_norm_fmad(__local float4 *const out,
const uint ln, __local const float4 *const in,
const uint li) __global const float4 *const restrict bias,
{ __global const float4 *const restrict weight,
for(uint stride = ln >> 1; stride > 0; stride >>= 1) const uint i)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride)
out[li] += out[li + stride];
}
}
inline void
ctor_local_reduce_max_ldr(__local float *const out,
const uint ln,
const uint li)
{
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride)
out[li] = max(out[li], out[li + stride]);
}
}
inline void
ctor_local_reduce_tournament_ldr(__local float *const best,
__local ushort *const idx,
const uint ln,
const uint li)
{
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(li < stride && best[li] < best[li + stride])
{
best[li] = best[li + stride];
idx[li] = idx[li + stride];
}
}
}
inline void
ctor_mean(__local float4 *const restrict out,
__local const float4 *const restrict in,
const uint num,
const uint i)
{
out[i] = in[i];
ctor_local_reduce_add_ldr(out, num, i);
float numerator = 0.0f;
float4 numeratorv = out[i];
for(uint k = 0; k < 4; ++k)
numerator += numeratorv[k];
out[i] = numerator;
ctor_local_bcast_ldr(out, num, i);
numeratorv = out[i];
out[i] = numeratorv / (num * 4);
}
inline void
ctor_norm(__local float4 *const out,
__local const float4 *const in,
__local float4 *const restrict tmp,
const uint num,
const uint i)
{
ctor_mean(tmp, in, num, i);
const float4
sub_mean = in[i] - tmp[i];
tmp[i] = pow(sub_mean, 2);
ctor_mean(out, tmp, num, i);
const float4
epsilon = 0.00001f,
s = sqrt(out[i] + epsilon);
out[i] = sub_mean / s;
}
inline void
ctor_norm_fmad(__local float4 *const out,
__local const float4 *const in,
__global const float4 *const restrict bias,
__global const float4 *const restrict weight,
const uint i)
{ {
out[i] = in[i] * weight[i] + bias[i]; out[i] = in[i] * weight[i] + bias[i];
} }
// Matrix * Vector Multiply/Accumulate // Matrix * Vector Multiply/Accumulate
inline void inline void
ctor_sgemv(__local float4 *const restrict out, ircd_gpt_sgemv(__local float4 *const restrict out,
__local const float4 *const restrict in, __local const float4 *const restrict in,
__global const float4 *const restrict bias, __global const float4 *const restrict bias,
__global const float4 *const restrict weight, __global const float4 *const restrict weight,
const uint width, const uint width,
const uint height, const uint height,
const uint tiles, const uint tiles,
const uint i) const uint i)
{ {
const uint seg = height / tiles; const uint seg = height / tiles;
@ -151,9 +49,9 @@ ctor_sgemv(__local float4 *const restrict out,
} }
inline void inline void
ctor_gelu(__local float4 *const out, ircd_gpt_gelu(__local float4 *const out,
__local const float4 *const in_, __local const float4 *const in_,
const uint i) const uint i)
{ {
float4 a, float4 a,
in = in_[i]; in = in_[i];
@ -178,14 +76,15 @@ ctor_gelu(__local float4 *const out,
// //
__kernel void __kernel void
ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl, ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global union aperaturev *const restrict out, __global union ircd_gpt_tokenv *const restrict accum,
__global const union tokenv *const restrict in, __global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_bias, __global const float4 *const restrict norm_weight,
__global const float4 *const restrict norm_weight, __global const float4 *const restrict fcon_bias,
__global const float4 *const restrict fcon_bias, __global const float4 *const restrict fcon_weight,
__global const float4 *const restrict fcon_weight) __global const float4 *const restrict proj_bias,
__global const float4 *const restrict proj_weight)
{ {
const uint const uint
gi = get_global_id(0), gi = get_global_id(0),
@ -195,31 +94,39 @@ ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
wi = get_group_id(0), wi = get_group_id(0),
wn = get_num_groups(0); wn = get_num_groups(0);
__local union aperaturev token; __local union ircd_gpt_aperaturev token;
__local float4 tmp[768/4]; __local float4 tmp[768/4];
token.word[li] = in[wi].word[li]; // Fetch local copy of the global accumulator. We operate on a cached
// copy as input, and add our output to the global upon completion.
token.word[li] = accum[wi].word[li];
// Layer re-normalization // Layer re-normalization
ctor_norm(token.word, token.word, tmp, ln, li); ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li); ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
// Fully connected // Fully connected
for(uint i = 0; i < 3; ++i) for(uint i = 0; i < 4; ++i)
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li); ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
// Export queries, keys, and values. // Gaussian Error Linear Unit
for(uint i = 0; i < 3; ++i) for(uint i = 0; i < 4; ++i)
out[wi].proj[i][li] = token.proj[i][li]; ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
// Projection
ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
// Accumulation; end of layer
accum[wi].word[li] += tmp[li];
} }
__kernel void __kernel void
ctor_attn_proj(__global const struct ctor_ctrl *const ctrl, ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global union tokenv *const restrict accum, __global union ircd_gpt_tokenv *const restrict accum,
__global const union tokenv *const restrict xattn, __local const union ircd_gpt_tokenv *const restrict xattn,
__global const float4 *const restrict proj_bias, __global const float4 *const restrict proj_bias,
__global const float4 *const restrict proj_weight) __global const float4 *const restrict proj_weight)
{ {
const uint const uint
gi = get_global_id(0), gi = get_global_id(0),
@ -234,25 +141,24 @@ ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
out[768/4]; out[768/4];
// Fetch // Fetch
in[li] = xattn[wi].word[li]; in[li] = xattn->word[li];
// Need this here if xattn is __local
barrier(CLK_LOCAL_MEM_FENCE);
// Projection // Projection
ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li); ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
// Accumulation; end of layer // Accumulation; end of layer
accum[wi].word[li] += out[li]; accum[wi].word[li] += out[li];
} }
__kernel void __kernel void
ctor_ffnn(__global const struct ctor_ctrl *const ctrl, ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global union tokenv *const restrict accum, __local union ircd_gpt_tokenv *const restrict out,
__global const float4 *const restrict norm_bias, __global const struct ircd_gpt_qkvv *const restrict token,
__global const float4 *const restrict norm_weight, __global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],
__global const float4 *const restrict fcon_bias,
__global const float4 *const restrict fcon_weight,
__global const float4 *const restrict proj_bias,
__global const float4 *const restrict proj_weight)
{ {
const uint const uint
gi = get_global_id(0), gi = get_global_id(0),
@ -262,96 +168,13 @@ ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
wi = get_group_id(0), wi = get_group_id(0),
wn = get_num_groups(0); wn = get_num_groups(0);
__local union aperaturev token; __local union
__local float4 tmp[768/4];
// Fetch local copy of the global accumulator. We operate on a cached
// copy as input, and add our output to the global upon completion.
token.word[li] = accum[wi].word[li];
// Layer re-normalization
ctor_norm(token.word, token.word, tmp, ln, li);
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
// Fully connected
for(uint i = 0; i < 4; ++i)
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
// Gaussian Error Linear Unit
for(uint i = 0; i < 4; ++i)
ctor_gelu(token.fcon, token.fcon, i * ln + li);
// Projection
ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
// Accumulation; end of layer
accum[wi].word[li] += tmp[li];
}
__kernel void
ctor_backend(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const union tokenv *const restrict xattn,
__global const float4 *const restrict attn_proj_bias,
__global const float4 *const restrict attn_proj_weight,
__global const float4 *const restrict ffnn_norm_bias,
__global const float4 *const restrict ffnn_norm_weight,
__global const float4 *const restrict ffnn_fcon_bias,
__global const float4 *const restrict ffnn_fcon_weight,
__global const float4 *const restrict ffnn_proj_bias,
__global const float4 *const restrict ffnn_proj_weight)
{
ctor_attn_proj
(
ctrl,
opts,
accum,
xattn,
attn_proj_bias,
attn_proj_weight
);
ctor_ffnn
(
ctrl,
opts,
accum,
ffnn_norm_bias,
ffnn_norm_weight,
ffnn_fcon_bias,
ffnn_fcon_weight,
ffnn_proj_bias,
ffnn_proj_weight
);
}
//
// ctrl
//
__kernel void
ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict out,
__global const struct qkvv *const restrict token,
__global const struct attn_mask *const restrict mask) // [1024][1024],
{
__local struct
{ {
float float
attn[12][32]; attn[12][96];
} }
self; self;
const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0),
ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
for(uint i = 0; i < wn; ++i) for(uint i = 0; i < wn; ++i)
if(mask[wi].token[i]) if(mask[wi].token[i])
self.attn[li][i] = 0.0f; self.attn[li][i] = 0.0f;
@ -389,114 +212,167 @@ ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
self.attn[li][i] /= acc; self.attn[li][i] /= acc;
for(uint j = 0; j < 64/4; ++j) for(uint j = 0; j < 64/4; ++j)
out[wi].attn[li][j] = 0.0f; out->attn[li][j] = 0.0f;
for(uint i = 0; i < wn; ++i) for(uint i = 0; i < wn; ++i)
for(uint j = 0; j < 64/4; ++j) for(uint j = 0; j < 64/4; ++j)
out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i]; out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
} }
//
// leads
//
__kernel void __kernel void
ctor_anode0(__global const struct ctor_ctrl *const ctrl, ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global union tokenv *const restrict accum, __global union ircd_gpt_aperaturev *const restrict out,
__global const union tokenv *const restrict pos, __global const union ircd_gpt_tokenv *const restrict in,
__global const union tokenv *const restrict vocab) __global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight,
__global const float4 *const restrict fcon_bias,
__global const float4 *const restrict fcon_weight)
{ {
const uint const uint
gi = get_global_id(0),
gn = get_global_size(0),
li = get_local_id(0), li = get_local_id(0),
wi = get_group_id(0); ln = get_local_size(0),
wi = get_group_id(0),
wn = get_num_groups(0);
const ushort __local union ircd_gpt_aperaturev token;
token = ctrl->body.token[wi]; __local float4 tmp[768/4];
const float4 token.word[li] = in[wi].word[li];
wte = vocab[token].word[li],
wpe = pos[wi].word[li];
accum[wi].word[li] = wte + wpe; // Layer re-normalization
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
// Fully connected
for(uint i = 0; i < 3; ++i)
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
// Export queries, keys, and values.
for(uint i = 0; i < 3; ++i)
out[wi].proj[i][li] = token.proj[i][li];
} }
__kernel void __kernel void
ctor_anode1(__global const struct ctor_ctrl *const ctrl, ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global union tokenv *const restrict accum, __global union ircd_gpt_tokenv *const restrict accum,
__global const union tokenv *const restrict pos, __global const struct ircd_gpt_qkvv *const restrict state,
__global const union tokenv *const restrict vocab) __global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
__global const float4 *const restrict attn_proj_bias,
__global const float4 *const restrict attn_proj_weight,
__global const float4 *const restrict ffnn_norm_bias,
__global const float4 *const restrict ffnn_norm_weight,
__global const float4 *const restrict ffnn_fcon_bias,
__global const float4 *const restrict ffnn_fcon_weight,
__global const float4 *const restrict ffnn_proj_bias,
__global const float4 *const restrict ffnn_proj_weight)
{
__local union ircd_gpt_tokenv value;
ircd_gpt_attn_self
(
ctrl,
opts,
&value,
state,
mask
);
ircd_gpt_attn_proj
(
ctrl,
opts,
accum,
&value,
attn_proj_bias,
attn_proj_weight
);
ircd_gpt_ffnn
(
ctrl,
opts,
accum,
ffnn_norm_bias,
ffnn_norm_weight,
ffnn_fcon_bias,
ffnn_fcon_weight,
ffnn_proj_bias,
ffnn_proj_weight
);
}
//
// frontend
//
inline void
_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_tokenv *const restrict out,
__global const union ircd_gpt_tokenv *const restrict pos,
__global const union ircd_gpt_tokenv *const restrict vocab,
const uint out_idx,
const uint tok_idx,
const uint word_idx)
{
const ushort
token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
const float4
wte = vocab[token].word[word_idx],
wpe = pos[tok_idx].word[word_idx];
out[out_idx].word[word_idx] = wte + wpe;
}
__kernel void
ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
__global union ircd_gpt_tokenv *const restrict accum,
__global const union ircd_gpt_tokenv *const restrict pos,
__global const union ircd_gpt_tokenv *const restrict vocab)
{ {
const uint const uint
li = get_local_id(0); li = get_local_id(0);
for(uint i = 0; i < ctrl->tokens; ++i) for(uint i = 0; i < ctrl->tokens; ++i)
{ _ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
const ushort
token = ctrl->body.token[i];
const float4
wte = vocab[token].word[li],
wpe = pos[i].word[li];
accum[i].word[li] = wte + wpe;
}
} }
__kernel void __kernel void
ctor_anode2(__global const struct ctor_ctrl *const ctrl, ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global union tokenv *const restrict accum, __global union ircd_gpt_tokenv *const restrict accum,
__global const union tokenv *const restrict pos, __global const float4 *const restrict norm_bias,
__global const union tokenv *const restrict vocab) __global const float4 *const restrict norm_weight)
{
const uint
gi = get_global_id(0);
const ushort
token = ctrl->body.token[gi];
for(uint i = 0; i < 768/4; ++i)
{
const float4
wte = vocab[token].word[i],
wpe = pos[gi].word[i];
accum[gi].word[i] = wte + wpe;
}
}
__kernel void
ctor_cathode(__global const struct ctor_ctrl *const ctrl,
__constant const struct ctor_opts *const opts,
__global union tokenv *const restrict accum,
__global const float4 *const restrict norm_bias,
__global const float4 *const restrict norm_weight)
{ {
const uint const uint
li = get_local_id(0), li = get_local_id(0),
ln = get_local_size(0), ln = get_local_size(0),
wi = get_global_offset(0) / ln + get_group_id(0); wi = get_global_offset(0) / ln + get_group_id(0);
__local union tokenv __local union ircd_gpt_tokenv
token, tmp; token, tmp;
token.word[li] = accum[wi].word[li]; token.word[li] = accum[wi].word[li];
// Final re-normalization // Final re-normalization
ctor_norm(token.word, token.word, tmp.word, ln, li); ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li); ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
accum[0].word[li] = token.word[li]; accum[0].word[li] = token.word[li];
} }
__kernel void __kernel void
ctor_lmhead(__global const struct ctor_ctrl *const ctrl, ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global float *const restrict logit, __global float *const restrict logit,
__global const union tokenv *const restrict accum, __global const union ircd_gpt_tokenv *const restrict accum,
__global const union tokenv *const restrict token) __global const union ircd_gpt_tokenv *const restrict token)
{ {
const uint const uint
gi = get_global_id(0); gi = get_global_id(0);
@ -519,10 +395,79 @@ ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
logit[gi] = res; logit[gi] = res;
} }
inline void
ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
const uint li)
{
// No action for other threads right now
if(li != 0)
return;
// Run debug checks and assertions.
#ifdef RB_DEBUG
if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)
if(ctrl->tokens < 2)
ctrl->call = IRCD_GPT_ETOKENS;
#endif
// If the call value has been set to something other than default we
// do nothing else here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
// On the last cycle, with no prior call or error code set, indicate
// a nominal exit condition.
if(ctrl->cycle + 1 >= opts->limit)
{
ctrl->call = IRCD_GPT_ACCEPT;
ctrl->epoch += 1;
}
ctrl->cycle += 1;
}
inline void
ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
const uint li,
__local const ushort *const restrict idx)
{
// To read from cells other than idx[0] we need this barrier.
if(opts->top_k > 1)
barrier(CLK_LOCAL_MEM_FENCE);
// No action for other threads right now
if(li != 0)
return;
// When the hypercall code is already set, bail here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
const bool
buffer_full = ctrl->tokens >= opts->buffer_tokens;
const ulong
rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
sel = rnd % max(opts->top_k, 1U);
const ushort
token = idx[sel],
token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
ctrl->token[token_idx] = token;
if(buffer_full)
ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
else
ctrl->tokens++;
}
__kernel void __kernel void
ctor_lmamax(__global struct ctor_ctrl *const ctrl, ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
__constant const struct ctor_opts *const opts, __constant const struct ircd_gpt_opts *const opts,
__global const float *const restrict logit) __global const float *const restrict logit)
{ {
const uint const uint
gi = get_global_id(0), gi = get_global_id(0),
@ -535,25 +480,13 @@ ctor_lmamax(__global struct ctor_ctrl *const ctrl,
ti = tn * li; ti = tn * li;
__local ushort idx[192]; __local ushort idx[192];
__local float best[192];
idx[li] = ti; idx[li] = ti;
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j) for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
if(logit[j] > logit[idx[li]]) if(logit[j] > logit[idx[li]])
idx[li] = j; idx[li] = j;
best[li] = logit[idx[li]]; ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
ctor_local_reduce_tournament_ldr(best, idx, ln, li); ircd_gpt_lm_result(ctrl, opts, li, idx);
ircd_gpt_leave(ctrl, opts, li);
if(li == 0 && ctrl->call == -1)
ctrl->body.token[ctrl->tokens++] = idx[li];
if(li == 0 && ctrl->call == -1)
ctrl->call = 1;
#ifdef RB_DEBUG
if(li == 0 && ctrl->call == 1)
if(ctrl->tokens < 2)
ctrl->call = -2;
#endif
} }

View file

@ -142,7 +142,7 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
fs::map::opts map_opts; fs::map::opts map_opts;
map_opts.huge2mb = true; map_opts.huge2mb = true;
map_opts.locked = false; map_opts.locked = true;
default_model_shm = fs::map default_model_shm = fs::map
{ {
fd, map_opts, sizeof(decoder) fd, map_opts, sizeof(decoder)

View file

@ -8,21 +8,33 @@
// copyright notice and this permission notice is present in all copies. The // copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file. // full license for this software is available in the LICENSE file.
#include <ircd/gpt/pipe/pipe.h>
namespace ircd::gpt
{
void transform(ctor_ctrl &, const ctor_opts &);
}
namespace ircd::gpt::pipe namespace ircd::gpt::pipe
{ {
static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts, static void profile_dumplog(pipe::exec &);
lmhead_opts, lmamax_opts;
static ircd::cl::exec::opts
negative_opts, positive_opts, selfattn_opts,
cathode_opts, anode_opts, lmhead_opts, lmamax_opts;
extern conf::item<size_t> flush_cycles;
extern conf::item<size_t> queue_cycles;
extern const ircd::run::changed handle_quit; extern const ircd::run::changed handle_quit;
} }
decltype(ircd::gpt::pipe::queue_cycles)
ircd::gpt::pipe::queue_cycles
{
{ "name", "ircd.gpt.pipe.queue" },
{ "default", 1L, },
};
decltype(ircd::gpt::pipe::flush_cycles)
ircd::gpt::pipe::flush_cycles
{
{ "name", "ircd.gpt.pipe.flush" },
{ "default", 0L, },
};
decltype(ircd::gpt::pipe::default_model) decltype(ircd::gpt::pipe::default_model)
ircd::gpt::pipe::default_model; ircd::gpt::pipe::default_model;
@ -82,144 +94,233 @@ noexcept
// //
void void
ircd::gpt::transform(ctor_ctrl &ctrl, ircd::gpt::generate(task &task)
const ctor_opts &opts)
{ {
if(unlikely(!pipe::default_model)) if(unlikely(!pipe::default_model))
pipe::init(); pipe::init();
ctrl.call = -1; const auto &opts
pipe::exec
{ {
ctrl, opts *task.opts
}; };
auto &ctrl
{
*task.ctrl
};
ctrl.call = IRCD_GPT_ECOMPLETE;
ctrl.host_tsc = prof::cycles();
size_t cycle(ctrl.cycle);
const size_t tokens(ctrl.tokens);
std::deque<pipe::exec> list;
for(; cycle < opts.limit; ++cycle)
{
// When the release/acquire bits are set the control pages are sent
// and received; only set on first and last iterations of this loop.
const bool
rel(cycle == 0),
acq(cycle + 1 >= opts.limit);
// Enqueue the cycle's commands
list.emplace_back
(
task, tokens + cycle, rel, acq
);
// Conditions for a cl::flush here; this is not default but
// may be configured to improve some workloads.
const bool flush
{
// Flushing here is enabled by the configuration
pipe::flush_cycles
// Skip flushing on cycles already performing IO or waiting.
&& !rel && !acq && list.size() <= pipe::queue_cycles
// The configuration item can specify an interval greater than
// one between flushes.
&& cycle % pipe::flush_cycles == 0
};
if(flush)
cl::flush();
// Enqueue consecutive repetitions of our kernel batch before waiting
// on the first; based on the configuration. XXX get from ircd::cl
if(list.size() <= pipe::queue_cycles)
continue;
// Profiling branch
if((false))
{
auto &ex(list.front());
profile_dumplog(ex);
}
// Destructing the front of the queue waits for completion by yielding
// this ircd::ctx.
list.pop_front();
}
// Wait for all unfinished
list.clear();
// Interp error codes
if(unlikely(ctrl.call <= 0)) if(unlikely(ctrl.call <= 0))
throw error throw error
{ {
"hyper (#%d) :%s", "hyper (#%d) :%s",
abs(ctrl.call), abs(int(ctrl.call)),
ctrl.body.str, reflect(ctrl.call),
}; };
always_assert(ctrl.cycle == cycle);
}
void
ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
{
constexpr size_t coils
{
sizeof(exec.coil) / sizeof(cl::exec)
};
for(size_t i(0); i < coils; ++i)
{
exec.coil[i].wait();
const auto &pro
{
exec.coil[i].profile()
};
char tmbuf[4][32] {{0}};
log::logf
{
log, log::level::DEBUG,
"coil:%-2lu %8s %8s %8s %8s\n",
i,
util::pretty(tmbuf[0], si(pro[0]), 1),
util::pretty(tmbuf[1], si(pro[1]), 1),
util::pretty(tmbuf[2], si(pro[2]), 1),
util::pretty(tmbuf[3], si(pro[3]), 1),
};
}
} }
// //
// pipe::exec // pipe::exec
// //
ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl, ircd::gpt::pipe::exec::exec(task &task,
const ctor_opts &opts) const size_t tokens,
const bool release,
const bool acquire)
:desc :desc
{ {
default_desc default_desc
} }
,out_ctrl ,send_opts
{ {
reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl) reinterpret_cast<const char *>(task.opts),
release? sizeof(struct ircd_gpt_opts): 0
} }
,in_ctrl ,send_ctrl
{ {
reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl) reinterpret_cast<const char *>(task.ctrl),
release? sizeof(struct ircd_gpt_task): 0
} }
,in_opts ,recv_ctrl
{ {
reinterpret_cast<const char *>(&opts), sizeof(ctor_opts) reinterpret_cast<char *>(task.ctrl),
acquire? sizeof(struct ircd_gpt_task): 0
} }
,range_anode ,range_lm_embed
{ {
{ ctrl.tokens, 0, }, { 1 * 192UL, 0, },
{ 1, 0, }, { 192UL, 0, },
}
,range_coil
{
{ ctrl.tokens * 192UL, 0, },
{ 192UL, 0, },
} }
,range_negative ,range_negative
{ {
range_coil { tokens * 192UL, 0, },
} { 192UL, 0, },
,range_selfattn
{
range_coil
} }
,range_positive ,range_positive
{ {
range_coil { tokens * 192UL, 0, },
{ 192UL, 0, },
} }
,range_cathode ,range_lm_norm
{ {
{ 1 * 192UL, 0 }, { 1 * 192UL, 0 },
{ 192UL, 0 }, { 192UL, 0 },
{ (ctrl.tokens - 1) * 192UL, 0 }, { (tokens - 1) * 192UL, 0 },
} }
,range_lmhead ,range_lm_logit
{ {
{ 262 * 192UL, 0 }, // align_up(50257) / 192 { 262 * 192UL, 0 }, // align_up(50257) / 192
{ 192UL, 0 }, { 192UL, 0 },
} }
,range_lmamax ,range_lm_select
{ {
{ 1 * 192UL, 0 }, { 1 * 192UL, 0 },
{ 192UL, 0 }, { 192UL, 0 },
} }
,send ,release_opts
{ {
{ desc->opts, in_opts }, desc->opts, send_opts
{ desc->ctrl, in_ctrl },
} }
,tail ,release_ctrl
{ {
{ desc->anode, range_anode, anode_opts }, desc->ctrl, send_ctrl
}
,lm_embed
{
desc->lm_embed, range_lm_embed, anode_opts
} }
,coil ,coil
{ {
{ desc->layer[0x00]->negative, range_negative, negative_opts }, { desc->layer[0x00]->negative, range_negative, negative_opts },
{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x00]->positive, range_positive, positive_opts }, { desc->layer[0x00]->positive, range_positive, positive_opts },
{ desc->layer[0x01]->negative, range_negative, negative_opts }, { desc->layer[0x01]->negative, range_negative, negative_opts },
{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x01]->positive, range_positive, positive_opts }, { desc->layer[0x01]->positive, range_positive, positive_opts },
{ desc->layer[0x02]->negative, range_negative, negative_opts }, { desc->layer[0x02]->negative, range_negative, negative_opts },
{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x02]->positive, range_positive, positive_opts }, { desc->layer[0x02]->positive, range_positive, positive_opts },
{ desc->layer[0x03]->negative, range_negative, negative_opts }, { desc->layer[0x03]->negative, range_negative, negative_opts },
{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x03]->positive, range_positive, positive_opts }, { desc->layer[0x03]->positive, range_positive, positive_opts },
{ desc->layer[0x04]->negative, range_negative, negative_opts }, { desc->layer[0x04]->negative, range_negative, negative_opts },
{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x04]->positive, range_positive, positive_opts }, { desc->layer[0x04]->positive, range_positive, positive_opts },
{ desc->layer[0x05]->negative, range_negative, negative_opts }, { desc->layer[0x05]->negative, range_negative, negative_opts },
{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x05]->positive, range_positive, positive_opts }, { desc->layer[0x05]->positive, range_positive, positive_opts },
{ desc->layer[0x06]->negative, range_negative, negative_opts }, { desc->layer[0x06]->negative, range_negative, negative_opts },
{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x06]->positive, range_positive, positive_opts }, { desc->layer[0x06]->positive, range_positive, positive_opts },
{ desc->layer[0x07]->negative, range_negative, negative_opts }, { desc->layer[0x07]->negative, range_negative, negative_opts },
{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x07]->positive, range_positive, positive_opts }, { desc->layer[0x07]->positive, range_positive, positive_opts },
{ desc->layer[0x08]->negative, range_negative, negative_opts }, { desc->layer[0x08]->negative, range_negative, negative_opts },
{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x08]->positive, range_positive, positive_opts }, { desc->layer[0x08]->positive, range_positive, positive_opts },
{ desc->layer[0x09]->negative, range_negative, negative_opts }, { desc->layer[0x09]->negative, range_negative, negative_opts },
{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x09]->positive, range_positive, positive_opts }, { desc->layer[0x09]->positive, range_positive, positive_opts },
{ desc->layer[0x0a]->negative, range_negative, negative_opts }, { desc->layer[0x0a]->negative, range_negative, negative_opts },
{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x0a]->positive, range_positive, positive_opts }, { desc->layer[0x0a]->positive, range_positive, positive_opts },
{ desc->layer[0x0b]->negative, range_negative, negative_opts }, { desc->layer[0x0b]->negative, range_negative, negative_opts },
{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
{ desc->layer[0x0b]->positive, range_positive, positive_opts }, { desc->layer[0x0b]->positive, range_positive, positive_opts },
} }
,head ,lm_norm
{ {
{ desc->cathode, range_cathode, cathode_opts }, desc->lm_norm, range_lm_norm, cathode_opts
{ desc->lmhead, range_lmhead, lmhead_opts },
{ desc->lmamax, range_lmamax, lmamax_opts },
} }
,recv ,lm_logit
{ {
{ desc->ctrl, out_ctrl }, desc->lm_logit, range_lm_logit, lmhead_opts
}
,lm_select
{
desc->lm_select, range_lm_select, lmamax_opts
}
,acquire_ctrl
{
desc->ctrl, recv_ctrl
} }
{ {
} }
@ -296,29 +397,14 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
{ {
&code &code
} }
,opts
{
4_KiB,
const_buffer{}
}
,ctrl
{
4_KiB,
mutable_buffer{}
}
,state ,state
{ {
32 * 3 * 768 * sizeof(float), 96 * 3 * 768 * sizeof(float),
mutable_buffer{}
}
,xattn
{
32 * 1 * 768 * sizeof(float),
mutable_buffer{} mutable_buffer{}
} }
,accum ,accum
{ {
32 * 768 * sizeof(float), 96 * 768 * sizeof(float),
mutable_buffer{} mutable_buffer{}
} }
,logit ,logit
@ -326,16 +412,54 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
65536 * sizeof(float), 65536 * sizeof(float),
mutable_buffer{} mutable_buffer{}
} }
,anode ,ctrl
{
sizeof(struct ircd_gpt_task),
mutable_buffer{}
}
,opts
{
sizeof(struct ircd_gpt_opts),
const_buffer{}
}
,lm_embed
{ {
code, code,
"ctor_anode2", "ircd_gpt_lm_embed",
ctrl, ctrl,
opts, opts,
accum, accum,
model.embed->pos, model.embed->pos,
model.embed->token, model.embed->token,
} }
,lm_norm
{
code,
"ircd_gpt_lm_norm",
ctrl,
opts,
accum,
model.decode->norm.bias,
model.decode->norm.weight,
}
,lm_logit
{
code,
"ircd_gpt_lm_logit",
ctrl,
opts,
logit,
accum,
model.embed->token,
}
,lm_select
{
code,
"ircd_gpt_lm_select",
ctrl,
opts,
logit,
}
,layer ,layer
{ {
std::make_unique<struct desc::layer>(*this, 0x00), std::make_unique<struct desc::layer>(*this, 0x00),
@ -351,34 +475,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
std::make_unique<struct desc::layer>(*this, 0x0a), std::make_unique<struct desc::layer>(*this, 0x0a),
std::make_unique<struct desc::layer>(*this, 0x0b), std::make_unique<struct desc::layer>(*this, 0x0b),
} }
,cathode
{
code,
"ctor_cathode",
ctrl,
opts,
accum,
model.decode->norm.bias,
model.decode->norm.weight,
}
,lmhead
{
code,
"ctor_lmhead",
ctrl,
opts,
logit,
accum,
model.embed->token,
}
,lmamax
{
code,
"ctor_lmamax",
ctrl,
opts,
logit,
}
{ {
} }
@ -391,7 +487,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
:negative :negative
{ {
*desc.code, *desc.code,
"ctor_attn_fcon", "ircd_gpt_attn_fcon",
desc.ctrl, desc.ctrl,
desc.opts, desc.opts,
desc.state, desc.state,
@ -401,24 +497,15 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
desc.model->decode->block[laynum].attn.fcon.bias, desc.model->decode->block[laynum].attn.fcon.bias,
desc.model->decode->block[laynum].attn.fcon.weight, desc.model->decode->block[laynum].attn.fcon.weight,
} }
,selfattn
{
*desc.code,
"ctor_attn_self",
desc.ctrl,
desc.opts,
desc.xattn,
desc.state,
desc.model->decode->block[laynum].attn.mask,
}
,positive ,positive
{ {
*desc.code, *desc.code,
"ctor_backend", "ircd_gpt_coil",
desc.ctrl, desc.ctrl,
desc.opts, desc.opts,
desc.accum, desc.accum,
desc.xattn, desc.state,
desc.model->decode->block[laynum].attn.mask,
desc.model->decode->block[laynum].attn.proj.bias, desc.model->decode->block[laynum].attn.proj.bias,
desc.model->decode->block[laynum].attn.proj.weight, desc.model->decode->block[laynum].attn.proj.weight,
desc.model->decode->block[laynum].ffnn.norm.bias, desc.model->decode->block[laynum].ffnn.norm.bias,
@ -486,23 +573,33 @@ noexcept
// //
ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder) ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
:block :master
{ {
{ decoder.layer[0x00], 0x00, }, sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer
{ decoder.layer[0x01], 0x01, }, {
{ decoder.layer[0x02], 0x02, }, reinterpret_cast<const char *>(decoder.layer),
{ decoder.layer[0x03], 0x03, }, sizeof(decoder.layer) + sizeof(decoder.f)
{ decoder.layer[0x04], 0x04, }, }
{ decoder.layer[0x05], 0x05, }, }
{ decoder.layer[0x06], 0x06, }, ,block
{ decoder.layer[0x07], 0x07, }, {
{ decoder.layer[0x08], 0x08, }, { master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
{ decoder.layer[0x09], 0x09, }, { master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
{ decoder.layer[0x0a], 0x0a, }, { master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, },
{ decoder.layer[0x0b], 0x0b, }, { master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, },
{ master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, },
{ master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, },
{ master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, },
{ master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, },
{ master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, },
{ master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, },
{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
} }
,norm ,norm
{ {
master,
off_t(sizeof(gpt::model::block) * 12),
const_buffer{decoder.f.bias}, const_buffer{decoder.f.bias},
const_buffer{decoder.f.weight}, const_buffer{decoder.f.weight},
} }
@ -544,6 +641,27 @@ ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
{ {
} }
ircd::gpt::pipe::model::block::block(cl::data &master,
const off_t offset,
const gpt::model::block &block,
const size_t layer)
:attn
{
master,
offset,
block.ln1,
block.attn,
}
,ffnn
{
master,
offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
block.ln2,
block.ffnn,
}
{
}
// //
// pipe::model::ffnn // pipe::model::ffnn
// //
@ -678,3 +796,48 @@ ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
} }
{ {
} }
//
// gpt::task
//
ircd::gpt::task::task(const gpt::opts *const opts,
struct ircd_gpt_task *const ctrl)
:opts
{
opts
}
,ctrl
{
ctrl
}
{
memset(this->ctrl, 0x0, sizeof(ircd_gpt_task));
this->ctrl->rand[0] = this->opts->seed;
this->ctrl->rand[1] = this->opts->seed;
this->ctrl->rand[2] = -1UL;
this->ctrl->rand[3] = -1UL;
}
ircd::gpt::task::~task()
noexcept
{
}
//
// hypercall
//
ircd::string_view
ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
noexcept
{
switch(code)
{
case IRCD_GPT_ACCEPT: return "ACCEPT";
case IRCD_GPT_ECOMPLETE: return "ECOMPLETE";
}
return "??????";
}

View file

@ -17324,10 +17324,15 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
opts.limit = param.at<uint>("limit"); opts.limit = param.at<uint>("limit");
opts.top_k = 3; opts.top_k = 3;
gpt::task task; struct ircd_gpt_task ctrl;
gpt::task task
{
&opts, &ctrl
};
const auto output const auto output
{ {
gpt::generate(buf, text, &opts, &task) gpt::generate(buf, text, task)
}; };
out out