mirror of
https://github.com/matrix-construct/construct
synced 2024-12-25 15:04:10 +01:00
ircd::gpt: Reorganize interface.
This commit is contained in:
parent
f096e7fcf5
commit
1870c364f4
17 changed files with 1067 additions and 778 deletions
|
@ -13,15 +13,16 @@
|
|||
|
||||
namespace ircd::gpt
|
||||
{
|
||||
void
|
||||
generate(task &);
|
||||
|
||||
vector_view<u16>
|
||||
generate(const vector_view<u16> &out,
|
||||
const vector_view<const u16> &in,
|
||||
const opts * = &default_opts,
|
||||
task * = nullptr);
|
||||
task &);
|
||||
|
||||
string_view
|
||||
generate(const mutable_buffer &out,
|
||||
const string_view &in,
|
||||
const opts * = &default_opts,
|
||||
task * = nullptr);
|
||||
task &);
|
||||
}
|
||||
|
|
|
@ -20,63 +20,14 @@ namespace ircd::gpt
|
|||
struct opts;
|
||||
struct task;
|
||||
|
||||
extern const opts default_opts;
|
||||
extern log::log log;
|
||||
}
|
||||
|
||||
#include "hypercall.h"
|
||||
#include "vocab.h"
|
||||
#include "model.h"
|
||||
#include "token.h"
|
||||
#include "opts.h"
|
||||
#include "task.h"
|
||||
#include "pipe/pipe.h"
|
||||
#include "generate.h"
|
||||
|
||||
/// Primary Options
|
||||
///
|
||||
/// Use this structure to configure and control specifics of the machine.
|
||||
/// These settings are immutable for the operations. To maintain state between
|
||||
/// calls see task.h
|
||||
///
|
||||
struct ircd::gpt::opts
|
||||
{
|
||||
/// Specifies the nominal halting condition based on the sequence of
|
||||
/// tokens. Generation will complete when this sequence is witnessed. Set
|
||||
/// tokens to -1 starting from the back to not match that token. Setting
|
||||
/// all tokens to -1 will ignore this condition.
|
||||
uint accept_code[3][3]
|
||||
{
|
||||
{ 13, 198, -1U, },
|
||||
{ 198, 198, -1U, },
|
||||
{ -1U, -1U, -1U, },
|
||||
};
|
||||
|
||||
/// Specifies the exceptional halting condition based on the sequence of
|
||||
/// tokens. By default, the three zeros represent three outputs of '!'
|
||||
/// which is probably an error code; note that a true "!!!" is represented
|
||||
/// by token number 10185. Set tokens to -1 starting from the back to
|
||||
/// not match that token; generated output after errors is usually garbage.
|
||||
uint error_code[3][3]
|
||||
{
|
||||
{ 0, 0, 0, },
|
||||
{ -1U, 0, 0, },
|
||||
{ -1U, 0, 0, },
|
||||
};
|
||||
|
||||
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
||||
/// of tokens generated will be limited by other factors.
|
||||
uint limit
|
||||
{
|
||||
-1U
|
||||
};
|
||||
|
||||
/// Flip random coins over the top k logits each round. Setting to 1
|
||||
/// deterministically selects the top logit.
|
||||
uint top_k
|
||||
{
|
||||
2
|
||||
};
|
||||
|
||||
/// Pointer to the model
|
||||
const model::decoder *model
|
||||
{
|
||||
model::default_model
|
||||
};
|
||||
};
|
||||
|
|
34
include/ircd/gpt/hypercall.h
Normal file
34
include/ircd/gpt/hypercall.h
Normal file
|
@ -0,0 +1,34 @@
|
|||
// Matrix Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_HYPERCALL_H
|
||||
|
||||
/// Hypercalling code enumeration.
|
||||
///
|
||||
/// Error codes are all negative values. Zero is also an error.
|
||||
enum ircd_gpt_hypercall
|
||||
{
|
||||
/// General nominal completion code; similar to EXIT_SUCCESS, etc.
|
||||
IRCD_GPT_ACCEPT = 1,
|
||||
|
||||
/// Failed or incomplete execution occurred. After an execution attempt
|
||||
/// it indicates no execution likely took place. Device software never
|
||||
/// sets this value; it is the initial value set by the host before
|
||||
/// execution.
|
||||
IRCD_GPT_ECOMPLETE = 0,
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace ircd::gpt
|
||||
{
|
||||
string_view reflect(const enum ircd_gpt_hypercall) noexcept;
|
||||
}
|
||||
#endif
|
|
@ -67,7 +67,8 @@ struct ircd::gpt::model::block
|
|||
|
||||
norm ln2;
|
||||
model::ffnn ffnn;
|
||||
};
|
||||
}
|
||||
__attribute__((packed));
|
||||
|
||||
/// Vocabulary embeddings
|
||||
struct ircd::gpt::model::embed
|
||||
|
|
123
include/ircd/gpt/opts.h
Normal file
123
include/ircd/gpt/opts.h
Normal file
|
@ -0,0 +1,123 @@
|
|||
// Matrix Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_OPTS_H
|
||||
|
||||
/// Task Options Page
|
||||
///
|
||||
/// The option block is directly shared with task software as constant data.
|
||||
/// This stucture and its mutable companion in `task.h` determine the outcome
|
||||
/// of the next execution cycle; options are immutable to device software but
|
||||
/// may be changed by the host between executions cycles if desired.
|
||||
///
|
||||
struct ircd_gpt_opts
|
||||
{
|
||||
/// Specifies the nominal halting condition based on a sequence of tokens.
|
||||
/// Generation will complete with success after one of these sequences is
|
||||
/// witnessed. Set tokens to -1 starting from the back for shorter
|
||||
/// sequences; zero-length sequences (all -1's) are never matched.
|
||||
uint accept_code[4][4]
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
{ 13, 198, -1U, -1U, },
|
||||
{ 198, 198, -1U, -1U, },
|
||||
{ -1U, -1U, -1U, -1U, },
|
||||
{ -1U, -1U, -1U, -1U, },
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Specifies the exceptional halting condition based on the sequence of
|
||||
/// tokens. By default, the three zeros represent three outputs of '!'
|
||||
/// which is probably an error; note that a true "!!!" is represented by
|
||||
/// token number 10185. Set tokens to -1 starting from the back to not
|
||||
/// match that token; generated output after errors is usually garbage.
|
||||
uint error_code[4][4]
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
{ 0, 0, 0, -1U, },
|
||||
{ -1U, -1U, -1U, -1U, },
|
||||
{ -1U, -1U, -1U, -1U, },
|
||||
{ -1U, -1U, -1U, -1U, },
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
||||
/// of tokens generated will be limited by other factors.
|
||||
uint limit
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
1
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Flip random coins over the top k logits each round. Setting to 1
|
||||
/// deterministically selects the top logit.
|
||||
uint top_k
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
2
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Specifies the token context size in tokens.
|
||||
uint context_tokens
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
1024
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Specifies the token buffer size in tokens.
|
||||
uint buffer_tokens
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
1024
|
||||
}
|
||||
#endif
|
||||
;
|
||||
|
||||
/// Seed for the task's PRNG.
|
||||
ulong seed
|
||||
#ifdef __cplusplus
|
||||
{
|
||||
1234567890UL
|
||||
}
|
||||
#endif
|
||||
;
|
||||
}
|
||||
__attribute__((aligned(4096)));
|
||||
|
||||
#ifdef __cplusplus
|
||||
/// Generator Task Options.
|
||||
///
|
||||
/// Parameters for a task. Options are constant and one instance can be shared
|
||||
/// between multiple task instances. This structure extends the task options
|
||||
/// page, starting a new page which is not visible to device software; C++ and
|
||||
/// host pointers are available.
|
||||
///
|
||||
struct ircd::gpt::opts
|
||||
:ircd_gpt_opts
|
||||
{
|
||||
/// Pointer to the model
|
||||
const model::decoder *model
|
||||
{
|
||||
model::default_model
|
||||
};
|
||||
};
|
||||
|
||||
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
|
||||
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
|
||||
#endif
|
22
include/ircd/gpt/pipe/code.h
Normal file
22
include/ircd/gpt/pipe/code.h
Normal file
|
@ -0,0 +1,22 @@
|
|||
// Matrix Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_PIPE_CODE_H
|
||||
|
||||
/// Pipe code segment.
|
||||
struct ircd::gpt::pipe::code
|
||||
:cl::code
|
||||
{
|
||||
static const string_view compile_opts;
|
||||
|
||||
code();
|
||||
~code() noexcept;
|
||||
};
|
47
include/ircd/gpt/pipe/desc.h
Normal file
47
include/ircd/gpt/pipe/desc.h
Normal file
|
@ -0,0 +1,47 @@
|
|||
// Matrix Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_PIPE_DESC_H
|
||||
|
||||
/// Pipe descriptor
|
||||
struct ircd::gpt::pipe::desc
|
||||
{
|
||||
struct layer;
|
||||
|
||||
pipe::model *model;
|
||||
pipe::code *code;
|
||||
|
||||
cl::data
|
||||
state, // qry/key/val projection (tokens * embed * 3 * float)
|
||||
accum, // accumulator (tokens * embed * float)
|
||||
logit, // result output vector (50257 * float)
|
||||
ctrl, // control page
|
||||
opts; // options page
|
||||
|
||||
cl::kern
|
||||
lm_embed,
|
||||
lm_norm,
|
||||
lm_logit,
|
||||
lm_select;
|
||||
|
||||
std::unique_ptr<struct desc::layer>
|
||||
layer[12];
|
||||
|
||||
desc(pipe::code &, pipe::model &);
|
||||
};
|
||||
|
||||
struct ircd::gpt::pipe::desc::layer
|
||||
{
|
||||
cl::kern negative;
|
||||
cl::kern positive;
|
||||
|
||||
layer(pipe::desc &, const int);
|
||||
};
|
57
include/ircd/gpt/pipe/exec.h
Normal file
57
include/ircd/gpt/pipe/exec.h
Normal file
|
@ -0,0 +1,57 @@
|
|||
// Matrix Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_PIPE_EXEC_H
|
||||
|
||||
/// Perform one task cycle on the device.
|
||||
///
|
||||
/// Constructions of this object enqueue device commands to complete an
|
||||
/// additional epoch of the task as provided by `ctrl` and `opts`.
|
||||
///
|
||||
/// Destructions of this object yield the ircd::ctx until those commands
|
||||
/// are complete.
|
||||
///
|
||||
/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
|
||||
/// achieved by constructing several objects before following with destructions
|
||||
/// i.e in a std::deque.
|
||||
///
|
||||
struct ircd::gpt::pipe::exec
|
||||
{
|
||||
pipe::desc *desc;
|
||||
|
||||
const_buffer
|
||||
send_opts, // Set when sending the options page.
|
||||
send_ctrl; // Set when sending the control page.
|
||||
|
||||
mutable_buffer
|
||||
recv_ctrl; // Set when receiving the control page.
|
||||
|
||||
cl::kern::range
|
||||
range_lm_embed, // Dimension range of the lm_embed kernel.
|
||||
range_negative, // Dimension range of a layer kernel.
|
||||
range_positive, // Dimension range of a layer kernel.
|
||||
range_lm_norm, // Dimension range of the final norm kernel.
|
||||
range_lm_logit, // Dimension range of the language logit kernel.
|
||||
range_lm_select; // Dimension range of the language token kernel.
|
||||
|
||||
cl::exec
|
||||
release_opts, // Release the options page.
|
||||
release_ctrl, // Release the control page.
|
||||
lm_embed, // Compute token and positional embeddings.
|
||||
coil[12 * 2], // Pass over all layers.
|
||||
lm_norm, // Final normalization.
|
||||
lm_logit, // Compute logit result vector.
|
||||
lm_select, // Select next token.
|
||||
acquire_ctrl; // Acquire the control page.
|
||||
|
||||
exec(task &, const size_t tokens, const bool rel, const bool acq);
|
||||
~exec() noexcept;
|
||||
};
|
|
@ -59,11 +59,13 @@ struct ircd::gpt::pipe::model::block
|
|||
model::attn attn;
|
||||
model::ffnn ffnn;
|
||||
|
||||
block(cl::data &, const off_t, const gpt::model::block &, const size_t);
|
||||
block(const gpt::model::block &, const size_t);
|
||||
};
|
||||
|
||||
struct ircd::gpt::pipe::model::decoder
|
||||
{
|
||||
cl::data master;
|
||||
model::block block[12];
|
||||
tensor norm;
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@ namespace ircd::gpt::pipe
|
|||
struct code;
|
||||
struct desc;
|
||||
struct exec;
|
||||
struct bank;
|
||||
|
||||
extern model *default_model;
|
||||
extern code *default_code;
|
||||
|
@ -27,70 +26,6 @@ namespace ircd::gpt::pipe
|
|||
};
|
||||
|
||||
#include "model.h"
|
||||
#include "ctrl.h"
|
||||
|
||||
struct ircd::gpt::pipe::code
|
||||
:cl::code
|
||||
{
|
||||
static const string_view compile_opts;
|
||||
|
||||
code();
|
||||
~code() noexcept;
|
||||
};
|
||||
|
||||
struct ircd::gpt::pipe::desc
|
||||
{
|
||||
struct layer;
|
||||
|
||||
pipe::model *model;
|
||||
pipe::code *code;
|
||||
|
||||
cl::data opts;
|
||||
cl::data ctrl;
|
||||
cl::data state;
|
||||
cl::data xattn;
|
||||
cl::data accum;
|
||||
cl::data logit;
|
||||
cl::kern anode;
|
||||
std::unique_ptr<struct desc::layer> layer[12];
|
||||
cl::kern cathode;
|
||||
cl::kern lmhead;
|
||||
cl::kern lmamax;
|
||||
|
||||
desc(pipe::code &, pipe::model &);
|
||||
};
|
||||
|
||||
struct ircd::gpt::pipe::desc::layer
|
||||
{
|
||||
cl::kern negative;
|
||||
cl::kern selfattn;
|
||||
cl::kern positive;
|
||||
|
||||
layer(pipe::desc &, const int);
|
||||
};
|
||||
|
||||
struct ircd::gpt::pipe::exec
|
||||
{
|
||||
pipe::desc *desc;
|
||||
|
||||
mutable_buffer out_ctrl;
|
||||
const_buffer in_ctrl, in_opts;
|
||||
|
||||
cl::kern::range range_anode;
|
||||
cl::kern::range range_coil;
|
||||
cl::kern::range range_negative;
|
||||
cl::kern::range range_selfattn;
|
||||
cl::kern::range range_positive;
|
||||
cl::kern::range range_cathode;
|
||||
cl::kern::range range_lmhead;
|
||||
cl::kern::range range_lmamax;
|
||||
|
||||
cl::exec send[2];
|
||||
cl::exec tail[1];
|
||||
cl::exec coil[12 * 3];
|
||||
cl::exec head[3];
|
||||
cl::exec recv[1];
|
||||
|
||||
exec(ctor_ctrl &, const ctor_opts &);
|
||||
~exec() noexcept;
|
||||
};
|
||||
#include "code.h"
|
||||
#include "desc.h"
|
||||
#include "exec.h"
|
||||
|
|
|
@ -11,8 +11,77 @@
|
|||
#pragma once
|
||||
#define HAVE_IRCD_GPT_TASK_H
|
||||
|
||||
/// Context to maintain state across calls.
|
||||
/// Task Control Page
|
||||
///
|
||||
/// The control block is shared with our device software. Execution state is
|
||||
/// maintained in the task control block across cycles. The control block is
|
||||
/// the mutable state component for an execution; for the immutable component
|
||||
/// also shared with device software see opts.h.
|
||||
///
|
||||
struct ircd_gpt_task
|
||||
{
|
||||
/// Header magic 0xC7012C70
|
||||
uint magic;
|
||||
|
||||
/// Hypercall code set by our device software upon completion and control
|
||||
/// transfer back to the host. Negative codes indicate errors, positive
|
||||
/// codes are used for status and/or procedure calls; zero is also an error.
|
||||
enum ircd_gpt_hypercall call;
|
||||
|
||||
/// Token ring head. Tokens in the ring extend behind the head for
|
||||
/// `tokens`. The `head` value is automatically modulated by device
|
||||
/// software to wrap around the ring.
|
||||
uint head;
|
||||
|
||||
/// Token counter. The counter indicates the number of valid tokens in
|
||||
/// the context buffer. This value must not exceed the buffer size.
|
||||
uint tokens;
|
||||
|
||||
/// Accumulates the number of task cycles. The cycle counter is incremented
|
||||
/// by device software after each repetition of the kernel pipeline to
|
||||
/// produce one additional token.
|
||||
ulong cycle;
|
||||
|
||||
/// Accumulates the epoch count for the task. The counter is incremented
|
||||
/// by one in device software before control returns back to the host.
|
||||
/// Several cycles may occur during each epoch.
|
||||
ulong epoch;
|
||||
|
||||
/// Accumulates the number of tokens produced by the task. Several tokens
|
||||
/// may be produced each epoch, but currently only one token is produced
|
||||
/// each cycle.
|
||||
ulong produced;
|
||||
|
||||
/// Accumulates the number tokens witnessed by the task. The number of
|
||||
/// tokens in the context for each cycle is counted as witnessed.
|
||||
ulong witnessed;
|
||||
|
||||
/// Accumulates time in microseconds elapsed for the task.
|
||||
ulong elapsed;
|
||||
|
||||
/// PRNG xoshiro256 state. This is the de facto random seed which can be
|
||||
/// set before cycle entry by the host. It is updated by device software
|
||||
/// when used.
|
||||
ulong rand[4];
|
||||
|
||||
/// Updated by the host with the value of the timestamp register as sampled
|
||||
/// immediately before each transfer of control to the device.
|
||||
ulong host_tsc;
|
||||
|
||||
/// State counters for the accept/error sequence codes.
|
||||
uint accept_seq[4], error_seq[4];
|
||||
|
||||
/// The token buffer starts at offset 2048 and continues to the end of
|
||||
/// the page; options specify the size of the tokens buffer in tokens.
|
||||
/// Additional pages must be attached for larger buffer sizes.
|
||||
ushort token[] __attribute__((aligned(2048)));
|
||||
}
|
||||
__attribute__((aligned(4096)));
|
||||
|
||||
#ifdef __cplusplus
|
||||
/// Task Context
|
||||
///
|
||||
/// State for a task.
|
||||
struct ircd::gpt::task
|
||||
{
|
||||
enum status :char;
|
||||
|
@ -20,36 +89,16 @@ struct ircd::gpt::task
|
|||
/// Reference to the attached options.
|
||||
const gpt::opts *opts {nullptr};
|
||||
|
||||
/// Reference to control pages.
|
||||
struct ircd_gpt_task *ctrl {nullptr};
|
||||
|
||||
/// Current task status.
|
||||
enum status status {'\0'};
|
||||
|
||||
/// State counters for the accept codes specified in the options.
|
||||
uint8_t accept_seq[3] {0};
|
||||
task(const gpt::opts * = nullptr,
|
||||
struct ircd_gpt_task * = nullptr);
|
||||
|
||||
/// State counters for the error codes specified in the options.
|
||||
uint8_t error_seq[3] {0};
|
||||
|
||||
/// Accumulates the number of executions by the user. Each call to the
|
||||
/// interface is an execution.
|
||||
uint64_t epoch {0};
|
||||
|
||||
/// Accumulates the number of tokens produced by the task. Several tokens
|
||||
/// may be produced each epoch.
|
||||
uint64_t produced {0};
|
||||
|
||||
/// Accumulates the number tokens witnessed by the task. The number of
|
||||
/// tokens in the context for each produced token is counted as witnessed.
|
||||
uint64_t witnessed {0};
|
||||
|
||||
/// Accumulates the number of CPU reference cycles consumed by the task.
|
||||
/// This counter does not reflect time when the task is queued or waiting
|
||||
/// or offloaded to a co-processor/accelerator.
|
||||
uint64_t cycles {0};
|
||||
|
||||
/// Accumulates the total time in milliseconds over all executions of the
|
||||
/// task. This counter reflects total wall-clock time of all phases of
|
||||
/// the execution.
|
||||
milliseconds time {0ms};
|
||||
~task() noexcept;
|
||||
};
|
||||
|
||||
/// The current status of a task is indicated with intelligible characters
|
||||
|
@ -61,3 +110,7 @@ enum ircd::gpt::task::status
|
|||
ACCEPT = 'A', ///< Execution completed successfully.
|
||||
ERROR = 'E', ///< Execution did not complete successfully.
|
||||
};
|
||||
|
||||
static_assert(sizeof(struct ircd_gpt_task) == 4096);
|
||||
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
|
||||
#endif
|
||||
|
|
|
@ -9,78 +9,46 @@
|
|||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_PIPE_CTRL_H
|
||||
#ifdef __OPENCL_C_VERSION__
|
||||
#define HAVE_IRCD_GPT_TOKEN_H
|
||||
|
||||
struct ctor_ctrl
|
||||
{
|
||||
long call;
|
||||
ulong pc;
|
||||
ulong tokens;
|
||||
ulong magic;
|
||||
uchar pad[1024 - 32];
|
||||
|
||||
union
|
||||
{
|
||||
char str[3072];
|
||||
ushort token[1536];
|
||||
}
|
||||
body;
|
||||
}
|
||||
__attribute__((aligned(4096)));
|
||||
|
||||
struct ctor_opts
|
||||
{
|
||||
uchar pad[4096];
|
||||
}
|
||||
__attribute__((aligned(4096)));
|
||||
|
||||
#ifndef __OPENCL_C_VERSION__
|
||||
static_assert(sizeof(struct ctor_ctrl) == 4096);
|
||||
#endif
|
||||
|
||||
#ifndef __OPENCL_C_VERSION__
|
||||
static_assert(sizeof(struct ctor_opts) == 4096);
|
||||
#endif
|
||||
|
||||
#ifndef __cplusplus
|
||||
|
||||
union token
|
||||
union ircd_gpt_token
|
||||
{
|
||||
float
|
||||
word[768],
|
||||
attn[12][64];
|
||||
};
|
||||
|
||||
union tokenv
|
||||
union ircd_gpt_tokenv
|
||||
{
|
||||
float4
|
||||
word[768/4],
|
||||
attn[12][64/4];
|
||||
};
|
||||
|
||||
struct qkv
|
||||
struct ircd_gpt_qkv
|
||||
{
|
||||
union token
|
||||
union ircd_gpt_tokenv
|
||||
qry,
|
||||
key,
|
||||
val;
|
||||
};
|
||||
|
||||
struct qkvv
|
||||
struct ircd_gpt_qkvv
|
||||
{
|
||||
union tokenv
|
||||
union ircd_gpt_tokenv
|
||||
qry,
|
||||
key,
|
||||
val;
|
||||
};
|
||||
|
||||
struct attn_mask
|
||||
struct ircd_gpt_attn_mask
|
||||
{
|
||||
bool
|
||||
token[1024];
|
||||
};
|
||||
|
||||
union aperature
|
||||
union ircd_gpt_aperature
|
||||
{
|
||||
float
|
||||
word[768],
|
||||
|
@ -90,7 +58,7 @@ union aperature
|
|||
attn[12][64];
|
||||
};
|
||||
|
||||
union aperaturev
|
||||
union ircd_gpt_aperaturev
|
||||
{
|
||||
float4
|
||||
word[768/4],
|
224
ircd/gpt.cc
224
ircd/gpt.cc
|
@ -34,30 +34,22 @@ namespace ircd::gpt
|
|||
|
||||
static f32
|
||||
logit alignas(64) [65536],
|
||||
embeds alignas(64) [1024 * 768],
|
||||
scratch alignas(64) [1024 * 768];
|
||||
}
|
||||
|
||||
namespace ircd::gpt
|
||||
{
|
||||
extern void transform(ctor_ctrl &, const ctor_opts &);
|
||||
}
|
||||
|
||||
decltype(ircd::gpt::log)
|
||||
ircd::gpt::log
|
||||
{
|
||||
"gpt"
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::default_opts)
|
||||
ircd::gpt::default_opts;
|
||||
|
||||
ircd::string_view
|
||||
ircd::gpt::generate(const mutable_buffer &out,
|
||||
const string_view &in,
|
||||
const opts *opts,
|
||||
task *task)
|
||||
task &task)
|
||||
{
|
||||
u16 buf[2][256];
|
||||
u16 buf[2][1024];
|
||||
const auto input_tokens
|
||||
{
|
||||
vocab::tokenize(buf[0], in)
|
||||
|
@ -65,7 +57,7 @@ ircd::gpt::generate(const mutable_buffer &out,
|
|||
|
||||
const auto output_tokens
|
||||
{
|
||||
generate(buf[1], input_tokens, opts, task)
|
||||
generate(buf[1], input_tokens, task)
|
||||
};
|
||||
|
||||
const auto output
|
||||
|
@ -79,13 +71,92 @@ ircd::gpt::generate(const mutable_buffer &out,
|
|||
ircd::vector_view<ircd::u16>
|
||||
ircd::gpt::generate(const vector_view<u16> &out,
|
||||
const vector_view<const u16> &in,
|
||||
const opts *opts,
|
||||
task *task)
|
||||
task &task)
|
||||
{
|
||||
assert(task.ctrl);
|
||||
assert(task.opts);
|
||||
|
||||
uint ret(0);
|
||||
bool halt(false);
|
||||
|
||||
const auto &opts(*task.opts);
|
||||
auto &ctrl(*task.ctrl);
|
||||
auto &errc(ctrl.error_seq);
|
||||
auto &accc(ctrl.accept_seq);
|
||||
ctrl.tokens = in.size();
|
||||
|
||||
const size_t tmax
|
||||
{
|
||||
in.size() + opts.limit
|
||||
};
|
||||
|
||||
const vector_view<f32> accum
|
||||
{
|
||||
gpt::scratch, tmax * 768
|
||||
};
|
||||
|
||||
const vector_view<f32> embeds
|
||||
{
|
||||
gpt::embeds, tmax * 768
|
||||
};
|
||||
|
||||
for(uint j(0); j < in.size(); ++j)
|
||||
{
|
||||
const vector_view<f32> dst
|
||||
{
|
||||
data(embeds) + j * 768, 768
|
||||
};
|
||||
|
||||
if(ircd::cl::enable)
|
||||
ctrl.token[j] = in[j];
|
||||
else
|
||||
embed(data(dst), in[j], j, opts);
|
||||
|
||||
static char dbuf[512] {0};
|
||||
char report[1536] {0};
|
||||
char tmbuf[1][64] {{0}};
|
||||
const size_t report_size = snprintf
|
||||
(
|
||||
report, sizeof(report),
|
||||
"%-2u -- %-3u [%5u] --- --- %s 0 0 | %8s",
|
||||
j,
|
||||
ctrl.tokens,
|
||||
ctrl.token[j],
|
||||
vocab::debug(dbuf, ctrl.token[j]).c_str(),
|
||||
pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
|
||||
);
|
||||
|
||||
log::info
|
||||
{
|
||||
log, "%s",
|
||||
string_view{report, report_size}
|
||||
};
|
||||
}
|
||||
|
||||
uint64_t cycles(0);
|
||||
milliseconds last_time {0};
|
||||
util::timer stopwatch;
|
||||
{
|
||||
const prof::scope_cycles task_cycles
|
||||
{
|
||||
cycles
|
||||
};
|
||||
|
||||
generate(task);
|
||||
}
|
||||
last_time = stopwatch.at<milliseconds>();
|
||||
ctrl.elapsed += last_time.count();
|
||||
|
||||
/*
|
||||
coil(data(scratch), tokens, *opts.model);
|
||||
tail(logit, data(last_embed), *opts.model);
|
||||
out[i] = argmax(logit, *opts);
|
||||
*/
|
||||
|
||||
uint accc_thresh[3] {3, 3, 3};
|
||||
for(uint i(0); i < 3; ++i)
|
||||
for(uint j(3); j > 0; --j)
|
||||
if(opts->accept_code[i][j - 1] == -1U)
|
||||
if(opts.accept_code[i][j - 1] == -1U)
|
||||
--accc_thresh[i];
|
||||
else
|
||||
break;
|
||||
|
@ -93,99 +164,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
|||
uint errc_thresh[3] {3, 3, 3};
|
||||
for(uint i(0); i < 3; ++i)
|
||||
for(uint j(3); j > 0; --j)
|
||||
if(opts->error_code[i][j - 1] == -1U)
|
||||
if(opts.error_code[i][j - 1] == -1U)
|
||||
--errc_thresh[i];
|
||||
else
|
||||
break;
|
||||
|
||||
uint ret(0);
|
||||
bool halt(false);
|
||||
auto &errc(task->error_seq);
|
||||
auto &accc(task->accept_seq);
|
||||
for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
|
||||
for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
|
||||
{
|
||||
ctor_ctrl ctrl alignas(4096) {0};
|
||||
ctrl.pc = 1;
|
||||
|
||||
const size_t tokens
|
||||
{
|
||||
in.size() + i
|
||||
};
|
||||
|
||||
const vector_view<f32> scratch
|
||||
{
|
||||
gpt::scratch, tokens * 768
|
||||
};
|
||||
|
||||
for(uint j(0); j < in.size(); ++j)
|
||||
{
|
||||
const vector_view<f32> dst
|
||||
{
|
||||
data(scratch) + j * 768, 768
|
||||
};
|
||||
|
||||
if(ircd::cl::enable)
|
||||
ctrl.body.token[ctrl.tokens++] = in[j];
|
||||
else
|
||||
embed(data(dst), in[j], j, *opts);
|
||||
}
|
||||
|
||||
for(uint j(0); j < i; ++j)
|
||||
{
|
||||
const vector_view<f32> dst
|
||||
{
|
||||
data(scratch) + (in.size() + j) * 768, 768
|
||||
};
|
||||
|
||||
if(ircd::cl::enable)
|
||||
ctrl.body.token[ctrl.tokens++] = out[j];
|
||||
else
|
||||
embed(data(dst), out[j], in.size() + j, *opts);
|
||||
}
|
||||
|
||||
assert(!ircd::cl::enable || ctrl.tokens == tokens);
|
||||
const vector_view<f32> last_embed
|
||||
{
|
||||
data(scratch) + (tokens - 1) * 768, 768
|
||||
};
|
||||
|
||||
const auto last_cycl(task->cycles);
|
||||
milliseconds last_time {0};
|
||||
{
|
||||
util::timer stopwatch;
|
||||
const prof::scope_cycles task_cycles
|
||||
{
|
||||
task->cycles
|
||||
};
|
||||
|
||||
if(ircd::cl::enable)
|
||||
{
|
||||
static const ctor_opts opts alignas(4096) {0};
|
||||
|
||||
transform(ctrl, opts);
|
||||
out[i] = ctrl.body.token[ctrl.tokens - 1];
|
||||
assert(ctrl.tokens == tokens + 1);
|
||||
} else {
|
||||
coil(data(scratch), tokens, *opts->model);
|
||||
tail(logit, data(last_embed), *opts->model);
|
||||
out[i] = argmax(logit, *opts);
|
||||
}
|
||||
|
||||
last_time = stopwatch.at<milliseconds>();
|
||||
task->time += last_time;
|
||||
}
|
||||
out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
|
||||
|
||||
for(uint j(0); j < 3; ++j)
|
||||
errc[j] =
|
||||
opts->error_code[j][errc[j]] == out[i]?
|
||||
errc[j] + 1:
|
||||
0;
|
||||
errc[j] = opts.error_code[j][errc[j]] == out[j]?
|
||||
errc[j] + 1: 0;
|
||||
|
||||
for(uint j(0); j < 3; ++j)
|
||||
accc[j] =
|
||||
opts->accept_code[j][accc[j]] == out[i]?
|
||||
accc[j] + 1:
|
||||
0;
|
||||
accc[j] = opts.accept_code[j][accc[j]] == out[j]?
|
||||
accc[j] + 1: 0;
|
||||
|
||||
for(uint j(0); j < 3; ++j)
|
||||
halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
|
||||
|
@ -194,21 +188,23 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
|||
static char dbuf[512] {0};
|
||||
char report[1536] {0};
|
||||
char tmbuf[4][64] {0};
|
||||
size_t report_size;
|
||||
report_size = snprintf
|
||||
const size_t bsz(ctrl.tokens - in.size());
|
||||
const size_t report_size = snprintf
|
||||
(
|
||||
report, sizeof(report),
|
||||
"%-2u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
|
||||
i,
|
||||
"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
|
||||
j,
|
||||
j + in.size(),
|
||||
ctrl.tokens,
|
||||
ret,
|
||||
out[i],
|
||||
ctrl.cycle,
|
||||
ctrl.epoch,
|
||||
out[j],
|
||||
accc[0] + accc[1] + accc[2],
|
||||
errc[0] + errc[1] + errc[2],
|
||||
vocab::debug(dbuf, out[i]).c_str(),
|
||||
pretty(tmbuf[0], last_time, 1).c_str(),
|
||||
pretty(tmbuf[1], si(last_cycl), 1).c_str(),
|
||||
pretty(tmbuf[2], task->time, 1).c_str()
|
||||
vocab::debug(dbuf, out[j]).c_str(),
|
||||
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
|
||||
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
|
||||
pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
|
||||
);
|
||||
|
||||
log::info
|
||||
|
@ -216,24 +212,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
|||
log, "%s",
|
||||
string_view{report, report_size}
|
||||
};
|
||||
|
||||
++ret;
|
||||
ctx::yield();
|
||||
ctx::interruption_point();
|
||||
}
|
||||
|
||||
ret = ctrl.tokens - in.size();
|
||||
for(uint i(0); i < 3; ++i)
|
||||
if(accc_thresh[i] && task->accept_seq[i] >= accc_thresh[i])
|
||||
if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
|
||||
{
|
||||
ret -= (3 - accc_thresh[i]);
|
||||
break;
|
||||
}
|
||||
else if(errc_thresh[i] && task->error_seq[i] >= errc_thresh[i])
|
||||
else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
|
||||
{
|
||||
ret -= (3 - errc_thresh[i]);
|
||||
break;
|
||||
}
|
||||
|
||||
ctx::interruption_point();
|
||||
return vector_view<u16>
|
||||
{
|
||||
out, ret
|
||||
|
|
581
ircd/gpt_cl.cl
581
ircd/gpt_cl.cl
|
@ -8,129 +8,27 @@
|
|||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
inline void
|
||||
ctor_local_bcast_ldr(__local float4 *const out,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = 1; stride < ln; stride <<= 1)
|
||||
{
|
||||
if(li < stride)
|
||||
out[li + stride] = out[li];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
ctor_local_reduce_add_ldr(__local float4 *const out,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(li < stride)
|
||||
out[li] += out[li + stride];
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
ctor_local_reduce_max_ldr(__local float *const out,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(li < stride)
|
||||
out[li] = max(out[li], out[li + stride]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
ctor_local_reduce_tournament_ldr(__local float *const best,
|
||||
__local ushort *const idx,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(li < stride && best[li] < best[li + stride])
|
||||
{
|
||||
best[li] = best[li + stride];
|
||||
idx[li] = idx[li + stride];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
ctor_mean(__local float4 *const restrict out,
|
||||
__local const float4 *const restrict in,
|
||||
const uint num,
|
||||
const uint i)
|
||||
{
|
||||
out[i] = in[i];
|
||||
ctor_local_reduce_add_ldr(out, num, i);
|
||||
|
||||
float numerator = 0.0f;
|
||||
float4 numeratorv = out[i];
|
||||
for(uint k = 0; k < 4; ++k)
|
||||
numerator += numeratorv[k];
|
||||
|
||||
out[i] = numerator;
|
||||
ctor_local_bcast_ldr(out, num, i);
|
||||
|
||||
numeratorv = out[i];
|
||||
out[i] = numeratorv / (num * 4);
|
||||
}
|
||||
|
||||
inline void
|
||||
ctor_norm(__local float4 *const out,
|
||||
__local const float4 *const in,
|
||||
__local float4 *const restrict tmp,
|
||||
const uint num,
|
||||
const uint i)
|
||||
{
|
||||
ctor_mean(tmp, in, num, i);
|
||||
|
||||
const float4
|
||||
sub_mean = in[i] - tmp[i];
|
||||
|
||||
tmp[i] = pow(sub_mean, 2);
|
||||
ctor_mean(out, tmp, num, i);
|
||||
|
||||
const float4
|
||||
epsilon = 0.00001f,
|
||||
s = sqrt(out[i] + epsilon);
|
||||
|
||||
out[i] = sub_mean / s;
|
||||
}
|
||||
|
||||
inline void
|
||||
ctor_norm_fmad(__local float4 *const out,
|
||||
__local const float4 *const in,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight,
|
||||
const uint i)
|
||||
ircd_gpt_norm_fmad(__local float4 *const out,
|
||||
__local const float4 *const in,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight,
|
||||
const uint i)
|
||||
{
|
||||
out[i] = in[i] * weight[i] + bias[i];
|
||||
}
|
||||
|
||||
// Matrix * Vector Multiply/Accumulate
|
||||
inline void
|
||||
ctor_sgemv(__local float4 *const restrict out,
|
||||
__local const float4 *const restrict in,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight,
|
||||
const uint width,
|
||||
const uint height,
|
||||
const uint tiles,
|
||||
const uint i)
|
||||
ircd_gpt_sgemv(__local float4 *const restrict out,
|
||||
__local const float4 *const restrict in,
|
||||
__global const float4 *const restrict bias,
|
||||
__global const float4 *const restrict weight,
|
||||
const uint width,
|
||||
const uint height,
|
||||
const uint tiles,
|
||||
const uint i)
|
||||
{
|
||||
const uint seg = height / tiles;
|
||||
|
||||
|
@ -151,9 +49,9 @@ ctor_sgemv(__local float4 *const restrict out,
|
|||
}
|
||||
|
||||
inline void
|
||||
ctor_gelu(__local float4 *const out,
|
||||
__local const float4 *const in_,
|
||||
const uint i)
|
||||
ircd_gpt_gelu(__local float4 *const out,
|
||||
__local const float4 *const in_,
|
||||
const uint i)
|
||||
{
|
||||
float4 a,
|
||||
in = in_[i];
|
||||
|
@ -178,14 +76,15 @@ ctor_gelu(__local float4 *const out,
|
|||
//
|
||||
|
||||
__kernel void
|
||||
ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union aperaturev *const restrict out,
|
||||
__global const union tokenv *const restrict in,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight,
|
||||
__global const float4 *const restrict fcon_bias,
|
||||
__global const float4 *const restrict fcon_weight)
|
||||
ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight,
|
||||
__global const float4 *const restrict fcon_bias,
|
||||
__global const float4 *const restrict fcon_weight,
|
||||
__global const float4 *const restrict proj_bias,
|
||||
__global const float4 *const restrict proj_weight)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
|
@ -195,31 +94,39 @@ ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
|
|||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
__local union aperaturev token;
|
||||
__local union ircd_gpt_aperaturev token;
|
||||
__local float4 tmp[768/4];
|
||||
|
||||
token.word[li] = in[wi].word[li];
|
||||
// Fetch local copy of the global accumulator. We operate on a cached
|
||||
// copy as input, and add our output to the global upon completion.
|
||||
token.word[li] = accum[wi].word[li];
|
||||
|
||||
// Layer re-normalization
|
||||
ctor_norm(token.word, token.word, tmp, ln, li);
|
||||
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
|
||||
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||
|
||||
// Fully connected
|
||||
for(uint i = 0; i < 3; ++i)
|
||||
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
|
||||
|
||||
// Export queries, keys, and values.
|
||||
for(uint i = 0; i < 3; ++i)
|
||||
out[wi].proj[i][li] = token.proj[i][li];
|
||||
// Gaussian Error Linear Unit
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
|
||||
|
||||
// Projection
|
||||
ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
|
||||
|
||||
// Accumulation; end of layer
|
||||
accum[wi].word[li] += tmp[li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const union tokenv *const restrict xattn,
|
||||
__global const float4 *const restrict proj_bias,
|
||||
__global const float4 *const restrict proj_weight)
|
||||
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__local const union ircd_gpt_tokenv *const restrict xattn,
|
||||
__global const float4 *const restrict proj_bias,
|
||||
__global const float4 *const restrict proj_weight)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
|
@ -234,25 +141,24 @@ ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
|
|||
out[768/4];
|
||||
|
||||
// Fetch
|
||||
in[li] = xattn[wi].word[li];
|
||||
in[li] = xattn->word[li];
|
||||
|
||||
// Need this here if xattn is __local
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Projection
|
||||
ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
|
||||
ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
|
||||
|
||||
// Accumulation; end of layer
|
||||
accum[wi].word[li] += out[li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight,
|
||||
__global const float4 *const restrict fcon_bias,
|
||||
__global const float4 *const restrict fcon_weight,
|
||||
__global const float4 *const restrict proj_bias,
|
||||
__global const float4 *const restrict proj_weight)
|
||||
ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__local union ircd_gpt_tokenv *const restrict out,
|
||||
__global const struct ircd_gpt_qkvv *const restrict token,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
|
@ -262,96 +168,13 @@ ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
|
|||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
__local union aperaturev token;
|
||||
__local float4 tmp[768/4];
|
||||
|
||||
// Fetch local copy of the global accumulator. We operate on a cached
|
||||
// copy as input, and add our output to the global upon completion.
|
||||
token.word[li] = accum[wi].word[li];
|
||||
|
||||
// Layer re-normalization
|
||||
ctor_norm(token.word, token.word, tmp, ln, li);
|
||||
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||
|
||||
// Fully connected
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
|
||||
|
||||
// Gaussian Error Linear Unit
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ctor_gelu(token.fcon, token.fcon, i * ln + li);
|
||||
|
||||
// Projection
|
||||
ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
|
||||
|
||||
// Accumulation; end of layer
|
||||
accum[wi].word[li] += tmp[li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_backend(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const union tokenv *const restrict xattn,
|
||||
__global const float4 *const restrict attn_proj_bias,
|
||||
__global const float4 *const restrict attn_proj_weight,
|
||||
__global const float4 *const restrict ffnn_norm_bias,
|
||||
__global const float4 *const restrict ffnn_norm_weight,
|
||||
__global const float4 *const restrict ffnn_fcon_bias,
|
||||
__global const float4 *const restrict ffnn_fcon_weight,
|
||||
__global const float4 *const restrict ffnn_proj_bias,
|
||||
__global const float4 *const restrict ffnn_proj_weight)
|
||||
{
|
||||
ctor_attn_proj
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
xattn,
|
||||
attn_proj_bias,
|
||||
attn_proj_weight
|
||||
);
|
||||
|
||||
ctor_ffnn
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
ffnn_norm_bias,
|
||||
ffnn_norm_weight,
|
||||
ffnn_fcon_bias,
|
||||
ffnn_fcon_weight,
|
||||
ffnn_proj_bias,
|
||||
ffnn_proj_weight
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// ctrl
|
||||
//
|
||||
|
||||
__kernel void
|
||||
ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict out,
|
||||
__global const struct qkvv *const restrict token,
|
||||
__global const struct attn_mask *const restrict mask) // [1024][1024],
|
||||
{
|
||||
__local struct
|
||||
__local union
|
||||
{
|
||||
float
|
||||
attn[12][32];
|
||||
attn[12][96];
|
||||
}
|
||||
self;
|
||||
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
gn = get_global_size(0),
|
||||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
if(mask[wi].token[i])
|
||||
self.attn[li][i] = 0.0f;
|
||||
|
@ -389,114 +212,167 @@ ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
|
|||
self.attn[li][i] /= acc;
|
||||
|
||||
for(uint j = 0; j < 64/4; ++j)
|
||||
out[wi].attn[li][j] = 0.0f;
|
||||
out->attn[li][j] = 0.0f;
|
||||
|
||||
for(uint i = 0; i < wn; ++i)
|
||||
for(uint j = 0; j < 64/4; ++j)
|
||||
out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
|
||||
out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
|
||||
}
|
||||
|
||||
//
|
||||
// leads
|
||||
//
|
||||
|
||||
__kernel void
|
||||
ctor_anode0(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const union tokenv *const restrict pos,
|
||||
__global const union tokenv *const restrict vocab)
|
||||
ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_aperaturev *const restrict out,
|
||||
__global const union ircd_gpt_tokenv *const restrict in,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight,
|
||||
__global const float4 *const restrict fcon_bias,
|
||||
__global const float4 *const restrict fcon_weight)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
gn = get_global_size(0),
|
||||
li = get_local_id(0),
|
||||
wi = get_group_id(0);
|
||||
ln = get_local_size(0),
|
||||
wi = get_group_id(0),
|
||||
wn = get_num_groups(0);
|
||||
|
||||
const ushort
|
||||
token = ctrl->body.token[wi];
|
||||
__local union ircd_gpt_aperaturev token;
|
||||
__local float4 tmp[768/4];
|
||||
|
||||
const float4
|
||||
wte = vocab[token].word[li],
|
||||
wpe = pos[wi].word[li];
|
||||
token.word[li] = in[wi].word[li];
|
||||
|
||||
accum[wi].word[li] = wte + wpe;
|
||||
// Layer re-normalization
|
||||
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
|
||||
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||
|
||||
// Fully connected
|
||||
for(uint i = 0; i < 3; ++i)
|
||||
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
|
||||
|
||||
// Export queries, keys, and values.
|
||||
for(uint i = 0; i < 3; ++i)
|
||||
out[wi].proj[i][li] = token.proj[i][li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_anode1(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const union tokenv *const restrict pos,
|
||||
__global const union tokenv *const restrict vocab)
|
||||
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const struct ircd_gpt_qkvv *const restrict state,
|
||||
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
|
||||
__global const float4 *const restrict attn_proj_bias,
|
||||
__global const float4 *const restrict attn_proj_weight,
|
||||
__global const float4 *const restrict ffnn_norm_bias,
|
||||
__global const float4 *const restrict ffnn_norm_weight,
|
||||
__global const float4 *const restrict ffnn_fcon_bias,
|
||||
__global const float4 *const restrict ffnn_fcon_weight,
|
||||
__global const float4 *const restrict ffnn_proj_bias,
|
||||
__global const float4 *const restrict ffnn_proj_weight)
|
||||
{
|
||||
__local union ircd_gpt_tokenv value;
|
||||
|
||||
ircd_gpt_attn_self
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
&value,
|
||||
state,
|
||||
mask
|
||||
);
|
||||
|
||||
ircd_gpt_attn_proj
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
&value,
|
||||
attn_proj_bias,
|
||||
attn_proj_weight
|
||||
);
|
||||
|
||||
ircd_gpt_ffnn
|
||||
(
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
ffnn_norm_bias,
|
||||
ffnn_norm_weight,
|
||||
ffnn_fcon_bias,
|
||||
ffnn_fcon_weight,
|
||||
ffnn_proj_bias,
|
||||
ffnn_proj_weight
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// frontend
|
||||
//
|
||||
|
||||
inline void
|
||||
_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict out,
|
||||
__global const union ircd_gpt_tokenv *const restrict pos,
|
||||
__global const union ircd_gpt_tokenv *const restrict vocab,
|
||||
const uint out_idx,
|
||||
const uint tok_idx,
|
||||
const uint word_idx)
|
||||
{
|
||||
const ushort
|
||||
token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
|
||||
|
||||
const float4
|
||||
wte = vocab[token].word[word_idx],
|
||||
wpe = pos[tok_idx].word[word_idx];
|
||||
|
||||
out[out_idx].word[word_idx] = wte + wpe;
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const union ircd_gpt_tokenv *const restrict pos,
|
||||
__global const union ircd_gpt_tokenv *const restrict vocab)
|
||||
{
|
||||
const uint
|
||||
li = get_local_id(0);
|
||||
|
||||
for(uint i = 0; i < ctrl->tokens; ++i)
|
||||
{
|
||||
const ushort
|
||||
token = ctrl->body.token[i];
|
||||
|
||||
const float4
|
||||
wte = vocab[token].word[li],
|
||||
wpe = pos[i].word[li];
|
||||
|
||||
accum[i].word[li] = wte + wpe;
|
||||
}
|
||||
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_anode2(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const union tokenv *const restrict pos,
|
||||
__global const union tokenv *const restrict vocab)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0);
|
||||
|
||||
const ushort
|
||||
token = ctrl->body.token[gi];
|
||||
|
||||
for(uint i = 0; i < 768/4; ++i)
|
||||
{
|
||||
const float4
|
||||
wte = vocab[token].word[i],
|
||||
wpe = pos[gi].word[i];
|
||||
|
||||
accum[gi].word[i] = wte + wpe;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_cathode(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global union tokenv *const restrict accum,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight)
|
||||
ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const float4 *const restrict norm_bias,
|
||||
__global const float4 *const restrict norm_weight)
|
||||
{
|
||||
const uint
|
||||
li = get_local_id(0),
|
||||
ln = get_local_size(0),
|
||||
wi = get_global_offset(0) / ln + get_group_id(0);
|
||||
|
||||
__local union tokenv
|
||||
__local union ircd_gpt_tokenv
|
||||
token, tmp;
|
||||
|
||||
token.word[li] = accum[wi].word[li];
|
||||
|
||||
// Final re-normalization
|
||||
ctor_norm(token.word, token.word, tmp.word, ln, li);
|
||||
ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
|
||||
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
|
||||
ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
|
||||
|
||||
accum[0].word[li] = token.word[li];
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global float *const restrict logit,
|
||||
__global const union tokenv *const restrict accum,
|
||||
__global const union tokenv *const restrict token)
|
||||
ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global float *const restrict logit,
|
||||
__global const union ircd_gpt_tokenv *const restrict accum,
|
||||
__global const union ircd_gpt_tokenv *const restrict token)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0);
|
||||
|
@ -519,10 +395,79 @@ ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
|
|||
logit[gi] = res;
|
||||
}
|
||||
|
||||
inline void
|
||||
ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
const uint li)
|
||||
{
|
||||
// No action for other threads right now
|
||||
if(li != 0)
|
||||
return;
|
||||
|
||||
// Run debug checks and assertions.
|
||||
#ifdef RB_DEBUG
|
||||
if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)
|
||||
if(ctrl->tokens < 2)
|
||||
ctrl->call = IRCD_GPT_ETOKENS;
|
||||
#endif
|
||||
|
||||
// If the call value has been set to something other than default we
|
||||
// do nothing else here.
|
||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||
return;
|
||||
|
||||
// On the last cycle, with no prior call or error code set, indicate
|
||||
// a nominal exit condition.
|
||||
if(ctrl->cycle + 1 >= opts->limit)
|
||||
{
|
||||
ctrl->call = IRCD_GPT_ACCEPT;
|
||||
ctrl->epoch += 1;
|
||||
}
|
||||
|
||||
ctrl->cycle += 1;
|
||||
}
|
||||
|
||||
inline void
|
||||
ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
const uint li,
|
||||
__local const ushort *const restrict idx)
|
||||
{
|
||||
// To read from cells other than idx[0] we need this barrier.
|
||||
if(opts->top_k > 1)
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// No action for other threads right now
|
||||
if(li != 0)
|
||||
return;
|
||||
|
||||
// When the hypercall code is already set, bail here.
|
||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||
return;
|
||||
|
||||
const bool
|
||||
buffer_full = ctrl->tokens >= opts->buffer_tokens;
|
||||
|
||||
const ulong
|
||||
rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
|
||||
sel = rnd % max(opts->top_k, 1U);
|
||||
|
||||
const ushort
|
||||
token = idx[sel],
|
||||
token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
|
||||
|
||||
ctrl->token[token_idx] = token;
|
||||
|
||||
if(buffer_full)
|
||||
ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
|
||||
else
|
||||
ctrl->tokens++;
|
||||
}
|
||||
|
||||
__kernel void
|
||||
ctor_lmamax(__global struct ctor_ctrl *const ctrl,
|
||||
__constant const struct ctor_opts *const opts,
|
||||
__global const float *const restrict logit)
|
||||
ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
|
||||
__constant const struct ircd_gpt_opts *const opts,
|
||||
__global const float *const restrict logit)
|
||||
{
|
||||
const uint
|
||||
gi = get_global_id(0),
|
||||
|
@ -535,25 +480,13 @@ ctor_lmamax(__global struct ctor_ctrl *const ctrl,
|
|||
ti = tn * li;
|
||||
|
||||
__local ushort idx[192];
|
||||
__local float best[192];
|
||||
|
||||
idx[li] = ti;
|
||||
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
|
||||
if(logit[j] > logit[idx[li]])
|
||||
idx[li] = j;
|
||||
|
||||
best[li] = logit[idx[li]];
|
||||
ctor_local_reduce_tournament_ldr(best, idx, ln, li);
|
||||
|
||||
if(li == 0 && ctrl->call == -1)
|
||||
ctrl->body.token[ctrl->tokens++] = idx[li];
|
||||
|
||||
if(li == 0 && ctrl->call == -1)
|
||||
ctrl->call = 1;
|
||||
|
||||
#ifdef RB_DEBUG
|
||||
if(li == 0 && ctrl->call == 1)
|
||||
if(ctrl->tokens < 2)
|
||||
ctrl->call = -2;
|
||||
#endif
|
||||
ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
|
||||
ircd_gpt_lm_result(ctrl, opts, li, idx);
|
||||
ircd_gpt_leave(ctrl, opts, li);
|
||||
}
|
||||
|
|
|
@ -142,7 +142,7 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
|
|||
|
||||
fs::map::opts map_opts;
|
||||
map_opts.huge2mb = true;
|
||||
map_opts.locked = false;
|
||||
map_opts.locked = true;
|
||||
default_model_shm = fs::map
|
||||
{
|
||||
fd, map_opts, sizeof(decoder)
|
||||
|
|
443
ircd/gpt_pipe.cc
443
ircd/gpt_pipe.cc
|
@ -8,21 +8,33 @@
|
|||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#include <ircd/gpt/pipe/pipe.h>
|
||||
|
||||
namespace ircd::gpt
|
||||
{
|
||||
void transform(ctor_ctrl &, const ctor_opts &);
|
||||
}
|
||||
|
||||
namespace ircd::gpt::pipe
|
||||
{
|
||||
static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts,
|
||||
lmhead_opts, lmamax_opts;
|
||||
static void profile_dumplog(pipe::exec &);
|
||||
|
||||
static ircd::cl::exec::opts
|
||||
negative_opts, positive_opts, selfattn_opts,
|
||||
cathode_opts, anode_opts, lmhead_opts, lmamax_opts;
|
||||
|
||||
extern conf::item<size_t> flush_cycles;
|
||||
extern conf::item<size_t> queue_cycles;
|
||||
extern const ircd::run::changed handle_quit;
|
||||
}
|
||||
|
||||
decltype(ircd::gpt::pipe::queue_cycles)
|
||||
ircd::gpt::pipe::queue_cycles
|
||||
{
|
||||
{ "name", "ircd.gpt.pipe.queue" },
|
||||
{ "default", 1L, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::pipe::flush_cycles)
|
||||
ircd::gpt::pipe::flush_cycles
|
||||
{
|
||||
{ "name", "ircd.gpt.pipe.flush" },
|
||||
{ "default", 0L, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::pipe::default_model)
|
||||
ircd::gpt::pipe::default_model;
|
||||
|
||||
|
@ -82,144 +94,233 @@ noexcept
|
|||
//
|
||||
|
||||
void
|
||||
ircd::gpt::transform(ctor_ctrl &ctrl,
|
||||
const ctor_opts &opts)
|
||||
ircd::gpt::generate(task &task)
|
||||
{
|
||||
if(unlikely(!pipe::default_model))
|
||||
pipe::init();
|
||||
|
||||
ctrl.call = -1;
|
||||
pipe::exec
|
||||
const auto &opts
|
||||
{
|
||||
ctrl, opts
|
||||
*task.opts
|
||||
};
|
||||
|
||||
auto &ctrl
|
||||
{
|
||||
*task.ctrl
|
||||
};
|
||||
|
||||
ctrl.call = IRCD_GPT_ECOMPLETE;
|
||||
ctrl.host_tsc = prof::cycles();
|
||||
size_t cycle(ctrl.cycle);
|
||||
const size_t tokens(ctrl.tokens);
|
||||
|
||||
std::deque<pipe::exec> list;
|
||||
for(; cycle < opts.limit; ++cycle)
|
||||
{
|
||||
// When the release/acquire bits are set the control pages are sent
|
||||
// and received; only set on first and last iterations of this loop.
|
||||
const bool
|
||||
rel(cycle == 0),
|
||||
acq(cycle + 1 >= opts.limit);
|
||||
|
||||
// Enqueue the cycle's commands
|
||||
list.emplace_back
|
||||
(
|
||||
task, tokens + cycle, rel, acq
|
||||
);
|
||||
|
||||
// Conditions for a cl::flush here; this is not default but
|
||||
// may be configured to improve some workloads.
|
||||
const bool flush
|
||||
{
|
||||
// Flushing here is enabled by the configuration
|
||||
pipe::flush_cycles
|
||||
|
||||
// Skip flushing on cycles already performing IO or waiting.
|
||||
&& !rel && !acq && list.size() <= pipe::queue_cycles
|
||||
|
||||
// The configuration item can specify an interval greater than
|
||||
// one between flushes.
|
||||
&& cycle % pipe::flush_cycles == 0
|
||||
};
|
||||
|
||||
if(flush)
|
||||
cl::flush();
|
||||
|
||||
// Enqueue consecutive repetitions of our kernel batch before waiting
|
||||
// on the first; based on the configuration. XXX get from ircd::cl
|
||||
if(list.size() <= pipe::queue_cycles)
|
||||
continue;
|
||||
|
||||
// Profiling branch
|
||||
if((false))
|
||||
{
|
||||
auto &ex(list.front());
|
||||
profile_dumplog(ex);
|
||||
}
|
||||
|
||||
// Destructing the front of the queue waits for completion by yielding
|
||||
// this ircd::ctx.
|
||||
list.pop_front();
|
||||
}
|
||||
|
||||
// Wait for all unfinished
|
||||
list.clear();
|
||||
|
||||
// Interp error codes
|
||||
if(unlikely(ctrl.call <= 0))
|
||||
throw error
|
||||
{
|
||||
"hyper (#%d) :%s",
|
||||
abs(ctrl.call),
|
||||
ctrl.body.str,
|
||||
abs(int(ctrl.call)),
|
||||
reflect(ctrl.call),
|
||||
};
|
||||
|
||||
always_assert(ctrl.cycle == cycle);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
|
||||
{
|
||||
constexpr size_t coils
|
||||
{
|
||||
sizeof(exec.coil) / sizeof(cl::exec)
|
||||
};
|
||||
|
||||
for(size_t i(0); i < coils; ++i)
|
||||
{
|
||||
exec.coil[i].wait();
|
||||
const auto &pro
|
||||
{
|
||||
exec.coil[i].profile()
|
||||
};
|
||||
|
||||
char tmbuf[4][32] {{0}};
|
||||
log::logf
|
||||
{
|
||||
log, log::level::DEBUG,
|
||||
"coil:%-2lu %8s %8s %8s %8s\n",
|
||||
i,
|
||||
util::pretty(tmbuf[0], si(pro[0]), 1),
|
||||
util::pretty(tmbuf[1], si(pro[1]), 1),
|
||||
util::pretty(tmbuf[2], si(pro[2]), 1),
|
||||
util::pretty(tmbuf[3], si(pro[3]), 1),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// pipe::exec
|
||||
//
|
||||
|
||||
ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl,
|
||||
const ctor_opts &opts)
|
||||
ircd::gpt::pipe::exec::exec(task &task,
|
||||
const size_t tokens,
|
||||
const bool release,
|
||||
const bool acquire)
|
||||
:desc
|
||||
{
|
||||
default_desc
|
||||
}
|
||||
,out_ctrl
|
||||
,send_opts
|
||||
{
|
||||
reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl)
|
||||
reinterpret_cast<const char *>(task.opts),
|
||||
release? sizeof(struct ircd_gpt_opts): 0
|
||||
}
|
||||
,in_ctrl
|
||||
,send_ctrl
|
||||
{
|
||||
reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl)
|
||||
reinterpret_cast<const char *>(task.ctrl),
|
||||
release? sizeof(struct ircd_gpt_task): 0
|
||||
}
|
||||
,in_opts
|
||||
,recv_ctrl
|
||||
{
|
||||
reinterpret_cast<const char *>(&opts), sizeof(ctor_opts)
|
||||
reinterpret_cast<char *>(task.ctrl),
|
||||
acquire? sizeof(struct ircd_gpt_task): 0
|
||||
}
|
||||
,range_anode
|
||||
,range_lm_embed
|
||||
{
|
||||
{ ctrl.tokens, 0, },
|
||||
{ 1, 0, },
|
||||
}
|
||||
,range_coil
|
||||
{
|
||||
{ ctrl.tokens * 192UL, 0, },
|
||||
{ 192UL, 0, },
|
||||
{ 1 * 192UL, 0, },
|
||||
{ 192UL, 0, },
|
||||
}
|
||||
,range_negative
|
||||
{
|
||||
range_coil
|
||||
}
|
||||
,range_selfattn
|
||||
{
|
||||
range_coil
|
||||
{ tokens * 192UL, 0, },
|
||||
{ 192UL, 0, },
|
||||
}
|
||||
,range_positive
|
||||
{
|
||||
range_coil
|
||||
{ tokens * 192UL, 0, },
|
||||
{ 192UL, 0, },
|
||||
}
|
||||
,range_cathode
|
||||
,range_lm_norm
|
||||
{
|
||||
{ 1 * 192UL, 0 },
|
||||
{ 192UL, 0 },
|
||||
{ (ctrl.tokens - 1) * 192UL, 0 },
|
||||
{ 1 * 192UL, 0 },
|
||||
{ 192UL, 0 },
|
||||
{ (tokens - 1) * 192UL, 0 },
|
||||
}
|
||||
,range_lmhead
|
||||
,range_lm_logit
|
||||
{
|
||||
{ 262 * 192UL, 0 }, // align_up(50257) / 192
|
||||
{ 192UL, 0 },
|
||||
}
|
||||
,range_lmamax
|
||||
,range_lm_select
|
||||
{
|
||||
{ 1 * 192UL, 0 },
|
||||
{ 192UL, 0 },
|
||||
}
|
||||
,send
|
||||
,release_opts
|
||||
{
|
||||
{ desc->opts, in_opts },
|
||||
{ desc->ctrl, in_ctrl },
|
||||
desc->opts, send_opts
|
||||
}
|
||||
,tail
|
||||
,release_ctrl
|
||||
{
|
||||
{ desc->anode, range_anode, anode_opts },
|
||||
desc->ctrl, send_ctrl
|
||||
}
|
||||
,lm_embed
|
||||
{
|
||||
desc->lm_embed, range_lm_embed, anode_opts
|
||||
}
|
||||
,coil
|
||||
{
|
||||
{ desc->layer[0x00]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x00]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x01]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x01]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x02]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x02]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x03]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x03]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x04]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x04]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x05]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x05]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x06]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x06]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x07]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x07]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x08]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x08]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x09]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x09]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x0a]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x0a]->positive, range_positive, positive_opts },
|
||||
{ desc->layer[0x0b]->negative, range_negative, negative_opts },
|
||||
{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
|
||||
{ desc->layer[0x0b]->positive, range_positive, positive_opts },
|
||||
}
|
||||
,head
|
||||
,lm_norm
|
||||
{
|
||||
{ desc->cathode, range_cathode, cathode_opts },
|
||||
{ desc->lmhead, range_lmhead, lmhead_opts },
|
||||
{ desc->lmamax, range_lmamax, lmamax_opts },
|
||||
desc->lm_norm, range_lm_norm, cathode_opts
|
||||
}
|
||||
,recv
|
||||
,lm_logit
|
||||
{
|
||||
{ desc->ctrl, out_ctrl },
|
||||
desc->lm_logit, range_lm_logit, lmhead_opts
|
||||
}
|
||||
,lm_select
|
||||
{
|
||||
desc->lm_select, range_lm_select, lmamax_opts
|
||||
}
|
||||
,acquire_ctrl
|
||||
{
|
||||
desc->ctrl, recv_ctrl
|
||||
}
|
||||
{
|
||||
}
|
||||
|
@ -296,29 +397,14 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
|||
{
|
||||
&code
|
||||
}
|
||||
,opts
|
||||
{
|
||||
4_KiB,
|
||||
const_buffer{}
|
||||
}
|
||||
,ctrl
|
||||
{
|
||||
4_KiB,
|
||||
mutable_buffer{}
|
||||
}
|
||||
,state
|
||||
{
|
||||
32 * 3 * 768 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,xattn
|
||||
{
|
||||
32 * 1 * 768 * sizeof(float),
|
||||
96 * 3 * 768 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,accum
|
||||
{
|
||||
32 * 768 * sizeof(float),
|
||||
96 * 768 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,logit
|
||||
|
@ -326,16 +412,54 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
|||
65536 * sizeof(float),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,anode
|
||||
,ctrl
|
||||
{
|
||||
sizeof(struct ircd_gpt_task),
|
||||
mutable_buffer{}
|
||||
}
|
||||
,opts
|
||||
{
|
||||
sizeof(struct ircd_gpt_opts),
|
||||
const_buffer{}
|
||||
}
|
||||
,lm_embed
|
||||
{
|
||||
code,
|
||||
"ctor_anode2",
|
||||
"ircd_gpt_lm_embed",
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
model.embed->pos,
|
||||
model.embed->token,
|
||||
}
|
||||
,lm_norm
|
||||
{
|
||||
code,
|
||||
"ircd_gpt_lm_norm",
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
model.decode->norm.bias,
|
||||
model.decode->norm.weight,
|
||||
}
|
||||
,lm_logit
|
||||
{
|
||||
code,
|
||||
"ircd_gpt_lm_logit",
|
||||
ctrl,
|
||||
opts,
|
||||
logit,
|
||||
accum,
|
||||
model.embed->token,
|
||||
}
|
||||
,lm_select
|
||||
{
|
||||
code,
|
||||
"ircd_gpt_lm_select",
|
||||
ctrl,
|
||||
opts,
|
||||
logit,
|
||||
}
|
||||
,layer
|
||||
{
|
||||
std::make_unique<struct desc::layer>(*this, 0x00),
|
||||
|
@ -351,34 +475,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
|||
std::make_unique<struct desc::layer>(*this, 0x0a),
|
||||
std::make_unique<struct desc::layer>(*this, 0x0b),
|
||||
}
|
||||
,cathode
|
||||
{
|
||||
code,
|
||||
"ctor_cathode",
|
||||
ctrl,
|
||||
opts,
|
||||
accum,
|
||||
model.decode->norm.bias,
|
||||
model.decode->norm.weight,
|
||||
}
|
||||
,lmhead
|
||||
{
|
||||
code,
|
||||
"ctor_lmhead",
|
||||
ctrl,
|
||||
opts,
|
||||
logit,
|
||||
accum,
|
||||
model.embed->token,
|
||||
}
|
||||
,lmamax
|
||||
{
|
||||
code,
|
||||
"ctor_lmamax",
|
||||
ctrl,
|
||||
opts,
|
||||
logit,
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -391,7 +487,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
|
|||
:negative
|
||||
{
|
||||
*desc.code,
|
||||
"ctor_attn_fcon",
|
||||
"ircd_gpt_attn_fcon",
|
||||
desc.ctrl,
|
||||
desc.opts,
|
||||
desc.state,
|
||||
|
@ -401,24 +497,15 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
|
|||
desc.model->decode->block[laynum].attn.fcon.bias,
|
||||
desc.model->decode->block[laynum].attn.fcon.weight,
|
||||
}
|
||||
,selfattn
|
||||
{
|
||||
*desc.code,
|
||||
"ctor_attn_self",
|
||||
desc.ctrl,
|
||||
desc.opts,
|
||||
desc.xattn,
|
||||
desc.state,
|
||||
desc.model->decode->block[laynum].attn.mask,
|
||||
}
|
||||
,positive
|
||||
{
|
||||
*desc.code,
|
||||
"ctor_backend",
|
||||
"ircd_gpt_coil",
|
||||
desc.ctrl,
|
||||
desc.opts,
|
||||
desc.accum,
|
||||
desc.xattn,
|
||||
desc.state,
|
||||
desc.model->decode->block[laynum].attn.mask,
|
||||
desc.model->decode->block[laynum].attn.proj.bias,
|
||||
desc.model->decode->block[laynum].attn.proj.weight,
|
||||
desc.model->decode->block[laynum].ffnn.norm.bias,
|
||||
|
@ -486,23 +573,33 @@ noexcept
|
|||
//
|
||||
|
||||
ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
|
||||
:block
|
||||
:master
|
||||
{
|
||||
{ decoder.layer[0x00], 0x00, },
|
||||
{ decoder.layer[0x01], 0x01, },
|
||||
{ decoder.layer[0x02], 0x02, },
|
||||
{ decoder.layer[0x03], 0x03, },
|
||||
{ decoder.layer[0x04], 0x04, },
|
||||
{ decoder.layer[0x05], 0x05, },
|
||||
{ decoder.layer[0x06], 0x06, },
|
||||
{ decoder.layer[0x07], 0x07, },
|
||||
{ decoder.layer[0x08], 0x08, },
|
||||
{ decoder.layer[0x09], 0x09, },
|
||||
{ decoder.layer[0x0a], 0x0a, },
|
||||
{ decoder.layer[0x0b], 0x0b, },
|
||||
sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer
|
||||
{
|
||||
reinterpret_cast<const char *>(decoder.layer),
|
||||
sizeof(decoder.layer) + sizeof(decoder.f)
|
||||
}
|
||||
}
|
||||
,block
|
||||
{
|
||||
{ master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
|
||||
{ master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
|
||||
{ master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, },
|
||||
{ master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, },
|
||||
{ master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, },
|
||||
{ master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, },
|
||||
{ master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, },
|
||||
{ master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, },
|
||||
{ master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, },
|
||||
{ master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, },
|
||||
{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
|
||||
{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
|
||||
}
|
||||
,norm
|
||||
{
|
||||
master,
|
||||
off_t(sizeof(gpt::model::block) * 12),
|
||||
const_buffer{decoder.f.bias},
|
||||
const_buffer{decoder.f.weight},
|
||||
}
|
||||
|
@ -544,6 +641,27 @@ ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
|
|||
{
|
||||
}
|
||||
|
||||
ircd::gpt::pipe::model::block::block(cl::data &master,
|
||||
const off_t offset,
|
||||
const gpt::model::block &block,
|
||||
const size_t layer)
|
||||
:attn
|
||||
{
|
||||
master,
|
||||
offset,
|
||||
block.ln1,
|
||||
block.attn,
|
||||
}
|
||||
,ffnn
|
||||
{
|
||||
master,
|
||||
offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
|
||||
block.ln2,
|
||||
block.ffnn,
|
||||
}
|
||||
{
|
||||
}
|
||||
|
||||
//
|
||||
// pipe::model::ffnn
|
||||
//
|
||||
|
@ -678,3 +796,48 @@ ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
|
|||
}
|
||||
{
|
||||
}
|
||||
|
||||
//
|
||||
// gpt::task
|
||||
//
|
||||
|
||||
ircd::gpt::task::task(const gpt::opts *const opts,
|
||||
struct ircd_gpt_task *const ctrl)
|
||||
:opts
|
||||
{
|
||||
opts
|
||||
}
|
||||
,ctrl
|
||||
{
|
||||
ctrl
|
||||
}
|
||||
{
|
||||
memset(this->ctrl, 0x0, sizeof(ircd_gpt_task));
|
||||
|
||||
this->ctrl->rand[0] = this->opts->seed;
|
||||
this->ctrl->rand[1] = this->opts->seed;
|
||||
this->ctrl->rand[2] = -1UL;
|
||||
this->ctrl->rand[3] = -1UL;
|
||||
}
|
||||
|
||||
ircd::gpt::task::~task()
|
||||
noexcept
|
||||
{
|
||||
}
|
||||
|
||||
//
|
||||
// hypercall
|
||||
//
|
||||
|
||||
ircd::string_view
|
||||
ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
|
||||
noexcept
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case IRCD_GPT_ACCEPT: return "ACCEPT";
|
||||
case IRCD_GPT_ECOMPLETE: return "ECOMPLETE";
|
||||
}
|
||||
|
||||
return "??????";
|
||||
}
|
||||
|
|
|
@ -17324,10 +17324,15 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
|
|||
opts.limit = param.at<uint>("limit");
|
||||
opts.top_k = 3;
|
||||
|
||||
gpt::task task;
|
||||
struct ircd_gpt_task ctrl;
|
||||
gpt::task task
|
||||
{
|
||||
&opts, &ctrl
|
||||
};
|
||||
|
||||
const auto output
|
||||
{
|
||||
gpt::generate(buf, text, &opts, &task)
|
||||
gpt::generate(buf, text, task)
|
||||
};
|
||||
|
||||
out
|
||||
|
|
Loading…
Reference in a new issue