mirror of
https://github.com/matrix-construct/construct
synced 2024-12-25 23:14:13 +01:00
ircd::gpt: Reorganize interface.
This commit is contained in:
parent
f096e7fcf5
commit
1870c364f4
17 changed files with 1067 additions and 778 deletions
|
@ -13,15 +13,16 @@
|
||||||
|
|
||||||
namespace ircd::gpt
|
namespace ircd::gpt
|
||||||
{
|
{
|
||||||
|
void
|
||||||
|
generate(task &);
|
||||||
|
|
||||||
vector_view<u16>
|
vector_view<u16>
|
||||||
generate(const vector_view<u16> &out,
|
generate(const vector_view<u16> &out,
|
||||||
const vector_view<const u16> &in,
|
const vector_view<const u16> &in,
|
||||||
const opts * = &default_opts,
|
task &);
|
||||||
task * = nullptr);
|
|
||||||
|
|
||||||
string_view
|
string_view
|
||||||
generate(const mutable_buffer &out,
|
generate(const mutable_buffer &out,
|
||||||
const string_view &in,
|
const string_view &in,
|
||||||
const opts * = &default_opts,
|
task &);
|
||||||
task * = nullptr);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,63 +20,14 @@ namespace ircd::gpt
|
||||||
struct opts;
|
struct opts;
|
||||||
struct task;
|
struct task;
|
||||||
|
|
||||||
extern const opts default_opts;
|
|
||||||
extern log::log log;
|
extern log::log log;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "hypercall.h"
|
||||||
#include "vocab.h"
|
#include "vocab.h"
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
|
#include "token.h"
|
||||||
|
#include "opts.h"
|
||||||
#include "task.h"
|
#include "task.h"
|
||||||
|
#include "pipe/pipe.h"
|
||||||
#include "generate.h"
|
#include "generate.h"
|
||||||
|
|
||||||
/// Primary Options
|
|
||||||
///
|
|
||||||
/// Use this structure to configure and control specifics of the machine.
|
|
||||||
/// These settings are immutable for the operations. To maintain state between
|
|
||||||
/// calls see task.h
|
|
||||||
///
|
|
||||||
struct ircd::gpt::opts
|
|
||||||
{
|
|
||||||
/// Specifies the nominal halting condition based on the sequence of
|
|
||||||
/// tokens. Generation will complete when this sequence is witnessed. Set
|
|
||||||
/// tokens to -1 starting from the back to not match that token. Setting
|
|
||||||
/// all tokens to -1 will ignore this condition.
|
|
||||||
uint accept_code[3][3]
|
|
||||||
{
|
|
||||||
{ 13, 198, -1U, },
|
|
||||||
{ 198, 198, -1U, },
|
|
||||||
{ -1U, -1U, -1U, },
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Specifies the exceptional halting condition based on the sequence of
|
|
||||||
/// tokens. By default, the three zeros represent three outputs of '!'
|
|
||||||
/// which is probably an error code; note that a true "!!!" is represented
|
|
||||||
/// by token number 10185. Set tokens to -1 starting from the back to
|
|
||||||
/// not match that token; generated output after errors is usually garbage.
|
|
||||||
uint error_code[3][3]
|
|
||||||
{
|
|
||||||
{ 0, 0, 0, },
|
|
||||||
{ -1U, 0, 0, },
|
|
||||||
{ -1U, 0, 0, },
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
|
||||||
/// of tokens generated will be limited by other factors.
|
|
||||||
uint limit
|
|
||||||
{
|
|
||||||
-1U
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Flip random coins over the top k logits each round. Setting to 1
|
|
||||||
/// deterministically selects the top logit.
|
|
||||||
uint top_k
|
|
||||||
{
|
|
||||||
2
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Pointer to the model
|
|
||||||
const model::decoder *model
|
|
||||||
{
|
|
||||||
model::default_model
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
34
include/ircd/gpt/hypercall.h
Normal file
34
include/ircd/gpt/hypercall.h
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_HYPERCALL_H
|
||||||
|
|
||||||
|
/// Hypercalling code enumeration.
|
||||||
|
///
|
||||||
|
/// Error codes are all negative values. Zero is also an error.
|
||||||
|
enum ircd_gpt_hypercall
|
||||||
|
{
|
||||||
|
/// General nominal completion code; similar to EXIT_SUCCESS, etc.
|
||||||
|
IRCD_GPT_ACCEPT = 1,
|
||||||
|
|
||||||
|
/// Failed or incomplete execution occurred. After an execution attempt
|
||||||
|
/// it indicates no execution likely took place. Device software never
|
||||||
|
/// sets this value; it is the initial value set by the host before
|
||||||
|
/// execution.
|
||||||
|
IRCD_GPT_ECOMPLETE = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
namespace ircd::gpt
|
||||||
|
{
|
||||||
|
string_view reflect(const enum ircd_gpt_hypercall) noexcept;
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -67,7 +67,8 @@ struct ircd::gpt::model::block
|
||||||
|
|
||||||
norm ln2;
|
norm ln2;
|
||||||
model::ffnn ffnn;
|
model::ffnn ffnn;
|
||||||
};
|
}
|
||||||
|
__attribute__((packed));
|
||||||
|
|
||||||
/// Vocabulary embeddings
|
/// Vocabulary embeddings
|
||||||
struct ircd::gpt::model::embed
|
struct ircd::gpt::model::embed
|
||||||
|
|
123
include/ircd/gpt/opts.h
Normal file
123
include/ircd/gpt/opts.h
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_OPTS_H
|
||||||
|
|
||||||
|
/// Task Options Page
|
||||||
|
///
|
||||||
|
/// The option block is directly shared with task software as constant data.
|
||||||
|
/// This stucture and its mutable companion in `task.h` determine the outcome
|
||||||
|
/// of the next execution cycle; options are immutable to device software but
|
||||||
|
/// may be changed by the host between executions cycles if desired.
|
||||||
|
///
|
||||||
|
struct ircd_gpt_opts
|
||||||
|
{
|
||||||
|
/// Specifies the nominal halting condition based on a sequence of tokens.
|
||||||
|
/// Generation will complete with success after one of these sequences is
|
||||||
|
/// witnessed. Set tokens to -1 starting from the back for shorter
|
||||||
|
/// sequences; zero-length sequences (all -1's) are never matched.
|
||||||
|
uint accept_code[4][4]
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
{ 13, 198, -1U, -1U, },
|
||||||
|
{ 198, 198, -1U, -1U, },
|
||||||
|
{ -1U, -1U, -1U, -1U, },
|
||||||
|
{ -1U, -1U, -1U, -1U, },
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
/// Specifies the exceptional halting condition based on the sequence of
|
||||||
|
/// tokens. By default, the three zeros represent three outputs of '!'
|
||||||
|
/// which is probably an error; note that a true "!!!" is represented by
|
||||||
|
/// token number 10185. Set tokens to -1 starting from the back to not
|
||||||
|
/// match that token; generated output after errors is usually garbage.
|
||||||
|
uint error_code[4][4]
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
{ 0, 0, 0, -1U, },
|
||||||
|
{ -1U, -1U, -1U, -1U, },
|
||||||
|
{ -1U, -1U, -1U, -1U, },
|
||||||
|
{ -1U, -1U, -1U, -1U, },
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
||||||
|
/// of tokens generated will be limited by other factors.
|
||||||
|
uint limit
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
1
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
/// Flip random coins over the top k logits each round. Setting to 1
|
||||||
|
/// deterministically selects the top logit.
|
||||||
|
uint top_k
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
2
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
/// Specifies the token context size in tokens.
|
||||||
|
uint context_tokens
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
1024
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
/// Specifies the token buffer size in tokens.
|
||||||
|
uint buffer_tokens
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
1024
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
/// Seed for the task's PRNG.
|
||||||
|
ulong seed
|
||||||
|
#ifdef __cplusplus
|
||||||
|
{
|
||||||
|
1234567890UL
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
}
|
||||||
|
__attribute__((aligned(4096)));
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
/// Generator Task Options.
|
||||||
|
///
|
||||||
|
/// Parameters for a task. Options are constant and one instance can be shared
|
||||||
|
/// between multiple task instances. This structure extends the task options
|
||||||
|
/// page, starting a new page which is not visible to device software; C++ and
|
||||||
|
/// host pointers are available.
|
||||||
|
///
|
||||||
|
struct ircd::gpt::opts
|
||||||
|
:ircd_gpt_opts
|
||||||
|
{
|
||||||
|
/// Pointer to the model
|
||||||
|
const model::decoder *model
|
||||||
|
{
|
||||||
|
model::default_model
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
|
||||||
|
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
|
||||||
|
#endif
|
22
include/ircd/gpt/pipe/code.h
Normal file
22
include/ircd/gpt/pipe/code.h
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_PIPE_CODE_H
|
||||||
|
|
||||||
|
/// Pipe code segment.
|
||||||
|
struct ircd::gpt::pipe::code
|
||||||
|
:cl::code
|
||||||
|
{
|
||||||
|
static const string_view compile_opts;
|
||||||
|
|
||||||
|
code();
|
||||||
|
~code() noexcept;
|
||||||
|
};
|
47
include/ircd/gpt/pipe/desc.h
Normal file
47
include/ircd/gpt/pipe/desc.h
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_PIPE_DESC_H
|
||||||
|
|
||||||
|
/// Pipe descriptor
|
||||||
|
struct ircd::gpt::pipe::desc
|
||||||
|
{
|
||||||
|
struct layer;
|
||||||
|
|
||||||
|
pipe::model *model;
|
||||||
|
pipe::code *code;
|
||||||
|
|
||||||
|
cl::data
|
||||||
|
state, // qry/key/val projection (tokens * embed * 3 * float)
|
||||||
|
accum, // accumulator (tokens * embed * float)
|
||||||
|
logit, // result output vector (50257 * float)
|
||||||
|
ctrl, // control page
|
||||||
|
opts; // options page
|
||||||
|
|
||||||
|
cl::kern
|
||||||
|
lm_embed,
|
||||||
|
lm_norm,
|
||||||
|
lm_logit,
|
||||||
|
lm_select;
|
||||||
|
|
||||||
|
std::unique_ptr<struct desc::layer>
|
||||||
|
layer[12];
|
||||||
|
|
||||||
|
desc(pipe::code &, pipe::model &);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ircd::gpt::pipe::desc::layer
|
||||||
|
{
|
||||||
|
cl::kern negative;
|
||||||
|
cl::kern positive;
|
||||||
|
|
||||||
|
layer(pipe::desc &, const int);
|
||||||
|
};
|
57
include/ircd/gpt/pipe/exec.h
Normal file
57
include/ircd/gpt/pipe/exec.h
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_PIPE_EXEC_H
|
||||||
|
|
||||||
|
/// Perform one task cycle on the device.
|
||||||
|
///
|
||||||
|
/// Constructions of this object enqueue device commands to complete an
|
||||||
|
/// additional epoch of the task as provided by `ctrl` and `opts`.
|
||||||
|
///
|
||||||
|
/// Destructions of this object yield the ircd::ctx until those commands
|
||||||
|
/// are complete.
|
||||||
|
///
|
||||||
|
/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
|
||||||
|
/// achieved by constructing several objects before following with destructions
|
||||||
|
/// i.e in a std::deque.
|
||||||
|
///
|
||||||
|
struct ircd::gpt::pipe::exec
|
||||||
|
{
|
||||||
|
pipe::desc *desc;
|
||||||
|
|
||||||
|
const_buffer
|
||||||
|
send_opts, // Set when sending the options page.
|
||||||
|
send_ctrl; // Set when sending the control page.
|
||||||
|
|
||||||
|
mutable_buffer
|
||||||
|
recv_ctrl; // Set when receiving the control page.
|
||||||
|
|
||||||
|
cl::kern::range
|
||||||
|
range_lm_embed, // Dimension range of the lm_embed kernel.
|
||||||
|
range_negative, // Dimension range of a layer kernel.
|
||||||
|
range_positive, // Dimension range of a layer kernel.
|
||||||
|
range_lm_norm, // Dimension range of the final norm kernel.
|
||||||
|
range_lm_logit, // Dimension range of the language logit kernel.
|
||||||
|
range_lm_select; // Dimension range of the language token kernel.
|
||||||
|
|
||||||
|
cl::exec
|
||||||
|
release_opts, // Release the options page.
|
||||||
|
release_ctrl, // Release the control page.
|
||||||
|
lm_embed, // Compute token and positional embeddings.
|
||||||
|
coil[12 * 2], // Pass over all layers.
|
||||||
|
lm_norm, // Final normalization.
|
||||||
|
lm_logit, // Compute logit result vector.
|
||||||
|
lm_select, // Select next token.
|
||||||
|
acquire_ctrl; // Acquire the control page.
|
||||||
|
|
||||||
|
exec(task &, const size_t tokens, const bool rel, const bool acq);
|
||||||
|
~exec() noexcept;
|
||||||
|
};
|
|
@ -59,11 +59,13 @@ struct ircd::gpt::pipe::model::block
|
||||||
model::attn attn;
|
model::attn attn;
|
||||||
model::ffnn ffnn;
|
model::ffnn ffnn;
|
||||||
|
|
||||||
|
block(cl::data &, const off_t, const gpt::model::block &, const size_t);
|
||||||
block(const gpt::model::block &, const size_t);
|
block(const gpt::model::block &, const size_t);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::pipe::model::decoder
|
struct ircd::gpt::pipe::model::decoder
|
||||||
{
|
{
|
||||||
|
cl::data master;
|
||||||
model::block block[12];
|
model::block block[12];
|
||||||
tensor norm;
|
tensor norm;
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@ namespace ircd::gpt::pipe
|
||||||
struct code;
|
struct code;
|
||||||
struct desc;
|
struct desc;
|
||||||
struct exec;
|
struct exec;
|
||||||
struct bank;
|
|
||||||
|
|
||||||
extern model *default_model;
|
extern model *default_model;
|
||||||
extern code *default_code;
|
extern code *default_code;
|
||||||
|
@ -27,70 +26,6 @@ namespace ircd::gpt::pipe
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
#include "ctrl.h"
|
#include "code.h"
|
||||||
|
#include "desc.h"
|
||||||
struct ircd::gpt::pipe::code
|
#include "exec.h"
|
||||||
:cl::code
|
|
||||||
{
|
|
||||||
static const string_view compile_opts;
|
|
||||||
|
|
||||||
code();
|
|
||||||
~code() noexcept;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ircd::gpt::pipe::desc
|
|
||||||
{
|
|
||||||
struct layer;
|
|
||||||
|
|
||||||
pipe::model *model;
|
|
||||||
pipe::code *code;
|
|
||||||
|
|
||||||
cl::data opts;
|
|
||||||
cl::data ctrl;
|
|
||||||
cl::data state;
|
|
||||||
cl::data xattn;
|
|
||||||
cl::data accum;
|
|
||||||
cl::data logit;
|
|
||||||
cl::kern anode;
|
|
||||||
std::unique_ptr<struct desc::layer> layer[12];
|
|
||||||
cl::kern cathode;
|
|
||||||
cl::kern lmhead;
|
|
||||||
cl::kern lmamax;
|
|
||||||
|
|
||||||
desc(pipe::code &, pipe::model &);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ircd::gpt::pipe::desc::layer
|
|
||||||
{
|
|
||||||
cl::kern negative;
|
|
||||||
cl::kern selfattn;
|
|
||||||
cl::kern positive;
|
|
||||||
|
|
||||||
layer(pipe::desc &, const int);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ircd::gpt::pipe::exec
|
|
||||||
{
|
|
||||||
pipe::desc *desc;
|
|
||||||
|
|
||||||
mutable_buffer out_ctrl;
|
|
||||||
const_buffer in_ctrl, in_opts;
|
|
||||||
|
|
||||||
cl::kern::range range_anode;
|
|
||||||
cl::kern::range range_coil;
|
|
||||||
cl::kern::range range_negative;
|
|
||||||
cl::kern::range range_selfattn;
|
|
||||||
cl::kern::range range_positive;
|
|
||||||
cl::kern::range range_cathode;
|
|
||||||
cl::kern::range range_lmhead;
|
|
||||||
cl::kern::range range_lmamax;
|
|
||||||
|
|
||||||
cl::exec send[2];
|
|
||||||
cl::exec tail[1];
|
|
||||||
cl::exec coil[12 * 3];
|
|
||||||
cl::exec head[3];
|
|
||||||
cl::exec recv[1];
|
|
||||||
|
|
||||||
exec(ctor_ctrl &, const ctor_opts &);
|
|
||||||
~exec() noexcept;
|
|
||||||
};
|
|
||||||
|
|
|
@ -11,8 +11,77 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#define HAVE_IRCD_GPT_TASK_H
|
#define HAVE_IRCD_GPT_TASK_H
|
||||||
|
|
||||||
/// Context to maintain state across calls.
|
/// Task Control Page
|
||||||
///
|
///
|
||||||
|
/// The control block is shared with our device software. Execution state is
|
||||||
|
/// maintained in the task control block across cycles. The control block is
|
||||||
|
/// the mutable state component for an execution; for the immutable component
|
||||||
|
/// also shared with device software see opts.h.
|
||||||
|
///
|
||||||
|
struct ircd_gpt_task
|
||||||
|
{
|
||||||
|
/// Header magic 0xC7012C70
|
||||||
|
uint magic;
|
||||||
|
|
||||||
|
/// Hypercall code set by our device software upon completion and control
|
||||||
|
/// transfer back to the host. Negative codes indicate errors, positive
|
||||||
|
/// codes are used for status and/or procedure calls; zero is also an error.
|
||||||
|
enum ircd_gpt_hypercall call;
|
||||||
|
|
||||||
|
/// Token ring head. Tokens in the ring extend behind the head for
|
||||||
|
/// `tokens`. The `head` value is automatically modulated by device
|
||||||
|
/// software to wrap around the ring.
|
||||||
|
uint head;
|
||||||
|
|
||||||
|
/// Token counter. The counter indicates the number of valid tokens in
|
||||||
|
/// the context buffer. This value must not exceed the buffer size.
|
||||||
|
uint tokens;
|
||||||
|
|
||||||
|
/// Accumulates the number of task cycles. The cycle counter is incremented
|
||||||
|
/// by device software after each repetition of the kernel pipeline to
|
||||||
|
/// produce one additional token.
|
||||||
|
ulong cycle;
|
||||||
|
|
||||||
|
/// Accumulates the epoch count for the task. The counter is incremented
|
||||||
|
/// by one in device software before control returns back to the host.
|
||||||
|
/// Several cycles may occur during each epoch.
|
||||||
|
ulong epoch;
|
||||||
|
|
||||||
|
/// Accumulates the number of tokens produced by the task. Several tokens
|
||||||
|
/// may be produced each epoch, but currently only one token is produced
|
||||||
|
/// each cycle.
|
||||||
|
ulong produced;
|
||||||
|
|
||||||
|
/// Accumulates the number tokens witnessed by the task. The number of
|
||||||
|
/// tokens in the context for each cycle is counted as witnessed.
|
||||||
|
ulong witnessed;
|
||||||
|
|
||||||
|
/// Accumulates time in microseconds elapsed for the task.
|
||||||
|
ulong elapsed;
|
||||||
|
|
||||||
|
/// PRNG xoshiro256 state. This is the de facto random seed which can be
|
||||||
|
/// set before cycle entry by the host. It is updated by device software
|
||||||
|
/// when used.
|
||||||
|
ulong rand[4];
|
||||||
|
|
||||||
|
/// Updated by the host with the value of the timestamp register as sampled
|
||||||
|
/// immediately before each transfer of control to the device.
|
||||||
|
ulong host_tsc;
|
||||||
|
|
||||||
|
/// State counters for the accept/error sequence codes.
|
||||||
|
uint accept_seq[4], error_seq[4];
|
||||||
|
|
||||||
|
/// The token buffer starts at offset 2048 and continues to the end of
|
||||||
|
/// the page; options specify the size of the tokens buffer in tokens.
|
||||||
|
/// Additional pages must be attached for larger buffer sizes.
|
||||||
|
ushort token[] __attribute__((aligned(2048)));
|
||||||
|
}
|
||||||
|
__attribute__((aligned(4096)));
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
/// Task Context
|
||||||
|
///
|
||||||
|
/// State for a task.
|
||||||
struct ircd::gpt::task
|
struct ircd::gpt::task
|
||||||
{
|
{
|
||||||
enum status :char;
|
enum status :char;
|
||||||
|
@ -20,36 +89,16 @@ struct ircd::gpt::task
|
||||||
/// Reference to the attached options.
|
/// Reference to the attached options.
|
||||||
const gpt::opts *opts {nullptr};
|
const gpt::opts *opts {nullptr};
|
||||||
|
|
||||||
|
/// Reference to control pages.
|
||||||
|
struct ircd_gpt_task *ctrl {nullptr};
|
||||||
|
|
||||||
/// Current task status.
|
/// Current task status.
|
||||||
enum status status {'\0'};
|
enum status status {'\0'};
|
||||||
|
|
||||||
/// State counters for the accept codes specified in the options.
|
task(const gpt::opts * = nullptr,
|
||||||
uint8_t accept_seq[3] {0};
|
struct ircd_gpt_task * = nullptr);
|
||||||
|
|
||||||
/// State counters for the error codes specified in the options.
|
~task() noexcept;
|
||||||
uint8_t error_seq[3] {0};
|
|
||||||
|
|
||||||
/// Accumulates the number of executions by the user. Each call to the
|
|
||||||
/// interface is an execution.
|
|
||||||
uint64_t epoch {0};
|
|
||||||
|
|
||||||
/// Accumulates the number of tokens produced by the task. Several tokens
|
|
||||||
/// may be produced each epoch.
|
|
||||||
uint64_t produced {0};
|
|
||||||
|
|
||||||
/// Accumulates the number tokens witnessed by the task. The number of
|
|
||||||
/// tokens in the context for each produced token is counted as witnessed.
|
|
||||||
uint64_t witnessed {0};
|
|
||||||
|
|
||||||
/// Accumulates the number of CPU reference cycles consumed by the task.
|
|
||||||
/// This counter does not reflect time when the task is queued or waiting
|
|
||||||
/// or offloaded to a co-processor/accelerator.
|
|
||||||
uint64_t cycles {0};
|
|
||||||
|
|
||||||
/// Accumulates the total time in milliseconds over all executions of the
|
|
||||||
/// task. This counter reflects total wall-clock time of all phases of
|
|
||||||
/// the execution.
|
|
||||||
milliseconds time {0ms};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The current status of a task is indicated with intelligible characters
|
/// The current status of a task is indicated with intelligible characters
|
||||||
|
@ -61,3 +110,7 @@ enum ircd::gpt::task::status
|
||||||
ACCEPT = 'A', ///< Execution completed successfully.
|
ACCEPT = 'A', ///< Execution completed successfully.
|
||||||
ERROR = 'E', ///< Execution did not complete successfully.
|
ERROR = 'E', ///< Execution did not complete successfully.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(struct ircd_gpt_task) == 4096);
|
||||||
|
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
|
||||||
|
#endif
|
||||||
|
|
|
@ -9,78 +9,46 @@
|
||||||
// full license for this software is available in the LICENSE file.
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#define HAVE_IRCD_GPT_PIPE_CTRL_H
|
#ifdef __OPENCL_C_VERSION__
|
||||||
|
#define HAVE_IRCD_GPT_TOKEN_H
|
||||||
|
|
||||||
struct ctor_ctrl
|
union ircd_gpt_token
|
||||||
{
|
|
||||||
long call;
|
|
||||||
ulong pc;
|
|
||||||
ulong tokens;
|
|
||||||
ulong magic;
|
|
||||||
uchar pad[1024 - 32];
|
|
||||||
|
|
||||||
union
|
|
||||||
{
|
|
||||||
char str[3072];
|
|
||||||
ushort token[1536];
|
|
||||||
}
|
|
||||||
body;
|
|
||||||
}
|
|
||||||
__attribute__((aligned(4096)));
|
|
||||||
|
|
||||||
struct ctor_opts
|
|
||||||
{
|
|
||||||
uchar pad[4096];
|
|
||||||
}
|
|
||||||
__attribute__((aligned(4096)));
|
|
||||||
|
|
||||||
#ifndef __OPENCL_C_VERSION__
|
|
||||||
static_assert(sizeof(struct ctor_ctrl) == 4096);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __OPENCL_C_VERSION__
|
|
||||||
static_assert(sizeof(struct ctor_opts) == 4096);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __cplusplus
|
|
||||||
|
|
||||||
union token
|
|
||||||
{
|
{
|
||||||
float
|
float
|
||||||
word[768],
|
word[768],
|
||||||
attn[12][64];
|
attn[12][64];
|
||||||
};
|
};
|
||||||
|
|
||||||
union tokenv
|
union ircd_gpt_tokenv
|
||||||
{
|
{
|
||||||
float4
|
float4
|
||||||
word[768/4],
|
word[768/4],
|
||||||
attn[12][64/4];
|
attn[12][64/4];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct qkv
|
struct ircd_gpt_qkv
|
||||||
{
|
{
|
||||||
union token
|
union ircd_gpt_tokenv
|
||||||
qry,
|
qry,
|
||||||
key,
|
key,
|
||||||
val;
|
val;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct qkvv
|
struct ircd_gpt_qkvv
|
||||||
{
|
{
|
||||||
union tokenv
|
union ircd_gpt_tokenv
|
||||||
qry,
|
qry,
|
||||||
key,
|
key,
|
||||||
val;
|
val;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct attn_mask
|
struct ircd_gpt_attn_mask
|
||||||
{
|
{
|
||||||
bool
|
bool
|
||||||
token[1024];
|
token[1024];
|
||||||
};
|
};
|
||||||
|
|
||||||
union aperature
|
union ircd_gpt_aperature
|
||||||
{
|
{
|
||||||
float
|
float
|
||||||
word[768],
|
word[768],
|
||||||
|
@ -90,7 +58,7 @@ union aperature
|
||||||
attn[12][64];
|
attn[12][64];
|
||||||
};
|
};
|
||||||
|
|
||||||
union aperaturev
|
union ircd_gpt_aperaturev
|
||||||
{
|
{
|
||||||
float4
|
float4
|
||||||
word[768/4],
|
word[768/4],
|
224
ircd/gpt.cc
224
ircd/gpt.cc
|
@ -34,30 +34,22 @@ namespace ircd::gpt
|
||||||
|
|
||||||
static f32
|
static f32
|
||||||
logit alignas(64) [65536],
|
logit alignas(64) [65536],
|
||||||
|
embeds alignas(64) [1024 * 768],
|
||||||
scratch alignas(64) [1024 * 768];
|
scratch alignas(64) [1024 * 768];
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace ircd::gpt
|
|
||||||
{
|
|
||||||
extern void transform(ctor_ctrl &, const ctor_opts &);
|
|
||||||
}
|
|
||||||
|
|
||||||
decltype(ircd::gpt::log)
|
decltype(ircd::gpt::log)
|
||||||
ircd::gpt::log
|
ircd::gpt::log
|
||||||
{
|
{
|
||||||
"gpt"
|
"gpt"
|
||||||
};
|
};
|
||||||
|
|
||||||
decltype(ircd::gpt::default_opts)
|
|
||||||
ircd::gpt::default_opts;
|
|
||||||
|
|
||||||
ircd::string_view
|
ircd::string_view
|
||||||
ircd::gpt::generate(const mutable_buffer &out,
|
ircd::gpt::generate(const mutable_buffer &out,
|
||||||
const string_view &in,
|
const string_view &in,
|
||||||
const opts *opts,
|
task &task)
|
||||||
task *task)
|
|
||||||
{
|
{
|
||||||
u16 buf[2][256];
|
u16 buf[2][1024];
|
||||||
const auto input_tokens
|
const auto input_tokens
|
||||||
{
|
{
|
||||||
vocab::tokenize(buf[0], in)
|
vocab::tokenize(buf[0], in)
|
||||||
|
@ -65,7 +57,7 @@ ircd::gpt::generate(const mutable_buffer &out,
|
||||||
|
|
||||||
const auto output_tokens
|
const auto output_tokens
|
||||||
{
|
{
|
||||||
generate(buf[1], input_tokens, opts, task)
|
generate(buf[1], input_tokens, task)
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto output
|
const auto output
|
||||||
|
@ -79,13 +71,92 @@ ircd::gpt::generate(const mutable_buffer &out,
|
||||||
ircd::vector_view<ircd::u16>
|
ircd::vector_view<ircd::u16>
|
||||||
ircd::gpt::generate(const vector_view<u16> &out,
|
ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
const vector_view<const u16> &in,
|
const vector_view<const u16> &in,
|
||||||
const opts *opts,
|
task &task)
|
||||||
task *task)
|
|
||||||
{
|
{
|
||||||
|
assert(task.ctrl);
|
||||||
|
assert(task.opts);
|
||||||
|
|
||||||
|
uint ret(0);
|
||||||
|
bool halt(false);
|
||||||
|
|
||||||
|
const auto &opts(*task.opts);
|
||||||
|
auto &ctrl(*task.ctrl);
|
||||||
|
auto &errc(ctrl.error_seq);
|
||||||
|
auto &accc(ctrl.accept_seq);
|
||||||
|
ctrl.tokens = in.size();
|
||||||
|
|
||||||
|
const size_t tmax
|
||||||
|
{
|
||||||
|
in.size() + opts.limit
|
||||||
|
};
|
||||||
|
|
||||||
|
const vector_view<f32> accum
|
||||||
|
{
|
||||||
|
gpt::scratch, tmax * 768
|
||||||
|
};
|
||||||
|
|
||||||
|
const vector_view<f32> embeds
|
||||||
|
{
|
||||||
|
gpt::embeds, tmax * 768
|
||||||
|
};
|
||||||
|
|
||||||
|
for(uint j(0); j < in.size(); ++j)
|
||||||
|
{
|
||||||
|
const vector_view<f32> dst
|
||||||
|
{
|
||||||
|
data(embeds) + j * 768, 768
|
||||||
|
};
|
||||||
|
|
||||||
|
if(ircd::cl::enable)
|
||||||
|
ctrl.token[j] = in[j];
|
||||||
|
else
|
||||||
|
embed(data(dst), in[j], j, opts);
|
||||||
|
|
||||||
|
static char dbuf[512] {0};
|
||||||
|
char report[1536] {0};
|
||||||
|
char tmbuf[1][64] {{0}};
|
||||||
|
const size_t report_size = snprintf
|
||||||
|
(
|
||||||
|
report, sizeof(report),
|
||||||
|
"%-2u -- %-3u [%5u] --- --- %s 0 0 | %8s",
|
||||||
|
j,
|
||||||
|
ctrl.tokens,
|
||||||
|
ctrl.token[j],
|
||||||
|
vocab::debug(dbuf, ctrl.token[j]).c_str(),
|
||||||
|
pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
|
||||||
|
);
|
||||||
|
|
||||||
|
log::info
|
||||||
|
{
|
||||||
|
log, "%s",
|
||||||
|
string_view{report, report_size}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t cycles(0);
|
||||||
|
milliseconds last_time {0};
|
||||||
|
util::timer stopwatch;
|
||||||
|
{
|
||||||
|
const prof::scope_cycles task_cycles
|
||||||
|
{
|
||||||
|
cycles
|
||||||
|
};
|
||||||
|
|
||||||
|
generate(task);
|
||||||
|
}
|
||||||
|
last_time = stopwatch.at<milliseconds>();
|
||||||
|
ctrl.elapsed += last_time.count();
|
||||||
|
|
||||||
|
/*
|
||||||
|
coil(data(scratch), tokens, *opts.model);
|
||||||
|
tail(logit, data(last_embed), *opts.model);
|
||||||
|
out[i] = argmax(logit, *opts);
|
||||||
|
*/
|
||||||
|
|
||||||
uint accc_thresh[3] {3, 3, 3};
|
uint accc_thresh[3] {3, 3, 3};
|
||||||
for(uint i(0); i < 3; ++i)
|
for(uint i(0); i < 3; ++i)
|
||||||
for(uint j(3); j > 0; --j)
|
for(uint j(3); j > 0; --j)
|
||||||
if(opts->accept_code[i][j - 1] == -1U)
|
if(opts.accept_code[i][j - 1] == -1U)
|
||||||
--accc_thresh[i];
|
--accc_thresh[i];
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
|
@ -93,99 +164,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
uint errc_thresh[3] {3, 3, 3};
|
uint errc_thresh[3] {3, 3, 3};
|
||||||
for(uint i(0); i < 3; ++i)
|
for(uint i(0); i < 3; ++i)
|
||||||
for(uint j(3); j > 0; --j)
|
for(uint j(3); j > 0; --j)
|
||||||
if(opts->error_code[i][j - 1] == -1U)
|
if(opts.error_code[i][j - 1] == -1U)
|
||||||
--errc_thresh[i];
|
--errc_thresh[i];
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
|
|
||||||
uint ret(0);
|
for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
|
||||||
bool halt(false);
|
|
||||||
auto &errc(task->error_seq);
|
|
||||||
auto &accc(task->accept_seq);
|
|
||||||
for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
|
|
||||||
{
|
{
|
||||||
ctor_ctrl ctrl alignas(4096) {0};
|
out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
|
||||||
ctrl.pc = 1;
|
|
||||||
|
|
||||||
const size_t tokens
|
|
||||||
{
|
|
||||||
in.size() + i
|
|
||||||
};
|
|
||||||
|
|
||||||
const vector_view<f32> scratch
|
|
||||||
{
|
|
||||||
gpt::scratch, tokens * 768
|
|
||||||
};
|
|
||||||
|
|
||||||
for(uint j(0); j < in.size(); ++j)
|
|
||||||
{
|
|
||||||
const vector_view<f32> dst
|
|
||||||
{
|
|
||||||
data(scratch) + j * 768, 768
|
|
||||||
};
|
|
||||||
|
|
||||||
if(ircd::cl::enable)
|
|
||||||
ctrl.body.token[ctrl.tokens++] = in[j];
|
|
||||||
else
|
|
||||||
embed(data(dst), in[j], j, *opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(uint j(0); j < i; ++j)
|
|
||||||
{
|
|
||||||
const vector_view<f32> dst
|
|
||||||
{
|
|
||||||
data(scratch) + (in.size() + j) * 768, 768
|
|
||||||
};
|
|
||||||
|
|
||||||
if(ircd::cl::enable)
|
|
||||||
ctrl.body.token[ctrl.tokens++] = out[j];
|
|
||||||
else
|
|
||||||
embed(data(dst), out[j], in.size() + j, *opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(!ircd::cl::enable || ctrl.tokens == tokens);
|
|
||||||
const vector_view<f32> last_embed
|
|
||||||
{
|
|
||||||
data(scratch) + (tokens - 1) * 768, 768
|
|
||||||
};
|
|
||||||
|
|
||||||
const auto last_cycl(task->cycles);
|
|
||||||
milliseconds last_time {0};
|
|
||||||
{
|
|
||||||
util::timer stopwatch;
|
|
||||||
const prof::scope_cycles task_cycles
|
|
||||||
{
|
|
||||||
task->cycles
|
|
||||||
};
|
|
||||||
|
|
||||||
if(ircd::cl::enable)
|
|
||||||
{
|
|
||||||
static const ctor_opts opts alignas(4096) {0};
|
|
||||||
|
|
||||||
transform(ctrl, opts);
|
|
||||||
out[i] = ctrl.body.token[ctrl.tokens - 1];
|
|
||||||
assert(ctrl.tokens == tokens + 1);
|
|
||||||
} else {
|
|
||||||
coil(data(scratch), tokens, *opts->model);
|
|
||||||
tail(logit, data(last_embed), *opts->model);
|
|
||||||
out[i] = argmax(logit, *opts);
|
|
||||||
}
|
|
||||||
|
|
||||||
last_time = stopwatch.at<milliseconds>();
|
|
||||||
task->time += last_time;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(uint j(0); j < 3; ++j)
|
for(uint j(0); j < 3; ++j)
|
||||||
errc[j] =
|
errc[j] = opts.error_code[j][errc[j]] == out[j]?
|
||||||
opts->error_code[j][errc[j]] == out[i]?
|
errc[j] + 1: 0;
|
||||||
errc[j] + 1:
|
|
||||||
0;
|
|
||||||
|
|
||||||
for(uint j(0); j < 3; ++j)
|
for(uint j(0); j < 3; ++j)
|
||||||
accc[j] =
|
accc[j] = opts.accept_code[j][accc[j]] == out[j]?
|
||||||
opts->accept_code[j][accc[j]] == out[i]?
|
accc[j] + 1: 0;
|
||||||
accc[j] + 1:
|
|
||||||
0;
|
|
||||||
|
|
||||||
for(uint j(0); j < 3; ++j)
|
for(uint j(0); j < 3; ++j)
|
||||||
halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
|
halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
|
||||||
|
@ -194,21 +188,23 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
static char dbuf[512] {0};
|
static char dbuf[512] {0};
|
||||||
char report[1536] {0};
|
char report[1536] {0};
|
||||||
char tmbuf[4][64] {0};
|
char tmbuf[4][64] {0};
|
||||||
size_t report_size;
|
const size_t bsz(ctrl.tokens - in.size());
|
||||||
report_size = snprintf
|
const size_t report_size = snprintf
|
||||||
(
|
(
|
||||||
report, sizeof(report),
|
report, sizeof(report),
|
||||||
"%-2u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
|
"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s | %8s",
|
||||||
i,
|
j,
|
||||||
|
j + in.size(),
|
||||||
ctrl.tokens,
|
ctrl.tokens,
|
||||||
ret,
|
ctrl.cycle,
|
||||||
out[i],
|
ctrl.epoch,
|
||||||
|
out[j],
|
||||||
accc[0] + accc[1] + accc[2],
|
accc[0] + accc[1] + accc[2],
|
||||||
errc[0] + errc[1] + errc[2],
|
errc[0] + errc[1] + errc[2],
|
||||||
vocab::debug(dbuf, out[i]).c_str(),
|
vocab::debug(dbuf, out[j]).c_str(),
|
||||||
pretty(tmbuf[0], last_time, 1).c_str(),
|
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
|
||||||
pretty(tmbuf[1], si(last_cycl), 1).c_str(),
|
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
|
||||||
pretty(tmbuf[2], task->time, 1).c_str()
|
pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
|
||||||
);
|
);
|
||||||
|
|
||||||
log::info
|
log::info
|
||||||
|
@ -216,24 +212,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
log, "%s",
|
log, "%s",
|
||||||
string_view{report, report_size}
|
string_view{report, report_size}
|
||||||
};
|
};
|
||||||
|
|
||||||
++ret;
|
|
||||||
ctx::yield();
|
|
||||||
ctx::interruption_point();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = ctrl.tokens - in.size();
|
||||||
for(uint i(0); i < 3; ++i)
|
for(uint i(0); i < 3; ++i)
|
||||||
if(accc_thresh[i] && task->accept_seq[i] >= accc_thresh[i])
|
if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
|
||||||
{
|
{
|
||||||
ret -= (3 - accc_thresh[i]);
|
ret -= (3 - accc_thresh[i]);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if(errc_thresh[i] && task->error_seq[i] >= errc_thresh[i])
|
else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
|
||||||
{
|
{
|
||||||
ret -= (3 - errc_thresh[i]);
|
ret -= (3 - errc_thresh[i]);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx::interruption_point();
|
||||||
return vector_view<u16>
|
return vector_view<u16>
|
||||||
{
|
{
|
||||||
out, ret
|
out, ret
|
||||||
|
|
581
ircd/gpt_cl.cl
581
ircd/gpt_cl.cl
|
@ -8,129 +8,27 @@
|
||||||
// copyright notice and this permission notice is present in all copies. The
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
// full license for this software is available in the LICENSE file.
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
inline void
|
|
||||||
ctor_local_bcast_ldr(__local float4 *const out,
|
|
||||||
const uint ln,
|
|
||||||
const uint li)
|
|
||||||
{
|
|
||||||
for(uint stride = 1; stride < ln; stride <<= 1)
|
|
||||||
{
|
|
||||||
if(li < stride)
|
|
||||||
out[li + stride] = out[li];
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void
|
inline void
|
||||||
ctor_local_reduce_add_ldr(__local float4 *const out,
|
ircd_gpt_norm_fmad(__local float4 *const out,
|
||||||
const uint ln,
|
__local const float4 *const in,
|
||||||
const uint li)
|
__global const float4 *const restrict bias,
|
||||||
{
|
__global const float4 *const restrict weight,
|
||||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
const uint i)
|
||||||
{
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(li < stride)
|
|
||||||
out[li] += out[li + stride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void
|
|
||||||
ctor_local_reduce_max_ldr(__local float *const out,
|
|
||||||
const uint ln,
|
|
||||||
const uint li)
|
|
||||||
{
|
|
||||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
|
||||||
{
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(li < stride)
|
|
||||||
out[li] = max(out[li], out[li + stride]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void
|
|
||||||
ctor_local_reduce_tournament_ldr(__local float *const best,
|
|
||||||
__local ushort *const idx,
|
|
||||||
const uint ln,
|
|
||||||
const uint li)
|
|
||||||
{
|
|
||||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
|
||||||
{
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if(li < stride && best[li] < best[li + stride])
|
|
||||||
{
|
|
||||||
best[li] = best[li + stride];
|
|
||||||
idx[li] = idx[li + stride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void
|
|
||||||
ctor_mean(__local float4 *const restrict out,
|
|
||||||
__local const float4 *const restrict in,
|
|
||||||
const uint num,
|
|
||||||
const uint i)
|
|
||||||
{
|
|
||||||
out[i] = in[i];
|
|
||||||
ctor_local_reduce_add_ldr(out, num, i);
|
|
||||||
|
|
||||||
float numerator = 0.0f;
|
|
||||||
float4 numeratorv = out[i];
|
|
||||||
for(uint k = 0; k < 4; ++k)
|
|
||||||
numerator += numeratorv[k];
|
|
||||||
|
|
||||||
out[i] = numerator;
|
|
||||||
ctor_local_bcast_ldr(out, num, i);
|
|
||||||
|
|
||||||
numeratorv = out[i];
|
|
||||||
out[i] = numeratorv / (num * 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void
|
|
||||||
ctor_norm(__local float4 *const out,
|
|
||||||
__local const float4 *const in,
|
|
||||||
__local float4 *const restrict tmp,
|
|
||||||
const uint num,
|
|
||||||
const uint i)
|
|
||||||
{
|
|
||||||
ctor_mean(tmp, in, num, i);
|
|
||||||
|
|
||||||
const float4
|
|
||||||
sub_mean = in[i] - tmp[i];
|
|
||||||
|
|
||||||
tmp[i] = pow(sub_mean, 2);
|
|
||||||
ctor_mean(out, tmp, num, i);
|
|
||||||
|
|
||||||
const float4
|
|
||||||
epsilon = 0.00001f,
|
|
||||||
s = sqrt(out[i] + epsilon);
|
|
||||||
|
|
||||||
out[i] = sub_mean / s;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void
|
|
||||||
ctor_norm_fmad(__local float4 *const out,
|
|
||||||
__local const float4 *const in,
|
|
||||||
__global const float4 *const restrict bias,
|
|
||||||
__global const float4 *const restrict weight,
|
|
||||||
const uint i)
|
|
||||||
{
|
{
|
||||||
out[i] = in[i] * weight[i] + bias[i];
|
out[i] = in[i] * weight[i] + bias[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Matrix * Vector Multiply/Accumulate
|
// Matrix * Vector Multiply/Accumulate
|
||||||
inline void
|
inline void
|
||||||
ctor_sgemv(__local float4 *const restrict out,
|
ircd_gpt_sgemv(__local float4 *const restrict out,
|
||||||
__local const float4 *const restrict in,
|
__local const float4 *const restrict in,
|
||||||
__global const float4 *const restrict bias,
|
__global const float4 *const restrict bias,
|
||||||
__global const float4 *const restrict weight,
|
__global const float4 *const restrict weight,
|
||||||
const uint width,
|
const uint width,
|
||||||
const uint height,
|
const uint height,
|
||||||
const uint tiles,
|
const uint tiles,
|
||||||
const uint i)
|
const uint i)
|
||||||
{
|
{
|
||||||
const uint seg = height / tiles;
|
const uint seg = height / tiles;
|
||||||
|
|
||||||
|
@ -151,9 +49,9 @@ ctor_sgemv(__local float4 *const restrict out,
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void
|
inline void
|
||||||
ctor_gelu(__local float4 *const out,
|
ircd_gpt_gelu(__local float4 *const out,
|
||||||
__local const float4 *const in_,
|
__local const float4 *const in_,
|
||||||
const uint i)
|
const uint i)
|
||||||
{
|
{
|
||||||
float4 a,
|
float4 a,
|
||||||
in = in_[i];
|
in = in_[i];
|
||||||
|
@ -178,14 +76,15 @@ ctor_gelu(__local float4 *const out,
|
||||||
//
|
//
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global union aperaturev *const restrict out,
|
__global union ircd_gpt_tokenv *const restrict accum,
|
||||||
__global const union tokenv *const restrict in,
|
__global const float4 *const restrict norm_bias,
|
||||||
__global const float4 *const restrict norm_bias,
|
__global const float4 *const restrict norm_weight,
|
||||||
__global const float4 *const restrict norm_weight,
|
__global const float4 *const restrict fcon_bias,
|
||||||
__global const float4 *const restrict fcon_bias,
|
__global const float4 *const restrict fcon_weight,
|
||||||
__global const float4 *const restrict fcon_weight)
|
__global const float4 *const restrict proj_bias,
|
||||||
|
__global const float4 *const restrict proj_weight)
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
gi = get_global_id(0),
|
gi = get_global_id(0),
|
||||||
|
@ -195,31 +94,39 @@ ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
|
||||||
wi = get_group_id(0),
|
wi = get_group_id(0),
|
||||||
wn = get_num_groups(0);
|
wn = get_num_groups(0);
|
||||||
|
|
||||||
__local union aperaturev token;
|
__local union ircd_gpt_aperaturev token;
|
||||||
__local float4 tmp[768/4];
|
__local float4 tmp[768/4];
|
||||||
|
|
||||||
token.word[li] = in[wi].word[li];
|
// Fetch local copy of the global accumulator. We operate on a cached
|
||||||
|
// copy as input, and add our output to the global upon completion.
|
||||||
|
token.word[li] = accum[wi].word[li];
|
||||||
|
|
||||||
// Layer re-normalization
|
// Layer re-normalization
|
||||||
ctor_norm(token.word, token.word, tmp, ln, li);
|
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
|
||||||
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||||
|
|
||||||
// Fully connected
|
// Fully connected
|
||||||
for(uint i = 0; i < 3; ++i)
|
for(uint i = 0; i < 4; ++i)
|
||||||
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
|
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
|
||||||
|
|
||||||
// Export queries, keys, and values.
|
// Gaussian Error Linear Unit
|
||||||
for(uint i = 0; i < 3; ++i)
|
for(uint i = 0; i < 4; ++i)
|
||||||
out[wi].proj[i][li] = token.proj[i][li];
|
ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
|
||||||
|
|
||||||
|
// Projection
|
||||||
|
ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
|
||||||
|
|
||||||
|
// Accumulation; end of layer
|
||||||
|
accum[wi].word[li] += tmp[li];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global union tokenv *const restrict accum,
|
__global union ircd_gpt_tokenv *const restrict accum,
|
||||||
__global const union tokenv *const restrict xattn,
|
__local const union ircd_gpt_tokenv *const restrict xattn,
|
||||||
__global const float4 *const restrict proj_bias,
|
__global const float4 *const restrict proj_bias,
|
||||||
__global const float4 *const restrict proj_weight)
|
__global const float4 *const restrict proj_weight)
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
gi = get_global_id(0),
|
gi = get_global_id(0),
|
||||||
|
@ -234,25 +141,24 @@ ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
|
||||||
out[768/4];
|
out[768/4];
|
||||||
|
|
||||||
// Fetch
|
// Fetch
|
||||||
in[li] = xattn[wi].word[li];
|
in[li] = xattn->word[li];
|
||||||
|
|
||||||
|
// Need this here if xattn is __local
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Projection
|
// Projection
|
||||||
ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
|
ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
|
||||||
|
|
||||||
// Accumulation; end of layer
|
// Accumulation; end of layer
|
||||||
accum[wi].word[li] += out[li];
|
accum[wi].word[li] += out[li];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global union tokenv *const restrict accum,
|
__local union ircd_gpt_tokenv *const restrict out,
|
||||||
__global const float4 *const restrict norm_bias,
|
__global const struct ircd_gpt_qkvv *const restrict token,
|
||||||
__global const float4 *const restrict norm_weight,
|
__global const struct ircd_gpt_attn_mask *const restrict mask) // [1024][1024],
|
||||||
__global const float4 *const restrict fcon_bias,
|
|
||||||
__global const float4 *const restrict fcon_weight,
|
|
||||||
__global const float4 *const restrict proj_bias,
|
|
||||||
__global const float4 *const restrict proj_weight)
|
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
gi = get_global_id(0),
|
gi = get_global_id(0),
|
||||||
|
@ -262,96 +168,13 @@ ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
|
||||||
wi = get_group_id(0),
|
wi = get_group_id(0),
|
||||||
wn = get_num_groups(0);
|
wn = get_num_groups(0);
|
||||||
|
|
||||||
__local union aperaturev token;
|
__local union
|
||||||
__local float4 tmp[768/4];
|
|
||||||
|
|
||||||
// Fetch local copy of the global accumulator. We operate on a cached
|
|
||||||
// copy as input, and add our output to the global upon completion.
|
|
||||||
token.word[li] = accum[wi].word[li];
|
|
||||||
|
|
||||||
// Layer re-normalization
|
|
||||||
ctor_norm(token.word, token.word, tmp, ln, li);
|
|
||||||
ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
|
||||||
|
|
||||||
// Fully connected
|
|
||||||
for(uint i = 0; i < 4; ++i)
|
|
||||||
ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
|
|
||||||
|
|
||||||
// Gaussian Error Linear Unit
|
|
||||||
for(uint i = 0; i < 4; ++i)
|
|
||||||
ctor_gelu(token.fcon, token.fcon, i * ln + li);
|
|
||||||
|
|
||||||
// Projection
|
|
||||||
ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
|
|
||||||
|
|
||||||
// Accumulation; end of layer
|
|
||||||
accum[wi].word[li] += tmp[li];
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void
|
|
||||||
ctor_backend(__global const struct ctor_ctrl *const ctrl,
|
|
||||||
__constant const struct ctor_opts *const opts,
|
|
||||||
__global union tokenv *const restrict accum,
|
|
||||||
__global const union tokenv *const restrict xattn,
|
|
||||||
__global const float4 *const restrict attn_proj_bias,
|
|
||||||
__global const float4 *const restrict attn_proj_weight,
|
|
||||||
__global const float4 *const restrict ffnn_norm_bias,
|
|
||||||
__global const float4 *const restrict ffnn_norm_weight,
|
|
||||||
__global const float4 *const restrict ffnn_fcon_bias,
|
|
||||||
__global const float4 *const restrict ffnn_fcon_weight,
|
|
||||||
__global const float4 *const restrict ffnn_proj_bias,
|
|
||||||
__global const float4 *const restrict ffnn_proj_weight)
|
|
||||||
{
|
|
||||||
ctor_attn_proj
|
|
||||||
(
|
|
||||||
ctrl,
|
|
||||||
opts,
|
|
||||||
accum,
|
|
||||||
xattn,
|
|
||||||
attn_proj_bias,
|
|
||||||
attn_proj_weight
|
|
||||||
);
|
|
||||||
|
|
||||||
ctor_ffnn
|
|
||||||
(
|
|
||||||
ctrl,
|
|
||||||
opts,
|
|
||||||
accum,
|
|
||||||
ffnn_norm_bias,
|
|
||||||
ffnn_norm_weight,
|
|
||||||
ffnn_fcon_bias,
|
|
||||||
ffnn_fcon_weight,
|
|
||||||
ffnn_proj_bias,
|
|
||||||
ffnn_proj_weight
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// ctrl
|
|
||||||
//
|
|
||||||
|
|
||||||
__kernel void
|
|
||||||
ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
|
|
||||||
__constant const struct ctor_opts *const opts,
|
|
||||||
__global union tokenv *const restrict out,
|
|
||||||
__global const struct qkvv *const restrict token,
|
|
||||||
__global const struct attn_mask *const restrict mask) // [1024][1024],
|
|
||||||
{
|
|
||||||
__local struct
|
|
||||||
{
|
{
|
||||||
float
|
float
|
||||||
attn[12][32];
|
attn[12][96];
|
||||||
}
|
}
|
||||||
self;
|
self;
|
||||||
|
|
||||||
const uint
|
|
||||||
gi = get_global_id(0),
|
|
||||||
gn = get_global_size(0),
|
|
||||||
li = get_local_id(0),
|
|
||||||
ln = get_local_size(0),
|
|
||||||
wi = get_group_id(0),
|
|
||||||
wn = get_num_groups(0);
|
|
||||||
|
|
||||||
for(uint i = 0; i < wn; ++i)
|
for(uint i = 0; i < wn; ++i)
|
||||||
if(mask[wi].token[i])
|
if(mask[wi].token[i])
|
||||||
self.attn[li][i] = 0.0f;
|
self.attn[li][i] = 0.0f;
|
||||||
|
@ -389,114 +212,167 @@ ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
|
||||||
self.attn[li][i] /= acc;
|
self.attn[li][i] /= acc;
|
||||||
|
|
||||||
for(uint j = 0; j < 64/4; ++j)
|
for(uint j = 0; j < 64/4; ++j)
|
||||||
out[wi].attn[li][j] = 0.0f;
|
out->attn[li][j] = 0.0f;
|
||||||
|
|
||||||
for(uint i = 0; i < wn; ++i)
|
for(uint i = 0; i < wn; ++i)
|
||||||
for(uint j = 0; j < 64/4; ++j)
|
for(uint j = 0; j < 64/4; ++j)
|
||||||
out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
|
out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// leads
|
|
||||||
//
|
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_anode0(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global union tokenv *const restrict accum,
|
__global union ircd_gpt_aperaturev *const restrict out,
|
||||||
__global const union tokenv *const restrict pos,
|
__global const union ircd_gpt_tokenv *const restrict in,
|
||||||
__global const union tokenv *const restrict vocab)
|
__global const float4 *const restrict norm_bias,
|
||||||
|
__global const float4 *const restrict norm_weight,
|
||||||
|
__global const float4 *const restrict fcon_bias,
|
||||||
|
__global const float4 *const restrict fcon_weight)
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
|
gi = get_global_id(0),
|
||||||
|
gn = get_global_size(0),
|
||||||
li = get_local_id(0),
|
li = get_local_id(0),
|
||||||
wi = get_group_id(0);
|
ln = get_local_size(0),
|
||||||
|
wi = get_group_id(0),
|
||||||
|
wn = get_num_groups(0);
|
||||||
|
|
||||||
const ushort
|
__local union ircd_gpt_aperaturev token;
|
||||||
token = ctrl->body.token[wi];
|
__local float4 tmp[768/4];
|
||||||
|
|
||||||
const float4
|
token.word[li] = in[wi].word[li];
|
||||||
wte = vocab[token].word[li],
|
|
||||||
wpe = pos[wi].word[li];
|
|
||||||
|
|
||||||
accum[wi].word[li] = wte + wpe;
|
// Layer re-normalization
|
||||||
|
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
|
||||||
|
ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
|
||||||
|
|
||||||
|
// Fully connected
|
||||||
|
for(uint i = 0; i < 3; ++i)
|
||||||
|
ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
|
||||||
|
|
||||||
|
// Export queries, keys, and values.
|
||||||
|
for(uint i = 0; i < 3; ++i)
|
||||||
|
out[wi].proj[i][li] = token.proj[i][li];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_anode1(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global union tokenv *const restrict accum,
|
__global union ircd_gpt_tokenv *const restrict accum,
|
||||||
__global const union tokenv *const restrict pos,
|
__global const struct ircd_gpt_qkvv *const restrict state,
|
||||||
__global const union tokenv *const restrict vocab)
|
__global const struct ircd_gpt_attn_mask *const restrict mask, // [1024][1024],
|
||||||
|
__global const float4 *const restrict attn_proj_bias,
|
||||||
|
__global const float4 *const restrict attn_proj_weight,
|
||||||
|
__global const float4 *const restrict ffnn_norm_bias,
|
||||||
|
__global const float4 *const restrict ffnn_norm_weight,
|
||||||
|
__global const float4 *const restrict ffnn_fcon_bias,
|
||||||
|
__global const float4 *const restrict ffnn_fcon_weight,
|
||||||
|
__global const float4 *const restrict ffnn_proj_bias,
|
||||||
|
__global const float4 *const restrict ffnn_proj_weight)
|
||||||
|
{
|
||||||
|
__local union ircd_gpt_tokenv value;
|
||||||
|
|
||||||
|
ircd_gpt_attn_self
|
||||||
|
(
|
||||||
|
ctrl,
|
||||||
|
opts,
|
||||||
|
&value,
|
||||||
|
state,
|
||||||
|
mask
|
||||||
|
);
|
||||||
|
|
||||||
|
ircd_gpt_attn_proj
|
||||||
|
(
|
||||||
|
ctrl,
|
||||||
|
opts,
|
||||||
|
accum,
|
||||||
|
&value,
|
||||||
|
attn_proj_bias,
|
||||||
|
attn_proj_weight
|
||||||
|
);
|
||||||
|
|
||||||
|
ircd_gpt_ffnn
|
||||||
|
(
|
||||||
|
ctrl,
|
||||||
|
opts,
|
||||||
|
accum,
|
||||||
|
ffnn_norm_bias,
|
||||||
|
ffnn_norm_weight,
|
||||||
|
ffnn_fcon_bias,
|
||||||
|
ffnn_fcon_weight,
|
||||||
|
ffnn_proj_bias,
|
||||||
|
ffnn_proj_weight
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// frontend
|
||||||
|
//
|
||||||
|
|
||||||
|
inline void
|
||||||
|
_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
|
__global union ircd_gpt_tokenv *const restrict out,
|
||||||
|
__global const union ircd_gpt_tokenv *const restrict pos,
|
||||||
|
__global const union ircd_gpt_tokenv *const restrict vocab,
|
||||||
|
const uint out_idx,
|
||||||
|
const uint tok_idx,
|
||||||
|
const uint word_idx)
|
||||||
|
{
|
||||||
|
const ushort
|
||||||
|
token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
|
||||||
|
|
||||||
|
const float4
|
||||||
|
wte = vocab[token].word[word_idx],
|
||||||
|
wpe = pos[tok_idx].word[word_idx];
|
||||||
|
|
||||||
|
out[out_idx].word[word_idx] = wte + wpe;
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
|
__global union ircd_gpt_tokenv *const restrict accum,
|
||||||
|
__global const union ircd_gpt_tokenv *const restrict pos,
|
||||||
|
__global const union ircd_gpt_tokenv *const restrict vocab)
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
li = get_local_id(0);
|
li = get_local_id(0);
|
||||||
|
|
||||||
for(uint i = 0; i < ctrl->tokens; ++i)
|
for(uint i = 0; i < ctrl->tokens; ++i)
|
||||||
{
|
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
|
||||||
const ushort
|
|
||||||
token = ctrl->body.token[i];
|
|
||||||
|
|
||||||
const float4
|
|
||||||
wte = vocab[token].word[li],
|
|
||||||
wpe = pos[i].word[li];
|
|
||||||
|
|
||||||
accum[i].word[li] = wte + wpe;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_anode2(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global union tokenv *const restrict accum,
|
__global union ircd_gpt_tokenv *const restrict accum,
|
||||||
__global const union tokenv *const restrict pos,
|
__global const float4 *const restrict norm_bias,
|
||||||
__global const union tokenv *const restrict vocab)
|
__global const float4 *const restrict norm_weight)
|
||||||
{
|
|
||||||
const uint
|
|
||||||
gi = get_global_id(0);
|
|
||||||
|
|
||||||
const ushort
|
|
||||||
token = ctrl->body.token[gi];
|
|
||||||
|
|
||||||
for(uint i = 0; i < 768/4; ++i)
|
|
||||||
{
|
|
||||||
const float4
|
|
||||||
wte = vocab[token].word[i],
|
|
||||||
wpe = pos[gi].word[i];
|
|
||||||
|
|
||||||
accum[gi].word[i] = wte + wpe;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void
|
|
||||||
ctor_cathode(__global const struct ctor_ctrl *const ctrl,
|
|
||||||
__constant const struct ctor_opts *const opts,
|
|
||||||
__global union tokenv *const restrict accum,
|
|
||||||
__global const float4 *const restrict norm_bias,
|
|
||||||
__global const float4 *const restrict norm_weight)
|
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
li = get_local_id(0),
|
li = get_local_id(0),
|
||||||
ln = get_local_size(0),
|
ln = get_local_size(0),
|
||||||
wi = get_global_offset(0) / ln + get_group_id(0);
|
wi = get_global_offset(0) / ln + get_group_id(0);
|
||||||
|
|
||||||
__local union tokenv
|
__local union ircd_gpt_tokenv
|
||||||
token, tmp;
|
token, tmp;
|
||||||
|
|
||||||
token.word[li] = accum[wi].word[li];
|
token.word[li] = accum[wi].word[li];
|
||||||
|
|
||||||
// Final re-normalization
|
// Final re-normalization
|
||||||
ctor_norm(token.word, token.word, tmp.word, ln, li);
|
ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
|
||||||
ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
|
ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
|
||||||
|
|
||||||
accum[0].word[li] = token.word[li];
|
accum[0].word[li] = token.word[li];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
|
ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global float *const restrict logit,
|
__global float *const restrict logit,
|
||||||
__global const union tokenv *const restrict accum,
|
__global const union ircd_gpt_tokenv *const restrict accum,
|
||||||
__global const union tokenv *const restrict token)
|
__global const union ircd_gpt_tokenv *const restrict token)
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
gi = get_global_id(0);
|
gi = get_global_id(0);
|
||||||
|
@ -519,10 +395,79 @@ ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
|
||||||
logit[gi] = res;
|
logit[gi] = res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void
|
||||||
|
ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
|
const uint li)
|
||||||
|
{
|
||||||
|
// No action for other threads right now
|
||||||
|
if(li != 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Run debug checks and assertions.
|
||||||
|
#ifdef RB_DEBUG
|
||||||
|
if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)
|
||||||
|
if(ctrl->tokens < 2)
|
||||||
|
ctrl->call = IRCD_GPT_ETOKENS;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// If the call value has been set to something other than default we
|
||||||
|
// do nothing else here.
|
||||||
|
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// On the last cycle, with no prior call or error code set, indicate
|
||||||
|
// a nominal exit condition.
|
||||||
|
if(ctrl->cycle + 1 >= opts->limit)
|
||||||
|
{
|
||||||
|
ctrl->call = IRCD_GPT_ACCEPT;
|
||||||
|
ctrl->epoch += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctrl->cycle += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void
|
||||||
|
ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
|
const uint li,
|
||||||
|
__local const ushort *const restrict idx)
|
||||||
|
{
|
||||||
|
// To read from cells other than idx[0] we need this barrier.
|
||||||
|
if(opts->top_k > 1)
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// No action for other threads right now
|
||||||
|
if(li != 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// When the hypercall code is already set, bail here.
|
||||||
|
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const bool
|
||||||
|
buffer_full = ctrl->tokens >= opts->buffer_tokens;
|
||||||
|
|
||||||
|
const ulong
|
||||||
|
rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
|
||||||
|
sel = rnd % max(opts->top_k, 1U);
|
||||||
|
|
||||||
|
const ushort
|
||||||
|
token = idx[sel],
|
||||||
|
token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
|
||||||
|
|
||||||
|
ctrl->token[token_idx] = token;
|
||||||
|
|
||||||
|
if(buffer_full)
|
||||||
|
ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
|
||||||
|
else
|
||||||
|
ctrl->tokens++;
|
||||||
|
}
|
||||||
|
|
||||||
__kernel void
|
__kernel void
|
||||||
ctor_lmamax(__global struct ctor_ctrl *const ctrl,
|
ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ctor_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
__global const float *const restrict logit)
|
__global const float *const restrict logit)
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
gi = get_global_id(0),
|
gi = get_global_id(0),
|
||||||
|
@ -535,25 +480,13 @@ ctor_lmamax(__global struct ctor_ctrl *const ctrl,
|
||||||
ti = tn * li;
|
ti = tn * li;
|
||||||
|
|
||||||
__local ushort idx[192];
|
__local ushort idx[192];
|
||||||
__local float best[192];
|
|
||||||
|
|
||||||
idx[li] = ti;
|
idx[li] = ti;
|
||||||
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
|
for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
|
||||||
if(logit[j] > logit[idx[li]])
|
if(logit[j] > logit[idx[li]])
|
||||||
idx[li] = j;
|
idx[li] = j;
|
||||||
|
|
||||||
best[li] = logit[idx[li]];
|
ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
|
||||||
ctor_local_reduce_tournament_ldr(best, idx, ln, li);
|
ircd_gpt_lm_result(ctrl, opts, li, idx);
|
||||||
|
ircd_gpt_leave(ctrl, opts, li);
|
||||||
if(li == 0 && ctrl->call == -1)
|
|
||||||
ctrl->body.token[ctrl->tokens++] = idx[li];
|
|
||||||
|
|
||||||
if(li == 0 && ctrl->call == -1)
|
|
||||||
ctrl->call = 1;
|
|
||||||
|
|
||||||
#ifdef RB_DEBUG
|
|
||||||
if(li == 0 && ctrl->call == 1)
|
|
||||||
if(ctrl->tokens < 2)
|
|
||||||
ctrl->call = -2;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -142,7 +142,7 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
|
||||||
|
|
||||||
fs::map::opts map_opts;
|
fs::map::opts map_opts;
|
||||||
map_opts.huge2mb = true;
|
map_opts.huge2mb = true;
|
||||||
map_opts.locked = false;
|
map_opts.locked = true;
|
||||||
default_model_shm = fs::map
|
default_model_shm = fs::map
|
||||||
{
|
{
|
||||||
fd, map_opts, sizeof(decoder)
|
fd, map_opts, sizeof(decoder)
|
||||||
|
|
443
ircd/gpt_pipe.cc
443
ircd/gpt_pipe.cc
|
@ -8,21 +8,33 @@
|
||||||
// copyright notice and this permission notice is present in all copies. The
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
// full license for this software is available in the LICENSE file.
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
#include <ircd/gpt/pipe/pipe.h>
|
|
||||||
|
|
||||||
namespace ircd::gpt
|
|
||||||
{
|
|
||||||
void transform(ctor_ctrl &, const ctor_opts &);
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace ircd::gpt::pipe
|
namespace ircd::gpt::pipe
|
||||||
{
|
{
|
||||||
static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts,
|
static void profile_dumplog(pipe::exec &);
|
||||||
lmhead_opts, lmamax_opts;
|
|
||||||
|
|
||||||
|
static ircd::cl::exec::opts
|
||||||
|
negative_opts, positive_opts, selfattn_opts,
|
||||||
|
cathode_opts, anode_opts, lmhead_opts, lmamax_opts;
|
||||||
|
|
||||||
|
extern conf::item<size_t> flush_cycles;
|
||||||
|
extern conf::item<size_t> queue_cycles;
|
||||||
extern const ircd::run::changed handle_quit;
|
extern const ircd::run::changed handle_quit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
decltype(ircd::gpt::pipe::queue_cycles)
|
||||||
|
ircd::gpt::pipe::queue_cycles
|
||||||
|
{
|
||||||
|
{ "name", "ircd.gpt.pipe.queue" },
|
||||||
|
{ "default", 1L, },
|
||||||
|
};
|
||||||
|
|
||||||
|
decltype(ircd::gpt::pipe::flush_cycles)
|
||||||
|
ircd::gpt::pipe::flush_cycles
|
||||||
|
{
|
||||||
|
{ "name", "ircd.gpt.pipe.flush" },
|
||||||
|
{ "default", 0L, },
|
||||||
|
};
|
||||||
|
|
||||||
decltype(ircd::gpt::pipe::default_model)
|
decltype(ircd::gpt::pipe::default_model)
|
||||||
ircd::gpt::pipe::default_model;
|
ircd::gpt::pipe::default_model;
|
||||||
|
|
||||||
|
@ -82,144 +94,233 @@ noexcept
|
||||||
//
|
//
|
||||||
|
|
||||||
void
|
void
|
||||||
ircd::gpt::transform(ctor_ctrl &ctrl,
|
ircd::gpt::generate(task &task)
|
||||||
const ctor_opts &opts)
|
|
||||||
{
|
{
|
||||||
if(unlikely(!pipe::default_model))
|
if(unlikely(!pipe::default_model))
|
||||||
pipe::init();
|
pipe::init();
|
||||||
|
|
||||||
ctrl.call = -1;
|
const auto &opts
|
||||||
pipe::exec
|
|
||||||
{
|
{
|
||||||
ctrl, opts
|
*task.opts
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto &ctrl
|
||||||
|
{
|
||||||
|
*task.ctrl
|
||||||
|
};
|
||||||
|
|
||||||
|
ctrl.call = IRCD_GPT_ECOMPLETE;
|
||||||
|
ctrl.host_tsc = prof::cycles();
|
||||||
|
size_t cycle(ctrl.cycle);
|
||||||
|
const size_t tokens(ctrl.tokens);
|
||||||
|
|
||||||
|
std::deque<pipe::exec> list;
|
||||||
|
for(; cycle < opts.limit; ++cycle)
|
||||||
|
{
|
||||||
|
// When the release/acquire bits are set the control pages are sent
|
||||||
|
// and received; only set on first and last iterations of this loop.
|
||||||
|
const bool
|
||||||
|
rel(cycle == 0),
|
||||||
|
acq(cycle + 1 >= opts.limit);
|
||||||
|
|
||||||
|
// Enqueue the cycle's commands
|
||||||
|
list.emplace_back
|
||||||
|
(
|
||||||
|
task, tokens + cycle, rel, acq
|
||||||
|
);
|
||||||
|
|
||||||
|
// Conditions for a cl::flush here; this is not default but
|
||||||
|
// may be configured to improve some workloads.
|
||||||
|
const bool flush
|
||||||
|
{
|
||||||
|
// Flushing here is enabled by the configuration
|
||||||
|
pipe::flush_cycles
|
||||||
|
|
||||||
|
// Skip flushing on cycles already performing IO or waiting.
|
||||||
|
&& !rel && !acq && list.size() <= pipe::queue_cycles
|
||||||
|
|
||||||
|
// The configuration item can specify an interval greater than
|
||||||
|
// one between flushes.
|
||||||
|
&& cycle % pipe::flush_cycles == 0
|
||||||
|
};
|
||||||
|
|
||||||
|
if(flush)
|
||||||
|
cl::flush();
|
||||||
|
|
||||||
|
// Enqueue consecutive repetitions of our kernel batch before waiting
|
||||||
|
// on the first; based on the configuration. XXX get from ircd::cl
|
||||||
|
if(list.size() <= pipe::queue_cycles)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Profiling branch
|
||||||
|
if((false))
|
||||||
|
{
|
||||||
|
auto &ex(list.front());
|
||||||
|
profile_dumplog(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Destructing the front of the queue waits for completion by yielding
|
||||||
|
// this ircd::ctx.
|
||||||
|
list.pop_front();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for all unfinished
|
||||||
|
list.clear();
|
||||||
|
|
||||||
|
// Interp error codes
|
||||||
if(unlikely(ctrl.call <= 0))
|
if(unlikely(ctrl.call <= 0))
|
||||||
throw error
|
throw error
|
||||||
{
|
{
|
||||||
"hyper (#%d) :%s",
|
"hyper (#%d) :%s",
|
||||||
abs(ctrl.call),
|
abs(int(ctrl.call)),
|
||||||
ctrl.body.str,
|
reflect(ctrl.call),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
always_assert(ctrl.cycle == cycle);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
|
||||||
|
{
|
||||||
|
constexpr size_t coils
|
||||||
|
{
|
||||||
|
sizeof(exec.coil) / sizeof(cl::exec)
|
||||||
|
};
|
||||||
|
|
||||||
|
for(size_t i(0); i < coils; ++i)
|
||||||
|
{
|
||||||
|
exec.coil[i].wait();
|
||||||
|
const auto &pro
|
||||||
|
{
|
||||||
|
exec.coil[i].profile()
|
||||||
|
};
|
||||||
|
|
||||||
|
char tmbuf[4][32] {{0}};
|
||||||
|
log::logf
|
||||||
|
{
|
||||||
|
log, log::level::DEBUG,
|
||||||
|
"coil:%-2lu %8s %8s %8s %8s\n",
|
||||||
|
i,
|
||||||
|
util::pretty(tmbuf[0], si(pro[0]), 1),
|
||||||
|
util::pretty(tmbuf[1], si(pro[1]), 1),
|
||||||
|
util::pretty(tmbuf[2], si(pro[2]), 1),
|
||||||
|
util::pretty(tmbuf[3], si(pro[3]), 1),
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// pipe::exec
|
// pipe::exec
|
||||||
//
|
//
|
||||||
|
|
||||||
ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl,
|
ircd::gpt::pipe::exec::exec(task &task,
|
||||||
const ctor_opts &opts)
|
const size_t tokens,
|
||||||
|
const bool release,
|
||||||
|
const bool acquire)
|
||||||
:desc
|
:desc
|
||||||
{
|
{
|
||||||
default_desc
|
default_desc
|
||||||
}
|
}
|
||||||
,out_ctrl
|
,send_opts
|
||||||
{
|
{
|
||||||
reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl)
|
reinterpret_cast<const char *>(task.opts),
|
||||||
|
release? sizeof(struct ircd_gpt_opts): 0
|
||||||
}
|
}
|
||||||
,in_ctrl
|
,send_ctrl
|
||||||
{
|
{
|
||||||
reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl)
|
reinterpret_cast<const char *>(task.ctrl),
|
||||||
|
release? sizeof(struct ircd_gpt_task): 0
|
||||||
}
|
}
|
||||||
,in_opts
|
,recv_ctrl
|
||||||
{
|
{
|
||||||
reinterpret_cast<const char *>(&opts), sizeof(ctor_opts)
|
reinterpret_cast<char *>(task.ctrl),
|
||||||
|
acquire? sizeof(struct ircd_gpt_task): 0
|
||||||
}
|
}
|
||||||
,range_anode
|
,range_lm_embed
|
||||||
{
|
{
|
||||||
{ ctrl.tokens, 0, },
|
{ 1 * 192UL, 0, },
|
||||||
{ 1, 0, },
|
{ 192UL, 0, },
|
||||||
}
|
|
||||||
,range_coil
|
|
||||||
{
|
|
||||||
{ ctrl.tokens * 192UL, 0, },
|
|
||||||
{ 192UL, 0, },
|
|
||||||
}
|
}
|
||||||
,range_negative
|
,range_negative
|
||||||
{
|
{
|
||||||
range_coil
|
{ tokens * 192UL, 0, },
|
||||||
}
|
{ 192UL, 0, },
|
||||||
,range_selfattn
|
|
||||||
{
|
|
||||||
range_coil
|
|
||||||
}
|
}
|
||||||
,range_positive
|
,range_positive
|
||||||
{
|
{
|
||||||
range_coil
|
{ tokens * 192UL, 0, },
|
||||||
|
{ 192UL, 0, },
|
||||||
}
|
}
|
||||||
,range_cathode
|
,range_lm_norm
|
||||||
{
|
{
|
||||||
{ 1 * 192UL, 0 },
|
{ 1 * 192UL, 0 },
|
||||||
{ 192UL, 0 },
|
{ 192UL, 0 },
|
||||||
{ (ctrl.tokens - 1) * 192UL, 0 },
|
{ (tokens - 1) * 192UL, 0 },
|
||||||
}
|
}
|
||||||
,range_lmhead
|
,range_lm_logit
|
||||||
{
|
{
|
||||||
{ 262 * 192UL, 0 }, // align_up(50257) / 192
|
{ 262 * 192UL, 0 }, // align_up(50257) / 192
|
||||||
{ 192UL, 0 },
|
{ 192UL, 0 },
|
||||||
}
|
}
|
||||||
,range_lmamax
|
,range_lm_select
|
||||||
{
|
{
|
||||||
{ 1 * 192UL, 0 },
|
{ 1 * 192UL, 0 },
|
||||||
{ 192UL, 0 },
|
{ 192UL, 0 },
|
||||||
}
|
}
|
||||||
,send
|
,release_opts
|
||||||
{
|
{
|
||||||
{ desc->opts, in_opts },
|
desc->opts, send_opts
|
||||||
{ desc->ctrl, in_ctrl },
|
|
||||||
}
|
}
|
||||||
,tail
|
,release_ctrl
|
||||||
{
|
{
|
||||||
{ desc->anode, range_anode, anode_opts },
|
desc->ctrl, send_ctrl
|
||||||
|
}
|
||||||
|
,lm_embed
|
||||||
|
{
|
||||||
|
desc->lm_embed, range_lm_embed, anode_opts
|
||||||
}
|
}
|
||||||
,coil
|
,coil
|
||||||
{
|
{
|
||||||
{ desc->layer[0x00]->negative, range_negative, negative_opts },
|
{ desc->layer[0x00]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x00]->positive, range_positive, positive_opts },
|
{ desc->layer[0x00]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x01]->negative, range_negative, negative_opts },
|
{ desc->layer[0x01]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x01]->positive, range_positive, positive_opts },
|
{ desc->layer[0x01]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x02]->negative, range_negative, negative_opts },
|
{ desc->layer[0x02]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x02]->positive, range_positive, positive_opts },
|
{ desc->layer[0x02]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x03]->negative, range_negative, negative_opts },
|
{ desc->layer[0x03]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x03]->positive, range_positive, positive_opts },
|
{ desc->layer[0x03]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x04]->negative, range_negative, negative_opts },
|
{ desc->layer[0x04]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x04]->positive, range_positive, positive_opts },
|
{ desc->layer[0x04]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x05]->negative, range_negative, negative_opts },
|
{ desc->layer[0x05]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x05]->positive, range_positive, positive_opts },
|
{ desc->layer[0x05]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x06]->negative, range_negative, negative_opts },
|
{ desc->layer[0x06]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x06]->positive, range_positive, positive_opts },
|
{ desc->layer[0x06]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x07]->negative, range_negative, negative_opts },
|
{ desc->layer[0x07]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x07]->positive, range_positive, positive_opts },
|
{ desc->layer[0x07]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x08]->negative, range_negative, negative_opts },
|
{ desc->layer[0x08]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x08]->positive, range_positive, positive_opts },
|
{ desc->layer[0x08]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x09]->negative, range_negative, negative_opts },
|
{ desc->layer[0x09]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x09]->positive, range_positive, positive_opts },
|
{ desc->layer[0x09]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x0a]->negative, range_negative, negative_opts },
|
{ desc->layer[0x0a]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x0a]->positive, range_positive, positive_opts },
|
{ desc->layer[0x0a]->positive, range_positive, positive_opts },
|
||||||
{ desc->layer[0x0b]->negative, range_negative, negative_opts },
|
{ desc->layer[0x0b]->negative, range_negative, negative_opts },
|
||||||
{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
|
|
||||||
{ desc->layer[0x0b]->positive, range_positive, positive_opts },
|
{ desc->layer[0x0b]->positive, range_positive, positive_opts },
|
||||||
}
|
}
|
||||||
,head
|
,lm_norm
|
||||||
{
|
{
|
||||||
{ desc->cathode, range_cathode, cathode_opts },
|
desc->lm_norm, range_lm_norm, cathode_opts
|
||||||
{ desc->lmhead, range_lmhead, lmhead_opts },
|
|
||||||
{ desc->lmamax, range_lmamax, lmamax_opts },
|
|
||||||
}
|
}
|
||||||
,recv
|
,lm_logit
|
||||||
{
|
{
|
||||||
{ desc->ctrl, out_ctrl },
|
desc->lm_logit, range_lm_logit, lmhead_opts
|
||||||
|
}
|
||||||
|
,lm_select
|
||||||
|
{
|
||||||
|
desc->lm_select, range_lm_select, lmamax_opts
|
||||||
|
}
|
||||||
|
,acquire_ctrl
|
||||||
|
{
|
||||||
|
desc->ctrl, recv_ctrl
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
@ -296,29 +397,14 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
||||||
{
|
{
|
||||||
&code
|
&code
|
||||||
}
|
}
|
||||||
,opts
|
|
||||||
{
|
|
||||||
4_KiB,
|
|
||||||
const_buffer{}
|
|
||||||
}
|
|
||||||
,ctrl
|
|
||||||
{
|
|
||||||
4_KiB,
|
|
||||||
mutable_buffer{}
|
|
||||||
}
|
|
||||||
,state
|
,state
|
||||||
{
|
{
|
||||||
32 * 3 * 768 * sizeof(float),
|
96 * 3 * 768 * sizeof(float),
|
||||||
mutable_buffer{}
|
|
||||||
}
|
|
||||||
,xattn
|
|
||||||
{
|
|
||||||
32 * 1 * 768 * sizeof(float),
|
|
||||||
mutable_buffer{}
|
mutable_buffer{}
|
||||||
}
|
}
|
||||||
,accum
|
,accum
|
||||||
{
|
{
|
||||||
32 * 768 * sizeof(float),
|
96 * 768 * sizeof(float),
|
||||||
mutable_buffer{}
|
mutable_buffer{}
|
||||||
}
|
}
|
||||||
,logit
|
,logit
|
||||||
|
@ -326,16 +412,54 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
||||||
65536 * sizeof(float),
|
65536 * sizeof(float),
|
||||||
mutable_buffer{}
|
mutable_buffer{}
|
||||||
}
|
}
|
||||||
,anode
|
,ctrl
|
||||||
|
{
|
||||||
|
sizeof(struct ircd_gpt_task),
|
||||||
|
mutable_buffer{}
|
||||||
|
}
|
||||||
|
,opts
|
||||||
|
{
|
||||||
|
sizeof(struct ircd_gpt_opts),
|
||||||
|
const_buffer{}
|
||||||
|
}
|
||||||
|
,lm_embed
|
||||||
{
|
{
|
||||||
code,
|
code,
|
||||||
"ctor_anode2",
|
"ircd_gpt_lm_embed",
|
||||||
ctrl,
|
ctrl,
|
||||||
opts,
|
opts,
|
||||||
accum,
|
accum,
|
||||||
model.embed->pos,
|
model.embed->pos,
|
||||||
model.embed->token,
|
model.embed->token,
|
||||||
}
|
}
|
||||||
|
,lm_norm
|
||||||
|
{
|
||||||
|
code,
|
||||||
|
"ircd_gpt_lm_norm",
|
||||||
|
ctrl,
|
||||||
|
opts,
|
||||||
|
accum,
|
||||||
|
model.decode->norm.bias,
|
||||||
|
model.decode->norm.weight,
|
||||||
|
}
|
||||||
|
,lm_logit
|
||||||
|
{
|
||||||
|
code,
|
||||||
|
"ircd_gpt_lm_logit",
|
||||||
|
ctrl,
|
||||||
|
opts,
|
||||||
|
logit,
|
||||||
|
accum,
|
||||||
|
model.embed->token,
|
||||||
|
}
|
||||||
|
,lm_select
|
||||||
|
{
|
||||||
|
code,
|
||||||
|
"ircd_gpt_lm_select",
|
||||||
|
ctrl,
|
||||||
|
opts,
|
||||||
|
logit,
|
||||||
|
}
|
||||||
,layer
|
,layer
|
||||||
{
|
{
|
||||||
std::make_unique<struct desc::layer>(*this, 0x00),
|
std::make_unique<struct desc::layer>(*this, 0x00),
|
||||||
|
@ -351,34 +475,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
|
||||||
std::make_unique<struct desc::layer>(*this, 0x0a),
|
std::make_unique<struct desc::layer>(*this, 0x0a),
|
||||||
std::make_unique<struct desc::layer>(*this, 0x0b),
|
std::make_unique<struct desc::layer>(*this, 0x0b),
|
||||||
}
|
}
|
||||||
,cathode
|
|
||||||
{
|
|
||||||
code,
|
|
||||||
"ctor_cathode",
|
|
||||||
ctrl,
|
|
||||||
opts,
|
|
||||||
accum,
|
|
||||||
model.decode->norm.bias,
|
|
||||||
model.decode->norm.weight,
|
|
||||||
}
|
|
||||||
,lmhead
|
|
||||||
{
|
|
||||||
code,
|
|
||||||
"ctor_lmhead",
|
|
||||||
ctrl,
|
|
||||||
opts,
|
|
||||||
logit,
|
|
||||||
accum,
|
|
||||||
model.embed->token,
|
|
||||||
}
|
|
||||||
,lmamax
|
|
||||||
{
|
|
||||||
code,
|
|
||||||
"ctor_lmamax",
|
|
||||||
ctrl,
|
|
||||||
opts,
|
|
||||||
logit,
|
|
||||||
}
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -391,7 +487,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
|
||||||
:negative
|
:negative
|
||||||
{
|
{
|
||||||
*desc.code,
|
*desc.code,
|
||||||
"ctor_attn_fcon",
|
"ircd_gpt_attn_fcon",
|
||||||
desc.ctrl,
|
desc.ctrl,
|
||||||
desc.opts,
|
desc.opts,
|
||||||
desc.state,
|
desc.state,
|
||||||
|
@ -401,24 +497,15 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
|
||||||
desc.model->decode->block[laynum].attn.fcon.bias,
|
desc.model->decode->block[laynum].attn.fcon.bias,
|
||||||
desc.model->decode->block[laynum].attn.fcon.weight,
|
desc.model->decode->block[laynum].attn.fcon.weight,
|
||||||
}
|
}
|
||||||
,selfattn
|
|
||||||
{
|
|
||||||
*desc.code,
|
|
||||||
"ctor_attn_self",
|
|
||||||
desc.ctrl,
|
|
||||||
desc.opts,
|
|
||||||
desc.xattn,
|
|
||||||
desc.state,
|
|
||||||
desc.model->decode->block[laynum].attn.mask,
|
|
||||||
}
|
|
||||||
,positive
|
,positive
|
||||||
{
|
{
|
||||||
*desc.code,
|
*desc.code,
|
||||||
"ctor_backend",
|
"ircd_gpt_coil",
|
||||||
desc.ctrl,
|
desc.ctrl,
|
||||||
desc.opts,
|
desc.opts,
|
||||||
desc.accum,
|
desc.accum,
|
||||||
desc.xattn,
|
desc.state,
|
||||||
|
desc.model->decode->block[laynum].attn.mask,
|
||||||
desc.model->decode->block[laynum].attn.proj.bias,
|
desc.model->decode->block[laynum].attn.proj.bias,
|
||||||
desc.model->decode->block[laynum].attn.proj.weight,
|
desc.model->decode->block[laynum].attn.proj.weight,
|
||||||
desc.model->decode->block[laynum].ffnn.norm.bias,
|
desc.model->decode->block[laynum].ffnn.norm.bias,
|
||||||
|
@ -486,23 +573,33 @@ noexcept
|
||||||
//
|
//
|
||||||
|
|
||||||
ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
|
ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
|
||||||
:block
|
:master
|
||||||
{
|
{
|
||||||
{ decoder.layer[0x00], 0x00, },
|
sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer
|
||||||
{ decoder.layer[0x01], 0x01, },
|
{
|
||||||
{ decoder.layer[0x02], 0x02, },
|
reinterpret_cast<const char *>(decoder.layer),
|
||||||
{ decoder.layer[0x03], 0x03, },
|
sizeof(decoder.layer) + sizeof(decoder.f)
|
||||||
{ decoder.layer[0x04], 0x04, },
|
}
|
||||||
{ decoder.layer[0x05], 0x05, },
|
}
|
||||||
{ decoder.layer[0x06], 0x06, },
|
,block
|
||||||
{ decoder.layer[0x07], 0x07, },
|
{
|
||||||
{ decoder.layer[0x08], 0x08, },
|
{ master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
|
||||||
{ decoder.layer[0x09], 0x09, },
|
{ master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
|
||||||
{ decoder.layer[0x0a], 0x0a, },
|
{ master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, },
|
||||||
{ decoder.layer[0x0b], 0x0b, },
|
{ master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
|
||||||
|
{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
|
||||||
}
|
}
|
||||||
,norm
|
,norm
|
||||||
{
|
{
|
||||||
|
master,
|
||||||
|
off_t(sizeof(gpt::model::block) * 12),
|
||||||
const_buffer{decoder.f.bias},
|
const_buffer{decoder.f.bias},
|
||||||
const_buffer{decoder.f.weight},
|
const_buffer{decoder.f.weight},
|
||||||
}
|
}
|
||||||
|
@ -544,6 +641,27 @@ ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ircd::gpt::pipe::model::block::block(cl::data &master,
|
||||||
|
const off_t offset,
|
||||||
|
const gpt::model::block &block,
|
||||||
|
const size_t layer)
|
||||||
|
:attn
|
||||||
|
{
|
||||||
|
master,
|
||||||
|
offset,
|
||||||
|
block.ln1,
|
||||||
|
block.attn,
|
||||||
|
}
|
||||||
|
,ffnn
|
||||||
|
{
|
||||||
|
master,
|
||||||
|
offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
|
||||||
|
block.ln2,
|
||||||
|
block.ffnn,
|
||||||
|
}
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// pipe::model::ffnn
|
// pipe::model::ffnn
|
||||||
//
|
//
|
||||||
|
@ -678,3 +796,48 @@ ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// gpt::task
|
||||||
|
//
|
||||||
|
|
||||||
|
ircd::gpt::task::task(const gpt::opts *const opts,
|
||||||
|
struct ircd_gpt_task *const ctrl)
|
||||||
|
:opts
|
||||||
|
{
|
||||||
|
opts
|
||||||
|
}
|
||||||
|
,ctrl
|
||||||
|
{
|
||||||
|
ctrl
|
||||||
|
}
|
||||||
|
{
|
||||||
|
memset(this->ctrl, 0x0, sizeof(ircd_gpt_task));
|
||||||
|
|
||||||
|
this->ctrl->rand[0] = this->opts->seed;
|
||||||
|
this->ctrl->rand[1] = this->opts->seed;
|
||||||
|
this->ctrl->rand[2] = -1UL;
|
||||||
|
this->ctrl->rand[3] = -1UL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ircd::gpt::task::~task()
|
||||||
|
noexcept
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// hypercall
|
||||||
|
//
|
||||||
|
|
||||||
|
ircd::string_view
|
||||||
|
ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
|
||||||
|
noexcept
|
||||||
|
{
|
||||||
|
switch(code)
|
||||||
|
{
|
||||||
|
case IRCD_GPT_ACCEPT: return "ACCEPT";
|
||||||
|
case IRCD_GPT_ECOMPLETE: return "ECOMPLETE";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "??????";
|
||||||
|
}
|
||||||
|
|
|
@ -17324,10 +17324,15 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
|
||||||
opts.limit = param.at<uint>("limit");
|
opts.limit = param.at<uint>("limit");
|
||||||
opts.top_k = 3;
|
opts.top_k = 3;
|
||||||
|
|
||||||
gpt::task task;
|
struct ircd_gpt_task ctrl;
|
||||||
|
gpt::task task
|
||||||
|
{
|
||||||
|
&opts, &ctrl
|
||||||
|
};
|
||||||
|
|
||||||
const auto output
|
const auto output
|
||||||
{
|
{
|
||||||
gpt::generate(buf, text, &opts, &task)
|
gpt::generate(buf, text, task)
|
||||||
};
|
};
|
||||||
|
|
||||||
out
|
out
|
||||||
|
|
Loading…
Reference in a new issue