0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2025-01-13 16:33:53 +01:00

ircd::gpt: Reorganize task options and control blocks.

This commit is contained in:
Jason Volk 2021-05-02 20:40:00 -07:00
parent 3e9c2d1b56
commit 37b1d47c8d
12 changed files with 527 additions and 643 deletions

View file

@ -17,8 +17,8 @@ namespace ircd::gpt
{
IRCD_EXCEPTION(ircd::error, error)
struct opts;
struct task;
struct gate;
extern log::log log;
}
@ -27,7 +27,6 @@ namespace ircd::gpt
#include "vocab.h"
#include "model.h"
#include "token.h"
#include "opts.h"
#include "task.h"
#include "task/task.h"
#include "pipe/pipe.h"
#include "generate.h"

View file

@ -1,277 +0,0 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_OPTS_H
/// Task Options Page
///
/// The option block is directly shared with task software as constant data.
/// This stucture and its mutable companion in `task.h` determine the outcome
/// of the next execution cycle; options are immutable to device software but
/// may be changed by the host between executions cycles if desired.
///
struct ircd_gpt_opts
{
/// Specifies the nominal halting condition based on a sequence of tokens.
/// Generation will complete with success after one of these sequences is
/// witnessed. Set tokens to -1 starting from the back for shorter
/// sequences; zero-length sequences (all -1's) are never matched.
uint accept_code[4][4]
#ifdef __cplusplus
{
{ 13, 198, -1U, -1U, },
{ 198, 198, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
}
#endif
;
/// Specifies the exceptional halting condition based on the sequence of
/// tokens. By default, the three zeros represent three outputs of '!'
/// which is probably an error; note that a true "!!!" is represented by
/// token number 10185. Set tokens to -1 starting from the back to not
/// match that token; generated output after errors is usually garbage.
uint error_code[4][4]
#ifdef __cplusplus
{
{ 0, 0, 0, -1U, },
{ -1U, -1U, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
{ -1U, -1U, -1U, -1U, },
}
#endif
;
/// Limit number of output tokens. Default of -1 is unlimited; the number
/// of tokens generated will be limited by other factors.
uint limit
#ifdef __cplusplus
{
1
}
#endif
;
/// Flip random coins over the top k logits each round. Setting to 1
/// deterministically selects the top logit.
uint top_k
#ifdef __cplusplus
{
2
}
#endif
;
/// Specifies the token context size in tokens.
uint context_tokens
#ifdef __cplusplus
{
1024
}
#endif
;
/// Specifies the token buffer size in tokens.
uint buffer_tokens
#ifdef __cplusplus
{
1024
}
#endif
;
/// Embedding vector elements
uint embed_elems
#ifdef __cplusplus
{
768
}
#endif
;
/// Attention unit fcon width multiple
uint attn_mult
#ifdef __cplusplus
{
3U
}
#endif
;
/// MLP unit fcon width multiple
uint ffnn_mult
#ifdef __cplusplus
{
4U
}
#endif
;
/// Attention unit width multiple
uint attn_elems
#ifdef __cplusplus
{
embed_elems * attn_mult
}
#endif
;
/// FFNN unit width multiple
uint ffnn_elems
#ifdef __cplusplus
{
embed_elems * ffnn_mult
}
#endif
;
/// SIMD lane count
uint lanes
#ifdef __cplusplus
{
4U
}
#endif
;
uint embed_width
#ifdef __cplusplus
{
embed_elems / lanes
}
#endif
;
uint attn_width
#ifdef __cplusplus
{
attn_elems / lanes
}
#endif
;
uint attn_height
#ifdef __cplusplus
{
embed_elems / lanes
}
#endif
;
uint ffnn_width
#ifdef __cplusplus
{
ffnn_elems / lanes
}
#endif
;
uint ffnn_height
#ifdef __cplusplus
{
embed_elems / lanes
}
#endif
;
/// Specifies the token context size in tokens.
uint logits
#ifdef __cplusplus
{
50257
}
#endif
;
/// Seed for the task's PRNG.
ulong seed
#ifdef __cplusplus
{
1234567890UL
}
#endif
;
/// Training steps
ulong training_steps
#ifdef __cplusplus
{
250000
}
#endif
;
/// Validation steps
ulong validation_steps
#ifdef __cplusplus
{
5000
}
#endif
;
ushort label
#ifdef __cplusplus
{
198
}
#endif
;
float alpha
#ifdef __cplusplus
{
0.001
}
#endif
;
float beta[2]
#ifdef __cplusplus
{
0.9, // Beta1
0.999, // Beta2
}
#endif
;
float epsilon
#ifdef __cplusplus
{
0.000001
}
#endif
;
}
__attribute__((aligned(4096)));
#ifdef __cplusplus
/// Generator Task Options.
///
/// Parameters for a task. Options are constant and one instance can be shared
/// between multiple task instances. This structure extends the task options
/// page, starting a new page which is not visible to device software; C++ and
/// host pointers are available.
///
struct ircd::gpt::opts
:ircd_gpt_opts
{
/// Pointer to the model
const model::decoder *model
{
model::default_model
};
};
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
#endif

View file

@ -1,166 +0,0 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_TASK_H
/// Task Control Page
///
/// The control block is shared with our device software. Execution state is
/// maintained in the task control block across cycles. The control block is
/// the mutable state component for an execution; for the immutable component
/// also shared with device software see opts.h.
///
struct ircd_gpt_task
{
/// Header magic 0xC7012C70
uint magic;
/// Hypercall code set by our device software upon completion and control
/// transfer back to the host. Negative codes indicate errors, positive
/// codes are used for status and/or procedure calls; zero is also an error.
enum ircd_gpt_hypercall call;
/// Token ring head. Tokens in the ring extend behind the head for
/// `tokens`. The `head` value is automatically modulated by device
/// software to wrap around the ring.
uint head;
/// Token counter. The counter indicates the number of valid tokens in
/// the context buffer. This value must not exceed the buffer size.
uint tokens;
/// Accumulates the number of task cycles. The cycle counter is incremented
/// by device software after each repetition of the kernel pipeline to
/// produce one additional token.
ulong cycle;
/// Accumulates the epoch count for the task. The counter is incremented
/// by one in device software before control returns back to the host.
/// Several cycles may occur during each epoch.
ulong epoch;
/// Accumulates the training epoch count for the task. The counter is
/// incremented by one in device software for each backward propagation.
ulong step;
/// Accumulates the number of tokens produced by the task. Several tokens
/// may be produced each epoch, but currently only one token is produced
/// each cycle.
ulong produced;
/// Accumulates the number tokens witnessed by the task. The number of
/// tokens in the context for each cycle is counted as witnessed.
ulong witnessed;
/// Accumulates time in microseconds elapsed for the task.
ulong elapsed;
/// PRNG xoshiro256 state. This is the de facto random seed which can be
/// set before cycle entry by the host. It is updated by device software
/// when used.
ulong rand[4];
/// Updated by the host with the value of the timestamp register as sampled
/// immediately before each transfer of control to the device.
ulong host_tsc;
/// State counters for the accept/error sequence codes.
uint accept_seq[4], error_seq[4];
/// Logit softmax mu
float samax_mu;
/// Logit softmax sum
float samax_sum;
/// Logit softmax lambda
float samax_lambda;
/// Loss for last token of last cycle
float loss;
/// Sum loss over all cycles
float loss_sum[4];
/// Average loss over all cycles
float loss_mean;
/// Perplexity score for last token of last cycle
float perp;
/// Sum ppl over all cycles
float perp_sum[4];
/// Perplexity mean over context
float perp_mean;
/// Certainty difference score for last token of last cycle
float cert;
/// Sum certainty over all cycles
float cert_sum[4];
/// Certainty mean over context
float cert_mean;
/// Final loss
float l2_loss;
/// Final loss mean
float l2_loss_mean;
/// Perform backprop
bool prop;
/// The token buffer starts at offset 2048 and continues to the end of
/// the page; options specify the size of the tokens buffer in tokens.
/// Additional pages must be attached for larger buffer sizes.
ushort token[] __attribute__((aligned(2048)));
}
__attribute__((aligned(4096)));
#ifdef __cplusplus
/// Task Context
///
/// State for a task.
struct ircd::gpt::task
{
enum status :char;
/// Reference to the attached options.
const gpt::opts *opts {nullptr};
/// Reference to control pages.
struct ircd_gpt_task *ctrl {nullptr};
/// Current task status.
enum status status {'\0'};
task(const gpt::opts * = nullptr,
struct ircd_gpt_task * = nullptr);
~task() noexcept;
};
/// The current status of a task is indicated with intelligible characters
enum ircd::gpt::task::status
:char
{
QUEUED = 'Q', ///< Queued for execution.
RUNNING = 'R', ///< Currently being executed.
ACCEPT = 'A', ///< Execution completed successfully.
ERROR = 'E', ///< Execution did not complete successfully.
};
static_assert(sizeof(struct ircd_gpt_task) == 4096);
static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
#endif

View file

@ -0,0 +1,58 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_TASK_CTRL_H
/// Task Control Page
///
/// The control block is shared with our device software. Execution state is
/// maintained in the task control block across cycles. The control block is
/// the mutable state component for an execution; for the immutable component
/// also shared with device software see opts.h.
///
struct ircd_gpt_task
{
/// Epoch counting & interrupt control block.
struct ircd_gpt_task_epic epic;
/// Token context control block. Contains state for the token context
/// buffer; the buffer with the tokens themselves is elsewhere.
struct ircd_gpt_task_tokens tokens;
/// Logit softmax state
struct ircd_math_samax samax;
/// Target label loss state
struct ircd_math_mean loss;
/// Target label perplexity score state
struct ircd_math_mean perp;
/// Target label certainty difference state
struct ircd_math_mean cert;
/// PRNG xoshiro256 state. This is the de facto random seed which can be
/// set before cycle entry by the host. It is updated by device software
/// when used.
ulong rand[4];
/// Perform backprop
bool prop;
/// Header magic 0xC7012C70
uint magic;
/// The token buffer starts at offset 2048 and continues to the end of
/// the page; options specify the size of the tokens buffer in tokens.
/// Additional pages must be attached for larger buffer sizes.
ushort token[] __attribute__((aligned(2048)));
}
__attribute__((aligned(4096)));

View file

@ -0,0 +1,38 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_TASK_EPIC_H
/// Epoch Precision Interrupt Controller
///
struct ircd_gpt_task_epic
{
/// Accumulates the number of task cycles. The cycle counter is incremented
/// by device software after each repetition of the kernel pipeline to
/// produce one additional token.
ulong cycle;
/// Accumulates the epoch count for the task. The counter is incremented
/// by one in device software before control returns back to the host.
/// Several cycles may occur during each epoch.
ulong epoch;
/// Accumulates the training epoch count for the task. The counter is
/// incremented by one in device software for each backward propagation.
ulong step;
/// Updated by the host with the value of the timestamp register as sampled
/// immediately before each transfer of control to the device.
ulong host_tsc;
/// Accumulates time in microseconds elapsed for the task.
ulong elapsed;
};

View file

@ -0,0 +1,30 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_GATE_H
/// Task Gate Descriptor
///
struct ircd_gpt_gate
{
ushort code[8];
}
__attribute__((aligned(16)));
#ifdef __cplusplus
struct ircd::gpt::gate
:ircd_gpt_gate
{
gate()
:ircd_gpt_gate{0}
{}
};
#endif

View file

@ -0,0 +1,123 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_OPTS_H
/// Task Options Page
///
/// The option block is directly shared with task software as constant data.
/// This stucture and its mutable companion in `task.h` determine the outcome
/// of the next execution cycle; options are immutable to device software but
/// may be changed by the host between executions cycles if desired.
///
struct ircd_gpt_opts
{
#ifdef __cplusplus
ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
#endif
/// Reference to the model (currently not available in device software).
#ifndef __cplusplus
const intptr_t model;
#else
const ircd::gpt::model::decoder *model;
#endif
/// Limit number of output tokens. Default of -1 is unlimited; the number
/// of tokens generated will be limited by other factors.
uint limit;
/// Flip random coins over the top k logits each round. Setting to 1
/// deterministically selects the top logit.
uint top_k;
/// Specifies the token context size in tokens.
uint context_tokens;
/// Specifies the token buffer size in tokens.
uint buffer_tokens;
/// Embedding vector elements
uint embed_elems;
/// Attention unit fcon width multiple
uint attn_mult;
/// (computed) MLP unit fcon width multiple
uint ffnn_mult;
/// (computed) attention unit width multiple
uint attn_elems;
/// FFNN unit width multiple
uint ffnn_elems;
/// SIMD lane count
uint lanes;
/// (computed) `embed_elems` / `lanes`
uint embed_width;
/// (computed) Attention unit X dimension
uint attn_width;
/// (computed) Attention unit Y dimension
uint attn_height;
/// (computed) MLP backend X dimension
uint ffnn_width;
/// (computed) MLP backend Y dimension
uint ffnn_height;
/// Number of possible target n-grams.
uint logits;
/// Seed for the task's PRNG.
ulong seed;
/// Training steps
ulong training_steps;
/// Validation steps
ulong validation_steps;
/// Target label
ushort label;
/// Learning rate
float alpha;
/// Decay rate
float beta[2];
/// Denorm smoothing
float epsilon;
/// Number of gate descriptors attached to this page.
uint gates;
/// The gate descriptor table starts at offset 2048 and continues to the
/// end of the page. For more descriptors additional pages must be
/// attached.
struct ircd_gpt_gate gate[] __attribute__((aligned(2048)));
}
__attribute__((aligned(4096)));
#ifdef __cplusplus
namespace ircd::gpt
{
using opts = struct ircd_gpt_opts;
}
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
#endif

View file

@ -0,0 +1,56 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_TASK_H
#include "epic.h"
#include "tokens.h"
#include "gate.h"
#include "opts.h"
#include "ctrl.h"
#ifdef __cplusplus
/// Task Context
///
/// State for a task.
struct ircd::gpt::task
{
enum status :char;
/// Reference to the attached options.
const gpt::opts *opts {nullptr};
/// Reference to control pages.
struct ircd_gpt_task *ctrl {nullptr};
/// Current task status.
enum status status {'\0'};
task(const gpt::opts * = nullptr,
struct ircd_gpt_task * = nullptr);
~task() noexcept;
};
/// The current status of a task is indicated with intelligible characters
enum ircd::gpt::task::status
:char
{
QUEUED = 'Q', ///< Queued for execution.
RUNNING = 'R', ///< Currently being executed.
ACCEPT = 'A', ///< Execution completed successfully.
ERROR = 'E', ///< Execution did not complete successfully.
};
static_assert(sizeof(struct ircd_gpt_task) == 4096);
static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
#endif

View file

@ -0,0 +1,35 @@
// Matrix Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_TASK_TOKENS_H
/// Token Context Buffer (Control Block)
///
struct ircd_gpt_task_tokens
{
/// Token ring head. Tokens in the ring extend behind the head for
/// `tokens`. The `head` value is automatically modulated by device
/// software to wrap around the ring.
uint head;
/// Token counter. The counter indicates the number of valid tokens in
/// the context buffer. This value must not exceed the buffer size.
uint count;
/// Accumulates the number of tokens produced by the task. Several tokens
/// may be produced each epoch, but currently only one token is produced
/// each cycle.
ulong produced;
/// Accumulates the number tokens witnessed by the task. The number of
/// tokens in the context for each cycle is counted as witnessed.
ulong witnessed;
};

View file

@ -88,67 +88,29 @@ ircd::gpt::generate(const vector_view<u16> &out,
const auto &opts(*task.opts);
auto &ctrl(*task.ctrl);
auto &errc(ctrl.error_seq);
auto &accc(ctrl.accept_seq);
ctrl.tokens = in.size();
ctrl.head = 0;
const size_t tmax
{
in.size() + opts.limit
};
const vector_view<f32> accum
{
gpt::scratch, tmax * 768
};
const vector_view<f32> embeds
{
gpt::embeds, tmax * 768
};
ctrl.tokens.count = 0;
ctrl.tokens.head = 0;
for(uint j(0); j < in.size(); ++j)
ctrl.token[ctrl.tokens.count++] = in[j];
for(uint i(0); i < opts.gates; ++i)
for(uint k(0); k < 8; ++k)
{
if(ctrl.tokens.count >= opts.buffer_tokens)
break;
if(opts.gate[i].code[k] == 0)
break;
ctrl.token[ctrl.tokens.count] = opts.gate[i].code[k];
ctrl.tokens.count++;
}
const size_t in_size
{
const vector_view<f32> dst
{
data(embeds) + j * 768, 768
};
if(ircd::cl::enable)
ctrl.token[j] = in[j];
else
embed(data(dst), in[j], j, opts);
#if 0 // RB_DEBUG
static char dbuf[512] {0};
char report[1536] {0};
char tmbuf[1][64] {{0}};
const size_t report_size = snprintf
(
report, sizeof(report),
"%-4u %4u %4u:%-4u %1u%1u [ %6.2fL %6.2f%% ] %6.2fL %5.1f%% %s",
ctrl.epoch,
ctrl.cycle,
j,
ctrl.tokens,
0,
0,
0.0,
0.0,
0.0,
0.0,
vocab::debug(dbuf, in[j]).c_str()
);
log::logf
{
log, log::level::DEBUG,
"%s",
string_view{report, report_size}
};
#endif
}
ctrl.tokens.count
};
uint64_t cycles(0);
if(ctrl.prop)
@ -170,7 +132,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
cycles
};
backprop(task, ctrl.loss_mean, *model::default_model, momentum);
backprop(task, ctrl.loss.mean, *model::default_model, momentum);
}
if(ctrl.prop)
@ -178,17 +140,17 @@ ircd::gpt::generate(const vector_view<u16> &out,
log::debug
{
log, "Backpropagation of %2.6f in %lu cycles.",
ctrl.loss_mean,
ctrl.loss.mean,
cycles,
};
ctrl.epoch = 0;
ctrl.loss_mean = 0;
ctrl.loss = ctrl.loss_mean;
ctrl.perp_mean = 0;
ctrl.perp = ctrl.perp_mean;
ctrl.cert_mean = 0;
ctrl.cert = ctrl.cert_mean;
ctrl.epic.epoch = 0;
ctrl.loss.mean = 0;
ctrl.loss.last = ctrl.loss.mean;
ctrl.perp.mean = 0;
ctrl.perp.last = ctrl.perp.mean;
ctrl.cert.mean = 0;
ctrl.cert.last = ctrl.cert.mean;
ctrl.prop = false;
pipe::default_model->invalid = true;
return {};
@ -206,73 +168,49 @@ ircd::gpt::generate(const vector_view<u16> &out,
generate(task);
}
last_time = stopwatch.at<milliseconds>();
ctrl.elapsed += last_time.count();
ctrl.epic.elapsed += last_time.count();
/*
coil(data(scratch), tokens, *opts.model);
tail(logit, data(last_embed), *opts.model);
out[i] = argmax(logit, *opts);
*/
uint accc_thresh[3] {3, 3, 3};
for(uint i(0); i < 3; ++i)
for(uint j(3); j > 0; --j)
if(opts.accept_code[i][j - 1] == -1U)
--accc_thresh[i];
else
break;
uint errc_thresh[3] {3, 3, 3};
for(uint i(0); i < 3; ++i)
for(uint j(3); j > 0; --j)
if(opts.error_code[i][j - 1] == -1U)
--errc_thresh[i];
else
break;
for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
for(uint j(0); j < ctrl.tokens.count && ret < out.size() && !halt; ++j)
{
out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
const auto tok
{
ctrl.token[j]
};
for(uint j(0); j < 3; ++j)
errc[j] = opts.error_code[j][errc[j]] == out[j]?
errc[j] + 1: 0;
if(j >= in_size)
out[ret++] = tok;
for(uint j(0); j < 3; ++j)
accc[j] = opts.accept_code[j][accc[j]] == out[j]?
accc[j] + 1: 0;
for(uint j(0); j < 3; ++j)
halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
halt |= errc_thresh[j] && errc[j] >= errc_thresh[j];
if(j < in_size)
continue;
static char dbuf[512] {0};
char report[1536] {0};
char tmbuf[4][64] {0};
const size_t bsz(ctrl.tokens - in.size());
const size_t bsz(ctrl.tokens.count - in_size);
const size_t report_size = snprintf
(
report, sizeof(report),
"%4lu:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP %5.1f%% %s %04x %8s %8s | %8s",
j + in.size(),
ctrl.tokens,
ctrl.epoch,
ctrl.cycle,
std::clamp(ctrl.cert_mean * 100.0f, 0.0f, 100.0f),
std::clamp(ctrl.perp_mean, 0.0f, 100.0f),
std::clamp(ctrl.loss_mean, 0.0f, 99.99f),
opts.label == out[j]? '+': ' ',
accc[0] + accc[1] + accc[2] >= 3? 'A': ' ',
errc[0] + errc[1] + errc[2] >= 3? 'E': ' ',
"%-3u %4u:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP %5.1f%% %s %04x %8s %8s | %8s",
j,
ret - 1,
ctrl.tokens.count,
ctrl.epic.epoch,
ctrl.epic.cycle,
std::clamp(ctrl.cert.mean * 100.0f, 0.0f, 100.0f),
std::clamp(ctrl.perp.mean, 0.0f, 100.0f),
std::clamp(ctrl.loss.mean, 0.0f, 99.99f),
opts.label == tok? '+': ' ',
' ', // flag place
' ', // flag place
opts.label,
std::clamp(ctrl.loss, 0.0f, 99.99f),
std::clamp(ctrl.perp, 0.0f, 100.0f),
std::clamp(ctrl.cert * 100.0f, 0.0f, 100.0f),
vocab::debug(dbuf, out[j]).c_str(),
out[j],
std::clamp(ctrl.loss.last, 0.0f, 99.99f),
std::clamp(ctrl.perp.last, 0.0f, 100.0f),
std::clamp(ctrl.cert.last * 100.0f, 0.0f, 100.0f),
vocab::debug(dbuf, tok).c_str(),
tok,
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
);
log::logf
@ -283,19 +221,6 @@ ircd::gpt::generate(const vector_view<u16> &out,
};
}
ret = ctrl.tokens - in.size();
if ((false)) for(uint i(0); i < 3; ++i)
if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
{
ret -= (3 - accc_thresh[i]);
break;
}
else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
{
ret -= (3 - errc_thresh[i]);
break;
}
ctx::interruption_point();
return vector_view<u16>
{
@ -689,6 +614,7 @@ ircd::gpt::gelu(f32x4 &out,
// backside
//
[[gnu::noinline]]
size_t
ircd::gpt::backprop(task &task,
const f32 grad,
@ -792,6 +718,7 @@ ircd::gpt::backprop(task &task,
return off;
}
[[gnu::noinline]]
size_t
ircd::gpt::adamw(task &task,
const f32 grad,
@ -820,7 +747,7 @@ ircd::gpt::adamw(task &task,
};
for(uint i(0); i < num / 4; ++i)
off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.step, off);
off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.epic.step, off);
return off;
}
@ -915,19 +842,111 @@ noexcept
}
//
// hypercall
// gpt::opts
//
ircd::string_view
ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model)
noexcept
:model
{
model
}
,limit
{
-1U
}
,top_k
{
2U
}
,context_tokens
{
1024U
}
,buffer_tokens
{
1024U
}
,embed_elems
{
768U
}
,attn_mult
{
3U
}
,ffnn_mult
{
4U
}
,attn_elems
{
embed_elems * attn_mult
}
,ffnn_elems
{
embed_elems * ffnn_mult
}
,lanes
{
4U
}
,embed_width
{
embed_elems / lanes
}
,attn_width
{
attn_elems / lanes
}
,attn_height
{
embed_elems / lanes
}
,ffnn_width
{
ffnn_elems / lanes
}
,ffnn_height
{
embed_elems / lanes
}
,logits
{
50257
}
,seed
{
1234567890UL
}
,training_steps
{
250000
}
,validation_steps
{
5000
}
,label
{
198
}
,alpha
{
0.001f
}
,beta
{
0.9f,
0.999f,
}
,epsilon
{
0.000001
}
,gates
{
0
}
{
switch(code)
{
case IRCD_GPT_ACCEPT: return "ACCEPT";
case IRCD_GPT_ECOMPLETE: return "ECOMPLETE";
case IRCD_GPT_ETOKENS: return "ETOKENS";
}
return "??????";
}

View file

@ -432,7 +432,7 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
const uint word_idx)
{
const ushort
ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens,
token = ctrl->token[ring_idx];
const float4
@ -454,7 +454,7 @@ ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
wi = get_group_id(0),
wn = get_num_groups(0);
for(uint i = 0; i < ctrl->tokens; ++i)
for(uint i = 0; i < ctrl->tokens.count; ++i)
if(i % wn == wi)
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
}
@ -492,7 +492,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
{
const uint
gi = get_global_id(0),
ti = ctrl->tokens - 1,
ti = ctrl->tokens.count - 1,
words = opts->embed_width;
float4 acc = 0.0f;
@ -596,31 +596,16 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
__constant const struct ircd_gpt_opts *const opts,
const uint li)
{
// If the call value has been set to something other than default we
// do nothing else here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
// No action for other threads right now
if(li != 0)
return;
// Run debug checks and assertions.
#ifdef RB_DEBUG
if(ctrl->call == IRCD_GPT_ECOMPLETE)
if(ctrl->tokens < 2)
ctrl->call = IRCD_GPT_ETOKENS;
#endif
// On the last cycle, with no prior call or error code set, indicate
// a nominal exit condition.
if(ctrl->cycle + 1 >= opts->limit)
{
ctrl->call = IRCD_GPT_ACCEPT;
ctrl->epoch += 1;
}
if(ctrl->epic.cycle + 1 >= opts->limit)
ctrl->epic.epoch += 1;
ctrl->cycle += 1;
ctrl->epic.cycle += 1;
ctrl->magic = 0xC7012C70U;
}
@ -634,10 +619,6 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
__global const float *const restrict logexp,
__global const float *const restrict logit)
{
// When the hypercall code is already set, bail here.
if(ctrl->call != IRCD_GPT_ECOMPLETE)
return;
// To read from cells other than idx[0] we need this barrier.
if(opts->top_k > 1)
barrier(CLK_LOCAL_MEM_FENCE);
@ -647,7 +628,7 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
return;
const bool
buffer_full = ctrl->tokens >= opts->buffer_tokens;
buffer_full = ctrl->tokens.count >= opts->buffer_tokens;
const ulong
rnd = opts->top_k > 1?
@ -657,20 +638,20 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
entro = max(opts->top_k, 1U),
select = rnd % entro,
token = idx[select],
dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens,
tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens),
head = buffer_full?
(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;
(ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head;
ctrl->head = head;
ctrl->tokens = tokens;
ctrl->tokens.head = head;
ctrl->tokens.count = tokens;
ctrl->token[dest] = token;
const ushort
ln = get_local_size(0),
next_select = (select + 1) % ln,
next_token = idx[next_select],
sum_sel = ctrl->epoch % 3;
sum_sel = ctrl->epic.epoch % 3;
const float
test_lsm = logexp[opts->label],
@ -737,7 +718,7 @@ ircd_gpt_prop_elem(__global const struct ircd_gpt_task *const ctrl,
{
const uint
li = get_local_id(0),
step = ctrl->step;
step = ctrl->epic.step;
const float4
param = param_[li],

View file

@ -129,12 +129,11 @@ ircd::gpt::generate(task &task)
*task.ctrl
};
ctrl.cycle = 0;
ctrl.call = IRCD_GPT_ECOMPLETE;
ctrl.host_tsc = prof::cycles();
volatile const size_t tokens(ctrl.tokens);
volatile const auto epoch(ctrl.epoch);
volatile size_t cycle(ctrl.cycle);
ctrl.epic.cycle = 0;
ctrl.epic.host_tsc = prof::cycles();
volatile const size_t tokens(ctrl.tokens.count);
volatile const auto epoch(ctrl.epic.epoch);
volatile size_t cycle(ctrl.epic.cycle);
std::deque<pipe::exec> list;
for(; cycle < opts.limit; ++cycle)
@ -151,8 +150,7 @@ ircd::gpt::generate(task &task)
task, tokens + cycle, rel, acq
);
// Conditions for a cl::flush here; this is not default but
// may be configured to improve some workloads.
// Conditions for a cl::flush here
const bool flush
{
// Flushing here is enabled by the configuration
@ -194,18 +192,8 @@ ircd::gpt::generate(task &task)
list.clear();
assert(ctrl.magic == 0xC7012C70);
assert(ctrl.epic.cycle == cycle || ctx::interruption_requested());
this_ctx::interruption_point();
// Interp error codes
if(unlikely(ctrl.call <= 0))
throw error
{
"hyper (#%d) :%s",
abs(int(ctrl.call)),
reflect(ctrl.call),
};
assert(ctrl.cycle == cycle || ctx::interruption_requested());
}
void