mirror of
https://github.com/matrix-construct/construct
synced 2024-11-17 23:40:57 +01:00
ircd::gpt: Reorganize task options and control blocks.
This commit is contained in:
parent
3e9c2d1b56
commit
37b1d47c8d
12 changed files with 527 additions and 643 deletions
|
@ -17,8 +17,8 @@ namespace ircd::gpt
|
||||||
{
|
{
|
||||||
IRCD_EXCEPTION(ircd::error, error)
|
IRCD_EXCEPTION(ircd::error, error)
|
||||||
|
|
||||||
struct opts;
|
|
||||||
struct task;
|
struct task;
|
||||||
|
struct gate;
|
||||||
|
|
||||||
extern log::log log;
|
extern log::log log;
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,6 @@ namespace ircd::gpt
|
||||||
#include "vocab.h"
|
#include "vocab.h"
|
||||||
#include "model.h"
|
#include "model.h"
|
||||||
#include "token.h"
|
#include "token.h"
|
||||||
#include "opts.h"
|
#include "task/task.h"
|
||||||
#include "task.h"
|
|
||||||
#include "pipe/pipe.h"
|
#include "pipe/pipe.h"
|
||||||
#include "generate.h"
|
#include "generate.h"
|
||||||
|
|
|
@ -1,277 +0,0 @@
|
||||||
// Matrix Construct
|
|
||||||
//
|
|
||||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
||||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
||||||
//
|
|
||||||
// Permission to use, copy, modify, and/or distribute this software for any
|
|
||||||
// purpose with or without fee is hereby granted, provided that the above
|
|
||||||
// copyright notice and this permission notice is present in all copies. The
|
|
||||||
// full license for this software is available in the LICENSE file.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#define HAVE_IRCD_GPT_OPTS_H
|
|
||||||
|
|
||||||
/// Task Options Page
|
|
||||||
///
|
|
||||||
/// The option block is directly shared with task software as constant data.
|
|
||||||
/// This stucture and its mutable companion in `task.h` determine the outcome
|
|
||||||
/// of the next execution cycle; options are immutable to device software but
|
|
||||||
/// may be changed by the host between executions cycles if desired.
|
|
||||||
///
|
|
||||||
struct ircd_gpt_opts
|
|
||||||
{
|
|
||||||
/// Specifies the nominal halting condition based on a sequence of tokens.
|
|
||||||
/// Generation will complete with success after one of these sequences is
|
|
||||||
/// witnessed. Set tokens to -1 starting from the back for shorter
|
|
||||||
/// sequences; zero-length sequences (all -1's) are never matched.
|
|
||||||
uint accept_code[4][4]
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
{ 13, 198, -1U, -1U, },
|
|
||||||
{ 198, 198, -1U, -1U, },
|
|
||||||
{ -1U, -1U, -1U, -1U, },
|
|
||||||
{ -1U, -1U, -1U, -1U, },
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Specifies the exceptional halting condition based on the sequence of
|
|
||||||
/// tokens. By default, the three zeros represent three outputs of '!'
|
|
||||||
/// which is probably an error; note that a true "!!!" is represented by
|
|
||||||
/// token number 10185. Set tokens to -1 starting from the back to not
|
|
||||||
/// match that token; generated output after errors is usually garbage.
|
|
||||||
uint error_code[4][4]
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
{ 0, 0, 0, -1U, },
|
|
||||||
{ -1U, -1U, -1U, -1U, },
|
|
||||||
{ -1U, -1U, -1U, -1U, },
|
|
||||||
{ -1U, -1U, -1U, -1U, },
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
|
||||||
/// of tokens generated will be limited by other factors.
|
|
||||||
uint limit
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
1
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Flip random coins over the top k logits each round. Setting to 1
|
|
||||||
/// deterministically selects the top logit.
|
|
||||||
uint top_k
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
2
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Specifies the token context size in tokens.
|
|
||||||
uint context_tokens
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
1024
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Specifies the token buffer size in tokens.
|
|
||||||
uint buffer_tokens
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
1024
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Embedding vector elements
|
|
||||||
uint embed_elems
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
768
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Attention unit fcon width multiple
|
|
||||||
uint attn_mult
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
3U
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// MLP unit fcon width multiple
|
|
||||||
uint ffnn_mult
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
4U
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Attention unit width multiple
|
|
||||||
uint attn_elems
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
embed_elems * attn_mult
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// FFNN unit width multiple
|
|
||||||
uint ffnn_elems
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
embed_elems * ffnn_mult
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// SIMD lane count
|
|
||||||
uint lanes
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
4U
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
uint embed_width
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
embed_elems / lanes
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
uint attn_width
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
attn_elems / lanes
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
uint attn_height
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
embed_elems / lanes
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
uint ffnn_width
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
ffnn_elems / lanes
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
uint ffnn_height
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
embed_elems / lanes
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Specifies the token context size in tokens.
|
|
||||||
uint logits
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
50257
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Seed for the task's PRNG.
|
|
||||||
ulong seed
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
1234567890UL
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Training steps
|
|
||||||
ulong training_steps
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
250000
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
/// Validation steps
|
|
||||||
ulong validation_steps
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
5000
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
ushort label
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
198
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
float alpha
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
0.001
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
float beta[2]
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
0.9, // Beta1
|
|
||||||
0.999, // Beta2
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
|
|
||||||
float epsilon
|
|
||||||
#ifdef __cplusplus
|
|
||||||
{
|
|
||||||
0.000001
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
;
|
|
||||||
}
|
|
||||||
__attribute__((aligned(4096)));
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
/// Generator Task Options.
|
|
||||||
///
|
|
||||||
/// Parameters for a task. Options are constant and one instance can be shared
|
|
||||||
/// between multiple task instances. This structure extends the task options
|
|
||||||
/// page, starting a new page which is not visible to device software; C++ and
|
|
||||||
/// host pointers are available.
|
|
||||||
///
|
|
||||||
struct ircd::gpt::opts
|
|
||||||
:ircd_gpt_opts
|
|
||||||
{
|
|
||||||
/// Pointer to the model
|
|
||||||
const model::decoder *model
|
|
||||||
{
|
|
||||||
model::default_model
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
|
|
||||||
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
|
|
||||||
#endif
|
|
|
@ -1,166 +0,0 @@
|
||||||
// Matrix Construct
|
|
||||||
//
|
|
||||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
||||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
||||||
//
|
|
||||||
// Permission to use, copy, modify, and/or distribute this software for any
|
|
||||||
// purpose with or without fee is hereby granted, provided that the above
|
|
||||||
// copyright notice and this permission notice is present in all copies. The
|
|
||||||
// full license for this software is available in the LICENSE file.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#define HAVE_IRCD_GPT_TASK_H
|
|
||||||
|
|
||||||
/// Task Control Page
|
|
||||||
///
|
|
||||||
/// The control block is shared with our device software. Execution state is
|
|
||||||
/// maintained in the task control block across cycles. The control block is
|
|
||||||
/// the mutable state component for an execution; for the immutable component
|
|
||||||
/// also shared with device software see opts.h.
|
|
||||||
///
|
|
||||||
struct ircd_gpt_task
|
|
||||||
{
|
|
||||||
/// Header magic 0xC7012C70
|
|
||||||
uint magic;
|
|
||||||
|
|
||||||
/// Hypercall code set by our device software upon completion and control
|
|
||||||
/// transfer back to the host. Negative codes indicate errors, positive
|
|
||||||
/// codes are used for status and/or procedure calls; zero is also an error.
|
|
||||||
enum ircd_gpt_hypercall call;
|
|
||||||
|
|
||||||
/// Token ring head. Tokens in the ring extend behind the head for
|
|
||||||
/// `tokens`. The `head` value is automatically modulated by device
|
|
||||||
/// software to wrap around the ring.
|
|
||||||
uint head;
|
|
||||||
|
|
||||||
/// Token counter. The counter indicates the number of valid tokens in
|
|
||||||
/// the context buffer. This value must not exceed the buffer size.
|
|
||||||
uint tokens;
|
|
||||||
|
|
||||||
/// Accumulates the number of task cycles. The cycle counter is incremented
|
|
||||||
/// by device software after each repetition of the kernel pipeline to
|
|
||||||
/// produce one additional token.
|
|
||||||
ulong cycle;
|
|
||||||
|
|
||||||
/// Accumulates the epoch count for the task. The counter is incremented
|
|
||||||
/// by one in device software before control returns back to the host.
|
|
||||||
/// Several cycles may occur during each epoch.
|
|
||||||
ulong epoch;
|
|
||||||
|
|
||||||
/// Accumulates the training epoch count for the task. The counter is
|
|
||||||
/// incremented by one in device software for each backward propagation.
|
|
||||||
ulong step;
|
|
||||||
|
|
||||||
/// Accumulates the number of tokens produced by the task. Several tokens
|
|
||||||
/// may be produced each epoch, but currently only one token is produced
|
|
||||||
/// each cycle.
|
|
||||||
ulong produced;
|
|
||||||
|
|
||||||
/// Accumulates the number tokens witnessed by the task. The number of
|
|
||||||
/// tokens in the context for each cycle is counted as witnessed.
|
|
||||||
ulong witnessed;
|
|
||||||
|
|
||||||
/// Accumulates time in microseconds elapsed for the task.
|
|
||||||
ulong elapsed;
|
|
||||||
|
|
||||||
/// PRNG xoshiro256 state. This is the de facto random seed which can be
|
|
||||||
/// set before cycle entry by the host. It is updated by device software
|
|
||||||
/// when used.
|
|
||||||
ulong rand[4];
|
|
||||||
|
|
||||||
/// Updated by the host with the value of the timestamp register as sampled
|
|
||||||
/// immediately before each transfer of control to the device.
|
|
||||||
ulong host_tsc;
|
|
||||||
|
|
||||||
/// State counters for the accept/error sequence codes.
|
|
||||||
uint accept_seq[4], error_seq[4];
|
|
||||||
|
|
||||||
/// Logit softmax mu
|
|
||||||
float samax_mu;
|
|
||||||
|
|
||||||
/// Logit softmax sum
|
|
||||||
float samax_sum;
|
|
||||||
|
|
||||||
/// Logit softmax lambda
|
|
||||||
float samax_lambda;
|
|
||||||
|
|
||||||
/// Loss for last token of last cycle
|
|
||||||
float loss;
|
|
||||||
|
|
||||||
/// Sum loss over all cycles
|
|
||||||
float loss_sum[4];
|
|
||||||
|
|
||||||
/// Average loss over all cycles
|
|
||||||
float loss_mean;
|
|
||||||
|
|
||||||
/// Perplexity score for last token of last cycle
|
|
||||||
float perp;
|
|
||||||
|
|
||||||
/// Sum ppl over all cycles
|
|
||||||
float perp_sum[4];
|
|
||||||
|
|
||||||
/// Perplexity mean over context
|
|
||||||
float perp_mean;
|
|
||||||
|
|
||||||
/// Certainty difference score for last token of last cycle
|
|
||||||
float cert;
|
|
||||||
|
|
||||||
/// Sum certainty over all cycles
|
|
||||||
float cert_sum[4];
|
|
||||||
|
|
||||||
/// Certainty mean over context
|
|
||||||
float cert_mean;
|
|
||||||
|
|
||||||
/// Final loss
|
|
||||||
float l2_loss;
|
|
||||||
|
|
||||||
/// Final loss mean
|
|
||||||
float l2_loss_mean;
|
|
||||||
|
|
||||||
/// Perform backprop
|
|
||||||
bool prop;
|
|
||||||
|
|
||||||
/// The token buffer starts at offset 2048 and continues to the end of
|
|
||||||
/// the page; options specify the size of the tokens buffer in tokens.
|
|
||||||
/// Additional pages must be attached for larger buffer sizes.
|
|
||||||
ushort token[] __attribute__((aligned(2048)));
|
|
||||||
}
|
|
||||||
__attribute__((aligned(4096)));
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
/// Task Context
|
|
||||||
///
|
|
||||||
/// State for a task.
|
|
||||||
struct ircd::gpt::task
|
|
||||||
{
|
|
||||||
enum status :char;
|
|
||||||
|
|
||||||
/// Reference to the attached options.
|
|
||||||
const gpt::opts *opts {nullptr};
|
|
||||||
|
|
||||||
/// Reference to control pages.
|
|
||||||
struct ircd_gpt_task *ctrl {nullptr};
|
|
||||||
|
|
||||||
/// Current task status.
|
|
||||||
enum status status {'\0'};
|
|
||||||
|
|
||||||
task(const gpt::opts * = nullptr,
|
|
||||||
struct ircd_gpt_task * = nullptr);
|
|
||||||
|
|
||||||
~task() noexcept;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// The current status of a task is indicated with intelligible characters
|
|
||||||
enum ircd::gpt::task::status
|
|
||||||
:char
|
|
||||||
{
|
|
||||||
QUEUED = 'Q', ///< Queued for execution.
|
|
||||||
RUNNING = 'R', ///< Currently being executed.
|
|
||||||
ACCEPT = 'A', ///< Execution completed successfully.
|
|
||||||
ERROR = 'E', ///< Execution did not complete successfully.
|
|
||||||
};
|
|
||||||
|
|
||||||
static_assert(sizeof(struct ircd_gpt_task) == 4096);
|
|
||||||
static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
|
|
||||||
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
|
|
||||||
#endif
|
|
58
include/ircd/gpt/task/ctrl.h
Normal file
58
include/ircd/gpt/task/ctrl.h
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_TASK_CTRL_H
|
||||||
|
|
||||||
|
/// Task Control Page
|
||||||
|
///
|
||||||
|
/// The control block is shared with our device software. Execution state is
|
||||||
|
/// maintained in the task control block across cycles. The control block is
|
||||||
|
/// the mutable state component for an execution; for the immutable component
|
||||||
|
/// also shared with device software see opts.h.
|
||||||
|
///
|
||||||
|
struct ircd_gpt_task
|
||||||
|
{
|
||||||
|
/// Epoch counting & interrupt control block.
|
||||||
|
struct ircd_gpt_task_epic epic;
|
||||||
|
|
||||||
|
/// Token context control block. Contains state for the token context
|
||||||
|
/// buffer; the buffer with the tokens themselves is elsewhere.
|
||||||
|
struct ircd_gpt_task_tokens tokens;
|
||||||
|
|
||||||
|
/// Logit softmax state
|
||||||
|
struct ircd_math_samax samax;
|
||||||
|
|
||||||
|
/// Target label loss state
|
||||||
|
struct ircd_math_mean loss;
|
||||||
|
|
||||||
|
/// Target label perplexity score state
|
||||||
|
struct ircd_math_mean perp;
|
||||||
|
|
||||||
|
/// Target label certainty difference state
|
||||||
|
struct ircd_math_mean cert;
|
||||||
|
|
||||||
|
/// PRNG xoshiro256 state. This is the de facto random seed which can be
|
||||||
|
/// set before cycle entry by the host. It is updated by device software
|
||||||
|
/// when used.
|
||||||
|
ulong rand[4];
|
||||||
|
|
||||||
|
/// Perform backprop
|
||||||
|
bool prop;
|
||||||
|
|
||||||
|
/// Header magic 0xC7012C70
|
||||||
|
uint magic;
|
||||||
|
|
||||||
|
/// The token buffer starts at offset 2048 and continues to the end of
|
||||||
|
/// the page; options specify the size of the tokens buffer in tokens.
|
||||||
|
/// Additional pages must be attached for larger buffer sizes.
|
||||||
|
ushort token[] __attribute__((aligned(2048)));
|
||||||
|
}
|
||||||
|
__attribute__((aligned(4096)));
|
38
include/ircd/gpt/task/epic.h
Normal file
38
include/ircd/gpt/task/epic.h
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_TASK_EPIC_H
|
||||||
|
|
||||||
|
/// Epoch Precision Interrupt Controller
|
||||||
|
///
|
||||||
|
struct ircd_gpt_task_epic
|
||||||
|
{
|
||||||
|
/// Accumulates the number of task cycles. The cycle counter is incremented
|
||||||
|
/// by device software after each repetition of the kernel pipeline to
|
||||||
|
/// produce one additional token.
|
||||||
|
ulong cycle;
|
||||||
|
|
||||||
|
/// Accumulates the epoch count for the task. The counter is incremented
|
||||||
|
/// by one in device software before control returns back to the host.
|
||||||
|
/// Several cycles may occur during each epoch.
|
||||||
|
ulong epoch;
|
||||||
|
|
||||||
|
/// Accumulates the training epoch count for the task. The counter is
|
||||||
|
/// incremented by one in device software for each backward propagation.
|
||||||
|
ulong step;
|
||||||
|
|
||||||
|
/// Updated by the host with the value of the timestamp register as sampled
|
||||||
|
/// immediately before each transfer of control to the device.
|
||||||
|
ulong host_tsc;
|
||||||
|
|
||||||
|
/// Accumulates time in microseconds elapsed for the task.
|
||||||
|
ulong elapsed;
|
||||||
|
};
|
30
include/ircd/gpt/task/gate.h
Normal file
30
include/ircd/gpt/task/gate.h
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_GATE_H
|
||||||
|
|
||||||
|
/// Task Gate Descriptor
|
||||||
|
///
|
||||||
|
struct ircd_gpt_gate
|
||||||
|
{
|
||||||
|
ushort code[8];
|
||||||
|
}
|
||||||
|
__attribute__((aligned(16)));
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
struct ircd::gpt::gate
|
||||||
|
:ircd_gpt_gate
|
||||||
|
{
|
||||||
|
gate()
|
||||||
|
:ircd_gpt_gate{0}
|
||||||
|
{}
|
||||||
|
};
|
||||||
|
#endif
|
123
include/ircd/gpt/task/opts.h
Normal file
123
include/ircd/gpt/task/opts.h
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_OPTS_H
|
||||||
|
|
||||||
|
/// Task Options Page
|
||||||
|
///
|
||||||
|
/// The option block is directly shared with task software as constant data.
|
||||||
|
/// This stucture and its mutable companion in `task.h` determine the outcome
|
||||||
|
/// of the next execution cycle; options are immutable to device software but
|
||||||
|
/// may be changed by the host between executions cycles if desired.
|
||||||
|
///
|
||||||
|
struct ircd_gpt_opts
|
||||||
|
{
|
||||||
|
#ifdef __cplusplus
|
||||||
|
ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Reference to the model (currently not available in device software).
|
||||||
|
#ifndef __cplusplus
|
||||||
|
const intptr_t model;
|
||||||
|
#else
|
||||||
|
const ircd::gpt::model::decoder *model;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
||||||
|
/// of tokens generated will be limited by other factors.
|
||||||
|
uint limit;
|
||||||
|
|
||||||
|
/// Flip random coins over the top k logits each round. Setting to 1
|
||||||
|
/// deterministically selects the top logit.
|
||||||
|
uint top_k;
|
||||||
|
|
||||||
|
/// Specifies the token context size in tokens.
|
||||||
|
uint context_tokens;
|
||||||
|
|
||||||
|
/// Specifies the token buffer size in tokens.
|
||||||
|
uint buffer_tokens;
|
||||||
|
|
||||||
|
/// Embedding vector elements
|
||||||
|
uint embed_elems;
|
||||||
|
|
||||||
|
/// Attention unit fcon width multiple
|
||||||
|
uint attn_mult;
|
||||||
|
|
||||||
|
/// (computed) MLP unit fcon width multiple
|
||||||
|
uint ffnn_mult;
|
||||||
|
|
||||||
|
/// (computed) attention unit width multiple
|
||||||
|
uint attn_elems;
|
||||||
|
|
||||||
|
/// FFNN unit width multiple
|
||||||
|
uint ffnn_elems;
|
||||||
|
|
||||||
|
/// SIMD lane count
|
||||||
|
uint lanes;
|
||||||
|
|
||||||
|
/// (computed) `embed_elems` / `lanes`
|
||||||
|
uint embed_width;
|
||||||
|
|
||||||
|
/// (computed) Attention unit X dimension
|
||||||
|
uint attn_width;
|
||||||
|
|
||||||
|
/// (computed) Attention unit Y dimension
|
||||||
|
uint attn_height;
|
||||||
|
|
||||||
|
/// (computed) MLP backend X dimension
|
||||||
|
uint ffnn_width;
|
||||||
|
|
||||||
|
/// (computed) MLP backend Y dimension
|
||||||
|
uint ffnn_height;
|
||||||
|
|
||||||
|
/// Number of possible target n-grams.
|
||||||
|
uint logits;
|
||||||
|
|
||||||
|
/// Seed for the task's PRNG.
|
||||||
|
ulong seed;
|
||||||
|
|
||||||
|
/// Training steps
|
||||||
|
ulong training_steps;
|
||||||
|
|
||||||
|
/// Validation steps
|
||||||
|
ulong validation_steps;
|
||||||
|
|
||||||
|
/// Target label
|
||||||
|
ushort label;
|
||||||
|
|
||||||
|
/// Learning rate
|
||||||
|
float alpha;
|
||||||
|
|
||||||
|
/// Decay rate
|
||||||
|
float beta[2];
|
||||||
|
|
||||||
|
/// Denorm smoothing
|
||||||
|
float epsilon;
|
||||||
|
|
||||||
|
/// Number of gate descriptors attached to this page.
|
||||||
|
uint gates;
|
||||||
|
|
||||||
|
/// The gate descriptor table starts at offset 2048 and continues to the
|
||||||
|
/// end of the page. For more descriptors additional pages must be
|
||||||
|
/// attached.
|
||||||
|
struct ircd_gpt_gate gate[] __attribute__((aligned(2048)));
|
||||||
|
}
|
||||||
|
__attribute__((aligned(4096)));
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
namespace ircd::gpt
|
||||||
|
{
|
||||||
|
using opts = struct ircd_gpt_opts;
|
||||||
|
}
|
||||||
|
|
||||||
|
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
|
||||||
|
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
|
||||||
|
#endif
|
56
include/ircd/gpt/task/task.h
Normal file
56
include/ircd/gpt/task/task.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_TASK_H
|
||||||
|
|
||||||
|
#include "epic.h"
|
||||||
|
#include "tokens.h"
|
||||||
|
#include "gate.h"
|
||||||
|
#include "opts.h"
|
||||||
|
#include "ctrl.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
/// Task Context
|
||||||
|
///
|
||||||
|
/// State for a task.
|
||||||
|
struct ircd::gpt::task
|
||||||
|
{
|
||||||
|
enum status :char;
|
||||||
|
|
||||||
|
/// Reference to the attached options.
|
||||||
|
const gpt::opts *opts {nullptr};
|
||||||
|
|
||||||
|
/// Reference to control pages.
|
||||||
|
struct ircd_gpt_task *ctrl {nullptr};
|
||||||
|
|
||||||
|
/// Current task status.
|
||||||
|
enum status status {'\0'};
|
||||||
|
|
||||||
|
task(const gpt::opts * = nullptr,
|
||||||
|
struct ircd_gpt_task * = nullptr);
|
||||||
|
|
||||||
|
~task() noexcept;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// The current status of a task is indicated with intelligible characters
|
||||||
|
enum ircd::gpt::task::status
|
||||||
|
:char
|
||||||
|
{
|
||||||
|
QUEUED = 'Q', ///< Queued for execution.
|
||||||
|
RUNNING = 'R', ///< Currently being executed.
|
||||||
|
ACCEPT = 'A', ///< Execution completed successfully.
|
||||||
|
ERROR = 'E', ///< Execution did not complete successfully.
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(struct ircd_gpt_task) == 4096);
|
||||||
|
static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
|
||||||
|
static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
|
||||||
|
#endif
|
35
include/ircd/gpt/task/tokens.h
Normal file
35
include/ircd/gpt/task/tokens.h
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_TASK_TOKENS_H
|
||||||
|
|
||||||
|
/// Token Context Buffer (Control Block)
|
||||||
|
///
|
||||||
|
struct ircd_gpt_task_tokens
|
||||||
|
{
|
||||||
|
/// Token ring head. Tokens in the ring extend behind the head for
|
||||||
|
/// `tokens`. The `head` value is automatically modulated by device
|
||||||
|
/// software to wrap around the ring.
|
||||||
|
uint head;
|
||||||
|
|
||||||
|
/// Token counter. The counter indicates the number of valid tokens in
|
||||||
|
/// the context buffer. This value must not exceed the buffer size.
|
||||||
|
uint count;
|
||||||
|
|
||||||
|
/// Accumulates the number of tokens produced by the task. Several tokens
|
||||||
|
/// may be produced each epoch, but currently only one token is produced
|
||||||
|
/// each cycle.
|
||||||
|
ulong produced;
|
||||||
|
|
||||||
|
/// Accumulates the number tokens witnessed by the task. The number of
|
||||||
|
/// tokens in the context for each cycle is counted as witnessed.
|
||||||
|
ulong witnessed;
|
||||||
|
};
|
309
ircd/gpt.cc
309
ircd/gpt.cc
|
@ -88,67 +88,29 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
|
|
||||||
const auto &opts(*task.opts);
|
const auto &opts(*task.opts);
|
||||||
auto &ctrl(*task.ctrl);
|
auto &ctrl(*task.ctrl);
|
||||||
auto &errc(ctrl.error_seq);
|
ctrl.tokens.count = 0;
|
||||||
auto &accc(ctrl.accept_seq);
|
ctrl.tokens.head = 0;
|
||||||
ctrl.tokens = in.size();
|
|
||||||
ctrl.head = 0;
|
|
||||||
|
|
||||||
const size_t tmax
|
|
||||||
{
|
|
||||||
in.size() + opts.limit
|
|
||||||
};
|
|
||||||
|
|
||||||
const vector_view<f32> accum
|
|
||||||
{
|
|
||||||
gpt::scratch, tmax * 768
|
|
||||||
};
|
|
||||||
|
|
||||||
const vector_view<f32> embeds
|
|
||||||
{
|
|
||||||
gpt::embeds, tmax * 768
|
|
||||||
};
|
|
||||||
|
|
||||||
for(uint j(0); j < in.size(); ++j)
|
for(uint j(0); j < in.size(); ++j)
|
||||||
|
ctrl.token[ctrl.tokens.count++] = in[j];
|
||||||
|
|
||||||
|
for(uint i(0); i < opts.gates; ++i)
|
||||||
|
for(uint k(0); k < 8; ++k)
|
||||||
|
{
|
||||||
|
if(ctrl.tokens.count >= opts.buffer_tokens)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if(opts.gate[i].code[k] == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
ctrl.token[ctrl.tokens.count] = opts.gate[i].code[k];
|
||||||
|
ctrl.tokens.count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t in_size
|
||||||
{
|
{
|
||||||
const vector_view<f32> dst
|
ctrl.tokens.count
|
||||||
{
|
};
|
||||||
data(embeds) + j * 768, 768
|
|
||||||
};
|
|
||||||
|
|
||||||
if(ircd::cl::enable)
|
|
||||||
ctrl.token[j] = in[j];
|
|
||||||
else
|
|
||||||
embed(data(dst), in[j], j, opts);
|
|
||||||
|
|
||||||
#if 0 // RB_DEBUG
|
|
||||||
static char dbuf[512] {0};
|
|
||||||
char report[1536] {0};
|
|
||||||
char tmbuf[1][64] {{0}};
|
|
||||||
const size_t report_size = snprintf
|
|
||||||
(
|
|
||||||
report, sizeof(report),
|
|
||||||
"%-4u %4u %4u:%-4u %1u%1u [ %6.2fL %6.2f%% ] %6.2fL %5.1f%% %s",
|
|
||||||
ctrl.epoch,
|
|
||||||
ctrl.cycle,
|
|
||||||
j,
|
|
||||||
ctrl.tokens,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
0.0,
|
|
||||||
vocab::debug(dbuf, in[j]).c_str()
|
|
||||||
);
|
|
||||||
|
|
||||||
log::logf
|
|
||||||
{
|
|
||||||
log, log::level::DEBUG,
|
|
||||||
"%s",
|
|
||||||
string_view{report, report_size}
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t cycles(0);
|
uint64_t cycles(0);
|
||||||
if(ctrl.prop)
|
if(ctrl.prop)
|
||||||
|
@ -170,7 +132,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
cycles
|
cycles
|
||||||
};
|
};
|
||||||
|
|
||||||
backprop(task, ctrl.loss_mean, *model::default_model, momentum);
|
backprop(task, ctrl.loss.mean, *model::default_model, momentum);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(ctrl.prop)
|
if(ctrl.prop)
|
||||||
|
@ -178,17 +140,17 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
log::debug
|
log::debug
|
||||||
{
|
{
|
||||||
log, "Backpropagation of %2.6f in %lu cycles.",
|
log, "Backpropagation of %2.6f in %lu cycles.",
|
||||||
ctrl.loss_mean,
|
ctrl.loss.mean,
|
||||||
cycles,
|
cycles,
|
||||||
};
|
};
|
||||||
|
|
||||||
ctrl.epoch = 0;
|
ctrl.epic.epoch = 0;
|
||||||
ctrl.loss_mean = 0;
|
ctrl.loss.mean = 0;
|
||||||
ctrl.loss = ctrl.loss_mean;
|
ctrl.loss.last = ctrl.loss.mean;
|
||||||
ctrl.perp_mean = 0;
|
ctrl.perp.mean = 0;
|
||||||
ctrl.perp = ctrl.perp_mean;
|
ctrl.perp.last = ctrl.perp.mean;
|
||||||
ctrl.cert_mean = 0;
|
ctrl.cert.mean = 0;
|
||||||
ctrl.cert = ctrl.cert_mean;
|
ctrl.cert.last = ctrl.cert.mean;
|
||||||
ctrl.prop = false;
|
ctrl.prop = false;
|
||||||
pipe::default_model->invalid = true;
|
pipe::default_model->invalid = true;
|
||||||
return {};
|
return {};
|
||||||
|
@ -206,73 +168,49 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
generate(task);
|
generate(task);
|
||||||
}
|
}
|
||||||
last_time = stopwatch.at<milliseconds>();
|
last_time = stopwatch.at<milliseconds>();
|
||||||
ctrl.elapsed += last_time.count();
|
ctrl.epic.elapsed += last_time.count();
|
||||||
|
|
||||||
/*
|
for(uint j(0); j < ctrl.tokens.count && ret < out.size() && !halt; ++j)
|
||||||
coil(data(scratch), tokens, *opts.model);
|
|
||||||
tail(logit, data(last_embed), *opts.model);
|
|
||||||
out[i] = argmax(logit, *opts);
|
|
||||||
*/
|
|
||||||
|
|
||||||
uint accc_thresh[3] {3, 3, 3};
|
|
||||||
for(uint i(0); i < 3; ++i)
|
|
||||||
for(uint j(3); j > 0; --j)
|
|
||||||
if(opts.accept_code[i][j - 1] == -1U)
|
|
||||||
--accc_thresh[i];
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
|
|
||||||
uint errc_thresh[3] {3, 3, 3};
|
|
||||||
for(uint i(0); i < 3; ++i)
|
|
||||||
for(uint j(3); j > 0; --j)
|
|
||||||
if(opts.error_code[i][j - 1] == -1U)
|
|
||||||
--errc_thresh[i];
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
|
|
||||||
for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
|
|
||||||
{
|
{
|
||||||
out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
|
const auto tok
|
||||||
|
{
|
||||||
|
ctrl.token[j]
|
||||||
|
};
|
||||||
|
|
||||||
for(uint j(0); j < 3; ++j)
|
if(j >= in_size)
|
||||||
errc[j] = opts.error_code[j][errc[j]] == out[j]?
|
out[ret++] = tok;
|
||||||
errc[j] + 1: 0;
|
|
||||||
|
|
||||||
for(uint j(0); j < 3; ++j)
|
if(j < in_size)
|
||||||
accc[j] = opts.accept_code[j][accc[j]] == out[j]?
|
continue;
|
||||||
accc[j] + 1: 0;
|
|
||||||
|
|
||||||
for(uint j(0); j < 3; ++j)
|
|
||||||
halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
|
|
||||||
halt |= errc_thresh[j] && errc[j] >= errc_thresh[j];
|
|
||||||
|
|
||||||
static char dbuf[512] {0};
|
static char dbuf[512] {0};
|
||||||
char report[1536] {0};
|
char report[1536] {0};
|
||||||
char tmbuf[4][64] {0};
|
char tmbuf[4][64] {0};
|
||||||
const size_t bsz(ctrl.tokens - in.size());
|
const size_t bsz(ctrl.tokens.count - in_size);
|
||||||
const size_t report_size = snprintf
|
const size_t report_size = snprintf
|
||||||
(
|
(
|
||||||
report, sizeof(report),
|
report, sizeof(report),
|
||||||
"%4lu:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP %5.1f%% %s %04x %8s %8s | %8s",
|
"%-3u %4u:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP %5.1f%% %s %04x %8s %8s | %8s",
|
||||||
j + in.size(),
|
j,
|
||||||
ctrl.tokens,
|
ret - 1,
|
||||||
ctrl.epoch,
|
ctrl.tokens.count,
|
||||||
ctrl.cycle,
|
ctrl.epic.epoch,
|
||||||
std::clamp(ctrl.cert_mean * 100.0f, 0.0f, 100.0f),
|
ctrl.epic.cycle,
|
||||||
std::clamp(ctrl.perp_mean, 0.0f, 100.0f),
|
std::clamp(ctrl.cert.mean * 100.0f, 0.0f, 100.0f),
|
||||||
std::clamp(ctrl.loss_mean, 0.0f, 99.99f),
|
std::clamp(ctrl.perp.mean, 0.0f, 100.0f),
|
||||||
opts.label == out[j]? '+': ' ',
|
std::clamp(ctrl.loss.mean, 0.0f, 99.99f),
|
||||||
accc[0] + accc[1] + accc[2] >= 3? 'A': ' ',
|
opts.label == tok? '+': ' ',
|
||||||
errc[0] + errc[1] + errc[2] >= 3? 'E': ' ',
|
' ', // flag place
|
||||||
|
' ', // flag place
|
||||||
opts.label,
|
opts.label,
|
||||||
std::clamp(ctrl.loss, 0.0f, 99.99f),
|
std::clamp(ctrl.loss.last, 0.0f, 99.99f),
|
||||||
std::clamp(ctrl.perp, 0.0f, 100.0f),
|
std::clamp(ctrl.perp.last, 0.0f, 100.0f),
|
||||||
std::clamp(ctrl.cert * 100.0f, 0.0f, 100.0f),
|
std::clamp(ctrl.cert.last * 100.0f, 0.0f, 100.0f),
|
||||||
vocab::debug(dbuf, out[j]).c_str(),
|
vocab::debug(dbuf, tok).c_str(),
|
||||||
out[j],
|
tok,
|
||||||
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
|
pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
|
||||||
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
|
pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
|
||||||
pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
|
pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
|
||||||
);
|
);
|
||||||
|
|
||||||
log::logf
|
log::logf
|
||||||
|
@ -283,19 +221,6 @@ ircd::gpt::generate(const vector_view<u16> &out,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = ctrl.tokens - in.size();
|
|
||||||
if ((false)) for(uint i(0); i < 3; ++i)
|
|
||||||
if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
|
|
||||||
{
|
|
||||||
ret -= (3 - accc_thresh[i]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
|
|
||||||
{
|
|
||||||
ret -= (3 - errc_thresh[i]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx::interruption_point();
|
ctx::interruption_point();
|
||||||
return vector_view<u16>
|
return vector_view<u16>
|
||||||
{
|
{
|
||||||
|
@ -689,6 +614,7 @@ ircd::gpt::gelu(f32x4 &out,
|
||||||
// backside
|
// backside
|
||||||
//
|
//
|
||||||
|
|
||||||
|
[[gnu::noinline]]
|
||||||
size_t
|
size_t
|
||||||
ircd::gpt::backprop(task &task,
|
ircd::gpt::backprop(task &task,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
|
@ -792,6 +718,7 @@ ircd::gpt::backprop(task &task,
|
||||||
return off;
|
return off;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[gnu::noinline]]
|
||||||
size_t
|
size_t
|
||||||
ircd::gpt::adamw(task &task,
|
ircd::gpt::adamw(task &task,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
|
@ -820,7 +747,7 @@ ircd::gpt::adamw(task &task,
|
||||||
};
|
};
|
||||||
|
|
||||||
for(uint i(0); i < num / 4; ++i)
|
for(uint i(0); i < num / 4; ++i)
|
||||||
off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.step, off);
|
off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.epic.step, off);
|
||||||
|
|
||||||
return off;
|
return off;
|
||||||
}
|
}
|
||||||
|
@ -915,19 +842,111 @@ noexcept
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// hypercall
|
// gpt::opts
|
||||||
//
|
//
|
||||||
|
|
||||||
ircd::string_view
|
ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model)
|
||||||
ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
|
|
||||||
noexcept
|
noexcept
|
||||||
|
:model
|
||||||
|
{
|
||||||
|
model
|
||||||
|
}
|
||||||
|
,limit
|
||||||
|
{
|
||||||
|
-1U
|
||||||
|
}
|
||||||
|
,top_k
|
||||||
|
{
|
||||||
|
2U
|
||||||
|
}
|
||||||
|
,context_tokens
|
||||||
|
{
|
||||||
|
1024U
|
||||||
|
}
|
||||||
|
,buffer_tokens
|
||||||
|
{
|
||||||
|
1024U
|
||||||
|
}
|
||||||
|
,embed_elems
|
||||||
|
{
|
||||||
|
768U
|
||||||
|
}
|
||||||
|
,attn_mult
|
||||||
|
{
|
||||||
|
3U
|
||||||
|
}
|
||||||
|
,ffnn_mult
|
||||||
|
{
|
||||||
|
4U
|
||||||
|
}
|
||||||
|
,attn_elems
|
||||||
|
{
|
||||||
|
embed_elems * attn_mult
|
||||||
|
}
|
||||||
|
,ffnn_elems
|
||||||
|
{
|
||||||
|
embed_elems * ffnn_mult
|
||||||
|
}
|
||||||
|
,lanes
|
||||||
|
{
|
||||||
|
4U
|
||||||
|
}
|
||||||
|
,embed_width
|
||||||
|
{
|
||||||
|
embed_elems / lanes
|
||||||
|
}
|
||||||
|
,attn_width
|
||||||
|
{
|
||||||
|
attn_elems / lanes
|
||||||
|
}
|
||||||
|
,attn_height
|
||||||
|
{
|
||||||
|
embed_elems / lanes
|
||||||
|
}
|
||||||
|
,ffnn_width
|
||||||
|
{
|
||||||
|
ffnn_elems / lanes
|
||||||
|
}
|
||||||
|
,ffnn_height
|
||||||
|
{
|
||||||
|
embed_elems / lanes
|
||||||
|
}
|
||||||
|
,logits
|
||||||
|
{
|
||||||
|
50257
|
||||||
|
}
|
||||||
|
,seed
|
||||||
|
{
|
||||||
|
1234567890UL
|
||||||
|
}
|
||||||
|
,training_steps
|
||||||
|
{
|
||||||
|
250000
|
||||||
|
}
|
||||||
|
,validation_steps
|
||||||
|
{
|
||||||
|
5000
|
||||||
|
}
|
||||||
|
,label
|
||||||
|
{
|
||||||
|
198
|
||||||
|
}
|
||||||
|
,alpha
|
||||||
|
{
|
||||||
|
0.001f
|
||||||
|
}
|
||||||
|
,beta
|
||||||
|
{
|
||||||
|
0.9f,
|
||||||
|
0.999f,
|
||||||
|
}
|
||||||
|
,epsilon
|
||||||
|
{
|
||||||
|
0.000001
|
||||||
|
}
|
||||||
|
,gates
|
||||||
|
{
|
||||||
|
0
|
||||||
|
}
|
||||||
{
|
{
|
||||||
switch(code)
|
|
||||||
{
|
|
||||||
case IRCD_GPT_ACCEPT: return "ACCEPT";
|
|
||||||
case IRCD_GPT_ECOMPLETE: return "ECOMPLETE";
|
|
||||||
case IRCD_GPT_ETOKENS: return "ETOKENS";
|
|
||||||
}
|
|
||||||
|
|
||||||
return "??????";
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -432,7 +432,7 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
||||||
const uint word_idx)
|
const uint word_idx)
|
||||||
{
|
{
|
||||||
const ushort
|
const ushort
|
||||||
ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
|
ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens,
|
||||||
token = ctrl->token[ring_idx];
|
token = ctrl->token[ring_idx];
|
||||||
|
|
||||||
const float4
|
const float4
|
||||||
|
@ -454,7 +454,7 @@ ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
|
||||||
wi = get_group_id(0),
|
wi = get_group_id(0),
|
||||||
wn = get_num_groups(0);
|
wn = get_num_groups(0);
|
||||||
|
|
||||||
for(uint i = 0; i < ctrl->tokens; ++i)
|
for(uint i = 0; i < ctrl->tokens.count; ++i)
|
||||||
if(i % wn == wi)
|
if(i % wn == wi)
|
||||||
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
|
_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
|
||||||
}
|
}
|
||||||
|
@ -492,7 +492,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
gi = get_global_id(0),
|
gi = get_global_id(0),
|
||||||
ti = ctrl->tokens - 1,
|
ti = ctrl->tokens.count - 1,
|
||||||
words = opts->embed_width;
|
words = opts->embed_width;
|
||||||
|
|
||||||
float4 acc = 0.0f;
|
float4 acc = 0.0f;
|
||||||
|
@ -596,31 +596,16 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
|
||||||
__constant const struct ircd_gpt_opts *const opts,
|
__constant const struct ircd_gpt_opts *const opts,
|
||||||
const uint li)
|
const uint li)
|
||||||
{
|
{
|
||||||
// If the call value has been set to something other than default we
|
|
||||||
// do nothing else here.
|
|
||||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// No action for other threads right now
|
// No action for other threads right now
|
||||||
if(li != 0)
|
if(li != 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Run debug checks and assertions.
|
|
||||||
#ifdef RB_DEBUG
|
|
||||||
if(ctrl->call == IRCD_GPT_ECOMPLETE)
|
|
||||||
if(ctrl->tokens < 2)
|
|
||||||
ctrl->call = IRCD_GPT_ETOKENS;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// On the last cycle, with no prior call or error code set, indicate
|
// On the last cycle, with no prior call or error code set, indicate
|
||||||
// a nominal exit condition.
|
// a nominal exit condition.
|
||||||
if(ctrl->cycle + 1 >= opts->limit)
|
if(ctrl->epic.cycle + 1 >= opts->limit)
|
||||||
{
|
ctrl->epic.epoch += 1;
|
||||||
ctrl->call = IRCD_GPT_ACCEPT;
|
|
||||||
ctrl->epoch += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctrl->cycle += 1;
|
ctrl->epic.cycle += 1;
|
||||||
ctrl->magic = 0xC7012C70U;
|
ctrl->magic = 0xC7012C70U;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -634,10 +619,6 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
|
||||||
__global const float *const restrict logexp,
|
__global const float *const restrict logexp,
|
||||||
__global const float *const restrict logit)
|
__global const float *const restrict logit)
|
||||||
{
|
{
|
||||||
// When the hypercall code is already set, bail here.
|
|
||||||
if(ctrl->call != IRCD_GPT_ECOMPLETE)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// To read from cells other than idx[0] we need this barrier.
|
// To read from cells other than idx[0] we need this barrier.
|
||||||
if(opts->top_k > 1)
|
if(opts->top_k > 1)
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
@ -647,7 +628,7 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
|
||||||
return;
|
return;
|
||||||
|
|
||||||
const bool
|
const bool
|
||||||
buffer_full = ctrl->tokens >= opts->buffer_tokens;
|
buffer_full = ctrl->tokens.count >= opts->buffer_tokens;
|
||||||
|
|
||||||
const ulong
|
const ulong
|
||||||
rnd = opts->top_k > 1?
|
rnd = opts->top_k > 1?
|
||||||
|
@ -657,20 +638,20 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
|
||||||
entro = max(opts->top_k, 1U),
|
entro = max(opts->top_k, 1U),
|
||||||
select = rnd % entro,
|
select = rnd % entro,
|
||||||
token = idx[select],
|
token = idx[select],
|
||||||
dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
|
dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens,
|
||||||
tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
|
tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens),
|
||||||
head = buffer_full?
|
head = buffer_full?
|
||||||
(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;
|
(ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head;
|
||||||
|
|
||||||
ctrl->head = head;
|
ctrl->tokens.head = head;
|
||||||
ctrl->tokens = tokens;
|
ctrl->tokens.count = tokens;
|
||||||
ctrl->token[dest] = token;
|
ctrl->token[dest] = token;
|
||||||
|
|
||||||
const ushort
|
const ushort
|
||||||
ln = get_local_size(0),
|
ln = get_local_size(0),
|
||||||
next_select = (select + 1) % ln,
|
next_select = (select + 1) % ln,
|
||||||
next_token = idx[next_select],
|
next_token = idx[next_select],
|
||||||
sum_sel = ctrl->epoch % 3;
|
sum_sel = ctrl->epic.epoch % 3;
|
||||||
|
|
||||||
const float
|
const float
|
||||||
test_lsm = logexp[opts->label],
|
test_lsm = logexp[opts->label],
|
||||||
|
@ -737,7 +718,7 @@ ircd_gpt_prop_elem(__global const struct ircd_gpt_task *const ctrl,
|
||||||
{
|
{
|
||||||
const uint
|
const uint
|
||||||
li = get_local_id(0),
|
li = get_local_id(0),
|
||||||
step = ctrl->step;
|
step = ctrl->epic.step;
|
||||||
|
|
||||||
const float4
|
const float4
|
||||||
param = param_[li],
|
param = param_[li],
|
||||||
|
|
|
@ -129,12 +129,11 @@ ircd::gpt::generate(task &task)
|
||||||
*task.ctrl
|
*task.ctrl
|
||||||
};
|
};
|
||||||
|
|
||||||
ctrl.cycle = 0;
|
ctrl.epic.cycle = 0;
|
||||||
ctrl.call = IRCD_GPT_ECOMPLETE;
|
ctrl.epic.host_tsc = prof::cycles();
|
||||||
ctrl.host_tsc = prof::cycles();
|
volatile const size_t tokens(ctrl.tokens.count);
|
||||||
volatile const size_t tokens(ctrl.tokens);
|
volatile const auto epoch(ctrl.epic.epoch);
|
||||||
volatile const auto epoch(ctrl.epoch);
|
volatile size_t cycle(ctrl.epic.cycle);
|
||||||
volatile size_t cycle(ctrl.cycle);
|
|
||||||
|
|
||||||
std::deque<pipe::exec> list;
|
std::deque<pipe::exec> list;
|
||||||
for(; cycle < opts.limit; ++cycle)
|
for(; cycle < opts.limit; ++cycle)
|
||||||
|
@ -151,8 +150,7 @@ ircd::gpt::generate(task &task)
|
||||||
task, tokens + cycle, rel, acq
|
task, tokens + cycle, rel, acq
|
||||||
);
|
);
|
||||||
|
|
||||||
// Conditions for a cl::flush here; this is not default but
|
// Conditions for a cl::flush here
|
||||||
// may be configured to improve some workloads.
|
|
||||||
const bool flush
|
const bool flush
|
||||||
{
|
{
|
||||||
// Flushing here is enabled by the configuration
|
// Flushing here is enabled by the configuration
|
||||||
|
@ -194,18 +192,8 @@ ircd::gpt::generate(task &task)
|
||||||
list.clear();
|
list.clear();
|
||||||
|
|
||||||
assert(ctrl.magic == 0xC7012C70);
|
assert(ctrl.magic == 0xC7012C70);
|
||||||
|
assert(ctrl.epic.cycle == cycle || ctx::interruption_requested());
|
||||||
this_ctx::interruption_point();
|
this_ctx::interruption_point();
|
||||||
|
|
||||||
// Interp error codes
|
|
||||||
if(unlikely(ctrl.call <= 0))
|
|
||||||
throw error
|
|
||||||
{
|
|
||||||
"hyper (#%d) :%s",
|
|
||||||
abs(int(ctrl.call)),
|
|
||||||
reflect(ctrl.call),
|
|
||||||
};
|
|
||||||
|
|
||||||
assert(ctrl.cycle == cycle || ctx::interruption_requested());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
Loading…
Reference in a new issue