mirror of
https://github.com/matrix-construct/construct
synced 2024-05-19 19:33:45 +02:00
ircd::gpt: Various refactoring.
This commit is contained in:
parent
31e078506a
commit
78848925ee
|
@ -11,89 +11,71 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#define HAVE_IRCD_GPT_CTRL_H
|
#define HAVE_IRCD_GPT_CTRL_H
|
||||||
|
|
||||||
/// Epoch Precision Interrupt Controller
|
/// Result logit control block.
|
||||||
///
|
|
||||||
struct ircd_gpt_ctrl_epic
|
|
||||||
{
|
|
||||||
/// Accumulates the number of task cycles. The cycle counter is incremented
|
|
||||||
/// by device software after each repetition of the kernel pipeline to
|
|
||||||
/// produce one additional token.
|
|
||||||
ulong cycle;
|
|
||||||
|
|
||||||
/// Accumulates the epoch count for the task. The counter is incremented
|
|
||||||
/// by one in device software before control returns back to the host.
|
|
||||||
/// Several cycles may occur during each epoch.
|
|
||||||
ulong epoch;
|
|
||||||
|
|
||||||
/// Accumulates the training epoch count for the task. The counter is
|
|
||||||
/// incremented by one in device software for each backward propagation.
|
|
||||||
ulong step;
|
|
||||||
|
|
||||||
/// Updated by the host with the value of the timestamp register as sampled
|
|
||||||
/// immediately before each transfer of control to the device.
|
|
||||||
ulong host_tsc;
|
|
||||||
|
|
||||||
/// Accumulates time in microseconds elapsed for the task.
|
|
||||||
ulong elapsed;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Token Context Buffer (Control Block)
|
|
||||||
///
|
|
||||||
struct ircd_gpt_ctrl_tokens
|
|
||||||
{
|
|
||||||
/// Token ring head. Tokens in the ring extend behind the head for
|
|
||||||
/// `tokens`. The `head` value is automatically modulated by device
|
|
||||||
/// software to wrap around the ring.
|
|
||||||
uint head;
|
|
||||||
|
|
||||||
/// Token counter. The counter indicates the number of valid tokens in
|
|
||||||
/// the context buffer. This value must not exceed the buffer size.
|
|
||||||
uint count;
|
|
||||||
|
|
||||||
/// Accumulates the number of tokens produced by the task. Several tokens
|
|
||||||
/// may be produced each epoch, but currently only one token is produced
|
|
||||||
/// each cycle.
|
|
||||||
ulong produced;
|
|
||||||
|
|
||||||
/// Accumulates the number tokens witnessed by the task. The number of
|
|
||||||
/// tokens in the context for each cycle is counted as witnessed.
|
|
||||||
ulong witnessed;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Target label register (abridged)
|
|
||||||
///
|
|
||||||
struct ircd_gpt_ctrl_logit
|
struct ircd_gpt_ctrl_logit
|
||||||
{
|
{
|
||||||
/// Vocabulary token.
|
/// Vocabulary token.
|
||||||
ushort token;
|
ushort token;
|
||||||
|
|
||||||
/// Padding #0.
|
/// Padding #0.
|
||||||
ushort _pad0;
|
ushort flag;
|
||||||
|
|
||||||
/// Result logit softmax probability.
|
/// Result logit softmax probability.
|
||||||
float samax;
|
float samax;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Target label register (full)
|
/// Target label control block. Results for each target are registered
|
||||||
///
|
/// and state is updated each cycle.
|
||||||
struct ircd_gpt_ctrl_label
|
struct ircd_gpt_ctrl_label
|
||||||
{
|
{
|
||||||
/// Vocabulary token.
|
/// Logit descriptor
|
||||||
ushort token;
|
struct ircd_gpt_ctrl_logit logit;
|
||||||
|
|
||||||
/// Padding #0.
|
|
||||||
ushort _pad0;
|
|
||||||
|
|
||||||
/// Result logit softmax probability.
|
|
||||||
float samax;
|
|
||||||
|
|
||||||
/// Loss state
|
/// Loss state
|
||||||
struct ircd_math_mean loss;
|
struct ircd_math_mean loss;
|
||||||
|
|
||||||
/// Perplexity state
|
/// Perplexity state
|
||||||
struct ircd_math_mean perp;
|
struct ircd_math_mean ppl;
|
||||||
}
|
};
|
||||||
__attribute__((aligned(64)));
|
|
||||||
|
/// Master clock
|
||||||
|
struct ircd_gpt_ctrl_clk
|
||||||
|
{
|
||||||
|
/// Master clock. The cycle count is incremented by one in device software
|
||||||
|
/// after each repetition of the kernels producing one additional token.
|
||||||
|
/// The cycle count resets to zero before the beginning of each sample.
|
||||||
|
uint cycle;
|
||||||
|
|
||||||
|
/// Master clock. Sample consists of one or more cycles; sample count is
|
||||||
|
/// incremented by one in device software after every accept condition,
|
||||||
|
/// growing monotonically for the `step`; resets to zero each `step`.
|
||||||
|
uint samp;
|
||||||
|
|
||||||
|
/// Master clock. Step (or timestep) consists of one or more samples. Step
|
||||||
|
/// count is incremented by one in device software after each backward
|
||||||
|
/// propagation. Step grows monotonically even across epochs.
|
||||||
|
uint step;
|
||||||
|
|
||||||
|
/// Master clock. Epoch consists of one or more steps; epoch count is
|
||||||
|
/// incremented by one after every backward propagation.
|
||||||
|
uint epoch;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Profiling block
|
||||||
|
struct ircd_gpt_ctrl_prof
|
||||||
|
{
|
||||||
|
/// Host timestamp sampled at last control page transfer to the device.
|
||||||
|
ulong released;
|
||||||
|
|
||||||
|
/// Host timestamp sampled when this control page accquired by the host.
|
||||||
|
ulong acquired;
|
||||||
|
|
||||||
|
/// Device timestamp at beginning of cycle.
|
||||||
|
ulong entered;
|
||||||
|
|
||||||
|
/// Device timestamp at end of cycle.
|
||||||
|
ulong finished;
|
||||||
|
};
|
||||||
|
|
||||||
/// Task Control Page
|
/// Task Control Page
|
||||||
///
|
///
|
||||||
|
@ -104,50 +86,94 @@ __attribute__((aligned(64)));
|
||||||
///
|
///
|
||||||
struct ircd_gpt_ctrl
|
struct ircd_gpt_ctrl
|
||||||
{
|
{
|
||||||
/// Epoch counting & interrupt control block.
|
/// Accept register. If >= 0 the cycle produced a token which satisfies the
|
||||||
struct ircd_gpt_ctrl_epic epic;
|
/// indicated accept condition.
|
||||||
|
int accept;
|
||||||
|
|
||||||
/// Token context control block. Contains state for the token context
|
/// Dispatch register. Device software wishes additional cycles to be
|
||||||
/// buffer; the buffer with the tokens themselves is elsewhere.
|
/// commanded by the host. Effectively minimum distance until next accept.
|
||||||
struct ircd_gpt_ctrl_tokens tokens;
|
uint dispatch;
|
||||||
|
|
||||||
/// Top result summary from the softed result logit softmax vector. This
|
/// Token counter. The counter indicates the number of valid tokens in
|
||||||
/// is updated each cycle by device software with extended statistics on
|
/// the context buffer. This value must not exceed the opts.buffer_size.
|
||||||
/// the top N results.
|
/// This value should not exceed the opts.context_size at least for now.
|
||||||
struct ircd_gpt_ctrl_logit top[16];
|
uint count;
|
||||||
|
|
||||||
/// Target label control block. Results for each target are registered
|
/// Token counter. The counter indicates the number of valid tokens in
|
||||||
/// and state is updated each cycle.
|
/// the context buffer. This value must not exceed the opts.buffer_size.
|
||||||
struct ircd_gpt_ctrl_label label[4];
|
/// This value should not exceed the opts.context_size at least for now.
|
||||||
|
uint tokens;
|
||||||
|
|
||||||
/// Result logit vector softmax internal state.
|
/// Master clock.
|
||||||
struct ircd_math_samax samax;
|
struct ircd_gpt_ctrl_clk clk;
|
||||||
|
|
||||||
|
/// Profiling related.
|
||||||
|
struct ircd_gpt_ctrl_prof prof;
|
||||||
|
|
||||||
/// PRNG xoshiro256 internal state (note: see opts.h to seed the prng).
|
/// PRNG xoshiro256 internal state (note: see opts.h to seed the prng).
|
||||||
ulong rand[4];
|
ulong rand[4];
|
||||||
|
|
||||||
/// Perform backprop TODO: XXX
|
/// Top result summary from the softed result logit softmax vector. This
|
||||||
bool prop;
|
/// is updated each cycle by device software with extended statistics on
|
||||||
|
/// the top N results.
|
||||||
|
struct ircd_gpt_ctrl_logit top[16] __attribute__((aligned(8)));
|
||||||
|
|
||||||
/// Header magic 0xC7012C70
|
/// User label control block. Results for each target are registered
|
||||||
uint magic;
|
/// and state is updated each cycle; averaged for each step.
|
||||||
|
struct ircd_gpt_ctrl_label label[14] __attribute__((aligned(64)));
|
||||||
|
|
||||||
/// The token buffer starts at offset 2048 and continues to the end of
|
/// Target result label; traces training token.
|
||||||
/// the page; options specify the size of the tokens buffer in tokens.
|
struct ircd_gpt_ctrl_label target __attribute__((aligned(64)));
|
||||||
/// Additional pages must be attached for larger buffer sizes.
|
|
||||||
ushort token[] __attribute__((aligned(2048)));
|
/// Selected result token label.
|
||||||
|
struct ircd_gpt_ctrl_label select __attribute__((aligned(64)));
|
||||||
|
|
||||||
|
/// Incremented when the target is the selected token.
|
||||||
|
uint hit, miss;
|
||||||
|
|
||||||
|
/// Attention summary; [layer][head] => [token]. Each value points to a
|
||||||
|
/// position in the token buffer. The top-scoring softmax result for each
|
||||||
|
/// head in each layer is attending to the token.at(value) for this cycle.
|
||||||
|
/// These values are completely updated every cycle.
|
||||||
|
ushort attn[12][12];
|
||||||
|
|
||||||
|
/// Header magic: host sets 0xDEADBEEF before release to device; device
|
||||||
|
/// sets 0xC7012C7012 before release to host.
|
||||||
|
ulong magic;
|
||||||
|
|
||||||
|
/// Token buffer
|
||||||
|
ushort token[1024] __attribute__((aligned(2048)));
|
||||||
}
|
}
|
||||||
__attribute__((aligned(4096)));
|
__attribute__((aligned(4096)));
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#if defined(__cplusplus)
|
||||||
namespace ircd::gpt
|
namespace ircd::gpt
|
||||||
{
|
{
|
||||||
using ctrl = struct ircd_gpt_ctrl;
|
using ctrl = struct ircd_gpt_ctrl;
|
||||||
|
using ctrl_clk = struct ircd_gpt_ctrl_clk;
|
||||||
|
using ctrl_prof = struct ircd_gpt_ctrl_prof;
|
||||||
|
using ctrl_logit = struct ircd_gpt_ctrl_logit;
|
||||||
|
using ctrl_label = struct ircd_gpt_ctrl_label;
|
||||||
|
|
||||||
|
string_view debug_token_at(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = -1U);
|
||||||
|
string_view debug_token(const mutable_buffer &, const opts &, const ctrl &, const uint fmt = -1U);
|
||||||
|
string_view debug_head(const mutable_buffer &, const opts &, const ctrl_clk &);
|
||||||
|
string_view debug_head(const mutable_buffer &, const opts &, const ctrl &);
|
||||||
|
string_view debug(const mutable_buffer &, const opts &, const ctrl_logit &, const uint fmt = 0);
|
||||||
|
string_view debug(const mutable_buffer &, const opts &, const ctrl_label &, const uint fmt = 0);
|
||||||
|
string_view debug(const mutable_buffer &, const opts &, const ctrl &);
|
||||||
|
|
||||||
|
string_view debug_attn(const mutable_buffer &, const opts &, const ctrl &, const uint);
|
||||||
|
string_view debug_label(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = 0);
|
||||||
|
string_view debug_top(const mutable_buffer &, const opts &, const ctrl &, const uint);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#if defined(__cplusplus)
|
||||||
static_assert(sizeof(struct ircd_gpt_ctrl) == 4096);
|
static_assert(sizeof(struct ircd_gpt_ctrl) % 4096 == 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus) && defined(__GLIBCXX__)
|
||||||
static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
|
static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
|
||||||
static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
|
static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
|
||||||
#endif
|
#endif
|
||||||
|
|
38
include/ircd/gpt/epoch.h
Normal file
38
include/ircd/gpt/epoch.h
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_EPOCH_H
|
||||||
|
|
||||||
|
/// Perform one task epoch on the device.
|
||||||
|
///
|
||||||
|
struct ircd::gpt::epoch
|
||||||
|
{
|
||||||
|
gpt::task &task;
|
||||||
|
pipe::desc &desc;
|
||||||
|
|
||||||
|
const gpt::opts &opts;
|
||||||
|
gpt::ctrl &ctrl;
|
||||||
|
|
||||||
|
const uint id;
|
||||||
|
const size_t start, stop;
|
||||||
|
f32 *moment[2];
|
||||||
|
|
||||||
|
pipe::prof profile;
|
||||||
|
|
||||||
|
void profile_accumulate(const pipe::prof &);
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool done() const noexcept;
|
||||||
|
bool operator()();
|
||||||
|
|
||||||
|
epoch(gpt::task &);
|
||||||
|
~epoch() noexcept;
|
||||||
|
};
|
|
@ -17,16 +17,36 @@ namespace ircd::gpt
|
||||||
{
|
{
|
||||||
IRCD_EXCEPTION(ircd::error, error)
|
IRCD_EXCEPTION(ircd::error, error)
|
||||||
|
|
||||||
|
struct samp;
|
||||||
|
struct step;
|
||||||
|
struct epoch;
|
||||||
struct task;
|
struct task;
|
||||||
|
|
||||||
extern log::log log;
|
extern log::log log;
|
||||||
}
|
}
|
||||||
|
|
||||||
#include "vocab.h"
|
#include "vocab.h"
|
||||||
#include "model.h"
|
|
||||||
#include "token.h"
|
#include "token.h"
|
||||||
|
#include "vector.h"
|
||||||
|
#include "model.h"
|
||||||
#include "opts.h"
|
#include "opts.h"
|
||||||
#include "ctrl.h"
|
#include "ctrl.h"
|
||||||
#include "task.h"
|
|
||||||
#include "pipe/pipe.h"
|
#include "pipe/pipe.h"
|
||||||
|
#include "samp.h"
|
||||||
|
#include "step.h"
|
||||||
|
#include "epoch.h"
|
||||||
|
#include "task.h"
|
||||||
#include "generate.h"
|
#include "generate.h"
|
||||||
|
|
||||||
|
namespace ircd::gpt
|
||||||
|
{
|
||||||
|
void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept;
|
||||||
|
|
||||||
|
void log_debug(const opts &, const ctrl &);
|
||||||
|
void log_debug_token(const opts &, const ctrl &, const uint);
|
||||||
|
void log_debug_attns(const opts &, const ctrl &);
|
||||||
|
void log_debug_attns_top(const opts &, const ctrl &);
|
||||||
|
void log_debug_labels(const opts &, const ctrl &);
|
||||||
|
void log_debug_topn(const opts &, const ctrl &);
|
||||||
|
void log_debug_prof(const opts &, const ctrl &, const pipe::prof &);
|
||||||
|
}
|
||||||
|
|
290
include/ircd/gpt/gpu.h
Normal file
290
include/ircd/gpt/gpu.h
Normal file
|
@ -0,0 +1,290 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
typedef union ircd_gpt_vector_f32x4 ircd_gpt_vectorv __attribute__((aligned(4096)));
|
||||||
|
typedef struct ircd_gpt_attn_qkv_f32x4 ircd_gpt_attn_qkvv __attribute__((aligned(4096)));
|
||||||
|
typedef union ircd_gpt_attn_aperature_f32x4 ircd_gpt_attn_aperaturev __attribute__((aligned(4096)));
|
||||||
|
typedef union ircd_gpt_ffnn_aperature_f32x4 ircd_gpt_ffnn_aperaturev __attribute__((aligned(4096)));
|
||||||
|
|
||||||
|
//
|
||||||
|
// Frontside
|
||||||
|
//
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *out,
|
||||||
|
__local const ircd_gpt_vectorv *in,
|
||||||
|
__global const ircd_gpt_vectorv *bias,
|
||||||
|
__global const ircd_gpt_vectorv *weight,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_norm(__local ircd_gpt_vectorv *out,
|
||||||
|
__local const ircd_gpt_vectorv *in,
|
||||||
|
__local ircd_gpt_vectorv *tmp,
|
||||||
|
__global const ircd_gpt_vectorv *bias,
|
||||||
|
__global const ircd_gpt_vectorv *weight,
|
||||||
|
const uint ln,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *out,
|
||||||
|
__local const ircd_gpt_ffnn_aperaturev *in,
|
||||||
|
const uint i);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local ircd_gpt_ffnn_aperaturev *out,
|
||||||
|
__local const ircd_gpt_vectorv *in,
|
||||||
|
__global const ircd_gpt_ffnn_aperaturev *bias,
|
||||||
|
__global const ircd_gpt_ffnn_aperaturev *weight,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local ircd_gpt_ffnn_aperaturev *out,
|
||||||
|
__local const ircd_gpt_vectorv *in,
|
||||||
|
__global const ircd_gpt_ffnn_aperaturev *bias,
|
||||||
|
__global const ircd_gpt_ffnn_aperaturev *weight,
|
||||||
|
const uint ln,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local ircd_gpt_vectorv *out,
|
||||||
|
__local const ircd_gpt_ffnn_aperaturev *in,
|
||||||
|
__global const ircd_gpt_vectorv *bias,
|
||||||
|
__global const ircd_gpt_vectorv *weight,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local ircd_gpt_vectorv *vector,
|
||||||
|
__local ircd_gpt_ffnn_aperaturev *buf,
|
||||||
|
__local ircd_gpt_vectorv *tmp,
|
||||||
|
__global const ircd_gpt_vectorv *norm_bias,
|
||||||
|
__global const ircd_gpt_vectorv *norm_weight,
|
||||||
|
__global const ircd_gpt_ffnn_aperaturev *fcon_bias,
|
||||||
|
__global const ircd_gpt_ffnn_aperaturev *fcon_weight,
|
||||||
|
__global const ircd_gpt_vectorv *proj_bias,
|
||||||
|
__global const ircd_gpt_vectorv *proj_weight,
|
||||||
|
const uint ln,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local ircd_gpt_attn_aperaturev *out,
|
||||||
|
__local const ircd_gpt_vectorv *in,
|
||||||
|
__global const ircd_gpt_attn_aperaturev *bias,
|
||||||
|
__global const ircd_gpt_attn_aperaturev *weight,
|
||||||
|
const uint ln,
|
||||||
|
const uint li);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__private const uint layer,
|
||||||
|
__global ircd_gpt_attn_aperaturev *state,
|
||||||
|
__global const ircd_gpt_vectorv *accum,
|
||||||
|
__global const ircd_gpt_vectorv *norm_bias,
|
||||||
|
__global const ircd_gpt_vectorv *norm_weight,
|
||||||
|
__global const ircd_gpt_attn_aperaturev *fcon_bias,
|
||||||
|
__global const ircd_gpt_attn_aperaturev *fcon_weight);
|
||||||
|
|
||||||
|
//
|
||||||
|
// head
|
||||||
|
//
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *accum,
|
||||||
|
__global const ircd_gpt_vectorv *pos,
|
||||||
|
__global const ircd_gpt_vectorv *vocab,
|
||||||
|
const ushort out_idx,
|
||||||
|
const ushort tok_idx,
|
||||||
|
const ushort word_idx);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *accum,
|
||||||
|
__global const ircd_gpt_vectorv *pos,
|
||||||
|
__global const ircd_gpt_vectorv *vocab);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *accum,
|
||||||
|
__global const ircd_gpt_vectorv *norm_bias,
|
||||||
|
__global const ircd_gpt_vectorv *norm_weight);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global float *logit,
|
||||||
|
__global const ircd_gpt_vectorv *accum,
|
||||||
|
__global const ircd_gpt_vectorv *pos,
|
||||||
|
__global const ircd_gpt_vectorv *vocab);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global float logit[65536]);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local const ushort *idx,
|
||||||
|
__global const float *logsm,
|
||||||
|
const uint i);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local struct ircd_math_mean *mean,
|
||||||
|
const float last);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local struct ircd_gpt_ctrl_label *label,
|
||||||
|
__global const float *logsm);
|
||||||
|
|
||||||
|
ushort
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__local const ushort *idx,
|
||||||
|
__global const float *logsm);
|
||||||
|
|
||||||
|
uint
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
const uint i);
|
||||||
|
|
||||||
|
uint
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
const uint i);
|
||||||
|
|
||||||
|
int
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts);
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_accept(__local struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts);
|
||||||
|
|
||||||
|
//
|
||||||
|
// backside
|
||||||
|
//
|
||||||
|
|
||||||
|
void
|
||||||
|
__attribute__((internal_linkage))
|
||||||
|
ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global float4 *param_,
|
||||||
|
__global float4 *exp_avg_,
|
||||||
|
__global float4 *exp_avg_sqr_);
|
||||||
|
|
||||||
|
//
|
||||||
|
// backpropagations
|
||||||
|
//
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *bias,
|
||||||
|
__global ircd_gpt_vectorv *bias_m0,
|
||||||
|
__global ircd_gpt_vectorv *bias_m1,
|
||||||
|
__global ircd_gpt_vectorv *weight,
|
||||||
|
__global ircd_gpt_vectorv *weight_m0,
|
||||||
|
__global ircd_gpt_vectorv *weight_m1);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *norm_bias,
|
||||||
|
__global ircd_gpt_vectorv *norm_bias_m0,
|
||||||
|
__global ircd_gpt_vectorv *norm_bias_m1,
|
||||||
|
__global ircd_gpt_vectorv *norm_weight,
|
||||||
|
__global ircd_gpt_vectorv *norm_weight_m0,
|
||||||
|
__global ircd_gpt_vectorv *norm_weight_m1,
|
||||||
|
__global ircd_gpt_attn_aperaturev *fcon_bias,
|
||||||
|
__global ircd_gpt_attn_aperaturev *fcon_bias_m0,
|
||||||
|
__global ircd_gpt_attn_aperaturev *fcon_bias_m1,
|
||||||
|
__global ircd_gpt_attn_aperaturev *fcon_weight,
|
||||||
|
__global ircd_gpt_attn_aperaturev *fcon_weight_m0,
|
||||||
|
__global ircd_gpt_attn_aperaturev *fcon_weight_m1,
|
||||||
|
__global ircd_gpt_vectorv *proj_bias,
|
||||||
|
__global ircd_gpt_vectorv *proj_bias_m0,
|
||||||
|
__global ircd_gpt_vectorv *proj_bias_m1,
|
||||||
|
__global ircd_gpt_vectorv *proj_weight,
|
||||||
|
__global ircd_gpt_vectorv *proj_weight_m0,
|
||||||
|
__global ircd_gpt_vectorv *proj_weight_m1);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *norm_bias,
|
||||||
|
__global ircd_gpt_vectorv *norm_bias_m0,
|
||||||
|
__global ircd_gpt_vectorv *norm_bias_m1,
|
||||||
|
__global ircd_gpt_vectorv *norm_weight,
|
||||||
|
__global ircd_gpt_vectorv *norm_weight_m0,
|
||||||
|
__global ircd_gpt_vectorv *norm_weight_m1,
|
||||||
|
__global ircd_gpt_ffnn_aperaturev *fcon_bias,
|
||||||
|
__global ircd_gpt_ffnn_aperaturev *fcon_bias_m0,
|
||||||
|
__global ircd_gpt_ffnn_aperaturev *fcon_bias_m1,
|
||||||
|
__global ircd_gpt_ffnn_aperaturev *fcon_weight,
|
||||||
|
__global ircd_gpt_ffnn_aperaturev *fcon_weight_m0,
|
||||||
|
__global ircd_gpt_ffnn_aperaturev *fcon_weight_m1,
|
||||||
|
__global ircd_gpt_vectorv *proj_bias,
|
||||||
|
__global ircd_gpt_vectorv *proj_bias_m0,
|
||||||
|
__global ircd_gpt_vectorv *proj_bias_m1,
|
||||||
|
__global ircd_gpt_vectorv *proj_weight,
|
||||||
|
__global ircd_gpt_vectorv *proj_weight_m0,
|
||||||
|
__global ircd_gpt_vectorv *proj_weight_m1);
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
__attribute__((visibility("protected")))
|
||||||
|
ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *ctrl,
|
||||||
|
__constant const struct ircd_gpt_opts *opts,
|
||||||
|
__global ircd_gpt_vectorv *pos,
|
||||||
|
__global ircd_gpt_vectorv *pos_m0,
|
||||||
|
__global ircd_gpt_vectorv *pos_m1,
|
||||||
|
__global ircd_gpt_vectorv *token,
|
||||||
|
__global ircd_gpt_vectorv *token_m0,
|
||||||
|
__global ircd_gpt_vectorv *token_m1);
|
|
@ -20,66 +20,105 @@ namespace ircd::gpt::model
|
||||||
struct embed;
|
struct embed;
|
||||||
struct decoder;
|
struct decoder;
|
||||||
|
|
||||||
constexpr auto align {64};
|
struct prop;
|
||||||
|
struct text;
|
||||||
|
|
||||||
extern decoder *default_model;
|
extern decoder *default_model;
|
||||||
|
extern float *default_moment[2];
|
||||||
|
extern float *default_checkpoint[3];
|
||||||
extern string_view default_dataset;
|
extern string_view default_dataset;
|
||||||
extern std::vector<json::object> default_data;
|
extern std::vector<json::object> default_data;
|
||||||
|
|
||||||
|
constexpr auto alignment {4096};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Layer normalization
|
||||||
|
struct ircd::gpt::model::norm
|
||||||
|
{
|
||||||
|
union ircd_gpt_vector
|
||||||
|
bias alignas(alignment),
|
||||||
|
weight alignas(alignment);
|
||||||
|
};
|
||||||
|
|
||||||
/// Attention aperature
|
/// Attention aperature
|
||||||
struct ircd::gpt::model::attn
|
struct ircd::gpt::model::attn
|
||||||
{
|
{
|
||||||
float
|
model::norm
|
||||||
attn_bias alignas(align) [2304],
|
norm;
|
||||||
attn_weight alignas(align) [768][2304];
|
|
||||||
|
|
||||||
float
|
union ircd_gpt_attn_aperature
|
||||||
proj_bias alignas(align) [768],
|
fcon_bias alignas(alignment),
|
||||||
proj_weight alignas(align) [768][768];
|
fcon_weight alignas(alignment) [768];
|
||||||
|
|
||||||
|
union ircd_gpt_vector
|
||||||
|
proj_bias alignas(alignment),
|
||||||
|
proj_weight alignas(alignment) [768];
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Feed-forward neural network
|
/// Feed-forward neural network
|
||||||
struct ircd::gpt::model::ffnn
|
struct ircd::gpt::model::ffnn
|
||||||
{
|
{
|
||||||
float
|
model::norm
|
||||||
fc_bias alignas(align) [3072],
|
norm;
|
||||||
fc_weight alignas(align) [768][3072];
|
|
||||||
|
|
||||||
float
|
union ircd_gpt_ffnn_aperature
|
||||||
proj_bias alignas(align) [768],
|
fcon_bias alignas(alignment),
|
||||||
proj_weight alignas(align) [3072][768];
|
fcon_weight alignas(alignment) [768];
|
||||||
};
|
|
||||||
|
|
||||||
/// Layer normalization
|
union ircd_gpt_vector
|
||||||
struct ircd::gpt::model::norm
|
proj_bias alignas(alignment),
|
||||||
{
|
proj_weight alignas(alignment) [3072];
|
||||||
float
|
|
||||||
bias alignas(align) [768],
|
|
||||||
weight alignas(align) [768];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Transformer block
|
/// Transformer block
|
||||||
struct ircd::gpt::model::block
|
struct ircd::gpt::model::block
|
||||||
{
|
{
|
||||||
norm ln1;
|
model::attn
|
||||||
model::attn attn;
|
attn;
|
||||||
|
|
||||||
norm ln2;
|
model::ffnn
|
||||||
model::ffnn ffnn;
|
ffnn;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Vocabulary embeddings
|
/// Vocabulary embeddings
|
||||||
struct ircd::gpt::model::embed
|
struct ircd::gpt::model::embed
|
||||||
{
|
{
|
||||||
float
|
model::norm
|
||||||
pos alignas(align) [1024][768],
|
norm;
|
||||||
token alignas(align) [65536][768];
|
|
||||||
|
union ircd_gpt_vector
|
||||||
|
pos alignas(alignment) [1024],
|
||||||
|
token alignas(alignment) [65536];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::model::decoder
|
/// Transformer decoder
|
||||||
|
struct alignas(ircd::gpt::model::alignment)
|
||||||
|
ircd::gpt::model::decoder
|
||||||
{
|
{
|
||||||
block layer[12];
|
model::block
|
||||||
|
layer[12];
|
||||||
|
|
||||||
norm f;
|
model::embed
|
||||||
embed word;
|
embed;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ircd::gpt::model::prop
|
||||||
|
{
|
||||||
|
static constexpr const char
|
||||||
|
*const ended {"ended"},
|
||||||
|
*const id {"id"},
|
||||||
|
*const length {"length"},
|
||||||
|
*const text {"text"};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ircd::gpt::model::text
|
||||||
|
:json::tuple
|
||||||
|
<
|
||||||
|
json::property<prop::ended, bool>,
|
||||||
|
json::property<prop::id, uint>,
|
||||||
|
json::property<prop::length, uint>,
|
||||||
|
json::property<prop::text, json::string>
|
||||||
|
>
|
||||||
|
{
|
||||||
|
using super_type::tuple;
|
||||||
};
|
};
|
||||||
|
|
|
@ -11,6 +11,13 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#define HAVE_IRCD_GPT_OPTS_H
|
#define HAVE_IRCD_GPT_OPTS_H
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
namespace ircd::gpt::model
|
||||||
|
{
|
||||||
|
struct decoder;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/// Task Options Page
|
/// Task Options Page
|
||||||
///
|
///
|
||||||
/// The option block is directly shared with task software as constant data.
|
/// The option block is directly shared with task software as constant data.
|
||||||
|
@ -20,30 +27,23 @@
|
||||||
///
|
///
|
||||||
struct ircd_gpt_opts
|
struct ircd_gpt_opts
|
||||||
{
|
{
|
||||||
#ifdef __cplusplus
|
#if defined(__cplusplus)
|
||||||
ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
|
ircd_gpt_opts() noexcept;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// Reference to the model (currently not available in device software).
|
//
|
||||||
#ifndef __cplusplus
|
// Frontside
|
||||||
const void *model;
|
//
|
||||||
#else
|
|
||||||
const ircd::gpt::model::decoder *model;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/// Seed for the task's PRNG.
|
/// Seed for the task's PRNG.
|
||||||
ulong seed;
|
ulong seed;
|
||||||
|
|
||||||
/// Limit number of output tokens. Default of -1 is unlimited; the number
|
|
||||||
/// of tokens generated will be limited by other factors.
|
|
||||||
uint limit;
|
|
||||||
|
|
||||||
/// Flip random coins over the top k logits each round. Setting to 1
|
/// Flip random coins over the top k logits each round. Setting to 1
|
||||||
/// deterministically selects the top logit.
|
/// deterministically selects the top logit.
|
||||||
uint top_k;
|
uint top_k;
|
||||||
|
|
||||||
/// Flip a random coin between 0 and top_p ( = 90 = 0.9) for logit select.
|
/// Flip a random coin between 0 and top_p ( = 90 = 0.9) for logit select.
|
||||||
uint top_p;
|
float top_p;
|
||||||
|
|
||||||
/// Registers the top n result logits in the ctrl block each cycle.
|
/// Registers the top n result logits in the ctrl block each cycle.
|
||||||
uint top_n;
|
uint top_n;
|
||||||
|
@ -51,59 +51,25 @@ struct ircd_gpt_opts
|
||||||
/// Number of target labels to register results for in the ctrl block.
|
/// Number of target labels to register results for in the ctrl block.
|
||||||
uint labels;
|
uint labels;
|
||||||
|
|
||||||
/// Bitbar toggling various debug modes
|
/// Number of pages available after the control block for the frame log.
|
||||||
|
uint frames;
|
||||||
|
|
||||||
|
/// Limit number of output tokens. Default of -1; other halting conditions
|
||||||
|
/// will be used.
|
||||||
|
uint limit;
|
||||||
|
|
||||||
|
/// Bitbar toggling various debug modes.
|
||||||
uint debug;
|
uint debug;
|
||||||
|
|
||||||
/// Specifies the token context size in tokens.
|
/// Accepting condition codes.
|
||||||
uint context_tokens;
|
ushort accept[4][8] __attribute__((aligned(4)));
|
||||||
|
|
||||||
/// Specifies the token buffer size in tokens.
|
//
|
||||||
uint buffer_tokens;
|
// Backside
|
||||||
|
//
|
||||||
|
|
||||||
/// Decoding layers.
|
/// Samples per step.
|
||||||
uint layers;
|
uint batch_size;
|
||||||
|
|
||||||
/// SIMD lane count.
|
|
||||||
uint lanes;
|
|
||||||
|
|
||||||
/// Embedding vector elements
|
|
||||||
uint embed_elems;
|
|
||||||
|
|
||||||
/// Cross-attention dimension
|
|
||||||
uint attn_rank;
|
|
||||||
|
|
||||||
/// Attention unit fcon width multiple
|
|
||||||
uint attn_mult;
|
|
||||||
|
|
||||||
/// (computed) MLP unit fcon width multiple
|
|
||||||
uint ffnn_mult;
|
|
||||||
|
|
||||||
/// (computed) attention unit width multiple
|
|
||||||
uint attn_elems;
|
|
||||||
|
|
||||||
/// FFNN unit width multiple
|
|
||||||
uint ffnn_elems;
|
|
||||||
|
|
||||||
/// SIMD lane count
|
|
||||||
uint lanes;
|
|
||||||
|
|
||||||
/// (computed) `embed_elems` / `lanes`
|
|
||||||
uint embed_width;
|
|
||||||
|
|
||||||
/// (computed) Attention unit X dimension
|
|
||||||
uint attn_width;
|
|
||||||
|
|
||||||
/// (computed) Attention unit Y dimension
|
|
||||||
uint attn_height;
|
|
||||||
|
|
||||||
/// (computed) MLP backend X dimension
|
|
||||||
uint ffnn_width;
|
|
||||||
|
|
||||||
/// (computed) MLP backend Y dimension
|
|
||||||
uint ffnn_height;
|
|
||||||
|
|
||||||
/// Number of possible target n-grams.
|
|
||||||
uint logits;
|
|
||||||
|
|
||||||
/// Training steps
|
/// Training steps
|
||||||
uint training_steps;
|
uint training_steps;
|
||||||
|
@ -122,15 +88,90 @@ struct ircd_gpt_opts
|
||||||
|
|
||||||
/// Denorm smoothing
|
/// Denorm smoothing
|
||||||
float epsilon;
|
float epsilon;
|
||||||
|
|
||||||
|
/// Tuning convergence rate
|
||||||
|
float lambda;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Model dimensions
|
||||||
|
//
|
||||||
|
|
||||||
|
/// Number of possible target n-grams.
|
||||||
|
uint logits;
|
||||||
|
|
||||||
|
/// Specifies the token buffer size in tokens.
|
||||||
|
uint buffer_tokens;
|
||||||
|
|
||||||
|
/// Specifies the token context size in tokens.
|
||||||
|
uint context_tokens;
|
||||||
|
|
||||||
|
/// Decoding layers.
|
||||||
|
uint layers;
|
||||||
|
|
||||||
|
/// SIMD lane count.
|
||||||
|
uint lanes;
|
||||||
|
|
||||||
|
/// Embedding vector elements
|
||||||
|
uint embed_elems;
|
||||||
|
|
||||||
|
/// (computed) `embed_elems` / `lanes`
|
||||||
|
uint embed_width;
|
||||||
|
|
||||||
|
/// Cross-attention dimension
|
||||||
|
uint attn_rank;
|
||||||
|
|
||||||
|
/// Attention unit fcon width multiple
|
||||||
|
uint attn_mult;
|
||||||
|
|
||||||
|
/// (computed) attention unit width multiple
|
||||||
|
uint attn_elems;
|
||||||
|
|
||||||
|
/// (computed) Attention unit X dimension
|
||||||
|
uint attn_fcon_width;
|
||||||
|
|
||||||
|
/// (computed) Attention unit Y dimension
|
||||||
|
uint attn_fcon_height;
|
||||||
|
|
||||||
|
/// (computed) Attention unit X dimension
|
||||||
|
uint attn_proj_width;
|
||||||
|
|
||||||
|
/// (computed) Attention unit Y dimension
|
||||||
|
uint attn_proj_height;
|
||||||
|
|
||||||
|
/// (computed) Packed attention array total element count
|
||||||
|
uint attn_self_elems;
|
||||||
|
|
||||||
|
/// MLP unit fcon width multiple
|
||||||
|
uint ffnn_mult;
|
||||||
|
|
||||||
|
/// (computed) FFNN unit width multiple
|
||||||
|
uint ffnn_elems;
|
||||||
|
|
||||||
|
/// (computed) MLP backend X dimension
|
||||||
|
uint ffnn_fcon_width;
|
||||||
|
|
||||||
|
/// (computed) MLP backend Y dimension
|
||||||
|
uint ffnn_fcon_height;
|
||||||
|
|
||||||
|
/// (computed) MLP backend X dimension
|
||||||
|
uint ffnn_proj_width;
|
||||||
|
|
||||||
|
/// (computed) MLP backend Y dimension
|
||||||
|
uint ffnn_proj_height;
|
||||||
}
|
}
|
||||||
__attribute__((aligned(4096)));
|
__attribute__((aligned(4096)));
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#if defined(__cplusplus)
|
||||||
namespace ircd::gpt
|
namespace ircd::gpt
|
||||||
{
|
{
|
||||||
using opts = ::ircd_gpt_opts;
|
using opts = ::ircd_gpt_opts;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
|
static_assert(sizeof(struct ircd_gpt_opts) == 4096);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus) && defined(__GLIBCXX__)
|
||||||
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
|
static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -16,12 +16,15 @@ struct ircd::gpt::pipe::code
|
||||||
:cl::code
|
:cl::code
|
||||||
{
|
{
|
||||||
static conf::item<std::string> default_path;
|
static conf::item<std::string> default_path;
|
||||||
static conf::item<std::string> default_opts;
|
static conf::item<std::string> default_compile_opts;
|
||||||
|
static conf::item<std::string> default_link_opts;
|
||||||
static conf::item<std::string> cache_path;
|
static conf::item<std::string> cache_path;
|
||||||
|
|
||||||
static string_view make_cache_path(const mutable_buffer &);
|
static string_view make_cache_path(const mutable_buffer &);
|
||||||
static cl::code from_cache(const string_view &opts, const string_view &path);
|
|
||||||
static cl::code from_source(const string_view &opts);
|
static cl::code from_cache();
|
||||||
|
static cl::code from_source(const string_view &comp_opts = {}, const string_view &link_opts = {});
|
||||||
|
static cl::code from_bitcode(const string_view &link_opts = {});
|
||||||
|
|
||||||
void set_cache(const string_view &path);
|
void set_cache(const string_view &path);
|
||||||
bool put_cache();
|
bool put_cache();
|
||||||
|
|
48
include/ircd/gpt/pipe/cycle.h
Normal file
48
include/ircd/gpt/pipe/cycle.h
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_PIPE_CYCLE_H
|
||||||
|
|
||||||
|
namespace ircd::gpt::pipe
|
||||||
|
{
|
||||||
|
const gpt::ctrl &acquire(cycle &);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform one task cycle on the device. The cycle is an atomic unit of
|
||||||
|
/// generation producing one token.
|
||||||
|
///
|
||||||
|
/// Constructions of this object enqueue device commands to complete an
|
||||||
|
/// additional cycle of the task as provided by `ctrl` and `opts`.
|
||||||
|
///
|
||||||
|
/// Destructions of this object yield the ircd::ctx until those commands
|
||||||
|
/// are complete.
|
||||||
|
///
|
||||||
|
struct ircd::gpt::pipe::cycle
|
||||||
|
{
|
||||||
|
struct profile;
|
||||||
|
|
||||||
|
static constexpr size_t stages
|
||||||
|
{
|
||||||
|
4 + 3 + (12 * 2) + 4 + 2 + (12 * 2) + 1
|
||||||
|
};
|
||||||
|
|
||||||
|
pipe::desc &desc;
|
||||||
|
uint tick;
|
||||||
|
uint count;
|
||||||
|
uint tokens;
|
||||||
|
uint cached;
|
||||||
|
uint frame;
|
||||||
|
pipe::range range;
|
||||||
|
std::array<cl::exec, stages> stage;
|
||||||
|
|
||||||
|
cycle(gpt::samp &);
|
||||||
|
~cycle() noexcept;
|
||||||
|
};
|
|
@ -16,43 +16,62 @@ struct ircd::gpt::pipe::desc
|
||||||
{
|
{
|
||||||
struct layer;
|
struct layer;
|
||||||
|
|
||||||
|
// Model descriptor
|
||||||
pipe::model *model;
|
pipe::model *model;
|
||||||
|
|
||||||
|
// Code descriptor
|
||||||
pipe::code *code;
|
pipe::code *code;
|
||||||
|
|
||||||
|
// Memories
|
||||||
cl::data
|
cl::data
|
||||||
state, // [root] projection (layers * tokens * embed * 3 * float)
|
opts, // [root] options page
|
||||||
|
ctrl, // [root] control page
|
||||||
master, // [root] single allocation for additional buffers:
|
master, // [root] single allocation for additional buffers:
|
||||||
|
state, // [-sub] projection (layers * tokens * embed * 3 * float)
|
||||||
accum, // [-sub] accumulator (tokens * embed * float)
|
accum, // [-sub] accumulator (tokens * embed * float)
|
||||||
logit, // [-sub] result logit vector (50257 * float)
|
logit, // [-sub] result logit vector (50257 * float)
|
||||||
logsm, // [-sub] outputs distribution (50257 * float)
|
attns, // [-sub] result attention softmax
|
||||||
ctrl, // [root] control page
|
frame[8]; // [root] result stream
|
||||||
opts; // [root] options page
|
|
||||||
|
|
||||||
|
// Programs
|
||||||
cl::kern
|
cl::kern
|
||||||
|
alloc,
|
||||||
|
enter,
|
||||||
lm_embed,
|
lm_embed,
|
||||||
lm_norm,
|
lm_norm,
|
||||||
lm_logit,
|
lm_logit,
|
||||||
lm_logsm,
|
lm_logsm,
|
||||||
lm_select,
|
lm_select,
|
||||||
lm_norm_backprop,
|
lm_prop_embed,
|
||||||
lm_embed_backprop;
|
lm_prop_norm,
|
||||||
|
leave[8];
|
||||||
|
|
||||||
std::unique_ptr<struct desc::layer>
|
/// Coil pack
|
||||||
layer[12];
|
std::unique_ptr<struct desc::layer> layer[12];
|
||||||
|
|
||||||
desc(pipe::code &, pipe::model &);
|
/// Attention projection for first N tokens already contained in `state`.
|
||||||
|
uint cached {0};
|
||||||
|
|
||||||
|
desc(const gpt::opts *const &,
|
||||||
|
gpt::ctrl *const &,
|
||||||
|
pipe::model &model,
|
||||||
|
pipe::code &code);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Pipe descriptor: coil layer
|
||||||
struct ircd::gpt::pipe::desc::layer
|
struct ircd::gpt::pipe::desc::layer
|
||||||
{
|
{
|
||||||
cl::data
|
cl::data
|
||||||
state; // [-sub] qry/key/val projection (tokens * embed * 3 * float)
|
state, // [-sub] qry/key/val projection (tokens * embed * 3 * float)
|
||||||
|
attns; // [-sub] attn softmax result (((tokens * tokens) / 2) * 12 * float)
|
||||||
|
|
||||||
cl::kern
|
cl::kern
|
||||||
negative,
|
attn,
|
||||||
positive,
|
ffnn,
|
||||||
backattn,
|
prop_attn,
|
||||||
backffnn;
|
prop_ffnn;
|
||||||
|
|
||||||
layer(pipe::desc &, const int);
|
layer(pipe::desc &,
|
||||||
|
const gpt::opts *const &,
|
||||||
|
const uint laynum);
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
// Matrix Construct
|
|
||||||
//
|
|
||||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
|
||||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
|
||||||
//
|
|
||||||
// Permission to use, copy, modify, and/or distribute this software for any
|
|
||||||
// purpose with or without fee is hereby granted, provided that the above
|
|
||||||
// copyright notice and this permission notice is present in all copies. The
|
|
||||||
// full license for this software is available in the LICENSE file.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#define HAVE_IRCD_GPT_PIPE_EXEC_H
|
|
||||||
|
|
||||||
/// Perform one task cycle on the device.
|
|
||||||
///
|
|
||||||
/// Constructions of this object enqueue device commands to complete an
|
|
||||||
/// additional epoch of the task as provided by `ctrl` and `opts`.
|
|
||||||
///
|
|
||||||
/// Destructions of this object yield the ircd::ctx until those commands
|
|
||||||
/// are complete.
|
|
||||||
///
|
|
||||||
/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
|
|
||||||
/// achieved by constructing several objects before following with destructions
|
|
||||||
/// i.e in a std::deque.
|
|
||||||
///
|
|
||||||
struct ircd::gpt::pipe::exec
|
|
||||||
{
|
|
||||||
pipe::desc *desc;
|
|
||||||
|
|
||||||
const_buffer
|
|
||||||
send_opts, // Set when sending the options page.
|
|
||||||
send_ctrl, // Set when sending the control page.
|
|
||||||
send_coil, // Set when sending the updated model.
|
|
||||||
send_head; // Set when sending the updated model.
|
|
||||||
|
|
||||||
mutable_buffer
|
|
||||||
recv_ctrl; // Set when receiving the control page.
|
|
||||||
|
|
||||||
cl::kern::range
|
|
||||||
range_full,
|
|
||||||
range_last,
|
|
||||||
range_lm_embed, // Dimension range of the lm_embed kernel.
|
|
||||||
range_negative, // Dimension range of a layer kernel.
|
|
||||||
range_positive, // Dimension range of a layer kernel.
|
|
||||||
range_lm_norm, // Dimension range of the final norm kernel.
|
|
||||||
range_lm_logit, // Dimension range of the language logit kernel.
|
|
||||||
range_lm_logsm, // Dimension range of the language statistic kernel.
|
|
||||||
range_lm_select; // Dimension range of the language token kernel.
|
|
||||||
|
|
||||||
cl::exec
|
|
||||||
release_opts, // Release the options page.
|
|
||||||
release_ctrl, // Release the control page.
|
|
||||||
release_coil, // Release updates to the model.
|
|
||||||
release_head, // Release updates to the model.
|
|
||||||
lm_embed, // Compute token and positional embeddings.
|
|
||||||
coil[12 * 2], // Pass over all layers.
|
|
||||||
lm_norm, // Final normalization.
|
|
||||||
lm_logit, // Compute language logits.
|
|
||||||
lm_logsm, // Statistics on the logits.
|
|
||||||
lm_select, // Select next token.
|
|
||||||
acquire_ctrl; // Acquire the control page.
|
|
||||||
|
|
||||||
exec(task &, const size_t tokens, const bool rel, const bool acq);
|
|
||||||
~exec() noexcept;
|
|
||||||
};
|
|
|
@ -21,15 +21,15 @@ struct ircd::gpt::pipe::model
|
||||||
struct attn;
|
struct attn;
|
||||||
struct ffnn;
|
struct ffnn;
|
||||||
struct block;
|
struct block;
|
||||||
|
struct embed;
|
||||||
struct decoder;
|
struct decoder;
|
||||||
struct language;
|
|
||||||
|
|
||||||
|
const gpt::model::decoder *decode_const {nullptr};
|
||||||
|
gpt::model::decoder *decode_mutable {nullptr};
|
||||||
std::unique_ptr<model::decoder> decode;
|
std::unique_ptr<model::decoder> decode;
|
||||||
std::unique_ptr<model::language> embed;
|
|
||||||
bool invalid {false};
|
|
||||||
|
|
||||||
model(const gpt::model::decoder &, const gpt::model::embed &);
|
model(const gpt::model::decoder &);
|
||||||
model(gpt::model::decoder &, gpt::model::embed &);
|
model(gpt::model::decoder &);
|
||||||
~model() noexcept;
|
~model() noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -49,8 +49,8 @@ struct ircd::gpt::pipe::model::tensor
|
||||||
bias,
|
bias,
|
||||||
weight;
|
weight;
|
||||||
|
|
||||||
tensor(cl::data *, const off_t, const const_buffer &bias, const const_buffer &weight);
|
tensor(cl::data *, const off_t, const const_buffer &bias, const off_t, const const_buffer &weight);
|
||||||
tensor(cl::data *, const off_t, const mutable_buffer &bias, const mutable_buffer &weight);
|
tensor(cl::data *, const off_t, const mutable_buffer &bias, const off_t, const mutable_buffer &weight);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::pipe::model::attn
|
struct ircd::gpt::pipe::model::attn
|
||||||
|
@ -60,8 +60,8 @@ struct ircd::gpt::pipe::model::attn
|
||||||
fcon,
|
fcon,
|
||||||
proj;
|
proj;
|
||||||
|
|
||||||
attn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::attn &);
|
attn(cl::data *, const off_t, const gpt::model::attn &);
|
||||||
attn(cl::data *, const off_t, gpt::model::norm &, gpt::model::attn &);
|
attn(cl::data *, const off_t, gpt::model::attn &);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::pipe::model::ffnn
|
struct ircd::gpt::pipe::model::ffnn
|
||||||
|
@ -71,40 +71,33 @@ struct ircd::gpt::pipe::model::ffnn
|
||||||
fcon,
|
fcon,
|
||||||
proj;
|
proj;
|
||||||
|
|
||||||
ffnn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::ffnn &);
|
ffnn(cl::data *, const off_t, const gpt::model::ffnn &);
|
||||||
ffnn(cl::data *, const off_t, gpt::model::norm &, gpt::model::ffnn &);
|
ffnn(cl::data *, const off_t, gpt::model::ffnn &);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::pipe::model::block
|
struct ircd::gpt::pipe::model::block
|
||||||
{
|
{
|
||||||
// Single layer memory roots
|
model::attn
|
||||||
cl::data
|
attn;
|
||||||
master[3];
|
|
||||||
|
|
||||||
// Layer units
|
model::ffnn
|
||||||
model::attn attn;
|
ffnn;
|
||||||
model::ffnn ffnn;
|
|
||||||
|
|
||||||
block(cl::data *, const off_t, const gpt::model::block &, const size_t);
|
block(cl::data *, const off_t, const gpt::model::block &, const size_t);
|
||||||
block(cl::data *, const off_t, gpt::model::block &, const size_t);
|
block(cl::data *, const off_t, gpt::model::block &, const size_t);
|
||||||
block(const gpt::model::block &, const size_t);
|
|
||||||
block(gpt::model::block &, const size_t);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::pipe::model::language
|
struct ircd::gpt::pipe::model::embed
|
||||||
{
|
{
|
||||||
cl::data
|
tensor
|
||||||
master[3];
|
norm;
|
||||||
|
|
||||||
matrix
|
matrix
|
||||||
pos,
|
pos,
|
||||||
token;
|
token;
|
||||||
|
|
||||||
language(cl::data *, const off_t, const gpt::model::embed &);
|
embed(cl::data *, const off_t, const gpt::model::embed &);
|
||||||
language(cl::data *, const off_t, gpt::model::embed &);
|
embed(cl::data *, const off_t, gpt::model::embed &);
|
||||||
language(const gpt::model::embed &);
|
|
||||||
language( gpt::model::embed &);
|
|
||||||
~language() noexcept;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ircd::gpt::pipe::model::decoder
|
struct ircd::gpt::pipe::model::decoder
|
||||||
|
@ -115,10 +108,11 @@ struct ircd::gpt::pipe::model::decoder
|
||||||
|
|
||||||
// Layer blocks
|
// Layer blocks
|
||||||
model::block
|
model::block
|
||||||
block[12];
|
layer[12];
|
||||||
|
|
||||||
// Final norm
|
// Language model head
|
||||||
tensor norm;
|
model::embed
|
||||||
|
embed;
|
||||||
|
|
||||||
decoder(const gpt::model::decoder &);
|
decoder(const gpt::model::decoder &);
|
||||||
decoder(gpt::model::decoder &);
|
decoder(gpt::model::decoder &);
|
||||||
|
|
|
@ -13,21 +13,21 @@
|
||||||
|
|
||||||
namespace ircd::gpt::pipe
|
namespace ircd::gpt::pipe
|
||||||
{
|
{
|
||||||
struct model;
|
|
||||||
struct code;
|
struct code;
|
||||||
|
struct model;
|
||||||
struct desc;
|
struct desc;
|
||||||
struct exec;
|
struct range;
|
||||||
|
struct cycle;
|
||||||
|
struct prof;
|
||||||
|
|
||||||
extern model *default_model;
|
extern conf::item<size_t> queue_cycles;
|
||||||
extern code *default_code;
|
|
||||||
extern desc *default_desc;
|
|
||||||
|
|
||||||
void generate(task &);
|
|
||||||
|
|
||||||
void init(), fini() noexcept;
|
void init(), fini() noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "model.h"
|
|
||||||
#include "code.h"
|
#include "code.h"
|
||||||
|
#include "model.h"
|
||||||
#include "desc.h"
|
#include "desc.h"
|
||||||
#include "exec.h"
|
#include "range.h"
|
||||||
|
#include "cycle.h"
|
||||||
|
#include "prof.h"
|
||||||
|
|
56
include/ircd/gpt/pipe/prof.h
Normal file
56
include/ircd/gpt/pipe/prof.h
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_PIPE_PROF_H
|
||||||
|
|
||||||
|
namespace ircd::gpt::pipe
|
||||||
|
{
|
||||||
|
string_view debug(const mutable_buffer &, const prof &, const size_t &pos);
|
||||||
|
string_view debug(const mutable_buffer &, const prof &);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract profiling information for a cycle. This object contains timing
|
||||||
|
/// state integers for each corresponding stage of the cycle.
|
||||||
|
///
|
||||||
|
/// Default constructions initialize to zero and the state can also be used
|
||||||
|
/// as an accumulator.
|
||||||
|
struct ircd::gpt::pipe::prof
|
||||||
|
{
|
||||||
|
static constexpr size_t stages
|
||||||
|
{
|
||||||
|
cycle::stages
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr size_t phases
|
||||||
|
{
|
||||||
|
num_of<cl::work::prof::phase>()
|
||||||
|
};
|
||||||
|
|
||||||
|
using phase = cl::work::prof::phase;
|
||||||
|
using phase_array = cl::work::prof;
|
||||||
|
using stage_array = std::array<phase_array, stages>;
|
||||||
|
using info_type = std::tuple<string_view, int>;
|
||||||
|
using info_array = std::array<info_type, stages>;
|
||||||
|
using info_name_array = std::array<char[64], stages>;
|
||||||
|
|
||||||
|
static info_name_array name;
|
||||||
|
static info_array info;
|
||||||
|
|
||||||
|
private:
|
||||||
|
static bool init;
|
||||||
|
static void init_info(const pipe::cycle &);
|
||||||
|
|
||||||
|
public:
|
||||||
|
stage_array ts;
|
||||||
|
|
||||||
|
prof(const pipe::cycle &);
|
||||||
|
prof() noexcept;
|
||||||
|
};
|
39
include/ircd/gpt/pipe/range.h
Normal file
39
include/ircd/gpt/pipe/range.h
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_PIPE_RANGE_H
|
||||||
|
|
||||||
|
struct ircd::gpt::pipe::range
|
||||||
|
{
|
||||||
|
cl::kern::range
|
||||||
|
_full,
|
||||||
|
_last,
|
||||||
|
alloc,
|
||||||
|
embed,
|
||||||
|
attn,
|
||||||
|
ffnn,
|
||||||
|
fffnn,
|
||||||
|
fnorm,
|
||||||
|
logit,
|
||||||
|
logsm,
|
||||||
|
select,
|
||||||
|
prop_embed,
|
||||||
|
prop_norm,
|
||||||
|
prop_attn,
|
||||||
|
prop_ffnn;
|
||||||
|
|
||||||
|
range(const uint tick,
|
||||||
|
const uint count,
|
||||||
|
const uint tokens,
|
||||||
|
const uint cached,
|
||||||
|
const bool fwd,
|
||||||
|
const bool rev) noexcept;
|
||||||
|
};
|
49
include/ircd/gpt/samp.h
Normal file
49
include/ircd/gpt/samp.h
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_SAMP_H
|
||||||
|
|
||||||
|
/// Perform one task step on the device. The step is a sequence of cycles
|
||||||
|
/// which generate tokens until satisfying a halting condition. The number of
|
||||||
|
/// cycles for the step is limited by the size of the context buffer.
|
||||||
|
///
|
||||||
|
struct ircd::gpt::samp
|
||||||
|
{
|
||||||
|
gpt::step &step;
|
||||||
|
pipe::desc &desc;
|
||||||
|
|
||||||
|
const gpt::opts &opts;
|
||||||
|
gpt::ctrl &ctrl;
|
||||||
|
|
||||||
|
const uint id;
|
||||||
|
|
||||||
|
int accept;
|
||||||
|
uint dispatch;
|
||||||
|
uint cycle;
|
||||||
|
uint tokens;
|
||||||
|
uint count;
|
||||||
|
|
||||||
|
pipe::prof profile;
|
||||||
|
std::deque<pipe::cycle> queue;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void profile_accumulate(const pipe::prof &);
|
||||||
|
bool retire(pipe::cycle &, const gpt::ctrl &);
|
||||||
|
bool evaluate(pipe::cycle &);
|
||||||
|
uint tokenize();
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool done() const noexcept;
|
||||||
|
bool operator()();
|
||||||
|
|
||||||
|
samp(gpt::step &);
|
||||||
|
~samp() noexcept;
|
||||||
|
};
|
40
include/ircd/gpt/step.h
Normal file
40
include/ircd/gpt/step.h
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_STEP_H
|
||||||
|
|
||||||
|
/// Perform one task step on the device. The step is a sequence of cycles
|
||||||
|
/// which generate tokens until satisfying a halting condition. The number of
|
||||||
|
/// cycles for the step is limited by the size of the context buffer.
|
||||||
|
///
|
||||||
|
struct ircd::gpt::step
|
||||||
|
{
|
||||||
|
gpt::epoch &epoch;
|
||||||
|
pipe::desc &desc;
|
||||||
|
|
||||||
|
const gpt::opts &opts;
|
||||||
|
gpt::ctrl &ctrl;
|
||||||
|
|
||||||
|
const uint id;
|
||||||
|
const uint start;
|
||||||
|
|
||||||
|
pipe::prof profile;
|
||||||
|
|
||||||
|
void profile_accumulate(const pipe::prof &);
|
||||||
|
bool backpropagate();
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool done() const noexcept;
|
||||||
|
bool operator()();
|
||||||
|
|
||||||
|
step(gpt::epoch &);
|
||||||
|
~step() noexcept;
|
||||||
|
};
|
|
@ -12,6 +12,14 @@
|
||||||
#define HAVE_IRCD_GPT_TASK_H
|
#define HAVE_IRCD_GPT_TASK_H
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
namespace ircd::gpt
|
||||||
|
{
|
||||||
|
void seed(task &, const uint64_t &) noexcept;
|
||||||
|
void seed(task &) noexcept;
|
||||||
|
void clear(task &) noexcept;
|
||||||
|
void reset(task &) noexcept;
|
||||||
|
}
|
||||||
|
|
||||||
/// Task Context
|
/// Task Context
|
||||||
///
|
///
|
||||||
/// State for a task.
|
/// State for a task.
|
||||||
|
@ -22,27 +30,25 @@ struct ircd::gpt::task
|
||||||
/// Reference to the attached options.
|
/// Reference to the attached options.
|
||||||
const gpt::opts *opts {nullptr};
|
const gpt::opts *opts {nullptr};
|
||||||
|
|
||||||
/// Reference to control pages.
|
/// Reference to user's control block.
|
||||||
gpt::ctrl *ctrl {nullptr};
|
gpt::ctrl *ctrl {nullptr};
|
||||||
|
|
||||||
/// Current task status.
|
/// Pipe code
|
||||||
enum status status {'\0'};
|
std::unique_ptr<pipe::code> code;
|
||||||
|
|
||||||
|
/// Pipe model
|
||||||
|
std::unique_ptr<pipe::model> model;
|
||||||
|
|
||||||
|
/// Pipe state
|
||||||
|
pipe::desc desc;
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool done() const noexcept;
|
||||||
|
bool operator()();
|
||||||
|
|
||||||
|
task(const gpt::opts * = nullptr,
|
||||||
|
gpt::ctrl * = nullptr);
|
||||||
|
|
||||||
task(const gpt::opts * = nullptr, gpt::ctrl * = nullptr);
|
|
||||||
~task() noexcept;
|
~task() noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The current status of a task is indicated with intelligible characters
|
|
||||||
enum ircd::gpt::task::status
|
|
||||||
:char
|
|
||||||
{
|
|
||||||
QUEUED = 'Q', ///< Queued for execution.
|
|
||||||
RUNNING = 'R', ///< Currently being executed.
|
|
||||||
ACCEPT = 'A', ///< Execution completed successfully.
|
|
||||||
ERROR = 'E', ///< Execution did not complete successfully.
|
|
||||||
};
|
|
||||||
|
|
||||||
static_assert(sizeof(struct ircd_gpt_ctrl) == 4096);
|
|
||||||
static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
|
|
||||||
static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -11,127 +11,67 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#define HAVE_IRCD_GPT_TOKEN_H
|
#define HAVE_IRCD_GPT_TOKEN_H
|
||||||
|
|
||||||
union ircd_gpt_token
|
namespace ircd::gpt
|
||||||
{
|
{
|
||||||
float
|
struct token;
|
||||||
word[768],
|
}
|
||||||
attn[12][64];
|
|
||||||
|
/// Token is just a 16-bit index into the vocabulary. This lightweight wrapper
|
||||||
|
/// convenience constructs from a string lookup or from a u16 directly.
|
||||||
|
class ircd::gpt::token
|
||||||
|
{
|
||||||
|
uint16_t val;
|
||||||
|
|
||||||
|
public:
|
||||||
|
operator const uint16_t &() const;
|
||||||
|
operator uint16_t &();
|
||||||
|
|
||||||
|
operator string_view() const;
|
||||||
|
|
||||||
|
token(const_buffer &buf) noexcept;
|
||||||
|
token(const string_view &);
|
||||||
|
token(const uint16_t &) noexcept;
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __OPENCL_C_VERSION__
|
static_assert(sizeof(ircd::gpt::token) == sizeof(uint16_t));
|
||||||
union ircd_gpt_tokenv
|
static_assert(std::is_standard_layout<ircd::gpt::token>::value);
|
||||||
|
|
||||||
|
/// Direct construction; no lookup
|
||||||
|
inline
|
||||||
|
ircd::gpt::token::token(const uint16_t &val)
|
||||||
|
noexcept
|
||||||
|
:val{val}
|
||||||
|
{}
|
||||||
|
|
||||||
|
/// Must resolve to one token or error thrown.
|
||||||
|
inline
|
||||||
|
ircd::gpt::token::token(const string_view &str)
|
||||||
|
:val{vocab::tokenize(str)}
|
||||||
|
{}
|
||||||
|
|
||||||
|
/// Consumes input for one token off front of buf
|
||||||
|
inline
|
||||||
|
ircd::gpt::token::token(const_buffer &buf)
|
||||||
|
noexcept
|
||||||
|
:val{vocab::tokenize(buf)}
|
||||||
|
{}
|
||||||
|
|
||||||
|
inline ircd::gpt::token::operator
|
||||||
|
string_view()
|
||||||
|
const
|
||||||
{
|
{
|
||||||
float4
|
return vocab::token[val];
|
||||||
word[768/4],
|
}
|
||||||
attn[12][64/4];
|
|
||||||
|
|
||||||
union ircd_gpt_token
|
inline ircd::gpt::token::operator
|
||||||
token;
|
uint16_t &()
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct ircd_gpt_attn_qkv
|
|
||||||
{
|
{
|
||||||
union ircd_gpt_token
|
return val;
|
||||||
qry,
|
}
|
||||||
key,
|
|
||||||
val;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef __OPENCL_C_VERSION__
|
inline ircd::gpt::token::operator
|
||||||
struct ircd_gpt_attn_qkvv
|
const uint16_t &()
|
||||||
|
const
|
||||||
{
|
{
|
||||||
union ircd_gpt_tokenv
|
return val;
|
||||||
qry,
|
}
|
||||||
key,
|
|
||||||
val;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
union ircd_gpt_attn_aperature
|
|
||||||
{
|
|
||||||
float
|
|
||||||
fcon[2304],
|
|
||||||
proj[3][768],
|
|
||||||
qkv[3][12][64];
|
|
||||||
|
|
||||||
union ircd_gpt_token
|
|
||||||
token[3];
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef __OPENCL_C_VERSION__
|
|
||||||
union ircd_gpt_attn_aperaturev
|
|
||||||
{
|
|
||||||
float4
|
|
||||||
fcon[2304/4],
|
|
||||||
proj[3][768/4],
|
|
||||||
qkv[3][12][64/4];
|
|
||||||
|
|
||||||
union ircd_gpt_token_f32x4
|
|
||||||
token[3];
|
|
||||||
};
|
|
||||||
|
|
||||||
union ircd_gpt_attn_aperature_f32x8
|
|
||||||
{
|
|
||||||
float8
|
|
||||||
fcon[2304/8],
|
|
||||||
proj[3][768/8],
|
|
||||||
qkv[3][12][64/8];
|
|
||||||
|
|
||||||
union ircd_gpt_token_f32x8
|
|
||||||
token[3];
|
|
||||||
};
|
|
||||||
|
|
||||||
union ircd_gpt_attn_aperature_f32x16
|
|
||||||
{
|
|
||||||
float16
|
|
||||||
fcon[2304/16],
|
|
||||||
proj[3][768/16],
|
|
||||||
qkv[3][12][64/16];
|
|
||||||
|
|
||||||
union ircd_gpt_token_f32x16
|
|
||||||
token[3];
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
union ircd_gpt_ffnn_aperature
|
|
||||||
{
|
|
||||||
float
|
|
||||||
fcon[3072],
|
|
||||||
proj[4][768];
|
|
||||||
|
|
||||||
union ircd_gpt_token
|
|
||||||
token[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef __OPENCL_C_VERSION__
|
|
||||||
union ircd_gpt_ffnn_aperaturev
|
|
||||||
{
|
|
||||||
float4
|
|
||||||
fcon[3072/4],
|
|
||||||
proj[4][768/4];
|
|
||||||
|
|
||||||
union ircd_gpt_token_f32x4
|
|
||||||
token[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
union ircd_gpt_ffnn_aperature_f32x8
|
|
||||||
{
|
|
||||||
float8
|
|
||||||
fcon[3072/8],
|
|
||||||
proj[4][768/8];
|
|
||||||
|
|
||||||
union ircd_gpt_token_f32x4
|
|
||||||
token[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
union ircd_gpt_ffnn_aperature_f32x16
|
|
||||||
{
|
|
||||||
float16
|
|
||||||
fcon[3072/16],
|
|
||||||
proj[4][768/16];
|
|
||||||
|
|
||||||
union ircd_gpt_token_f32x4
|
|
||||||
token[4];
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
147
include/ircd/gpt/vector.h
Normal file
147
include/ircd/gpt/vector.h
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
// Matrix Construct
|
||||||
|
//
|
||||||
|
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||||
|
// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#define HAVE_IRCD_GPT_VECTOR_H
|
||||||
|
|
||||||
|
#if !defined(__SIZEOF_FLOAT4__) && defined(__OPENCL_VERSION__)
|
||||||
|
#define __SIZEOF_FLOAT4__ 16
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__SIZEOF_FLOAT8__) && defined(__OPENCL_VERSION__)
|
||||||
|
#define __SIZEOF_FLOAT8__ 32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__SIZEOF_FLOAT16__) && defined(__OPENCL_VERSION__)
|
||||||
|
#define __SIZEOF_FLOAT16__ 64
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __OPENCL_VERSION__
|
||||||
|
#define __constant
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static __constant const uint
|
||||||
|
ircd_gpt_context_tokens = 512, // 1024,
|
||||||
|
ircd_gpt_vector_elems = 768,
|
||||||
|
ircd_gpt_attn_rank = 12,
|
||||||
|
ircd_gpt_attn_segs = 3,
|
||||||
|
ircd_gpt_ffnn_segs = 4;
|
||||||
|
|
||||||
|
static __constant const uint
|
||||||
|
ircd_gpt_vector_attn_elems = ircd_gpt_vector_elems / ircd_gpt_attn_rank,
|
||||||
|
ircd_gpt_attn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_attn_segs,
|
||||||
|
ircd_gpt_ffnn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_ffnn_segs;
|
||||||
|
|
||||||
|
//
|
||||||
|
// embed vector
|
||||||
|
//
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT__)
|
||||||
|
union ircd_gpt_vector
|
||||||
|
{
|
||||||
|
float
|
||||||
|
elem[ircd_gpt_vector_elems],
|
||||||
|
attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems];
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT4__)
|
||||||
|
union ircd_gpt_vector_f32x4
|
||||||
|
{
|
||||||
|
float4
|
||||||
|
elem[ircd_gpt_vector_elems / 4],
|
||||||
|
attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4];
|
||||||
|
|
||||||
|
union ircd_gpt_vector
|
||||||
|
vector;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// attn qkv
|
||||||
|
//
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT__)
|
||||||
|
struct ircd_gpt_attn_qkv
|
||||||
|
{
|
||||||
|
union ircd_gpt_vector
|
||||||
|
qry,
|
||||||
|
key,
|
||||||
|
val;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT4__)
|
||||||
|
struct ircd_gpt_attn_qkv_f32x4
|
||||||
|
{
|
||||||
|
union ircd_gpt_vector_f32x4
|
||||||
|
qry,
|
||||||
|
key,
|
||||||
|
val;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// attn aperature
|
||||||
|
//
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT__)
|
||||||
|
union ircd_gpt_attn_aperature
|
||||||
|
{
|
||||||
|
float
|
||||||
|
fcon[ircd_gpt_attn_fcon_elems],
|
||||||
|
proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems],
|
||||||
|
qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems];
|
||||||
|
|
||||||
|
union ircd_gpt_vector
|
||||||
|
vector[ircd_gpt_attn_segs];
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT4__)
|
||||||
|
union ircd_gpt_attn_aperature_f32x4
|
||||||
|
{
|
||||||
|
float4
|
||||||
|
fcon[ircd_gpt_attn_fcon_elems / 4],
|
||||||
|
proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems / 4],
|
||||||
|
qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4];
|
||||||
|
|
||||||
|
union ircd_gpt_vector_f32x4
|
||||||
|
vector[ircd_gpt_attn_segs];
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// ffnn aperature
|
||||||
|
//
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT__)
|
||||||
|
union ircd_gpt_ffnn_aperature
|
||||||
|
{
|
||||||
|
float
|
||||||
|
fcon[ircd_gpt_ffnn_fcon_elems],
|
||||||
|
proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems];
|
||||||
|
|
||||||
|
union ircd_gpt_vector
|
||||||
|
vector[ircd_gpt_ffnn_segs];
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SIZEOF_FLOAT4__)
|
||||||
|
union ircd_gpt_ffnn_aperature_f32x4
|
||||||
|
{
|
||||||
|
float4
|
||||||
|
fcon[ircd_gpt_ffnn_fcon_elems / 4],
|
||||||
|
proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems / 4];
|
||||||
|
|
||||||
|
union ircd_gpt_vector_f32x4
|
||||||
|
vector[ircd_gpt_ffnn_segs];
|
||||||
|
};
|
||||||
|
#endif
|
|
@ -34,11 +34,17 @@ namespace ircd::gpt::vocab
|
||||||
merges_path;
|
merges_path;
|
||||||
|
|
||||||
// Tokenize UTF-8 input string of any length into proper token values,
|
// Tokenize UTF-8 input string of any length into proper token values,
|
||||||
vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in);
|
vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in) noexcept;
|
||||||
|
|
||||||
|
// Tokenize one token. The buffer is advanced consuming one token per call.
|
||||||
|
u16 tokenize(const_buffer &) noexcept;
|
||||||
|
|
||||||
|
// Tokenize one token. Error thrown if input is not exactly one token.
|
||||||
|
u16 tokenize(const string_view &in);
|
||||||
|
|
||||||
// Decode token values to build output text string.
|
// Decode token values to build output text string.
|
||||||
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in);
|
string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
|
||||||
|
|
||||||
// Other tools
|
// Other tools
|
||||||
string_view debug(const mutable_buffer &buf, const u16 token);
|
string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U);
|
||||||
}
|
}
|
||||||
|
|
166
ircd/Makefile.am
166
ircd/Makefile.am
|
@ -53,6 +53,7 @@ if LTO
|
||||||
if CLANG
|
if CLANG
|
||||||
AM_CXXFLAGS += -fstrict-vtable-pointers
|
AM_CXXFLAGS += -fstrict-vtable-pointers
|
||||||
AM_CXXFLAGS += -fwhole-program-vtables
|
AM_CXXFLAGS += -fwhole-program-vtables
|
||||||
|
#AM_LDFLAGS += -Wl,-plugin-opt,-pass-remarks='.*'
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -87,22 +88,6 @@ AM_LDFLAGS += -Wl,--enable-runtime-pseudo-reloc
|
||||||
AM_LDFLAGS += -export-symbols-regex '*'
|
AM_LDFLAGS += -export-symbols-regex '*'
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ROCKSDB_SRC_CPPFLAGS =#
|
|
||||||
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
|
|
||||||
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
|
|
||||||
|
|
||||||
GPT_FP_CXXFLAGS =#
|
|
||||||
GPT_FP_CXXFLAGS += -fno-math-errno
|
|
||||||
GPT_FP_CXXFLAGS += -fno-trapping-math
|
|
||||||
GPT_FP_CXXFLAGS += -ffinite-math-only
|
|
||||||
GPT_FP_CXXFLAGS += -fno-signed-zeros
|
|
||||||
GPT_FP_CXXFLAGS += -fassociative-math
|
|
||||||
GPT_FP_CXXFLAGS += -ffp-contract=fast
|
|
||||||
GPT_FP_CXXFLAGS += -freciprocal-math
|
|
||||||
if CLANG
|
|
||||||
GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
|
|
||||||
endif
|
|
||||||
|
|
||||||
libircddir = @libdir@
|
libircddir = @libdir@
|
||||||
libircd_LTLIBRARIES = libircd.la
|
libircd_LTLIBRARIES = libircd.la
|
||||||
|
|
||||||
|
@ -241,16 +226,12 @@ libircd_la_SOURCES += png.cc
|
||||||
if OPENCL
|
if OPENCL
|
||||||
libircd_la_SOURCES += cl.cc
|
libircd_la_SOURCES += cl.cc
|
||||||
endif
|
endif
|
||||||
libircd_la_SOURCES += gpt.cc
|
|
||||||
libircd_la_SOURCES += gpt_pipe.cc
|
|
||||||
libircd_la_SOURCES += gpt_model.cc
|
|
||||||
libircd_la_SOURCES += gpt_vocab.cc
|
libircd_la_SOURCES += gpt_vocab.cc
|
||||||
|
libircd_la_SOURCES += gpt_model.cc
|
||||||
|
libircd_la_SOURCES += gpt_pipe_code.cc
|
||||||
|
libircd_la_SOURCES += gpt_pipe.cc
|
||||||
libircd_la_SOURCES += gpt_cpu.cc
|
libircd_la_SOURCES += gpt_cpu.cc
|
||||||
if OPENCL
|
libircd_la_SOURCES += gpt.cc
|
||||||
if CLANG
|
|
||||||
BUILT_SOURCES += gpt_gpu.o
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
libircd_la_SOURCES += openssl.cc
|
libircd_la_SOURCES += openssl.cc
|
||||||
libircd_la_SOURCES += rfc1459.cc
|
libircd_la_SOURCES += rfc1459.cc
|
||||||
libircd_la_SOURCES += rfc3986.cc
|
libircd_la_SOURCES += rfc3986.cc
|
||||||
|
@ -283,6 +264,23 @@ libircd_la_SOURCES += ircd.cc
|
||||||
# Specific unit option composition
|
# Specific unit option composition
|
||||||
#
|
#
|
||||||
|
|
||||||
|
ROCKSDB_SRC_CPPFLAGS =#
|
||||||
|
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
|
||||||
|
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
|
||||||
|
|
||||||
|
GPT_FP_CXXFLAGS =#
|
||||||
|
GPT_FP_CXXFLAGS += -fno-math-errno
|
||||||
|
GPT_FP_CXXFLAGS += -fno-trapping-math
|
||||||
|
GPT_FP_CXXFLAGS += -ffinite-math-only
|
||||||
|
GPT_FP_CXXFLAGS += -fno-signed-zeros
|
||||||
|
GPT_FP_CXXFLAGS += -fassociative-math
|
||||||
|
GPT_FP_CXXFLAGS += -freciprocal-math
|
||||||
|
GPT_FP_CXXFLAGS += -ffp-contract=fast
|
||||||
|
if CLANG
|
||||||
|
GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
|
||||||
|
GPT_FP_CXXFLAGS += -ffp-model=fast
|
||||||
|
endif
|
||||||
|
|
||||||
client.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
client.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||||
ctx_x86_64.lo: AM_CPPFLAGS := -I$(top_srcdir)/include
|
ctx_x86_64.lo: AM_CPPFLAGS := -I$(top_srcdir)/include
|
||||||
ctx.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
ctx.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||||
|
@ -307,7 +305,7 @@ endif
|
||||||
if IOU
|
if IOU
|
||||||
fs_iou.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
fs_iou.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||||
endif
|
endif
|
||||||
gpt.lo: AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
|
gpt_cpu.lo: AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
|
||||||
http.lo: AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
http.lo: AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||||
http.lo: AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
|
http.lo: AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
|
||||||
ios.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
ios.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||||
|
@ -364,12 +362,118 @@ default.profdata:
|
||||||
-$(LLVM_PROFDATA) merge -output=default.profdata default.proftext
|
-$(LLVM_PROFDATA) merge -output=default.profdata default.proftext
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#
|
||||||
|
# Hardware Acceleration / sub-targets
|
||||||
|
#
|
||||||
|
|
||||||
if CLANG
|
if CLANG
|
||||||
if OPENCL
|
if OPENCL
|
||||||
gpt_gpu.ll: gpt_gpu.cl
|
|
||||||
$(CC) -S -emit-llvm -std=CL1.1 $(AM_CPPFLAGS) $(CPPFLAGS) $(DEFS) -Xclang -finclude-default-header -include "ircd/config.h" -o $@ -c $^
|
|
||||||
|
|
||||||
gpt_gpu.o: gpt_gpu.ll
|
GPU_CPPFLAGS =#
|
||||||
$(CC) -std=CL1.1 -o $@ -c $^
|
GPU_CPPFLAGS += -D__OPENCL_VERSION__=110
|
||||||
endif
|
GPU_CPPFLAGS += -Dcl_clang_storage_class_specifiers
|
||||||
endif
|
GPU_CPPFLAGS += -DNOFP64
|
||||||
|
GPU_CPPFLAGS += $(AM_CPPFLAGS)
|
||||||
|
GPU_CPPFLAGS += $(CPPFLAGS)
|
||||||
|
GPU_CPPFLAGS += $(DEFS)
|
||||||
|
|
||||||
|
GPU_CFLAGS =#
|
||||||
|
GPU_CFLAGS += -std=cl1.1
|
||||||
|
GPU_CFLAGS += -fident
|
||||||
|
GPU_CFLAGS += -fno-builtin
|
||||||
|
GPU_CFLAGS += -fverbose-asm
|
||||||
|
GPU_CFLAGS += -fno-discard-value-names
|
||||||
|
GPU_CFLAGS += -mllvm -propagate-attrs=true
|
||||||
|
GPU_CFLAGS += -O3
|
||||||
|
GPU_CFLAGS += -fno-trapping-math
|
||||||
|
GPU_CFLAGS += -ffinite-math-only
|
||||||
|
GPU_CFLAGS += -fno-signed-zeros
|
||||||
|
GPU_CFLAGS += -ffp-contract=fast
|
||||||
|
#GPU_CFLAGS += -include "ircd/config.h"
|
||||||
|
#GPU_CFLAGS += -include "clc/clc.h"
|
||||||
|
#GPU_CFLAGS += -Wassume
|
||||||
|
#GPU_CFLAGS += -Rpass-analysis=".*"
|
||||||
|
#GPU_CFLAGS += -Rpass-missed=".*"
|
||||||
|
|
||||||
|
GPU_LINKFLAGS =#
|
||||||
|
GPU_LINKFLAGS +=#
|
||||||
|
|
||||||
|
GPU_OPTFLAGS =#
|
||||||
|
GPU_OPTFLAGS += -propagate-attrs=true
|
||||||
|
GPU_OPTFLAGS += -O3
|
||||||
|
#GPU_OPTFLAGS += -opt-bisect-limit=-1
|
||||||
|
#GPU_OPTFLAGS += -debug-pass=Arguments
|
||||||
|
#GPU_OPTFLAGS += -pass-remarks='.*'
|
||||||
|
|
||||||
|
GPU_ASFLAGS =#
|
||||||
|
GPU_ASFLAGS += -fident
|
||||||
|
GPU_ASFLAGS += -fno-builtin
|
||||||
|
GPU_ASFLAGS += -mllvm -propagate-attrs=true
|
||||||
|
GPU_ASFLAGS += -mllvm -verify-machineinstrs
|
||||||
|
GPU_ASFLAGS += -O3
|
||||||
|
GPU_ASFLAGS += -Rpass-analysis=asm-printer
|
||||||
|
#GPU_ASFLAGS += -Rpass-analysis=".*"
|
||||||
|
#GPU_ASFLAGS += -mllvm -debug-pass=Arguments
|
||||||
|
#GPU_ASFLAGS += -mllvm -pass-remarks='.*'
|
||||||
|
|
||||||
|
#
|
||||||
|
# SPV
|
||||||
|
#
|
||||||
|
|
||||||
|
BUILT_SOURCES += gpt_gpu.spv.bc
|
||||||
|
gpt_gpu.spv.bc: gpt_gpu.cl
|
||||||
|
clang-13 -target spir-- $(GPU_CPPFLAGS) -O0 -emit-llvm -o $@ -x cl -c $^
|
||||||
|
|
||||||
|
BUILT_SOURCES += gpt_gpu.spv
|
||||||
|
gpt_gpu.spv: gpt_gpu.spv.bc
|
||||||
|
llvm-spirv -o $@ $^
|
||||||
|
|
||||||
|
CLEANFILES += gpt_gpu.spv.cc
|
||||||
|
libircd_la_SOURCES += gpt_gpu.spv.cc
|
||||||
|
gpt_gpu.spv.cc: gpt_gpu.spv
|
||||||
|
xxd -i $^ $@
|
||||||
|
|
||||||
|
#
|
||||||
|
# R600
|
||||||
|
#
|
||||||
|
|
||||||
|
R600_TARGET = r600--
|
||||||
|
|
||||||
|
#
|
||||||
|
# R600 Saint Barthélemy
|
||||||
|
#
|
||||||
|
|
||||||
|
R600_BARTS_CFLAGS = $(GPU_CFLAGS)
|
||||||
|
R600_BARTS_CFLAGS += -target $(R600_TARGET)
|
||||||
|
R600_BARTS_CFLAGS += -mcpu=barts
|
||||||
|
R600_BARTS_CFLAGS += -Xclang -mlink-bitcode-file -Xclang /usr/lib/clc/barts-r600--.bc
|
||||||
|
|
||||||
|
BUILT_SOURCES += gpt_gpu.r600_barts.bc
|
||||||
|
gpt_gpu.r600_barts.bc: gpt_gpu.cl
|
||||||
|
$(CC) $(R600_BARTS_CFLAGS) $(GPU_CPPFLAGS) -emit-llvm -o $@ -x cl -c $^
|
||||||
|
|
||||||
|
#BUILT_SOURCES += gpt_gpu.r600_barts.link.bc
|
||||||
|
gpt_gpu.r600_barts.link.bc: gpt_gpu.r600_barts.bc
|
||||||
|
llvm-link-14 $(GPU_LINKFLAGS) -o $@ $^ /usr/lib/clc/barts-r600--.bc
|
||||||
|
opt-14 $(GPU_OPTFLAGS) -o $@ $@
|
||||||
|
|
||||||
|
#BUILT_SOURCES += gpt_gpu.r600_barts.s
|
||||||
|
gpt_gpu.r600_barts.s: gpt_gpu.r600_barts.link.bc
|
||||||
|
$(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -S -o $@ -x ir $^
|
||||||
|
|
||||||
|
#BUILT_SOURCES += gpt_gpu.r600_barts.o
|
||||||
|
gpt_gpu.r600_barts.o: gpt_gpu.r600_barts.link.bc
|
||||||
|
$(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -o $@ -x ir $^
|
||||||
|
|
||||||
|
CLEANFILES += gpt_gpu.r600_barts.bc.cc
|
||||||
|
libircd_la_SOURCES += gpt_gpu.r600_barts.bc.cc
|
||||||
|
gpt_gpu.r600_barts.bc.cc: gpt_gpu.r600_barts.bc
|
||||||
|
xxd -i $^ $@
|
||||||
|
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
||||||
|
endif # OPENCL
|
||||||
|
endif # CLANG
|
||||||
|
|
1537
ircd/gpt.cc
1537
ircd/gpt.cc
File diff suppressed because it is too large
Load diff
491
ircd/gpt_cpu.cc
491
ircd/gpt_cpu.cc
|
@ -8,17 +8,21 @@
|
||||||
// copyright notice and this permission notice is present in all copies. The
|
// copyright notice and this permission notice is present in all copies. The
|
||||||
// full license for this software is available in the LICENSE file.
|
// full license for this software is available in the LICENSE file.
|
||||||
|
|
||||||
|
#pragma clang fp exceptions(ignore)
|
||||||
|
#pragma clang fp reassociate(on)
|
||||||
|
#pragma clang fp contract(fast)
|
||||||
|
|
||||||
namespace ircd::gpt
|
namespace ircd::gpt
|
||||||
{
|
{
|
||||||
static size_t adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32, size_t);
|
static void adamw(const opts &, const u32, const f32, const uint, f32 *, f32 *, f32 *);
|
||||||
static size_t adamw(task &, const f32, f32 *, const size_t, f32 *const (&)[2], const size_t);
|
|
||||||
|
|
||||||
static size_t backprop(task &, const f32, model::norm &, f32 *const (&)[2], size_t);
|
static void backprop(const opts &, const u32, const f32, model::norm &, model::norm &, model::norm &);
|
||||||
static size_t backprop(task &, const f32, model::attn &, f32 *const (&)[2], size_t);
|
static void backprop(const opts &, const u32, const f32, model::attn &, model::attn &, model::attn &);
|
||||||
static size_t backprop(task &, const f32, model::ffnn &, f32 *const (&)[2], size_t);
|
static void backprop(const opts &, const u32, const f32, model::ffnn &, model::ffnn &, model::ffnn &);
|
||||||
static size_t backprop(task &, const f32, model::block &, f32 *const (&)[2], size_t);
|
static void backprop(const opts &, const u32, const f32, model::block &, model::block &, model::block &);
|
||||||
static size_t backprop(task &, const f32, model::embed &, f32 *const (&)[2], size_t);
|
static void backprop(const opts &, const u32, const f32, model::embed &, model::embed &, model::embed &);
|
||||||
extern size_t backprop(task &, const f32, model::decoder &, f32 *const (&)[2], size_t = 0);
|
static void backprop(const opts &, const u32, const f32, model::decoder &, model::decoder &, model::decoder &);
|
||||||
|
extern void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept;
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
|
static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
|
||||||
|
@ -37,7 +41,7 @@ namespace ircd::gpt
|
||||||
static void logits(float *, const float *, const model::decoder &);
|
static void logits(float *, const float *, const model::decoder &);
|
||||||
static void tail(float *, const float *, const model::decoder &);
|
static void tail(float *, const float *, const model::decoder &);
|
||||||
static u16 argmax(const float *, const opts &);
|
static u16 argmax(const float *, const opts &);
|
||||||
static void embed(float *, const u16 token, const u16 position, const opts &);
|
static void embed(float *, const u16 token, const u16 position, const model::decoder &);
|
||||||
|
|
||||||
static f32
|
static f32
|
||||||
logit alignas(64) [65536],
|
logit alignas(64) [65536],
|
||||||
|
@ -49,21 +53,20 @@ void
|
||||||
ircd::gpt::embed(float *const out,
|
ircd::gpt::embed(float *const out,
|
||||||
const u16 token,
|
const u16 token,
|
||||||
const u16 position,
|
const u16 position,
|
||||||
const opts &opts)
|
const model::decoder &model)
|
||||||
{
|
{
|
||||||
assert(opts.model);
|
|
||||||
const auto &wpe
|
const auto &wpe
|
||||||
{
|
{
|
||||||
opts.model->word.pos[position]
|
model.embed.pos[position]
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto &wte
|
const auto &wte
|
||||||
{
|
{
|
||||||
opts.model->word.token[token]
|
model.embed.token[token]
|
||||||
};
|
};
|
||||||
|
|
||||||
for(uint j(0); j < 768; ++j)
|
for(uint j(0); j < 768; ++j)
|
||||||
out[j] = wte[j] + wpe[j];
|
out[j] = wte.elem[j] + wpe.elem[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16_t
|
uint16_t
|
||||||
|
@ -117,7 +120,7 @@ ircd::gpt::tail(float *const __restrict__ logit,
|
||||||
for(uint i(0); i < 768; ++i)
|
for(uint i(0); i < 768; ++i)
|
||||||
buf[0][i] = state[i];
|
buf[0][i] = state[i];
|
||||||
|
|
||||||
norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.f.bias, (const f32x4 *)d.f.weight, lnf_epsilon);
|
norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.embed.norm.bias.elem, (const f32x4 *)d.embed.norm.weight.elem, lnf_epsilon);
|
||||||
logits(logit, buf[0], d);
|
logits(logit, buf[0], d);
|
||||||
//logitsmax(logit, logit, vocab::tokens);
|
//logitsmax(logit, logit, vocab::tokens);
|
||||||
}
|
}
|
||||||
|
@ -132,7 +135,7 @@ ircd::gpt::logits(float *const __restrict__ out,
|
||||||
|
|
||||||
for(uint j(0); j < vocab::tokens; ++j)
|
for(uint j(0); j < vocab::tokens; ++j)
|
||||||
for(uint k(0); k < 768; ++k)
|
for(uint k(0); k < 768; ++k)
|
||||||
out[j] += in[k] * d.word.token[j][k];
|
out[j] += in[k] * d.embed.token[j].elem[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
[[gnu::noinline]]
|
[[gnu::noinline]]
|
||||||
|
@ -192,7 +195,7 @@ ircd::gpt::coil(float *__restrict__ accum,
|
||||||
};
|
};
|
||||||
|
|
||||||
for(uint j(0); j < tokens; ++j)
|
for(uint j(0); j < tokens; ++j)
|
||||||
fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
|
fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias.elem, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
|
||||||
|
|
||||||
for(uint j(0); j < tokens; ++j)
|
for(uint j(0); j < tokens; ++j)
|
||||||
ffnn(accum + j * 768, accum + j * 768, decoder, i);
|
ffnn(accum + j * 768, accum + j * 768, decoder, i);
|
||||||
|
@ -227,7 +230,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64],
|
||||||
buf alignas(64) [768],
|
buf alignas(64) [768],
|
||||||
proj alignas(64) [2304];
|
proj alignas(64) [2304];
|
||||||
|
|
||||||
norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon);
|
norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.attn.norm.bias.elem, (const f32x4 *)layer.attn.norm.weight.elem, ln1_epsilon);
|
||||||
|
|
||||||
static const math::fmma_opts fmma_opts
|
static const math::fmma_opts fmma_opts
|
||||||
{
|
{
|
||||||
|
@ -235,7 +238,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64],
|
||||||
};
|
};
|
||||||
|
|
||||||
memset(proj, 0x0, sizeof(proj));
|
memset(proj, 0x0, sizeof(proj));
|
||||||
fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts);
|
fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.fcon_bias.fcon, (const f32x4 *)layer.attn.fcon_weight, fmma_opts);
|
||||||
|
|
||||||
#pragma clang loop unroll (disable)
|
#pragma clang loop unroll (disable)
|
||||||
for(uint j(0); j < 12; ++j)
|
for(uint j(0); j < 12; ++j)
|
||||||
|
@ -372,10 +375,10 @@ ircd::gpt::ffnn(float *const out,
|
||||||
buf2 alignas(64) [3072];
|
buf2 alignas(64) [3072];
|
||||||
|
|
||||||
memset(buf2, 0x0, sizeof(buf2));
|
memset(buf2, 0x0, sizeof(buf2));
|
||||||
norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
|
norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ffnn.norm.bias.elem, (const f32x4 *)layer.ffnn.norm.weight.elem, ln2_epsilon);
|
||||||
fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts);
|
fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fcon_bias.fcon, (const f32x4 *)layer.ffnn.fcon_weight, fmma3_opts);
|
||||||
gelu((f32x4 *)buf2, (const f32x4 *)buf2);
|
gelu((f32x4 *)buf2, (const f32x4 *)buf2);
|
||||||
fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
|
fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias.elem, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -431,213 +434,219 @@ ircd::gpt::gelu(f32x4 &out,
|
||||||
//
|
//
|
||||||
|
|
||||||
[[gnu::noinline]]
|
[[gnu::noinline]]
|
||||||
size_t
|
void
|
||||||
ircd::gpt::backprop(task &task,
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
model::decoder ¶m,
|
model::decoder &__restrict__ param,
|
||||||
f32 *const (&moment)[2],
|
f32 *const __restrict__ buf[2])
|
||||||
size_t off)
|
noexcept
|
||||||
{
|
{
|
||||||
for(uint i(0); i < 12; ++i)
|
model::decoder *const __restrict__ moment[2]
|
||||||
off = backprop(task, grad, param.layer[i], moment, off);
|
|
||||||
|
|
||||||
off = backprop(task, grad, param.f, moment, off);
|
|
||||||
off = backprop(task, grad, param.word, moment, off);
|
|
||||||
return off;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t
|
|
||||||
ircd::gpt::backprop(task &task,
|
|
||||||
const f32 grad,
|
|
||||||
model::embed ¶m,
|
|
||||||
f32 *const (&moment)[2],
|
|
||||||
size_t off)
|
|
||||||
{
|
|
||||||
assert(task.opts);
|
|
||||||
const auto &opts
|
|
||||||
{
|
{
|
||||||
*task.opts
|
reinterpret_cast<model::decoder *>(buf[0]),
|
||||||
|
reinterpret_cast<model::decoder *>(buf[1]),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
backprop(opts, step, grad, param, *moment[0], *moment[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
|
const f32 grad,
|
||||||
|
model::decoder &__restrict__ param,
|
||||||
|
model::decoder &__restrict__ moment0,
|
||||||
|
model::decoder &__restrict__ moment1)
|
||||||
|
{
|
||||||
|
fpe::errors_handle eh;
|
||||||
|
|
||||||
|
assume(opts.attn_rank > 0);
|
||||||
|
assume(opts.layers > 0);
|
||||||
|
const auto eln
|
||||||
|
{
|
||||||
|
opts.layers - 1 // step % opts.layers
|
||||||
|
};
|
||||||
|
|
||||||
|
for(int i(opts.layers - 1); i >= int(opts.layers - eln - 1); --i)
|
||||||
|
{
|
||||||
|
assert(i >= 0 && i < int(opts.layers));
|
||||||
|
backprop(opts, step, grad, param.layer[i], moment0.layer[i], moment1.layer[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
backprop(opts, step, grad, param.embed, moment0.embed, moment1.embed);
|
||||||
|
|
||||||
|
auto pending(eh.pending());
|
||||||
|
eh.clear_pending();
|
||||||
|
pending &= ~pending & FE_INEXACT;
|
||||||
|
if(unlikely(pending))
|
||||||
|
fpe::throw_errors(pending);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
|
const f32 grad,
|
||||||
|
model::embed &__restrict__ param,
|
||||||
|
model::embed &__restrict__ moment0,
|
||||||
|
model::embed &__restrict__ moment1)
|
||||||
|
{
|
||||||
|
backprop(opts, step, grad, param.norm, moment0.norm, moment1.norm);
|
||||||
|
|
||||||
|
assume(opts.context_tokens > 0);
|
||||||
for(uint i(0); i < opts.context_tokens; ++i)
|
for(uint i(0); i < opts.context_tokens; ++i)
|
||||||
off = adamw(task, grad, param.pos[i], 768, moment, off);
|
adamw(opts, step, grad, 768, param.pos[i].elem, moment0.pos[i].elem, moment1.pos[i].elem);
|
||||||
|
|
||||||
|
assume(opts.logits > 0);
|
||||||
for(uint i(0); i < opts.logits; ++i)
|
for(uint i(0); i < opts.logits; ++i)
|
||||||
off = adamw(task, grad, param.token[i], 768, moment, off);
|
adamw(opts, step, grad, 768, param.token[i].elem, moment0.token[i].elem, moment1.token[i].elem);
|
||||||
|
|
||||||
return off;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
void
|
||||||
ircd::gpt::backprop(task &task,
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
model::block ¶m,
|
model::block &__restrict__ param,
|
||||||
f32 *const (&moment)[2],
|
model::block &__restrict__ moment0,
|
||||||
size_t off)
|
model::block &__restrict__ moment1)
|
||||||
{
|
{
|
||||||
off = backprop(task, grad, param.ln1, moment, off);
|
backprop(opts, step, grad, param.attn.norm, moment0.attn.norm, moment1.attn.norm);
|
||||||
off = backprop(task, grad, param.attn, moment, off);
|
backprop(opts, step, grad, param.attn, moment0.attn, moment1.attn);
|
||||||
off = backprop(task, grad, param.ln2, moment, off);
|
|
||||||
off = backprop(task, grad, param.ffnn, moment, off);
|
backprop(opts, step, grad, param.ffnn.norm, moment0.ffnn.norm, moment1.ffnn.norm);
|
||||||
return off;
|
backprop(opts, step, grad, param.ffnn, moment0.ffnn, moment1.ffnn);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
void
|
||||||
ircd::gpt::backprop(task &task,
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
model::attn ¶m,
|
model::attn &__restrict__ param,
|
||||||
f32 *const (&moment)[2],
|
model::attn &__restrict__ moment0,
|
||||||
size_t off)
|
model::attn &__restrict__ moment1)
|
||||||
{
|
{
|
||||||
off = adamw(task, grad, param.attn_bias, 2304, moment, off);
|
adamw(opts, step, grad, 2304, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon);
|
||||||
|
|
||||||
for(uint i(0); i < 768; ++i)
|
for(uint i(0); i < 768; ++i)
|
||||||
off = adamw(task, grad, param.attn_weight[i], 2304, moment, off);
|
adamw(opts, step, grad, 2304, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon);
|
||||||
|
|
||||||
off = adamw(task, grad, param.proj_bias, 768, moment, off);
|
adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem);
|
||||||
|
|
||||||
for(uint i(0); i < 768; ++i)
|
for(uint i(0); i < 768; ++i)
|
||||||
off = adamw(task, grad, param.proj_weight[i], 768, moment, off);
|
adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem);
|
||||||
|
|
||||||
return off;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
void
|
||||||
ircd::gpt::backprop(task &task,
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
model::ffnn ¶m,
|
model::ffnn &__restrict__ param,
|
||||||
f32 *const (&moment)[2],
|
model::ffnn &__restrict__ moment0,
|
||||||
size_t off)
|
model::ffnn &__restrict__ moment1)
|
||||||
{
|
{
|
||||||
off = adamw(task, grad, param.fc_bias, 3072, moment, off);
|
adamw(opts, step, grad, 3072, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon);
|
||||||
|
|
||||||
for(uint i(0); i < 768; ++i)
|
for(uint i(0); i < 768; ++i)
|
||||||
off = adamw(task, grad, param.fc_weight[i], 3072, moment, off);
|
adamw(opts, step, grad, 3072, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon);
|
||||||
|
|
||||||
off = adamw(task, grad, param.proj_bias, 768, moment, off);
|
adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem);
|
||||||
|
|
||||||
for(uint i(0); i < 3072; ++i)
|
for(uint i(0); i < 3072; ++i)
|
||||||
off = adamw(task, grad, param.proj_weight[i], 768, moment, off);
|
adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem);
|
||||||
|
|
||||||
return off;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
void
|
||||||
ircd::gpt::backprop(task &task,
|
ircd::gpt::backprop(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
const f32 grad,
|
const f32 grad,
|
||||||
model::norm ¶m,
|
model::norm &__restrict__ param,
|
||||||
f32 *const (&moment)[2],
|
model::norm &__restrict__ moment0,
|
||||||
size_t off)
|
model::norm &__restrict__ moment1)
|
||||||
{
|
{
|
||||||
off = adamw(task, grad, param.bias, 768, moment, off);
|
adamw(opts, step, grad, 768, param.bias.elem, moment0.bias.elem, moment1.bias.elem);
|
||||||
off = adamw(task, grad, param.weight, 768, moment, off);
|
adamw(opts, step, grad, 768, param.weight.elem, moment0.weight.elem, moment1.weight.elem);
|
||||||
return off;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[[gnu::noinline]]
|
namespace ircd::gpt
|
||||||
size_t
|
|
||||||
ircd::gpt::adamw(task &task,
|
|
||||||
const f32 grad,
|
|
||||||
f32 *const p_,
|
|
||||||
const size_t num,
|
|
||||||
f32 *const (&__restrict__ m_)[2],
|
|
||||||
size_t off)
|
|
||||||
{
|
{
|
||||||
assert(task.opts);
|
static f32x4 adamw_moment(const f32x4, const f32, const f32);
|
||||||
const auto &opts
|
static f32x4 adamw_numer(const f32x4, const f32, const u32);
|
||||||
|
static f32x4 adamw_denom(const f32x4, const f32, const u32);
|
||||||
|
static f32x4 adamw_delta(const f32x4, const f32x4, const f32, const f32, const f32, const u32);
|
||||||
|
static void adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32);
|
||||||
|
static void adamw(const opts &, const u32, const f32, const u32, f32 *, f32 *, f32 *);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ircd::gpt::adamw(const opts &opts,
|
||||||
|
const u32 step,
|
||||||
|
const f32 grad,
|
||||||
|
const u32 num,
|
||||||
|
f32 *const __restrict__ param,
|
||||||
|
f32 *const __restrict__ moment0,
|
||||||
|
f32 *const __restrict__ moment1)
|
||||||
|
{
|
||||||
|
f32x4 *const __restrict__ val[3]
|
||||||
{
|
{
|
||||||
*task.opts
|
reinterpret_cast<f32x4 *>(param),
|
||||||
|
reinterpret_cast<f32x4 *>(moment0),
|
||||||
|
reinterpret_cast<f32x4 *>(moment1),
|
||||||
};
|
};
|
||||||
|
|
||||||
assert(task.ctrl);
|
const auto n
|
||||||
auto &ctrl
|
|
||||||
{
|
{
|
||||||
*task.ctrl
|
num / 4
|
||||||
};
|
};
|
||||||
|
|
||||||
f32x4 *const p[3]
|
assume(0 < n);
|
||||||
{
|
for(uint i(0); i < n; ++i)
|
||||||
reinterpret_cast<f32x4 *>(p_),
|
adamw
|
||||||
reinterpret_cast<f32x4 *>(m_[0]) + off,
|
|
||||||
reinterpret_cast<f32x4 *>(m_[1]) + off,
|
|
||||||
};
|
|
||||||
|
|
||||||
assert(num >= 4);
|
|
||||||
const uint n
|
|
||||||
{
|
|
||||||
uint(num) / 4
|
|
||||||
};
|
|
||||||
|
|
||||||
// Assume loop body always taken w/o soundness; otherwise extra branch.
|
|
||||||
assert(n > 0);
|
|
||||||
uint i(0); do
|
|
||||||
{
|
|
||||||
off = adamw
|
|
||||||
(
|
(
|
||||||
p[0][i],
|
val[0][i],
|
||||||
p[1][i],
|
val[1][i],
|
||||||
p[2][i],
|
val[2][i],
|
||||||
grad,
|
grad,
|
||||||
opts.alpha,
|
opts.alpha,
|
||||||
opts.beta[0],
|
opts.beta[0],
|
||||||
opts.beta[1],
|
opts.beta[1],
|
||||||
ctrl.epic.step,
|
step + 1
|
||||||
off
|
|
||||||
);
|
);
|
||||||
}
|
|
||||||
while(++i < n);
|
|
||||||
|
|
||||||
return off;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t
|
void
|
||||||
ircd::gpt::adamw(f32x4 &__restrict__ param,
|
ircd::gpt::adamw(f32x4 &__restrict__ param,
|
||||||
f32x4 &__restrict__ moment0,
|
f32x4 &__restrict__ moment0,
|
||||||
f32x4 &__restrict__ moment1,
|
f32x4 &__restrict__ moment1,
|
||||||
const f32 grad,
|
const f32 grad_,
|
||||||
const f32 alpha,
|
const f32 alpha_,
|
||||||
const f32 beta0,
|
const f32 beta0,
|
||||||
const f32 beta1,
|
const f32 beta1,
|
||||||
const u32 step,
|
const u32 step)
|
||||||
const size_t off)
|
|
||||||
{
|
{
|
||||||
const f32x4 one
|
const f32 alpha
|
||||||
{
|
{
|
||||||
1.0f, 1.0f, 1.0f, 1.0f,
|
grad_ < 0? -alpha_ : alpha_
|
||||||
};
|
};
|
||||||
|
|
||||||
const f32x4 a[2]
|
const f32 grad
|
||||||
{
|
{
|
||||||
{ one - beta0 },
|
grad_ < 0? -grad_ : grad_
|
||||||
{ one - beta1 },
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const f32x4 avg_mul[2]
|
const f32 grad_grad
|
||||||
{
|
{
|
||||||
{ moment0 * beta0 },
|
grad * grad
|
||||||
{ moment1 * beta1 },
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const f32x4 avg_dot[2]
|
const f32x4 moment[]
|
||||||
{
|
{
|
||||||
{ avg_mul[0] + a[0] * grad },
|
adamw_moment(moment0, grad, beta0),
|
||||||
{ avg_mul[1] + a[1] * grad * grad },
|
adamw_moment(moment1, grad_grad, beta1)
|
||||||
};
|
|
||||||
|
|
||||||
const f32x4 bias[2]
|
|
||||||
{
|
|
||||||
{ avg_dot[0] / (one - powf(beta0, step + 1)) },
|
|
||||||
{ avg_dot[1] / (one - powf(beta1, step + 1)) },
|
|
||||||
};
|
|
||||||
|
|
||||||
const f32x4 denom
|
|
||||||
{
|
|
||||||
sqrtf(bias[1]) + 0.000001f // epsilon
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const f32x4 delta
|
const f32x4 delta
|
||||||
{
|
{
|
||||||
alpha * (bias[0] / denom)
|
adamw_delta(moment[0], moment[1], alpha, beta0, beta1, step)
|
||||||
};
|
};
|
||||||
|
|
||||||
const f32x4 update
|
const f32x4 update
|
||||||
|
@ -645,8 +654,168 @@ ircd::gpt::adamw(f32x4 &__restrict__ param,
|
||||||
param - delta
|
param - delta
|
||||||
};
|
};
|
||||||
|
|
||||||
moment0 = avg_dot[0];
|
if((false))
|
||||||
moment1 = avg_dot[1];
|
for(uint i(0); i < 4; ++i)
|
||||||
|
printf("%-15p p[%11.8lf] m[%11.8lf][%11.8lf] g[%11.8lf] d[%11.8lf] p[%11.8lf] m[%11.8lf][%11.8lf]\n",
|
||||||
|
((f32 *)¶m) + i,
|
||||||
|
param[i],
|
||||||
|
moment0[i],
|
||||||
|
moment1[i],
|
||||||
|
grad,
|
||||||
|
delta[i],
|
||||||
|
update[i],
|
||||||
|
moment[0][i],
|
||||||
|
moment[1][i]);
|
||||||
|
|
||||||
|
assert(std::isnormal(update[0]));
|
||||||
|
assert(std::isnormal(update[1]));
|
||||||
|
assert(std::isnormal(update[2]));
|
||||||
|
assert(std::isnormal(update[3]));
|
||||||
|
|
||||||
|
assert(std::isnormal(moment[0][0]));
|
||||||
|
assert(std::isnormal(moment[0][1]));
|
||||||
|
assert(std::isnormal(moment[0][2]));
|
||||||
|
assert(std::isnormal(moment[0][3]));
|
||||||
|
|
||||||
|
assert(std::isnormal(moment[1][0]));
|
||||||
|
assert(std::isnormal(moment[1][1]));
|
||||||
|
assert(std::isnormal(moment[1][2]));
|
||||||
|
assert(std::isnormal(moment[1][3]));
|
||||||
|
|
||||||
param = update;
|
param = update;
|
||||||
return off + 1;
|
//__builtin_nontemporal_store(update, ¶m);
|
||||||
|
|
||||||
|
moment0 = moment[0];
|
||||||
|
moment1 = moment[1];
|
||||||
|
//__builtin_nontemporal_store(moment[0], &moment0);
|
||||||
|
//__builtin_nontemporal_store(moment[1], &moment1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ircd::f32x4
|
||||||
|
ircd::gpt::adamw_delta(const f32x4 moment0,
|
||||||
|
const f32x4 moment1,
|
||||||
|
const f32 alpha,
|
||||||
|
const f32 beta0,
|
||||||
|
const f32 beta1,
|
||||||
|
const u32 step)
|
||||||
|
{
|
||||||
|
static const f32 epsilon
|
||||||
|
{
|
||||||
|
FLT_EPSILON
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 denom
|
||||||
|
{
|
||||||
|
adamw_denom(moment1, beta1, step) + epsilon
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 decay
|
||||||
|
{
|
||||||
|
adamw_numer(moment0, beta0, step)
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 smooth
|
||||||
|
{
|
||||||
|
alpha * decay
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(std::isnormal(denom[0]));
|
||||||
|
assert(std::isnormal(denom[1]));
|
||||||
|
assert(std::isnormal(denom[2]));
|
||||||
|
assert(std::isnormal(denom[3]));
|
||||||
|
const f32x4 delta
|
||||||
|
{
|
||||||
|
smooth / denom
|
||||||
|
};
|
||||||
|
|
||||||
|
return delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
ircd::f32x4
|
||||||
|
ircd::gpt::adamw_denom(const f32x4 moment,
|
||||||
|
const f32 beta,
|
||||||
|
const u32 step)
|
||||||
|
{
|
||||||
|
static const f32x4 one
|
||||||
|
{
|
||||||
|
1.0f, 1.0f, 1.0f, 1.0f,
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(step > 0);
|
||||||
|
const f32x4 decay
|
||||||
|
{
|
||||||
|
one - powf(beta, step)
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(std::isnormal(decay[0]));
|
||||||
|
assert(std::isnormal(decay[1]));
|
||||||
|
assert(std::isnormal(decay[2]));
|
||||||
|
assert(std::isnormal(decay[3]));
|
||||||
|
const f32x4 bias
|
||||||
|
{
|
||||||
|
moment / decay
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 denom
|
||||||
|
{
|
||||||
|
sqrtf(bias)
|
||||||
|
};
|
||||||
|
|
||||||
|
return denom;
|
||||||
|
}
|
||||||
|
|
||||||
|
ircd::f32x4
|
||||||
|
ircd::gpt::adamw_numer(const f32x4 moment,
|
||||||
|
const f32 beta,
|
||||||
|
const u32 step)
|
||||||
|
{
|
||||||
|
static const f32x4 one
|
||||||
|
{
|
||||||
|
1.0f, 1.0f, 1.0f, 1.0f,
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(step > 0);
|
||||||
|
const f32x4 decay
|
||||||
|
{
|
||||||
|
one - powf(beta, step)
|
||||||
|
};
|
||||||
|
|
||||||
|
assert(std::isnormal(decay[0]));
|
||||||
|
assert(std::isnormal(decay[1]));
|
||||||
|
assert(std::isnormal(decay[2]));
|
||||||
|
assert(std::isnormal(decay[3]));
|
||||||
|
const f32x4 bias
|
||||||
|
{
|
||||||
|
moment / decay
|
||||||
|
};
|
||||||
|
|
||||||
|
return bias;
|
||||||
|
}
|
||||||
|
|
||||||
|
ircd::f32x4
|
||||||
|
ircd::gpt::adamw_moment(const f32x4 moment,
|
||||||
|
const f32 grad,
|
||||||
|
const f32 beta)
|
||||||
|
{
|
||||||
|
static const f32x4 one
|
||||||
|
{
|
||||||
|
1.0f, 1.0f, 1.0f, 1.0f,
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 rate
|
||||||
|
{
|
||||||
|
one - beta
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 avg
|
||||||
|
{
|
||||||
|
moment * beta
|
||||||
|
};
|
||||||
|
|
||||||
|
const f32x4 dot
|
||||||
|
{
|
||||||
|
rate * grad + avg
|
||||||
|
};
|
||||||
|
|
||||||
|
return dot;
|
||||||
}
|
}
|
||||||
|
|
2428
ircd/gpt_gpu.cl
2428
ircd/gpt_gpu.cl
File diff suppressed because it is too large
Load diff
|
@ -54,10 +54,14 @@ namespace ircd::gpt::model
|
||||||
static fs::map
|
static fs::map
|
||||||
default_model_shm,
|
default_model_shm,
|
||||||
default_dataset_shm;
|
default_dataset_shm;
|
||||||
|
|
||||||
static std::unique_ptr<decoder> default_model_res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr const char
|
||||||
|
*const ircd::gpt::model::prop::ended,
|
||||||
|
*const ircd::gpt::model::prop::id,
|
||||||
|
*const ircd::gpt::model::prop::length,
|
||||||
|
*const ircd::gpt::model::prop::text;
|
||||||
|
|
||||||
decltype(ircd::gpt::model::manifest_h)
|
decltype(ircd::gpt::model::manifest_h)
|
||||||
ircd::gpt::model::manifest_h
|
ircd::gpt::model::manifest_h
|
||||||
{
|
{
|
||||||
|
@ -102,7 +106,7 @@ decltype(ircd::gpt::model::cache_hugepage)
|
||||||
ircd::gpt::model::cache_hugepage
|
ircd::gpt::model::cache_hugepage
|
||||||
{
|
{
|
||||||
{ "name", "ircd.gpt.model.cache.hugepage" },
|
{ "name", "ircd.gpt.model.cache.hugepage" },
|
||||||
{ "default", true },
|
{ "default", false },
|
||||||
};
|
};
|
||||||
|
|
||||||
decltype(ircd::gpt::model::cache_path)
|
decltype(ircd::gpt::model::cache_path)
|
||||||
|
@ -132,6 +136,12 @@ ircd::gpt::model::path
|
||||||
decltype(ircd::gpt::model::default_model)
|
decltype(ircd::gpt::model::default_model)
|
||||||
ircd::gpt::model::default_model;
|
ircd::gpt::model::default_model;
|
||||||
|
|
||||||
|
decltype(ircd::gpt::model::default_moment)
|
||||||
|
ircd::gpt::model::default_moment;
|
||||||
|
|
||||||
|
decltype(ircd::gpt::model::default_checkpoint)
|
||||||
|
ircd::gpt::model::default_checkpoint;
|
||||||
|
|
||||||
decltype(ircd::gpt::model::default_dataset)
|
decltype(ircd::gpt::model::default_dataset)
|
||||||
ircd::gpt::model::default_dataset;
|
ircd::gpt::model::default_dataset;
|
||||||
|
|
||||||
|
@ -144,17 +154,31 @@ ircd::gpt::model::init()
|
||||||
if(!model::path)
|
if(!model::path)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if(!init_from_cache(model::cache_path))
|
|
||||||
init_from_json(model::cache_path, model::path);
|
|
||||||
|
|
||||||
if(model::dataset_path)
|
if(model::dataset_path)
|
||||||
init_dataset(model::dataset_path);
|
init_dataset(model::dataset_path);
|
||||||
|
|
||||||
|
if(likely(init_from_cache(model::cache_path)))
|
||||||
|
return;
|
||||||
|
|
||||||
|
init_from_json(model::cache_path, model::path);
|
||||||
|
if(unlikely(!init_from_cache(model::cache_path)))
|
||||||
|
throw error
|
||||||
|
{
|
||||||
|
"Failed to find and/or initialize model."
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
ircd::gpt::model::fini()
|
ircd::gpt::model::fini()
|
||||||
noexcept
|
noexcept
|
||||||
{
|
{
|
||||||
|
default_checkpoint[2] = nullptr;
|
||||||
|
default_checkpoint[1] = nullptr;
|
||||||
|
default_checkpoint[0] = nullptr;
|
||||||
|
|
||||||
|
default_moment[1] = nullptr;
|
||||||
|
default_moment[0] = nullptr;
|
||||||
|
|
||||||
default_model = nullptr;
|
default_model = nullptr;
|
||||||
default_model_shm = {};
|
default_model_shm = {};
|
||||||
|
|
||||||
|
@ -169,18 +193,33 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
|
||||||
if(!fs::is_reg(cache_path))
|
if(!fs::is_reg(cache_path))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
const auto size
|
const auto file_size
|
||||||
{
|
{
|
||||||
fs::size(cache_path)
|
fs::size(cache_path)
|
||||||
};
|
};
|
||||||
|
|
||||||
if(unlikely(size != sizeof(model::decoder)))
|
const auto decoder_size
|
||||||
|
{
|
||||||
|
sizeof(model::decoder)
|
||||||
|
};
|
||||||
|
|
||||||
|
const bool has_params
|
||||||
|
{
|
||||||
|
file_size >= decoder_size
|
||||||
|
};
|
||||||
|
|
||||||
|
const bool has_moments
|
||||||
|
{
|
||||||
|
file_size >= decoder_size * 6
|
||||||
|
};
|
||||||
|
|
||||||
|
if(unlikely(!has_params))
|
||||||
throw error
|
throw error
|
||||||
{
|
{
|
||||||
"Cached model `%s' size %zu differs from %zu.",
|
"Cached model `%s' size %zu insufficient for decoder size %zu.",
|
||||||
cache_path,
|
cache_path,
|
||||||
size,
|
file_size,
|
||||||
sizeof(model::decoder),
|
decoder_size,
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto mode
|
const auto mode
|
||||||
|
@ -192,20 +231,41 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
|
||||||
|
|
||||||
const fs::fd fd
|
const fs::fd fd
|
||||||
{
|
{
|
||||||
cache_path, mode
|
cache_path, fs::fd::opts
|
||||||
|
{
|
||||||
|
.mode = mode,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
fs::map::opts map_opts
|
const bool map_moments
|
||||||
{
|
{
|
||||||
mode
|
has_moments || cache_shared
|
||||||
|
};
|
||||||
|
|
||||||
|
if(!has_moments && map_moments)
|
||||||
|
{
|
||||||
|
fs::truncate(fd, decoder_size * 6);
|
||||||
|
fs::allocate(fd, decoder_size * 5, decoder_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto map_size
|
||||||
|
{
|
||||||
|
map_moments?
|
||||||
|
decoder_size * 6:
|
||||||
|
decoder_size
|
||||||
|
};
|
||||||
|
|
||||||
|
const fs::map::opts map_opts
|
||||||
|
{
|
||||||
|
.alignment = alignof(model::decoder),
|
||||||
|
.shared = bool(cache_shared),
|
||||||
|
.locked = bool(cache_locked),
|
||||||
|
.huge2mb = bool(cache_hugepage),
|
||||||
};
|
};
|
||||||
|
|
||||||
map_opts.locked = bool(cache_locked);
|
|
||||||
map_opts.shared = bool(cache_shared);
|
|
||||||
map_opts.huge2mb = bool(cache_hugepage);
|
|
||||||
default_model_shm = fs::map
|
default_model_shm = fs::map
|
||||||
{
|
{
|
||||||
fd, map_opts, sizeof(decoder)
|
fd, map_opts, map_size
|
||||||
};
|
};
|
||||||
|
|
||||||
default_model = reinterpret_cast<decoder *>
|
default_model = reinterpret_cast<decoder *>
|
||||||
|
@ -213,13 +273,28 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
|
||||||
data(default_model_shm)
|
data(default_model_shm)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if(map_moments)
|
||||||
|
{
|
||||||
|
default_moment[0] = reinterpret_cast<float *>(default_model + 1);
|
||||||
|
default_moment[1] = reinterpret_cast<float *>(default_model + 2);
|
||||||
|
default_checkpoint[0] = reinterpret_cast<float *>(default_model + 3);
|
||||||
|
default_checkpoint[1] = reinterpret_cast<float *>(default_model + 4);
|
||||||
|
default_checkpoint[2] = reinterpret_cast<float *>(default_model + 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
allocator::lock({(const char *)default_model, sizeof(decoder)});
|
||||||
|
fs::prefetch(default_model_shm, sizeof(decoder));
|
||||||
|
|
||||||
char pbuf[48];
|
char pbuf[48];
|
||||||
log::info
|
log::info
|
||||||
{
|
{
|
||||||
log, "model(%p) mapped cached model `%s' %s",
|
log, "model(%p) mapped cached model `%s' params:%b moments:%b align:%u %s",
|
||||||
data(default_model_shm),
|
data(default_model_shm),
|
||||||
cache_path,
|
cache_path,
|
||||||
pretty(pbuf, iec(size)),
|
has_params,
|
||||||
|
has_moments,
|
||||||
|
map_opts.alignment,
|
||||||
|
pretty(pbuf, iec(map_size)),
|
||||||
};
|
};
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -264,9 +339,6 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
|
||||||
cache_path,
|
cache_path,
|
||||||
stopwatch.pretty(pbuf[1]),
|
stopwatch.pretty(pbuf[1]),
|
||||||
};
|
};
|
||||||
|
|
||||||
default_model_res = std::move(decoder);
|
|
||||||
default_model = default_model_res.get();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -363,6 +435,7 @@ ircd::gpt::model::init_dataset(const string_view &path)
|
||||||
|
|
||||||
size_t checkpoint(0);
|
size_t checkpoint(0);
|
||||||
default_data.resize(260000); //TODO: XXX
|
default_data.resize(260000); //TODO: XXX
|
||||||
|
fs::prefetch(default_dataset_shm, size);
|
||||||
ircd::tokens(default_dataset, '\n', [&checkpoint]
|
ircd::tokens(default_dataset, '\n', [&checkpoint]
|
||||||
(const string_view &line)
|
(const string_view &line)
|
||||||
{
|
{
|
||||||
|
@ -379,6 +452,7 @@ ircd::gpt::model::init_dataset(const string_view &path)
|
||||||
checkpoint,
|
checkpoint,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
fs::evict(default_dataset_shm, size);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -393,9 +467,9 @@ ircd::gpt::model::init_wpe_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t j(0);
|
size_t j(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.word.pos[i][j++] = lex_cast<float>(elem);
|
d.embed.pos[i].elem[j++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(j == sizeof(d.word.pos[i]) / sizeof(float));
|
always_assert(j == sizeof(d.embed.pos[i]) / sizeof(float));
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -411,9 +485,9 @@ ircd::gpt::model::init_wte_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t j(0);
|
size_t j(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.word.token[i][j++] = lex_cast<float>(elem);
|
d.embed.token[i].elem[j++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(j == sizeof(d.word.token[i]) / sizeof(float));
|
always_assert(j == sizeof(d.embed.token[i]) / sizeof(float));
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -426,9 +500,9 @@ ircd::gpt::model::init_f_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.f.weight[i++] = lex_cast<float>(elem);
|
d.embed.norm.weight.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.f.weight) / sizeof(float));
|
always_assert(i == sizeof(d.embed.norm.weight) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -439,9 +513,9 @@ ircd::gpt::model::init_f_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.f.bias[i++] = lex_cast<float>(elem);
|
d.embed.norm.bias.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.f.bias) / sizeof(float));
|
always_assert(i == sizeof(d.embed.norm.bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -455,16 +529,16 @@ ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t j(0);
|
size_t j(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
|
d.layer[layer].ffnn.fcon_weight[i].fcon[j++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
|
always_assert(j == sizeof(d.layer[layer].ffnn.fcon_weight[i]) / sizeof(float));
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
|
|
||||||
always_assert
|
always_assert
|
||||||
(
|
(
|
||||||
i == sizeof(d.layer[layer].ffnn.fc_weight)
|
i == sizeof(d.layer[layer].ffnn.fcon_weight)
|
||||||
/ sizeof(d.layer[layer].ffnn.fc_weight[0])
|
/ sizeof(d.layer[layer].ffnn.fcon_weight[0])
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -476,9 +550,9 @@ ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
|
d.layer[layer].ffnn.fcon_bias.fcon[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].ffnn.fcon_bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -492,7 +566,7 @@ ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t j(0);
|
size_t j(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
|
d.layer[layer].ffnn.proj_weight[i].elem[j++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
|
always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
|
||||||
++i;
|
++i;
|
||||||
|
@ -513,7 +587,7 @@ ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
|
d.layer[layer].ffnn.proj_bias.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
@ -526,9 +600,9 @@ ircd::gpt::model::init_h_ln_1_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
|
d.layer[layer].attn.norm.weight.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].attn.norm.weight) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -539,9 +613,9 @@ ircd::gpt::model::init_h_ln_1_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
|
d.layer[layer].attn.norm.bias.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].attn.norm.bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -552,9 +626,9 @@ ircd::gpt::model::init_h_ln_2_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
|
d.layer[layer].ffnn.norm.weight.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].ffnn.norm.weight) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -565,9 +639,9 @@ ircd::gpt::model::init_h_ln_2_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
|
d.layer[layer].ffnn.norm.bias.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].ffnn.norm.bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -581,16 +655,16 @@ ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t j(0);
|
size_t j(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
|
d.layer[layer].attn.fcon_weight[i].fcon[j++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
|
always_assert(j == sizeof(d.layer[layer].attn.fcon_weight[i]) / sizeof(float));
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
|
|
||||||
always_assert
|
always_assert
|
||||||
(
|
(
|
||||||
i == sizeof(d.layer[layer].attn.attn_weight)
|
i == sizeof(d.layer[layer].attn.fcon_weight)
|
||||||
/ sizeof(d.layer[layer].attn.attn_weight[0])
|
/ sizeof(d.layer[layer].attn.fcon_weight[0])
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -602,9 +676,9 @@ ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
|
d.layer[layer].attn.fcon_bias.fcon[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].attn.fcon_bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -618,7 +692,7 @@ ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
|
||||||
{
|
{
|
||||||
size_t j(0);
|
size_t j(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
|
d.layer[layer].attn.proj_weight[i].elem[j++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
|
always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
|
||||||
++i;
|
++i;
|
||||||
|
@ -639,7 +713,7 @@ ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
|
||||||
{
|
{
|
||||||
size_t i(0);
|
size_t i(0);
|
||||||
for(const auto &elem : vec)
|
for(const auto &elem : vec)
|
||||||
d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
|
d.layer[layer].attn.proj_bias.elem[i++] = lex_cast<float>(elem);
|
||||||
|
|
||||||
always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
|
always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
1466
ircd/gpt_pipe.cc
1466
ircd/gpt_pipe.cc
File diff suppressed because it is too large
Load diff
|
@ -152,10 +152,12 @@ ircd::gpt::pipe::code::set_cache(const string_view &path)
|
||||||
}
|
}
|
||||||
|
|
||||||
extern const uint8_t
|
extern const uint8_t
|
||||||
gpt_gpu_r600_barts_bc[];
|
gpt_gpu_r600_barts_bc[],
|
||||||
|
gpt_gpu_spv[];
|
||||||
|
|
||||||
extern const uint
|
extern const uint
|
||||||
gpt_gpu_r600_barts_bc_len;
|
gpt_gpu_r600_barts_bc_len,
|
||||||
|
gpt_gpu_spv_len;
|
||||||
|
|
||||||
ircd::cl::code
|
ircd::cl::code
|
||||||
ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
|
ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
|
||||||
|
@ -164,6 +166,9 @@ ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
|
||||||
{
|
{
|
||||||
reinterpret_cast<const char *>(gpt_gpu_r600_barts_bc),
|
reinterpret_cast<const char *>(gpt_gpu_r600_barts_bc),
|
||||||
gpt_gpu_r600_barts_bc_len
|
gpt_gpu_r600_barts_bc_len
|
||||||
|
|
||||||
|
//reinterpret_cast<const char *>(gpt_gpu_spv),
|
||||||
|
//gpt_gpu_spv_len
|
||||||
};
|
};
|
||||||
|
|
||||||
char pbuf[1][48];
|
char pbuf[1][48];
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
|
|
||||||
namespace ircd::gpt::vocab
|
namespace ircd::gpt::vocab
|
||||||
{
|
{
|
||||||
|
static u8x16 get_token(const u16);
|
||||||
static u16 find_token(const u8x16);
|
static u16 find_token(const u8x16);
|
||||||
static u16 find_merge(const u8x16, const u8x16);
|
static u16 find_merge(const u8x16, const u8x16);
|
||||||
static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint);
|
static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint);
|
||||||
|
@ -23,11 +24,11 @@ namespace ircd::gpt::vocab
|
||||||
static u64x2 tokenize_block(u16x16 &, const u8x16, const i8x16) noexcept;
|
static u64x2 tokenize_block(u16x16 &, const u8x16, const i8x16) noexcept;
|
||||||
static void init_tokens(), init_merges();
|
static void init_tokens(), init_merges();
|
||||||
|
|
||||||
[[gnu::visibility("internal")]]
|
|
||||||
extern const char32_t charset[256];
|
extern const char32_t charset[256];
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Remapping of single byte characters (Control (C0) and Basic Latin (ASCII)).
|
/// Remapping of single byte characters (Control (C0) and Basic Latin (ASCII)).
|
||||||
|
[[gnu::visibility("internal")]]
|
||||||
decltype(ircd::gpt::vocab::charset)
|
decltype(ircd::gpt::vocab::charset)
|
||||||
ircd::gpt::vocab::charset
|
ircd::gpt::vocab::charset
|
||||||
alignas(64)
|
alignas(64)
|
||||||
|
@ -169,7 +170,8 @@ ircd::gpt::vocab::init_merges()
|
||||||
|
|
||||||
ircd::string_view
|
ircd::string_view
|
||||||
ircd::gpt::vocab::debug(const mutable_buffer &out,
|
ircd::gpt::vocab::debug(const mutable_buffer &out,
|
||||||
const u16 idx)
|
const u16 idx,
|
||||||
|
const uint mask)
|
||||||
{
|
{
|
||||||
const auto *const token
|
const auto *const token
|
||||||
{
|
{
|
||||||
|
@ -177,13 +179,21 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
|
||||||
};
|
};
|
||||||
|
|
||||||
thread_local char strbuf[2][512];
|
thread_local char strbuf[2][512];
|
||||||
return string_view{fmt::sprintf
|
return fmt::sprintf
|
||||||
{
|
{
|
||||||
out, "%5u [%32s] %s",
|
out, "%5u %s%32s%s%s%s",
|
||||||
idx,
|
idx,
|
||||||
simd::print_chr(strbuf[0], token[idx]),
|
mask & 0x1?
|
||||||
simd::print_mem(strbuf[1], token[idx]),
|
"[ "_sv: string_view{},
|
||||||
}};
|
mask & 0x1?
|
||||||
|
simd::print_chr(strbuf[0], token[idx]): string_view{},
|
||||||
|
mask & 0x1?
|
||||||
|
" ]"_sv: string_view{},
|
||||||
|
mask & 0x2?
|
||||||
|
" "_sv: string_view{},
|
||||||
|
mask & 0x2?
|
||||||
|
simd::print_mem(strbuf[1], token[idx]): string_view{},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -193,6 +203,7 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
|
||||||
ircd::string_view
|
ircd::string_view
|
||||||
ircd::gpt::vocab::detokenize(const mutable_buffer &out,
|
ircd::gpt::vocab::detokenize(const mutable_buffer &out,
|
||||||
const vector_view<const u16> &in)
|
const vector_view<const u16> &in)
|
||||||
|
noexcept
|
||||||
{
|
{
|
||||||
size_t off(0);
|
size_t off(0);
|
||||||
for(const u16 &id : in)
|
for(const u16 &id : in)
|
||||||
|
@ -228,9 +239,65 @@ ircd::gpt::vocab::detokenize(const mutable_buffer &out,
|
||||||
// tokenize
|
// tokenize
|
||||||
//
|
//
|
||||||
|
|
||||||
|
uint16_t
|
||||||
|
ircd::gpt::vocab::tokenize(const string_view &in)
|
||||||
|
{
|
||||||
|
char str_buf[16];
|
||||||
|
const string_view str
|
||||||
|
{
|
||||||
|
str_buf, copy(str_buf, in)
|
||||||
|
};
|
||||||
|
|
||||||
|
u16 buf[16];
|
||||||
|
const auto out
|
||||||
|
{
|
||||||
|
tokenize(buf, str)
|
||||||
|
};
|
||||||
|
|
||||||
|
if(unlikely(out.size() != 1))
|
||||||
|
throw error
|
||||||
|
{
|
||||||
|
"Input tokenizes to %zu tokens.",
|
||||||
|
out.size()
|
||||||
|
};
|
||||||
|
|
||||||
|
return buf[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
uint16_t
|
||||||
|
ircd::gpt::vocab::tokenize(const_buffer &in)
|
||||||
|
noexcept
|
||||||
|
{
|
||||||
|
char str_buf[16];
|
||||||
|
const string_view str
|
||||||
|
{
|
||||||
|
str_buf, copy(str_buf, in)
|
||||||
|
};
|
||||||
|
|
||||||
|
u16 buf[16];
|
||||||
|
const auto out
|
||||||
|
{
|
||||||
|
tokenize(buf, str)
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto &tok
|
||||||
|
{
|
||||||
|
get_token(buf[0])
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto consumed
|
||||||
|
{
|
||||||
|
simd::strlen(tok)
|
||||||
|
};
|
||||||
|
|
||||||
|
consume(in, consumed);
|
||||||
|
return buf[0];
|
||||||
|
}
|
||||||
|
|
||||||
ircd::vector_view<ircd::u16>
|
ircd::vector_view<ircd::u16>
|
||||||
ircd::gpt::vocab::tokenize(const vector_view<u16> &out,
|
ircd::gpt::vocab::tokenize(const vector_view<u16> &out,
|
||||||
const string_view &in)
|
const string_view &in)
|
||||||
|
noexcept
|
||||||
{
|
{
|
||||||
using input_t = u8x16;
|
using input_t = u8x16;
|
||||||
using block_t = u16x16;
|
using block_t = u16x16;
|
||||||
|
@ -801,13 +868,8 @@ ircd::gpt::vocab::bpe_score(u16 (&score)[16],
|
||||||
ircd::u16
|
ircd::u16
|
||||||
ircd::gpt::vocab::find_token(const u8x16 string)
|
ircd::gpt::vocab::find_token(const u8x16 string)
|
||||||
{
|
{
|
||||||
const auto *const __restrict__ token
|
|
||||||
{
|
|
||||||
reinterpret_cast<const u8x16 *>(vocab::token)
|
|
||||||
};
|
|
||||||
|
|
||||||
for(uint i(0); i < tokens; ++i)
|
for(uint i(0); i < tokens; ++i)
|
||||||
if(simd::streq(string, token[i]))
|
if(simd::streq(string, get_token(i)))
|
||||||
return i;
|
return i;
|
||||||
|
|
||||||
return u16(-1U);
|
return u16(-1U);
|
||||||
|
@ -835,3 +897,14 @@ ircd::gpt::vocab::find_merge(const u8x16 a,
|
||||||
|
|
||||||
return u16(-1U);
|
return u16(-1U);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ircd::u8x16
|
||||||
|
ircd::gpt::vocab::get_token(const u16 idx)
|
||||||
|
{
|
||||||
|
const auto *const __restrict__ token
|
||||||
|
{
|
||||||
|
reinterpret_cast<const u8x16 *>(vocab::token)
|
||||||
|
};
|
||||||
|
|
||||||
|
return token[idx];
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue