0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-12-26 07:23:53 +01:00

ircd::gpt: More Matrix Than Matrix.

This commit is contained in:
Jason Volk 2021-03-04 17:03:33 -08:00
parent 53c4260a21
commit 4458235dfa
5 changed files with 1085 additions and 0 deletions

View file

@ -16,9 +16,19 @@
namespace ircd::gpt
{
IRCD_EXCEPTION(ircd::error, error)
u16
generate(const vector_view<const f32> &) noexcept;
vector_view<f32>
embed(const vector_view<f32> &,
const vector_view<const u16> &) noexcept;
extern log::log log;
}
#include "vocab.h"
#include "model.h"
namespace ircd::gpt
{

68
include/ircd/gpt/model.h Normal file
View file

@ -0,0 +1,68 @@
// Tensor Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
#pragma once
#define HAVE_IRCD_GPT_MODEL_H
namespace ircd::gpt::model
{
struct norm;
struct attn;
struct ffnn;
struct block;
struct decoder;
}
/// Attention aperature
struct ircd::gpt::model::attn
{
float
attn_bias alignas(64) [2304],
attn_weight alignas(64) [768][2304],
proj_bias alignas(64) [768],
proj_weight alignas(64) [768][768];
bool bias alignas(64) [1024][1024];
};
/// Feed-forward neural network
struct ircd::gpt::model::ffnn
{
float
fc_bias alignas(64) [3072],
fc_weight alignas(64) [768][3072],
proj_bias alignas(64) [768],
proj_weight alignas(64) [3072][768];
};
/// Layer normalization
struct ircd::gpt::model::norm
{
float
bias alignas(64) [768],
weight alignas(64) [768];
};
/// Transformer block
struct ircd::gpt::model::block
{
norm ln1;
model::attn attn;
norm ln2;
model::ffnn ffnn;
};
struct ircd::gpt::model::decoder
{
float
wpe alignas(64) [1024][768],
wte alignas(64) [65536][768];
block layer[12];
norm f;
};

View file

@ -217,6 +217,8 @@ libircd_la_SOURCES += png.cc
if OPENCL
libircd_la_SOURCES += cl.cc
endif
libircd_la_SOURCES += gpt.cc
libircd_la_SOURCES += gpt_model.cc
libircd_la_SOURCES += gpt_vocab.cc
libircd_la_SOURCES += openssl.cc
libircd_la_SOURCES += rfc1459.cc
@ -254,6 +256,16 @@ ROCKSDB_SRC_CPPFLAGS =#
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
GPT_FP_CXXFLAGS =#
GPT_FP_CXXFLAGS += -fno-math-errno
GPT_FP_CXXFLAGS += -fno-trapping-math
GPT_FP_CXXFLAGS += -ffinite-math-only
GPT_FP_CXXFLAGS += -fno-signed-zeros
GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
GPT_FP_CXXFLAGS += -fassociative-math
GPT_FP_CXXFLAGS += -ffp-contract=fast
GPT_FP_CXXFLAGS += -freciprocal-math
#
# Specific unit option composition
#
@ -282,6 +294,7 @@ endif
if IOU
fs_iou.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
endif
gpt.lo: AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
http.lo: AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
http.lo: AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
ios.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}

510
ircd/gpt.cc Normal file
View file

@ -0,0 +1,510 @@
// Matrix Construct Is All You Need Is All You Need Is AllĊĊĊĊĊĊĊĊ
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
decltype(ircd::gpt::log)
ircd::gpt::log
{
"gpt"
};
namespace ircd::gpt
{
static void gelu(float &, const float &);
static void gelu(float (&)[3072], const float (&)[3072]);
static void norm(float (&)[768], const float (&)[768], const float (&)[768], const float (&)[768], const float);
static void fmma(float (&)[768], const float (&)[3072], const float (&)[768], const float (&)[3072][768]);
static void fmma(float (&)[3072], const float (&)[768], const float (&)[3072], const float (&)[768][3072]);
static void fmma(float (&)[2304], const float (&)[768], const float (&)[2304], const float (&)[768][2304]);
static void fmma(float *, const float (&)[12][1024][64], const float (&)[768], const float (&)[768][768], const size_t);
static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
static void smax(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t);
static void ctrl(float (&)[3][1024][12][64], const float *const, const size_t, const model::block &);
static void ffnn(float (&)[768], const float (&)[768], const model::block &);
static void transform(float *, const size_t, const model::decoder &);
static void logitsmax(float *, const float *);
static void logits(float *, const float (&)[768], const model::decoder &);
static void tail(float *, const float (&)[768], const model::decoder &);
static u16 argmax(const float *);
std::unique_ptr<model::decoder> device
{
new model::decoder{}
};
static f32
logit alignas(64) [65536],
scratch alignas(64) [1024 * 768];
}
namespace ircd::gpt::model
{
constexpr float embed_pdrop
{
0.1
};
constexpr float ln1_epsilon
{
0.00001
};
constexpr float ln2_epsilon
{
0.00001
};
constexpr float lnf_epsilon
{
0.00001
};
constexpr float attn_pdrop
{
0.1
};
constexpr float resid_pdrop
{
0.1
};
}
ircd::vector_view<ircd::f32>
ircd::gpt::embed(const vector_view<f32> &out,
const vector_view<const u16> &in)
noexcept
{
assert(device);
uint i(0);
for(; i < in.size(); ++i)
{
const auto &wpe
{
device->wpe[i]
};
const auto &wte
{
device->wte[in[i]]
};
for(uint j(0); j < 768; ++j)
out[i * 768 + j] = wte[j] + wpe[j];
}
return vector_view<f32>
{
data(out), i * 768
};
}
uint16_t
ircd::gpt::generate(const vector_view<const f32> &in)
noexcept
{
always_assert(in.size() % 768 == 0);
const auto toks
{
in.size() / 768
};
const vector_view<f32> scratch
{
gpt::scratch, in.size()
};
for(uint i(0); i < in.size(); ++i)
scratch[i] = in[i];
transform(data(scratch), toks, *device);
static float
buf alignas(64) [768];
for(uint i(0); i < 768; ++i)
buf[i] = scratch[(toks - 1) * 768 + i];
tail(logit, buf, *device);
return argmax(logit);
}
uint16_t
ircd::gpt::argmax(const float *const __restrict__ logit)
{
u16 ret(0);
for(uint j(0); j < vocab::tokens; ++j)
if(logit[j] > logit[ret])
ret = j;
return ret;
}
[[gnu::noinline]]
void
ircd::gpt::tail(float *const __restrict__ logit,
const float (&__restrict__ state)[768],
const model::decoder &d)
{
static float
buf alignas(64) [768];
norm(buf, state, d.f.bias, d.f.weight, model::lnf_epsilon);
logits(logit, buf, d);
//logitsmax(logit, logit);
}
void
ircd::gpt::logits(float *const __restrict__ out,
const float (&__restrict__ in)[768],
const model::decoder &d)
{
for(uint j(0); j < vocab::tokens; ++j)
out[j] = 0;
for(uint j(0); j < vocab::tokens; ++j)
for(uint k(0); k < 768; ++k)
out[j] += in[k] * d.wte[j][k];
}
void
ircd::gpt::logitsmax(float *const out,
const float *const in)
{
static float
exps alignas(64) [65536];
for(uint j(0); j < vocab::tokens; ++j)
exps[j] = exp(in[j]);
for(uint j(0); j < vocab::tokens; ++j)
out[j] = 0;
for(uint j(0); j < vocab::tokens; ++j)
for(uint k(0); k < vocab::tokens; ++k)
out[k] += exps[j];
for(uint j(0); j < vocab::tokens; ++j)
out[j] = exps[j] / out[j];
}
[[gnu::noinline]]
void
ircd::gpt::transform(float *__restrict__ accum,
const size_t tokens,
const model::decoder &decoder)
{
static float
qkv alignas(64) [3][1024][12][64],
state alignas(64) [12][1024][1024],
attns alignas(64) [12][1024][64],
buf alignas(64) [768];
for(uint i(0); i < 12; ++i)
{
const auto &layer
{
decoder.layer[i]
};
ctrl(qkv, accum, tokens, layer);
pare(state, qkv, tokens);
mask(state, state, layer.attn.bias, tokens);
smax(state, state, tokens);
vals(attns, state, qkv, tokens);
fmma(accum, attns, layer.attn.proj_bias, layer.attn.proj_weight, tokens);
for(uint j(0); j < tokens; ++j)
{
for(uint k(0); k < 768; ++k)
buf[k] = accum[j * 768 + k];
ffnn(buf, buf, layer);
for(uint k(0); k < 768; ++k)
accum[j * 768 + k] += buf[k];
}
}
}
void
ircd::gpt::ffnn(float (&__restrict__ out)[768],
const float (&__restrict__ in)[768],
const model::block &layer)
{
static float
proj alignas(64) [3072];
norm(out, in, layer.ln2.bias, layer.ln2.weight, model::ln2_epsilon);
fmma(proj, out, layer.ffnn.fc_bias, layer.ffnn.fc_weight);
gelu(proj, proj);
fmma(out, proj, layer.ffnn.proj_bias, layer.ffnn.proj_weight);
}
void
ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
const float *const __restrict__ in,
const size_t num,
const model::block &layer)
{
float
(&__restrict__ qry)[1024][12][64] { out[0] },
(&__restrict__ key)[1024][12][64] { out[1] },
(&__restrict__ val)[1024][12][64] { out[2] };
for(uint i(0); i < num; ++i)
{
static float
buf alignas(64) [768],
proj alignas(64) [2304];
for(uint j(0); j < 768; ++j)
buf[j] = in[i * 768 + j];
norm(buf, buf, layer.ln1.bias, layer.ln1.weight, model::ln1_epsilon);
fmma(proj, buf, layer.attn.attn_bias, layer.attn.attn_weight);
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < 64; ++k)
qry[i][j][k] = proj[768 * 0 + j * 64 + k];
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < 64; ++k)
key[i][j][k] = proj[768 * 1 + j * 64 + k];
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < 64; ++k)
val[i][j][k] = proj[768 * 2 + j * 64 + k];
}
}
void
ircd::gpt::pare(float (&__restrict__ out)[12][1024][1024],
const float (&__restrict__ qkv)[3][1024][12][64],
const size_t num)
{
const float
(&__restrict__ qry)[1024][12][64] { qkv[0] },
(&__restrict__ key)[1024][12][64] { qkv[1] },
(&__restrict__ val)[1024][12][64] { qkv[2] };
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < num; ++l)
out[j][k][l] = 0;
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < num; ++l)
for(uint m(0); m < 64; ++m)
out[j][k][l] += qry[k][j][m] * key[l][j][m];
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < num; ++l)
out[j][k][l] /= 8.0;
}
void
ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
const float (&__restrict__ in)[12][1024][1024],
const float (&__restrict__ qkv)[3][1024][12][64],
const size_t num)
{
const float
(&__restrict__ val)[1024][12][64] { qkv[2] };
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < 64; ++l)
out[j][k][l] = 0;
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < num; ++l)
for(uint m(0); m < 64; ++m)
out[j][k][m] += in[j][k][l] * val[l][j][m];
}
void
ircd::gpt::smax(float (&__restrict__ out)[12][1024][1024],
const float (&__restrict__ in)[12][1024][1024],
const size_t num)
{
static float
exps alignas(64) [12][1024][1024];
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint m(0); m < num; ++m)
exps[j][k][m] = exp(in[j][k][m]);
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint m(0); m < num; ++m)
out[j][k][m] = 0;
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint m(0); m < num; ++m)
for(uint l(0); l < num; ++l)
out[j][k][l] += exps[j][k][m];
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < num; ++l)
out[j][k][l] = exps[j][k][l] / out[j][k][l];
}
void
ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024],
const float (&__restrict__ in)[12][1024][1024],
const bool (&__restrict__ bias)[1024][1024],
const size_t num)
{
static const float masked
{
-10000.0
};
#pragma clang loop unroll (disable)
for(uint j(0); j < 12; ++j)
for(uint k(0); k < num; ++k)
for(uint l(0); l < num; ++l)
out[j][k][l] = bias[k][l]? in[j][k][l]: masked;
}
void
ircd::gpt::norm(float (&__restrict__ out)[768],
const float (&__restrict__ in)[768],
const float (&__restrict__ bias)[768],
const float (&__restrict__ weight)[768],
const float epsilon)
{
static float
tmp alignas(64) [768];
const float mean
{
math::mean<float>(in)
};
for(uint j(0); j < 768; ++j)
tmp[j] = pow(in[j] - mean, 2);
const float s
{
math::mean<float>(tmp)
};
for(uint j(0); j < 768; ++j)
out[j] = (in[j] - mean) / sqrt(s + epsilon),
out[j] = out[j] * weight[j] + bias[j];
}
void
ircd::gpt::fmma(float *const __restrict__ out,
const float (&__restrict__ attn)[12][1024][64],
const float (&__restrict__ bias)[768],
const float (&__restrict__ weight)[768][768],
const size_t num)
{
static float
a alignas(64) [1024][768],
b alignas(64) [1024][768];
for(uint k(0); k < 12; k++)
for(uint j(0); j < num; j++)
for(uint l(0); l < 64; l++)
a[j][k * 64 + l] = attn[k][j][l];
for(uint i(0); i < num; i++)
for(uint j(0); j < 768; j++)
b[i][j] = bias[j];
for(uint i(0); i < num; i++)
for(uint k(0); k < 768; k++)
for(uint j(0); j < 768; j++)
b[i][k] += a[i][j] * weight[j][k];
for(uint i(0); i < num; i++)
for(uint j(0); j < 768; j++)
out[i * 768 + j] += b[i][j];
}
void
ircd::gpt::fmma(float (&__restrict__ out)[2304],
const float (&__restrict__ in)[768],
const float (&__restrict__ bias)[2304],
const float (&__restrict__ weight)[768][2304])
{
for(uint j(0); j < 2304; ++j)
out[j] = bias[j];
for(uint k(0); k < 768; ++k)
for(uint j(0); j < 2304; ++j)
out[j] += in[k] * weight[k][j];
}
void
ircd::gpt::fmma(float (&__restrict__ out)[768],
const float (&__restrict__ in)[3072],
const float (&__restrict__ bias)[768],
const float (&__restrict__ weight)[3072][768])
{
for(uint j(0); j < 768; ++j)
out[j] = bias[j];
for(uint k(0); k < 3072; k++)
for(uint j(0); j < 768; j++)
out[j] += in[k] * weight[k][j];
}
void
ircd::gpt::fmma(float (&__restrict__ out)[3072],
const float (&__restrict__ in)[768],
const float (&__restrict__ bias)[3072],
const float (&__restrict__ weight)[768][3072])
{
for(uint j(0); j < 3072; ++j)
out[j] = bias[j];
for(uint k(0); k < 768; ++k)
for(uint j(0); j < 3072; ++j)
out[j] += in[k] * weight[k][j];
}
void
ircd::gpt::gelu(float (&__restrict__ out)[3072],
const float (&__restrict__ in)[3072])
{
for(uint j(0); j < 3072; ++j)
gelu(out[j], in[j]);
}
void
ircd::gpt::gelu(float &out,
const float &in)
{
out = 0.5 * in * (1.0 + tanh(in * 0.7978845608 * (1.0 + 0.044715 * in * in)));
}

484
ircd/gpt_model.cc Normal file
View file

@ -0,0 +1,484 @@
// Tensor Construct
//
// Copyright (C) Matrix Construct Developers, Authors & Contributors
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.
namespace ircd::gpt::model
{
static void
init_f_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_f_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_wpe_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_wte_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ffnn_fc_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ffnn_fc_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ffnn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ffnn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ln_1_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ln_1_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ln_2_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_ln_2_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_attn_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
init() noexcept;
extern conf::item<std::string> path;
extern const std::pair
<
string_view,
void (*)(decoder &, const string_view &, const size_t &, const json::array &)
>
manifest[],
manifest_h[],
manifest_td[];
}
decltype(ircd::gpt::model::manifest_h)
ircd::gpt::model::manifest_h
{
{ "h.%u.mlp.c_fc.weight.json", init_h_ffnn_fc_weight, },
{ "h.%u.mlp.c_fc.bias.json", init_h_ffnn_fc_bias, },
{ "h.%u.mlp.c_proj.weight.json", init_h_ffnn_proj_weight, },
{ "h.%u.mlp.c_proj.bias.json", init_h_ffnn_proj_bias, },
{ "h.%u.ln_1.weight.json", init_h_ln_1_weight, },
{ "h.%u.ln_1.bias.json", init_h_ln_1_bias, },
{ "h.%u.ln_2.weight.json", init_h_ln_2_weight, },
{ "h.%u.ln_2.bias.json", init_h_ln_2_bias, },
{ "h.%u.attn.c_attn.weight.json", init_h_attn_attn_weight, },
{ "h.%u.attn.c_attn.bias.json", init_h_attn_attn_bias, },
{ "h.%u.attn.c_proj.weight.json", init_h_attn_proj_weight, },
{ "h.%u.attn.c_proj.bias.json", init_h_attn_proj_bias },
{ "h.%u.attn.bias.json", init_h_attn_bias, },
};
decltype(ircd::gpt::model::manifest)
ircd::gpt::model::manifest
{
{ "ln_f.weight.json", init_f_weight, },
{ "ln_f.bias.json", init_f_bias, },
{ "wpe.weight.json", init_wpe_weight },
{ "wte.weight.json", init_wte_weight },
};
decltype(ircd::gpt::model::manifest_td)
ircd::gpt::model::manifest_td
{
{ "test.jsonl", nullptr, },
{ "valid.jsonl", nullptr, },
{ "train.jsonl", nullptr, },
};
decltype(ircd::gpt::model::path)
ircd::gpt::model::path
{
{
{ "name", "ircd.gpt.model.path" },
{ "default", string_view{} },
},
init
};
//TODO: XXX
namespace ircd::gpt
{
extern const std::unique_ptr<model::decoder> device;
}
void
ircd::gpt::model::init()
noexcept
{
if(!model::path)
return;
const size_t layers
{
12
};
const auto handle{[]
(const auto &a, const auto &b, const auto &i)
{
const auto &[fmt, handler]
{
a[b]
};
char namebuf[128] {0};
const string_view path_part[2]
{
model::path, fmt::sprintf
{
namebuf, fmt, i
}
};
const fs::fd fd
{
fs::path(fs::path_scratch, path_part)
};
fs::map::opts map_opts;
const fs::map map
{
fd, map_opts
};
const json::array mat
{
map
};
assert(gpt::device);
handler(*gpt::device, path_part[1], i, mat);
log::logf
{
log, log::level::DEBUG,
"Model init [%2d][%2d] :%s",
i,
b,
path_part[1],
};
}};
ircd::timer sw;
size_t read(0), wrote(0);
if(fs::exists("model"))
{
const auto _read
{
fs::read(fs::fd{"model"}, mutable_buffer{(char *)(gpt::device.get()), sizeof(model::decoder)})
};
read = size(_read);
} else {
memset(device.get(), 0x0, sizeof(model::decoder));
handle(manifest, 0, 0);
handle(manifest, 1, 0);
handle(manifest, 2, 0);
handle(manifest, 3, 0);
for(size_t i(0); i < layers; ++i)
for(size_t j(0); j < 13; ++j)
handle(manifest_h, j, i);
const auto _wrote
{
fs::write("model", const_buffer{(const char *)(gpt::device.get()), sizeof(model::decoder)})
};
wrote = size(_wrote);
}
char pbuf[3][48];
log::logf
{
log, log::level::DEBUG,
"Model init completed in %s read %s wrote %s",
sw.pretty(pbuf[0]),
pretty(pbuf[1], iec(size(read))),
pretty(pbuf[2], iec(size(wrote))),
};
}
void
ircd::gpt::model::init_wpe_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
size_t i(0);
for(const json::array vec : mat)
{
size_t j(0);
for(const auto &elem : vec)
d.wpe[i][j++] = lex_cast<float>(elem);
always_assert(j == sizeof(d.wpe[i]) / sizeof(float));
++i;
}
}
void
ircd::gpt::model::init_wte_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
size_t i(0);
for(const json::array vec : mat)
{
size_t j(0);
for(const auto &elem : vec)
d.wte[i][j++] = lex_cast<float>(elem);
always_assert(j == sizeof(d.wte[i]) / sizeof(float));
++i;
}
}
void
ircd::gpt::model::init_f_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.f.weight[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.f.weight) / sizeof(float));
}
void
ircd::gpt::model::init_f_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.f.bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.f.bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
size_t i(0);
for(const json::array vec : mat)
{
size_t j(0);
for(const auto &elem : vec)
d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
++i;
}
always_assert
(
i == sizeof(d.layer[layer].ffnn.fc_weight)
/ sizeof(d.layer[layer].ffnn.fc_weight[0])
);
}
void
ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
size_t i(0);
for(const json::array vec : mat)
{
size_t j(0);
for(const auto &elem : vec)
d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
++i;
}
always_assert
(
i == sizeof(d.layer[layer].ffnn.proj_weight)
/ sizeof(d.layer[layer].ffnn.proj_weight[0])
);
}
void
ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_ln_1_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
}
void
ircd::gpt::model::init_h_ln_1_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_ln_2_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
}
void
ircd::gpt::model::init_h_ln_2_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
size_t i(0);
for(const json::array vec : mat)
{
size_t j(0);
for(const auto &elem : vec)
d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
++i;
}
always_assert
(
i == sizeof(d.layer[layer].attn.attn_weight)
/ sizeof(d.layer[layer].attn.attn_weight[0])
);
}
void
ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
size_t i(0);
for(const json::array vec : mat)
{
size_t j(0);
for(const auto &elem : vec)
d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
++i;
}
always_assert
(
i == sizeof(d.layer[layer].attn.proj_weight)
/ sizeof(d.layer[layer].attn.proj_weight[0])
);
}
void
ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &vec)
{
size_t i(0);
for(const auto &elem : vec)
d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
}
void
ircd::gpt::model::init_h_attn_bias(decoder &d,
const string_view &name,
const size_t &layer,
const json::array &mat)
{
for(const json::array dim0 : mat)
{
for(const json::array dim1 : dim0)
{
size_t k(0);
for(const json::array dim2 : dim1)
{
size_t l(0);
for(const auto &elem : dim2)
{
always_assert(elem == "1.0" || elem == "0.0");
d.layer[layer].attn.bias[k][l++] = startswith(elem, '1');
}
++k;
}
always_assert
(
k == sizeof(d.layer[layer].attn.bias)
/ sizeof(d.layer[layer].attn.bias[0])
);
}
}
}