mirror of
https://github.com/matrix-construct/construct
synced 2025-01-13 16:33:53 +01:00
ircd::gpt: More Matrix Than Matrix.
This commit is contained in:
parent
53c4260a21
commit
4458235dfa
5 changed files with 1085 additions and 0 deletions
|
@ -16,9 +16,19 @@
|
|||
namespace ircd::gpt
|
||||
{
|
||||
IRCD_EXCEPTION(ircd::error, error)
|
||||
|
||||
u16
|
||||
generate(const vector_view<const f32> &) noexcept;
|
||||
|
||||
vector_view<f32>
|
||||
embed(const vector_view<f32> &,
|
||||
const vector_view<const u16> &) noexcept;
|
||||
|
||||
extern log::log log;
|
||||
}
|
||||
|
||||
#include "vocab.h"
|
||||
#include "model.h"
|
||||
|
||||
namespace ircd::gpt
|
||||
{
|
||||
|
|
68
include/ircd/gpt/model.h
Normal file
68
include/ircd/gpt/model.h
Normal file
|
@ -0,0 +1,68 @@
|
|||
// Tensor Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_GPT_MODEL_H
|
||||
|
||||
namespace ircd::gpt::model
|
||||
{
|
||||
struct norm;
|
||||
struct attn;
|
||||
struct ffnn;
|
||||
struct block;
|
||||
struct decoder;
|
||||
}
|
||||
|
||||
/// Attention aperature
|
||||
struct ircd::gpt::model::attn
|
||||
{
|
||||
float
|
||||
attn_bias alignas(64) [2304],
|
||||
attn_weight alignas(64) [768][2304],
|
||||
proj_bias alignas(64) [768],
|
||||
proj_weight alignas(64) [768][768];
|
||||
bool bias alignas(64) [1024][1024];
|
||||
};
|
||||
|
||||
/// Feed-forward neural network
|
||||
struct ircd::gpt::model::ffnn
|
||||
{
|
||||
float
|
||||
fc_bias alignas(64) [3072],
|
||||
fc_weight alignas(64) [768][3072],
|
||||
proj_bias alignas(64) [768],
|
||||
proj_weight alignas(64) [3072][768];
|
||||
};
|
||||
|
||||
/// Layer normalization
|
||||
struct ircd::gpt::model::norm
|
||||
{
|
||||
float
|
||||
bias alignas(64) [768],
|
||||
weight alignas(64) [768];
|
||||
};
|
||||
|
||||
/// Transformer block
|
||||
struct ircd::gpt::model::block
|
||||
{
|
||||
norm ln1;
|
||||
model::attn attn;
|
||||
norm ln2;
|
||||
model::ffnn ffnn;
|
||||
};
|
||||
|
||||
struct ircd::gpt::model::decoder
|
||||
{
|
||||
float
|
||||
wpe alignas(64) [1024][768],
|
||||
wte alignas(64) [65536][768];
|
||||
block layer[12];
|
||||
norm f;
|
||||
};
|
|
@ -217,6 +217,8 @@ libircd_la_SOURCES += png.cc
|
|||
if OPENCL
|
||||
libircd_la_SOURCES += cl.cc
|
||||
endif
|
||||
libircd_la_SOURCES += gpt.cc
|
||||
libircd_la_SOURCES += gpt_model.cc
|
||||
libircd_la_SOURCES += gpt_vocab.cc
|
||||
libircd_la_SOURCES += openssl.cc
|
||||
libircd_la_SOURCES += rfc1459.cc
|
||||
|
@ -254,6 +256,16 @@ ROCKSDB_SRC_CPPFLAGS =#
|
|||
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
|
||||
ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
|
||||
|
||||
GPT_FP_CXXFLAGS =#
|
||||
GPT_FP_CXXFLAGS += -fno-math-errno
|
||||
GPT_FP_CXXFLAGS += -fno-trapping-math
|
||||
GPT_FP_CXXFLAGS += -ffinite-math-only
|
||||
GPT_FP_CXXFLAGS += -fno-signed-zeros
|
||||
GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
|
||||
GPT_FP_CXXFLAGS += -fassociative-math
|
||||
GPT_FP_CXXFLAGS += -ffp-contract=fast
|
||||
GPT_FP_CXXFLAGS += -freciprocal-math
|
||||
|
||||
#
|
||||
# Specific unit option composition
|
||||
#
|
||||
|
@ -282,6 +294,7 @@ endif
|
|||
if IOU
|
||||
fs_iou.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||
endif
|
||||
gpt.lo: AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
|
||||
http.lo: AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||
http.lo: AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
|
||||
ios.lo: AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
|
||||
|
|
510
ircd/gpt.cc
Normal file
510
ircd/gpt.cc
Normal file
|
@ -0,0 +1,510 @@
|
|||
// Matrix Construct Is All You Need Is All You Need Is AllĊĊĊĊĊĊĊĊ
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
decltype(ircd::gpt::log)
|
||||
ircd::gpt::log
|
||||
{
|
||||
"gpt"
|
||||
};
|
||||
|
||||
namespace ircd::gpt
|
||||
{
|
||||
static void gelu(float &, const float &);
|
||||
static void gelu(float (&)[3072], const float (&)[3072]);
|
||||
static void norm(float (&)[768], const float (&)[768], const float (&)[768], const float (&)[768], const float);
|
||||
static void fmma(float (&)[768], const float (&)[3072], const float (&)[768], const float (&)[3072][768]);
|
||||
static void fmma(float (&)[3072], const float (&)[768], const float (&)[3072], const float (&)[768][3072]);
|
||||
static void fmma(float (&)[2304], const float (&)[768], const float (&)[2304], const float (&)[768][2304]);
|
||||
static void fmma(float *, const float (&)[12][1024][64], const float (&)[768], const float (&)[768][768], const size_t);
|
||||
static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
|
||||
static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
|
||||
static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
|
||||
static void smax(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t);
|
||||
static void ctrl(float (&)[3][1024][12][64], const float *const, const size_t, const model::block &);
|
||||
static void ffnn(float (&)[768], const float (&)[768], const model::block &);
|
||||
static void transform(float *, const size_t, const model::decoder &);
|
||||
static void logitsmax(float *, const float *);
|
||||
static void logits(float *, const float (&)[768], const model::decoder &);
|
||||
static void tail(float *, const float (&)[768], const model::decoder &);
|
||||
static u16 argmax(const float *);
|
||||
|
||||
std::unique_ptr<model::decoder> device
|
||||
{
|
||||
new model::decoder{}
|
||||
};
|
||||
|
||||
static f32
|
||||
logit alignas(64) [65536],
|
||||
scratch alignas(64) [1024 * 768];
|
||||
}
|
||||
|
||||
namespace ircd::gpt::model
|
||||
{
|
||||
constexpr float embed_pdrop
|
||||
{
|
||||
0.1
|
||||
};
|
||||
|
||||
constexpr float ln1_epsilon
|
||||
{
|
||||
0.00001
|
||||
};
|
||||
|
||||
constexpr float ln2_epsilon
|
||||
{
|
||||
0.00001
|
||||
};
|
||||
|
||||
constexpr float lnf_epsilon
|
||||
{
|
||||
0.00001
|
||||
};
|
||||
|
||||
constexpr float attn_pdrop
|
||||
{
|
||||
0.1
|
||||
};
|
||||
|
||||
constexpr float resid_pdrop
|
||||
{
|
||||
0.1
|
||||
};
|
||||
}
|
||||
|
||||
ircd::vector_view<ircd::f32>
|
||||
ircd::gpt::embed(const vector_view<f32> &out,
|
||||
const vector_view<const u16> &in)
|
||||
noexcept
|
||||
{
|
||||
assert(device);
|
||||
|
||||
uint i(0);
|
||||
for(; i < in.size(); ++i)
|
||||
{
|
||||
const auto &wpe
|
||||
{
|
||||
device->wpe[i]
|
||||
};
|
||||
|
||||
const auto &wte
|
||||
{
|
||||
device->wte[in[i]]
|
||||
};
|
||||
|
||||
for(uint j(0); j < 768; ++j)
|
||||
out[i * 768 + j] = wte[j] + wpe[j];
|
||||
}
|
||||
|
||||
return vector_view<f32>
|
||||
{
|
||||
data(out), i * 768
|
||||
};
|
||||
}
|
||||
|
||||
uint16_t
|
||||
ircd::gpt::generate(const vector_view<const f32> &in)
|
||||
noexcept
|
||||
{
|
||||
always_assert(in.size() % 768 == 0);
|
||||
const auto toks
|
||||
{
|
||||
in.size() / 768
|
||||
};
|
||||
|
||||
const vector_view<f32> scratch
|
||||
{
|
||||
gpt::scratch, in.size()
|
||||
};
|
||||
|
||||
for(uint i(0); i < in.size(); ++i)
|
||||
scratch[i] = in[i];
|
||||
|
||||
transform(data(scratch), toks, *device);
|
||||
|
||||
static float
|
||||
buf alignas(64) [768];
|
||||
|
||||
for(uint i(0); i < 768; ++i)
|
||||
buf[i] = scratch[(toks - 1) * 768 + i];
|
||||
|
||||
tail(logit, buf, *device);
|
||||
return argmax(logit);
|
||||
}
|
||||
|
||||
uint16_t
|
||||
ircd::gpt::argmax(const float *const __restrict__ logit)
|
||||
{
|
||||
u16 ret(0);
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
if(logit[j] > logit[ret])
|
||||
ret = j;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
[[gnu::noinline]]
|
||||
void
|
||||
ircd::gpt::tail(float *const __restrict__ logit,
|
||||
const float (&__restrict__ state)[768],
|
||||
const model::decoder &d)
|
||||
{
|
||||
static float
|
||||
buf alignas(64) [768];
|
||||
|
||||
norm(buf, state, d.f.bias, d.f.weight, model::lnf_epsilon);
|
||||
logits(logit, buf, d);
|
||||
//logitsmax(logit, logit);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::logits(float *const __restrict__ out,
|
||||
const float (&__restrict__ in)[768],
|
||||
const model::decoder &d)
|
||||
{
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
out[j] = 0;
|
||||
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
for(uint k(0); k < 768; ++k)
|
||||
out[j] += in[k] * d.wte[j][k];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::logitsmax(float *const out,
|
||||
const float *const in)
|
||||
{
|
||||
static float
|
||||
exps alignas(64) [65536];
|
||||
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
exps[j] = exp(in[j]);
|
||||
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
out[j] = 0;
|
||||
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
for(uint k(0); k < vocab::tokens; ++k)
|
||||
out[k] += exps[j];
|
||||
|
||||
for(uint j(0); j < vocab::tokens; ++j)
|
||||
out[j] = exps[j] / out[j];
|
||||
}
|
||||
|
||||
[[gnu::noinline]]
|
||||
void
|
||||
ircd::gpt::transform(float *__restrict__ accum,
|
||||
const size_t tokens,
|
||||
const model::decoder &decoder)
|
||||
{
|
||||
static float
|
||||
qkv alignas(64) [3][1024][12][64],
|
||||
state alignas(64) [12][1024][1024],
|
||||
attns alignas(64) [12][1024][64],
|
||||
buf alignas(64) [768];
|
||||
|
||||
for(uint i(0); i < 12; ++i)
|
||||
{
|
||||
const auto &layer
|
||||
{
|
||||
decoder.layer[i]
|
||||
};
|
||||
|
||||
ctrl(qkv, accum, tokens, layer);
|
||||
pare(state, qkv, tokens);
|
||||
mask(state, state, layer.attn.bias, tokens);
|
||||
smax(state, state, tokens);
|
||||
vals(attns, state, qkv, tokens);
|
||||
fmma(accum, attns, layer.attn.proj_bias, layer.attn.proj_weight, tokens);
|
||||
|
||||
for(uint j(0); j < tokens; ++j)
|
||||
{
|
||||
for(uint k(0); k < 768; ++k)
|
||||
buf[k] = accum[j * 768 + k];
|
||||
|
||||
ffnn(buf, buf, layer);
|
||||
for(uint k(0); k < 768; ++k)
|
||||
accum[j * 768 + k] += buf[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::ffnn(float (&__restrict__ out)[768],
|
||||
const float (&__restrict__ in)[768],
|
||||
const model::block &layer)
|
||||
{
|
||||
static float
|
||||
proj alignas(64) [3072];
|
||||
|
||||
norm(out, in, layer.ln2.bias, layer.ln2.weight, model::ln2_epsilon);
|
||||
fmma(proj, out, layer.ffnn.fc_bias, layer.ffnn.fc_weight);
|
||||
gelu(proj, proj);
|
||||
fmma(out, proj, layer.ffnn.proj_bias, layer.ffnn.proj_weight);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
|
||||
const float *const __restrict__ in,
|
||||
const size_t num,
|
||||
const model::block &layer)
|
||||
{
|
||||
float
|
||||
(&__restrict__ qry)[1024][12][64] { out[0] },
|
||||
(&__restrict__ key)[1024][12][64] { out[1] },
|
||||
(&__restrict__ val)[1024][12][64] { out[2] };
|
||||
|
||||
for(uint i(0); i < num; ++i)
|
||||
{
|
||||
static float
|
||||
buf alignas(64) [768],
|
||||
proj alignas(64) [2304];
|
||||
|
||||
for(uint j(0); j < 768; ++j)
|
||||
buf[j] = in[i * 768 + j];
|
||||
|
||||
norm(buf, buf, layer.ln1.bias, layer.ln1.weight, model::ln1_epsilon);
|
||||
fmma(proj, buf, layer.attn.attn_bias, layer.attn.attn_weight);
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < 64; ++k)
|
||||
qry[i][j][k] = proj[768 * 0 + j * 64 + k];
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < 64; ++k)
|
||||
key[i][j][k] = proj[768 * 1 + j * 64 + k];
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < 64; ++k)
|
||||
val[i][j][k] = proj[768 * 2 + j * 64 + k];
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::pare(float (&__restrict__ out)[12][1024][1024],
|
||||
const float (&__restrict__ qkv)[3][1024][12][64],
|
||||
const size_t num)
|
||||
{
|
||||
const float
|
||||
(&__restrict__ qry)[1024][12][64] { qkv[0] },
|
||||
(&__restrict__ key)[1024][12][64] { qkv[1] },
|
||||
(&__restrict__ val)[1024][12][64] { qkv[2] };
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
out[j][k][l] = 0;
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
for(uint m(0); m < 64; ++m)
|
||||
out[j][k][l] += qry[k][j][m] * key[l][j][m];
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
out[j][k][l] /= 8.0;
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
|
||||
const float (&__restrict__ in)[12][1024][1024],
|
||||
const float (&__restrict__ qkv)[3][1024][12][64],
|
||||
const size_t num)
|
||||
{
|
||||
const float
|
||||
(&__restrict__ val)[1024][12][64] { qkv[2] };
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < 64; ++l)
|
||||
out[j][k][l] = 0;
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
for(uint m(0); m < 64; ++m)
|
||||
out[j][k][m] += in[j][k][l] * val[l][j][m];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::smax(float (&__restrict__ out)[12][1024][1024],
|
||||
const float (&__restrict__ in)[12][1024][1024],
|
||||
const size_t num)
|
||||
{
|
||||
static float
|
||||
exps alignas(64) [12][1024][1024];
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint m(0); m < num; ++m)
|
||||
exps[j][k][m] = exp(in[j][k][m]);
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint m(0); m < num; ++m)
|
||||
out[j][k][m] = 0;
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint m(0); m < num; ++m)
|
||||
for(uint l(0); l < num; ++l)
|
||||
out[j][k][l] += exps[j][k][m];
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
out[j][k][l] = exps[j][k][l] / out[j][k][l];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024],
|
||||
const float (&__restrict__ in)[12][1024][1024],
|
||||
const bool (&__restrict__ bias)[1024][1024],
|
||||
const size_t num)
|
||||
{
|
||||
static const float masked
|
||||
{
|
||||
-10000.0
|
||||
};
|
||||
|
||||
#pragma clang loop unroll (disable)
|
||||
for(uint j(0); j < 12; ++j)
|
||||
for(uint k(0); k < num; ++k)
|
||||
for(uint l(0); l < num; ++l)
|
||||
out[j][k][l] = bias[k][l]? in[j][k][l]: masked;
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::norm(float (&__restrict__ out)[768],
|
||||
const float (&__restrict__ in)[768],
|
||||
const float (&__restrict__ bias)[768],
|
||||
const float (&__restrict__ weight)[768],
|
||||
const float epsilon)
|
||||
{
|
||||
static float
|
||||
tmp alignas(64) [768];
|
||||
|
||||
const float mean
|
||||
{
|
||||
math::mean<float>(in)
|
||||
};
|
||||
|
||||
for(uint j(0); j < 768; ++j)
|
||||
tmp[j] = pow(in[j] - mean, 2);
|
||||
|
||||
const float s
|
||||
{
|
||||
math::mean<float>(tmp)
|
||||
};
|
||||
|
||||
for(uint j(0); j < 768; ++j)
|
||||
out[j] = (in[j] - mean) / sqrt(s + epsilon),
|
||||
out[j] = out[j] * weight[j] + bias[j];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma(float *const __restrict__ out,
|
||||
const float (&__restrict__ attn)[12][1024][64],
|
||||
const float (&__restrict__ bias)[768],
|
||||
const float (&__restrict__ weight)[768][768],
|
||||
const size_t num)
|
||||
{
|
||||
static float
|
||||
a alignas(64) [1024][768],
|
||||
b alignas(64) [1024][768];
|
||||
|
||||
for(uint k(0); k < 12; k++)
|
||||
for(uint j(0); j < num; j++)
|
||||
for(uint l(0); l < 64; l++)
|
||||
a[j][k * 64 + l] = attn[k][j][l];
|
||||
|
||||
for(uint i(0); i < num; i++)
|
||||
for(uint j(0); j < 768; j++)
|
||||
b[i][j] = bias[j];
|
||||
|
||||
for(uint i(0); i < num; i++)
|
||||
for(uint k(0); k < 768; k++)
|
||||
for(uint j(0); j < 768; j++)
|
||||
b[i][k] += a[i][j] * weight[j][k];
|
||||
|
||||
for(uint i(0); i < num; i++)
|
||||
for(uint j(0); j < 768; j++)
|
||||
out[i * 768 + j] += b[i][j];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma(float (&__restrict__ out)[2304],
|
||||
const float (&__restrict__ in)[768],
|
||||
const float (&__restrict__ bias)[2304],
|
||||
const float (&__restrict__ weight)[768][2304])
|
||||
{
|
||||
for(uint j(0); j < 2304; ++j)
|
||||
out[j] = bias[j];
|
||||
|
||||
for(uint k(0); k < 768; ++k)
|
||||
for(uint j(0); j < 2304; ++j)
|
||||
out[j] += in[k] * weight[k][j];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma(float (&__restrict__ out)[768],
|
||||
const float (&__restrict__ in)[3072],
|
||||
const float (&__restrict__ bias)[768],
|
||||
const float (&__restrict__ weight)[3072][768])
|
||||
{
|
||||
for(uint j(0); j < 768; ++j)
|
||||
out[j] = bias[j];
|
||||
|
||||
for(uint k(0); k < 3072; k++)
|
||||
for(uint j(0); j < 768; j++)
|
||||
out[j] += in[k] * weight[k][j];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::fmma(float (&__restrict__ out)[3072],
|
||||
const float (&__restrict__ in)[768],
|
||||
const float (&__restrict__ bias)[3072],
|
||||
const float (&__restrict__ weight)[768][3072])
|
||||
{
|
||||
for(uint j(0); j < 3072; ++j)
|
||||
out[j] = bias[j];
|
||||
|
||||
for(uint k(0); k < 768; ++k)
|
||||
for(uint j(0); j < 3072; ++j)
|
||||
out[j] += in[k] * weight[k][j];
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::gelu(float (&__restrict__ out)[3072],
|
||||
const float (&__restrict__ in)[3072])
|
||||
{
|
||||
for(uint j(0); j < 3072; ++j)
|
||||
gelu(out[j], in[j]);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::gelu(float &out,
|
||||
const float &in)
|
||||
{
|
||||
out = 0.5 * in * (1.0 + tanh(in * 0.7978845608 * (1.0 + 0.044715 * in * in)));
|
||||
}
|
484
ircd/gpt_model.cc
Normal file
484
ircd/gpt_model.cc
Normal file
|
@ -0,0 +1,484 @@
|
|||
// Tensor Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
namespace ircd::gpt::model
|
||||
{
|
||||
static void
|
||||
init_f_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_f_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_wpe_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_wte_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ffnn_fc_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ffnn_fc_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ffnn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ffnn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ln_1_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ln_1_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ln_2_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_ln_2_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_attn_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
|
||||
init() noexcept;
|
||||
|
||||
extern conf::item<std::string> path;
|
||||
extern const std::pair
|
||||
<
|
||||
string_view,
|
||||
void (*)(decoder &, const string_view &, const size_t &, const json::array &)
|
||||
>
|
||||
manifest[],
|
||||
manifest_h[],
|
||||
manifest_td[];
|
||||
}
|
||||
|
||||
decltype(ircd::gpt::model::manifest_h)
|
||||
ircd::gpt::model::manifest_h
|
||||
{
|
||||
{ "h.%u.mlp.c_fc.weight.json", init_h_ffnn_fc_weight, },
|
||||
{ "h.%u.mlp.c_fc.bias.json", init_h_ffnn_fc_bias, },
|
||||
{ "h.%u.mlp.c_proj.weight.json", init_h_ffnn_proj_weight, },
|
||||
{ "h.%u.mlp.c_proj.bias.json", init_h_ffnn_proj_bias, },
|
||||
{ "h.%u.ln_1.weight.json", init_h_ln_1_weight, },
|
||||
{ "h.%u.ln_1.bias.json", init_h_ln_1_bias, },
|
||||
{ "h.%u.ln_2.weight.json", init_h_ln_2_weight, },
|
||||
{ "h.%u.ln_2.bias.json", init_h_ln_2_bias, },
|
||||
{ "h.%u.attn.c_attn.weight.json", init_h_attn_attn_weight, },
|
||||
{ "h.%u.attn.c_attn.bias.json", init_h_attn_attn_bias, },
|
||||
{ "h.%u.attn.c_proj.weight.json", init_h_attn_proj_weight, },
|
||||
{ "h.%u.attn.c_proj.bias.json", init_h_attn_proj_bias },
|
||||
{ "h.%u.attn.bias.json", init_h_attn_bias, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::manifest)
|
||||
ircd::gpt::model::manifest
|
||||
{
|
||||
{ "ln_f.weight.json", init_f_weight, },
|
||||
{ "ln_f.bias.json", init_f_bias, },
|
||||
{ "wpe.weight.json", init_wpe_weight },
|
||||
{ "wte.weight.json", init_wte_weight },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::manifest_td)
|
||||
ircd::gpt::model::manifest_td
|
||||
{
|
||||
{ "test.jsonl", nullptr, },
|
||||
{ "valid.jsonl", nullptr, },
|
||||
{ "train.jsonl", nullptr, },
|
||||
};
|
||||
|
||||
decltype(ircd::gpt::model::path)
|
||||
ircd::gpt::model::path
|
||||
{
|
||||
{
|
||||
{ "name", "ircd.gpt.model.path" },
|
||||
{ "default", string_view{} },
|
||||
},
|
||||
init
|
||||
};
|
||||
|
||||
//TODO: XXX
|
||||
namespace ircd::gpt
|
||||
{
|
||||
extern const std::unique_ptr<model::decoder> device;
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init()
|
||||
noexcept
|
||||
{
|
||||
if(!model::path)
|
||||
return;
|
||||
|
||||
const size_t layers
|
||||
{
|
||||
12
|
||||
};
|
||||
|
||||
const auto handle{[]
|
||||
(const auto &a, const auto &b, const auto &i)
|
||||
{
|
||||
const auto &[fmt, handler]
|
||||
{
|
||||
a[b]
|
||||
};
|
||||
|
||||
char namebuf[128] {0};
|
||||
const string_view path_part[2]
|
||||
{
|
||||
model::path, fmt::sprintf
|
||||
{
|
||||
namebuf, fmt, i
|
||||
}
|
||||
};
|
||||
|
||||
const fs::fd fd
|
||||
{
|
||||
fs::path(fs::path_scratch, path_part)
|
||||
};
|
||||
|
||||
fs::map::opts map_opts;
|
||||
const fs::map map
|
||||
{
|
||||
fd, map_opts
|
||||
};
|
||||
|
||||
const json::array mat
|
||||
{
|
||||
map
|
||||
};
|
||||
|
||||
assert(gpt::device);
|
||||
handler(*gpt::device, path_part[1], i, mat);
|
||||
log::logf
|
||||
{
|
||||
log, log::level::DEBUG,
|
||||
"Model init [%2d][%2d] :%s",
|
||||
i,
|
||||
b,
|
||||
path_part[1],
|
||||
};
|
||||
}};
|
||||
|
||||
ircd::timer sw;
|
||||
size_t read(0), wrote(0);
|
||||
if(fs::exists("model"))
|
||||
{
|
||||
const auto _read
|
||||
{
|
||||
fs::read(fs::fd{"model"}, mutable_buffer{(char *)(gpt::device.get()), sizeof(model::decoder)})
|
||||
};
|
||||
|
||||
read = size(_read);
|
||||
} else {
|
||||
memset(device.get(), 0x0, sizeof(model::decoder));
|
||||
|
||||
handle(manifest, 0, 0);
|
||||
handle(manifest, 1, 0);
|
||||
handle(manifest, 2, 0);
|
||||
handle(manifest, 3, 0);
|
||||
for(size_t i(0); i < layers; ++i)
|
||||
for(size_t j(0); j < 13; ++j)
|
||||
handle(manifest_h, j, i);
|
||||
|
||||
const auto _wrote
|
||||
{
|
||||
fs::write("model", const_buffer{(const char *)(gpt::device.get()), sizeof(model::decoder)})
|
||||
};
|
||||
|
||||
wrote = size(_wrote);
|
||||
}
|
||||
|
||||
char pbuf[3][48];
|
||||
log::logf
|
||||
{
|
||||
log, log::level::DEBUG,
|
||||
"Model init completed in %s read %s wrote %s",
|
||||
sw.pretty(pbuf[0]),
|
||||
pretty(pbuf[1], iec(size(read))),
|
||||
pretty(pbuf[2], iec(size(wrote))),
|
||||
};
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_wpe_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const json::array vec : mat)
|
||||
{
|
||||
size_t j(0);
|
||||
for(const auto &elem : vec)
|
||||
d.wpe[i][j++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(j == sizeof(d.wpe[i]) / sizeof(float));
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_wte_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const json::array vec : mat)
|
||||
{
|
||||
size_t j(0);
|
||||
for(const auto &elem : vec)
|
||||
d.wte[i][j++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(j == sizeof(d.wte[i]) / sizeof(float));
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_f_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.f.weight[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.f.weight) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_f_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.f.bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.f.bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const json::array vec : mat)
|
||||
{
|
||||
size_t j(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
|
||||
++i;
|
||||
}
|
||||
|
||||
always_assert
|
||||
(
|
||||
i == sizeof(d.layer[layer].ffnn.fc_weight)
|
||||
/ sizeof(d.layer[layer].ffnn.fc_weight[0])
|
||||
);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const json::array vec : mat)
|
||||
{
|
||||
size_t j(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
|
||||
++i;
|
||||
}
|
||||
|
||||
always_assert
|
||||
(
|
||||
i == sizeof(d.layer[layer].ffnn.proj_weight)
|
||||
/ sizeof(d.layer[layer].ffnn.proj_weight[0])
|
||||
);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ln_1_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ln_1_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ln_2_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_ln_2_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const json::array vec : mat)
|
||||
{
|
||||
size_t j(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
|
||||
++i;
|
||||
}
|
||||
|
||||
always_assert
|
||||
(
|
||||
i == sizeof(d.layer[layer].attn.attn_weight)
|
||||
/ sizeof(d.layer[layer].attn.attn_weight[0])
|
||||
);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const json::array vec : mat)
|
||||
{
|
||||
size_t j(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
|
||||
++i;
|
||||
}
|
||||
|
||||
always_assert
|
||||
(
|
||||
i == sizeof(d.layer[layer].attn.proj_weight)
|
||||
/ sizeof(d.layer[layer].attn.proj_weight[0])
|
||||
);
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &vec)
|
||||
{
|
||||
size_t i(0);
|
||||
for(const auto &elem : vec)
|
||||
d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
|
||||
|
||||
always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
|
||||
}
|
||||
|
||||
void
|
||||
ircd::gpt::model::init_h_attn_bias(decoder &d,
|
||||
const string_view &name,
|
||||
const size_t &layer,
|
||||
const json::array &mat)
|
||||
{
|
||||
for(const json::array dim0 : mat)
|
||||
{
|
||||
for(const json::array dim1 : dim0)
|
||||
{
|
||||
size_t k(0);
|
||||
for(const json::array dim2 : dim1)
|
||||
{
|
||||
size_t l(0);
|
||||
for(const auto &elem : dim2)
|
||||
{
|
||||
always_assert(elem == "1.0" || elem == "0.0");
|
||||
d.layer[layer].attn.bias[k][l++] = startswith(elem, '1');
|
||||
}
|
||||
|
||||
++k;
|
||||
}
|
||||
|
||||
always_assert
|
||||
(
|
||||
k == sizeof(d.layer[layer].attn.bias)
|
||||
/ sizeof(d.layer[layer].attn.bias[0])
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue