ircd::gpt: More Matrix Than Matrix.

2024-12-26 15:33:54 +01:00 · 2021-03-04 17:03:33 -08:00 · 2021-03-04 17:03:33 -08:00 · 4458235dfa
commit 4458235dfa
parent 53c4260a21
5 changed files with 1085 additions and 0 deletions
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -16,9 +16,19 @@
 namespace ircd::gpt
 {
 	IRCD_EXCEPTION(ircd::error, error)
 	u16
 	generate(const vector_view<const f32> &) noexcept;
 	vector_view<f32>
 	embed(const vector_view<f32> &,
 	      const vector_view<const u16> &) noexcept;
 	extern log::log log;
 }
 #include "vocab.h"
 #include "model.h"
 namespace ircd::gpt
 {
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -0,0 +1,68 @@
 // Tensor Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_MODEL_H
 namespace ircd::gpt::model
 {
 	struct norm;
 	struct attn;
 	struct ffnn;
 	struct block;
 	struct decoder;
 }
 /// Attention aperature
 struct ircd::gpt::model::attn
 {
 	float
 	attn_bias    alignas(64) [2304],
 	attn_weight  alignas(64) [768][2304],
 	proj_bias    alignas(64) [768],
 	proj_weight  alignas(64) [768][768];
 	bool bias    alignas(64) [1024][1024];
 };
 /// Feed-forward neural network
 struct ircd::gpt::model::ffnn
 {
 	float
 	fc_bias      alignas(64) [3072],
 	fc_weight    alignas(64) [768][3072],
 	proj_bias    alignas(64) [768],
 	proj_weight  alignas(64) [3072][768];
 };
 /// Layer normalization
 struct ircd::gpt::model::norm
 {
 	float
 	bias    alignas(64) [768],
 	weight  alignas(64) [768];
 };
 /// Transformer block
 struct ircd::gpt::model::block
 {
 	norm ln1;
 	model::attn attn;
 	norm ln2;
 	model::ffnn ffnn;
 };
 struct ircd::gpt::model::decoder
 {
 	float
 	wpe  alignas(64) [1024][768],
 	wte  alignas(64) [65536][768];
 	block layer[12];
 	norm f;
 };
--- a/ircd/Makefile.am
+++ b/ircd/Makefile.am
@ -217,6 +217,8 @@ libircd_la_SOURCES += png.cc
 if OPENCL
 libircd_la_SOURCES += cl.cc
 endif
 libircd_la_SOURCES += gpt.cc
 libircd_la_SOURCES += gpt_model.cc
 libircd_la_SOURCES += gpt_vocab.cc
 libircd_la_SOURCES += openssl.cc
 libircd_la_SOURCES += rfc1459.cc
@ -254,6 +256,16 @@ ROCKSDB_SRC_CPPFLAGS =#
 ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
 ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
 GPT_FP_CXXFLAGS =#
 GPT_FP_CXXFLAGS += -fno-math-errno
 GPT_FP_CXXFLAGS += -fno-trapping-math
 GPT_FP_CXXFLAGS += -ffinite-math-only
 GPT_FP_CXXFLAGS += -fno-signed-zeros
 GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
 GPT_FP_CXXFLAGS += -fassociative-math
 GPT_FP_CXXFLAGS += -ffp-contract=fast
 GPT_FP_CXXFLAGS += -freciprocal-math
 #
 # Specific unit option composition
 #
@ -282,6 +294,7 @@ endif
 if IOU
 fs_iou.lo:            AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 endif
 gpt.lo:               AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
 http.lo:              AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 http.lo:              AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
 ios.lo:               AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -0,0 +1,510 @@
 // Matrix Construct Is All You Need Is All You Need Is AllĊĊĊĊĊĊĊĊ
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 decltype(ircd::gpt::log)
 ircd::gpt::log
 {
 	"gpt"
 };
 namespace ircd::gpt
 {
 	static void gelu(float &, const float &);
 	static void gelu(float (&)[3072], const float (&)[3072]);
 	static void norm(float (&)[768], const float (&)[768], const float (&)[768], const float (&)[768], const float);
 	static void fmma(float (&)[768], const float (&)[3072], const float (&)[768], const float (&)[3072][768]);
 	static void fmma(float (&)[3072], const float (&)[768], const float (&)[3072], const float (&)[768][3072]);
 	static void fmma(float (&)[2304], const float (&)[768], const float (&)[2304], const float (&)[768][2304]);
 	static void fmma(float *, const float (&)[12][1024][64], const float (&)[768], const float (&)[768][768], const size_t);
 	static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
 	static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
 	static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
 	static void smax(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t);
 	static void ctrl(float (&)[3][1024][12][64], const float *const, const size_t, const model::block &);
 	static void ffnn(float (&)[768], const float (&)[768], const model::block &);
 	static void transform(float *, const size_t, const model::decoder &);
 	static void logitsmax(float *, const float *);
 	static void logits(float *, const float (&)[768], const model::decoder &);
 	static void tail(float *, const float (&)[768], const model::decoder &);
 	static u16 argmax(const float *);
 	std::unique_ptr<model::decoder> device
 	{
 		new model::decoder{}
 	};
 	static f32
 	logit alignas(64) [65536],
 	scratch alignas(64) [1024 * 768];
 }
 namespace ircd::gpt::model
 {
 	constexpr float embed_pdrop
 	{
 		0.1
 	};
 	constexpr float ln1_epsilon
 	{
 		0.00001
 	};
 	constexpr float ln2_epsilon
 	{
 		0.00001
 	};
 	constexpr float lnf_epsilon
 	{
 		0.00001
 	};
 	constexpr float attn_pdrop
 	{
 		0.1
 	};
 	constexpr float resid_pdrop
 	{
 		0.1
 	};
 }
 ircd::vector_view<ircd::f32>
 ircd::gpt::embed(const vector_view<f32> &out,
                 const vector_view<const u16> &in)
 noexcept
 {
 	assert(device);
 	uint i(0);
 	for(; i < in.size(); ++i)
 	{
 		const auto &wpe
 		{
 			device->wpe[i]
 		};
 		const auto &wte
 		{
 			device->wte[in[i]]
 		};
 		for(uint j(0); j < 768; ++j)
 			out[i * 768 + j] = wte[j] + wpe[j];
 	}
 	return vector_view<f32>
 	{
 		data(out), i * 768
 	};
 }
 uint16_t
 ircd::gpt::generate(const vector_view<const f32> &in)
 noexcept
 {
 	always_assert(in.size() % 768 == 0);
 	const auto toks
 	{
 		in.size() / 768
 	};
 	const vector_view<f32> scratch
 	{
 		gpt::scratch, in.size()
 	};
 	for(uint i(0); i < in.size(); ++i)
 		scratch[i] = in[i];
 	transform(data(scratch), toks, *device);
 	static float
 	buf alignas(64) [768];
 	for(uint i(0); i < 768; ++i)
 		buf[i] = scratch[(toks - 1) * 768 + i];
 	tail(logit, buf, *device);
 	return argmax(logit);
 }
 uint16_t
 ircd::gpt::argmax(const float *const __restrict__ logit)
 {
 	u16 ret(0);
 	for(uint j(0); j < vocab::tokens; ++j)
 		if(logit[j] > logit[ret])
 			ret = j;
 	return ret;
 }
 [[gnu::noinline]]
 void
 ircd::gpt::tail(float *const __restrict__ logit,
                const float (&__restrict__ state)[768],
                const model::decoder &d)
 {
 	static float
 	buf alignas(64) [768];
 	norm(buf, state, d.f.bias, d.f.weight, model::lnf_epsilon);
 	logits(logit, buf, d);
 	//logitsmax(logit, logit);
 }
 void
 ircd::gpt::logits(float *const __restrict__ out,
                  const float (&__restrict__ in)[768],
                  const model::decoder &d)
 {
 	for(uint j(0); j < vocab::tokens; ++j)
 		out[j] = 0;
 	for(uint j(0); j < vocab::tokens; ++j)
 		for(uint k(0); k < 768; ++k)
 			out[j] += in[k] * d.wte[j][k];
 }
 void
 ircd::gpt::logitsmax(float *const out,
                     const float *const in)
 {
 	static float
 	exps alignas(64) [65536];
 	for(uint j(0); j < vocab::tokens; ++j)
 		exps[j] = exp(in[j]);
 	for(uint j(0); j < vocab::tokens; ++j)
 		out[j] = 0;
 	for(uint j(0); j < vocab::tokens; ++j)
 		for(uint k(0); k < vocab::tokens; ++k)
 			out[k] += exps[j];
 	for(uint j(0); j < vocab::tokens; ++j)
 		out[j] = exps[j] / out[j];
 }
 [[gnu::noinline]]
 void
 ircd::gpt::transform(float *__restrict__ accum,
                     const size_t tokens,
                     const model::decoder &decoder)
 {
 	static float
 	qkv alignas(64) [3][1024][12][64],
 	state alignas(64) [12][1024][1024],
 	attns alignas(64) [12][1024][64],
 	buf alignas(64) [768];
 	for(uint i(0); i < 12; ++i)
 	{
 		const auto &layer
 		{
 			decoder.layer[i]
 		};
 		ctrl(qkv, accum, tokens, layer);
 		pare(state, qkv, tokens);
 		mask(state, state, layer.attn.bias, tokens);
 		smax(state, state, tokens);
 		vals(attns, state, qkv, tokens);
 		fmma(accum, attns, layer.attn.proj_bias, layer.attn.proj_weight, tokens);
 		for(uint j(0); j < tokens; ++j)
 		{
 			for(uint k(0); k < 768; ++k)
 				buf[k] = accum[j * 768 + k];
 			ffnn(buf, buf, layer);
 			for(uint k(0); k < 768; ++k)
 				accum[j * 768 + k] += buf[k];
 		}
 	}
 }
 void
 ircd::gpt::ffnn(float (&__restrict__ out)[768],
                const float (&__restrict__ in)[768],
                const model::block &layer)
 {
 	static float
 	proj alignas(64) [3072];
 	norm(out, in, layer.ln2.bias, layer.ln2.weight, model::ln2_epsilon);
 	fmma(proj, out, layer.ffnn.fc_bias, layer.ffnn.fc_weight);
 	gelu(proj, proj);
 	fmma(out, proj, layer.ffnn.proj_bias, layer.ffnn.proj_weight);
 }
 void
 ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
                const float *const __restrict__ in,
                const size_t num,
                const model::block &layer)
 {
 	float
 	(&__restrict__ qry)[1024][12][64] { out[0] },
 	(&__restrict__ key)[1024][12][64] { out[1] },
 	(&__restrict__ val)[1024][12][64] { out[2] };
 	for(uint i(0); i < num; ++i)
 	{
 		static float
 		buf alignas(64) [768],
 		proj alignas(64) [2304];
 		for(uint j(0); j < 768; ++j)
 			buf[j] = in[i * 768 + j];
 		norm(buf, buf, layer.ln1.bias, layer.ln1.weight, model::ln1_epsilon);
 		fmma(proj, buf, layer.attn.attn_bias, layer.attn.attn_weight);
 		#pragma clang loop unroll (disable)
 		for(uint j(0); j < 12; ++j)
 			for(uint k(0); k < 64; ++k)
 				qry[i][j][k] = proj[768 * 0 + j * 64 + k];
 		#pragma clang loop unroll (disable)
 		for(uint j(0); j < 12; ++j)
 			for(uint k(0); k < 64; ++k)
 				key[i][j][k] = proj[768 * 1 + j * 64 + k];
 		#pragma clang loop unroll (disable)
 		for(uint j(0); j < 12; ++j)
 			for(uint k(0); k < 64; ++k)
 				val[i][j][k] = proj[768 * 2 + j * 64 + k];
 	}
 }
 void
 ircd::gpt::pare(float (&__restrict__ out)[12][1024][1024],
                const float (&__restrict__ qkv)[3][1024][12][64],
                const size_t num)
 {
 	const float
 	(&__restrict__ qry)[1024][12][64] { qkv[0] },
 	(&__restrict__ key)[1024][12][64] { qkv[1] },
 	(&__restrict__ val)[1024][12][64] { qkv[2] };
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < num; ++l)
 				out[j][k][l] = 0;
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < num; ++l)
 				for(uint m(0); m < 64; ++m)
 					out[j][k][l] += qry[k][j][m] * key[l][j][m];
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < num; ++l)
 				out[j][k][l] /= 8.0;
 }
 void
 ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
                const float (&__restrict__ in)[12][1024][1024],
                const float (&__restrict__ qkv)[3][1024][12][64],
                const size_t num)
 {
 	const float
 	(&__restrict__ val)[1024][12][64] { qkv[2] };
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < 64; ++l)
 				out[j][k][l] = 0;
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < num; ++l)
 				for(uint m(0); m < 64; ++m)
 					out[j][k][m] += in[j][k][l] * val[l][j][m];
 }
 void
 ircd::gpt::smax(float (&__restrict__ out)[12][1024][1024],
                const float (&__restrict__ in)[12][1024][1024],
                const size_t num)
 {
 	static float
 	exps alignas(64) [12][1024][1024];
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint m(0); m < num; ++m)
 				exps[j][k][m] = exp(in[j][k][m]);
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint m(0); m < num; ++m)
 				out[j][k][m] = 0;
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint m(0); m < num; ++m)
 				for(uint l(0); l < num; ++l)
 					out[j][k][l] += exps[j][k][m];
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < num; ++l)
 				out[j][k][l] = exps[j][k][l] / out[j][k][l];
 }
 void
 ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024],
                const float (&__restrict__ in)[12][1024][1024],
                const bool (&__restrict__ bias)[1024][1024],
                const size_t num)
 {
 	static const float masked
 	{
 		-10000.0
 	};
 	#pragma clang loop unroll (disable)
 	for(uint j(0); j < 12; ++j)
 		for(uint k(0); k < num; ++k)
 			for(uint l(0); l < num; ++l)
 				out[j][k][l] = bias[k][l]? in[j][k][l]: masked;
 }
 void
 ircd::gpt::norm(float (&__restrict__ out)[768],
                const float (&__restrict__ in)[768],
                const float (&__restrict__ bias)[768],
                const float (&__restrict__ weight)[768],
                const float epsilon)
 {
 	static float
 	tmp alignas(64) [768];
 	const float mean
 	{
 		math::mean<float>(in)
 	};
 	for(uint j(0); j < 768; ++j)
 		tmp[j] = pow(in[j] - mean, 2);
 	const float s
 	{
 		math::mean<float>(tmp)
 	};
 	for(uint j(0); j < 768; ++j)
 		out[j] = (in[j] - mean) / sqrt(s + epsilon),
 		out[j] = out[j] * weight[j] + bias[j];
 }
 void
 ircd::gpt::fmma(float *const __restrict__ out,
                const float (&__restrict__ attn)[12][1024][64],
                const float (&__restrict__ bias)[768],
                const float (&__restrict__ weight)[768][768],
                const size_t num)
 {
 	static float
 	a alignas(64) [1024][768],
 	b alignas(64) [1024][768];
 	for(uint k(0); k < 12; k++)
 		for(uint j(0); j < num; j++)
 			for(uint l(0); l < 64; l++)
 				a[j][k * 64 + l] = attn[k][j][l];
 	for(uint i(0); i < num; i++)
 		for(uint j(0); j < 768; j++)
 			b[i][j] = bias[j];
 	for(uint i(0); i < num; i++)
 		for(uint k(0); k < 768; k++)
 			for(uint j(0); j < 768; j++)
 				b[i][k] += a[i][j] * weight[j][k];
 	for(uint i(0); i < num; i++)
 		for(uint j(0); j < 768; j++)
 			out[i * 768 + j] += b[i][j];
 }
 void
 ircd::gpt::fmma(float (&__restrict__ out)[2304],
                const float (&__restrict__ in)[768],
                const float (&__restrict__ bias)[2304],
                const float (&__restrict__ weight)[768][2304])
 {
 	for(uint j(0); j < 2304; ++j)
 		out[j] = bias[j];
 	for(uint k(0); k < 768; ++k)
 		for(uint j(0); j < 2304; ++j)
 			out[j] += in[k] * weight[k][j];
 }
 void
 ircd::gpt::fmma(float (&__restrict__ out)[768],
                const float (&__restrict__ in)[3072],
                const float (&__restrict__ bias)[768],
                const float (&__restrict__ weight)[3072][768])
 {
 	for(uint j(0); j < 768; ++j)
 		out[j] = bias[j];
 	for(uint k(0); k < 3072; k++)
 		for(uint j(0); j < 768; j++)
 			out[j] += in[k] * weight[k][j];
 }
 void
 ircd::gpt::fmma(float (&__restrict__ out)[3072],
                const float (&__restrict__ in)[768],
                const float (&__restrict__ bias)[3072],
                const float (&__restrict__ weight)[768][3072])
 {
 	for(uint j(0); j < 3072; ++j)
 		out[j] = bias[j];
 	for(uint k(0); k < 768; ++k)
 		for(uint j(0); j < 3072; ++j)
 			out[j] += in[k] * weight[k][j];
 }
 void
 ircd::gpt::gelu(float (&__restrict__ out)[3072],
                const float (&__restrict__ in)[3072])
 {
 	for(uint j(0); j < 3072; ++j)
 		gelu(out[j], in[j]);
 }
 void
 ircd::gpt::gelu(float &out,
                const float &in)
 {
 	out = 0.5 * in * (1.0 + tanh(in * 0.7978845608 * (1.0 + 0.044715 * in * in)));
 }
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -0,0 +1,484 @@
 // Tensor Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 namespace ircd::gpt::model
 {
 	static void
 	init_f_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_f_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_wpe_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_wte_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ffnn_fc_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ffnn_fc_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ffnn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ffnn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ln_1_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ln_1_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ln_2_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_ln_2_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_attn_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init() noexcept;
 	extern conf::item<std::string> path;
 	extern const std::pair
 	<
 		string_view,
 		void (*)(decoder &, const string_view &, const size_t &, const json::array &)
 	>
 	manifest[],
 	manifest_h[],
 	manifest_td[];
 }
 decltype(ircd::gpt::model::manifest_h)
 ircd::gpt::model::manifest_h
 {
 	{ "h.%u.mlp.c_fc.weight.json",        init_h_ffnn_fc_weight,    },
 	{ "h.%u.mlp.c_fc.bias.json",          init_h_ffnn_fc_bias,      },
 	{ "h.%u.mlp.c_proj.weight.json",      init_h_ffnn_proj_weight,  },
 	{ "h.%u.mlp.c_proj.bias.json",        init_h_ffnn_proj_bias,    },
 	{ "h.%u.ln_1.weight.json",            init_h_ln_1_weight,       },
 	{ "h.%u.ln_1.bias.json",              init_h_ln_1_bias,         },
 	{ "h.%u.ln_2.weight.json",            init_h_ln_2_weight,       },
 	{ "h.%u.ln_2.bias.json",              init_h_ln_2_bias,         },
 	{ "h.%u.attn.c_attn.weight.json",     init_h_attn_attn_weight,  },
 	{ "h.%u.attn.c_attn.bias.json",       init_h_attn_attn_bias,    },
 	{ "h.%u.attn.c_proj.weight.json",     init_h_attn_proj_weight,  },
 	{ "h.%u.attn.c_proj.bias.json",       init_h_attn_proj_bias     },
 	{ "h.%u.attn.bias.json",              init_h_attn_bias,         },
 };
 decltype(ircd::gpt::model::manifest)
 ircd::gpt::model::manifest
 {
 	{ "ln_f.weight.json",   init_f_weight,  },
 	{ "ln_f.bias.json",     init_f_bias,    },
 	{ "wpe.weight.json",    init_wpe_weight },
 	{ "wte.weight.json",    init_wte_weight },
 };
 decltype(ircd::gpt::model::manifest_td)
 ircd::gpt::model::manifest_td
 {
 	{ "test.jsonl",    nullptr,  },
 	{ "valid.jsonl",   nullptr,  },
 	{ "train.jsonl",   nullptr,  },
 };
 decltype(ircd::gpt::model::path)
 ircd::gpt::model::path
 {
 	{
 		{ "name",     "ircd.gpt.model.path" },
 		{ "default",  string_view{}         },
 	},
 	init
 };
 //TODO: XXX
 namespace ircd::gpt
 {
 	extern const std::unique_ptr<model::decoder> device;
 }
 void
 ircd::gpt::model::init()
 noexcept
 {
 	if(!model::path)
 		return;
 	const size_t layers
 	{
 		12
 	};
 	const auto handle{[]
 	(const auto &a, const auto &b, const auto &i)
 	{
 		const auto &[fmt, handler]
 		{
 			a[b]
 		};
 		char namebuf[128] {0};
 		const string_view path_part[2]
 		{
 			model::path, fmt::sprintf
 			{
 				namebuf, fmt, i
 			}
 		};
 		const fs::fd fd
 		{
 			fs::path(fs::path_scratch, path_part)
 		};
 		fs::map::opts map_opts;
 		const fs::map map
 		{
 			fd, map_opts
 		};
 		const json::array mat
 		{
 			map
 		};
 		assert(gpt::device);
 		handler(*gpt::device, path_part[1], i, mat);
 		log::logf
 		{
 			log, log::level::DEBUG,
 			"Model init [%2d][%2d] :%s",
 			i,
 			b,
 			path_part[1],
 		};
 	}};
 	ircd::timer sw;
 	size_t read(0), wrote(0);
 	if(fs::exists("model"))
 	{
 		const auto _read
 		{
 			fs::read(fs::fd{"model"}, mutable_buffer{(char *)(gpt::device.get()), sizeof(model::decoder)})
 		};
 		read = size(_read);
 	} else {
 		memset(device.get(),  0x0, sizeof(model::decoder));
 		handle(manifest, 0, 0);
 		handle(manifest, 1, 0);
 		handle(manifest, 2, 0);
 		handle(manifest, 3, 0);
 		for(size_t i(0); i < layers; ++i)
 			for(size_t j(0); j < 13; ++j)
 				handle(manifest_h, j, i);
 		const auto _wrote
 		{
 			fs::write("model", const_buffer{(const char *)(gpt::device.get()), sizeof(model::decoder)})
 		};
 		wrote = size(_wrote);
 	}
 	char pbuf[3][48];
 	log::logf
 	{
 		log, log::level::DEBUG,
 		"Model init completed in %s read %s wrote %s",
 		sw.pretty(pbuf[0]),
 		pretty(pbuf[1], iec(size(read))),
 		pretty(pbuf[2], iec(size(wrote))),
 	};
 }
 void
 ircd::gpt::model::init_wpe_weight(decoder &d,
                                  const string_view &name,
                                  const size_t &layer,
                                  const json::array &mat)
 {
 	size_t i(0);
 	for(const json::array vec : mat)
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
 			d.wpe[i][j++] = lex_cast<float>(elem);
 		always_assert(j == sizeof(d.wpe[i]) / sizeof(float));
 		++i;
 	}
 }
 void
 ircd::gpt::model::init_wte_weight(decoder &d,
                                  const string_view &name,
                                  const size_t &layer,
                                  const json::array &mat)
 {
 	size_t i(0);
 	for(const json::array vec : mat)
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
 			d.wte[i][j++] = lex_cast<float>(elem);
 		always_assert(j == sizeof(d.wte[i]) / sizeof(float));
 		++i;
 	}
 }
 void
 ircd::gpt::model::init_f_weight(decoder &d,
                                const string_view &name,
                                const size_t &layer,
                                const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.f.weight[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.f.weight) / sizeof(float));
 }
 void
 ircd::gpt::model::init_f_bias(decoder &d,
                              const string_view &name,
                              const size_t &layer,
                              const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.f.bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.f.bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
                                        const string_view &name,
                                        const size_t &layer,
                                        const json::array &mat)
 {
 	size_t i(0);
 	for(const json::array vec : mat)
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
 			d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
 		always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
 		++i;
 	}
 	always_assert
 	(
 		i == sizeof(d.layer[layer].ffnn.fc_weight)
 		/ sizeof(d.layer[layer].ffnn.fc_weight[0])
 	);
 }
 void
 ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
                                      const string_view &name,
                                      const size_t &layer,
                                      const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
                                          const string_view &name,
                                          const size_t &layer,
                                          const json::array &mat)
 {
 	size_t i(0);
 	for(const json::array vec : mat)
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
 			d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
 		always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
 		++i;
 	}
 	always_assert
 	(
 		i == sizeof(d.layer[layer].ffnn.proj_weight)
 		/ sizeof(d.layer[layer].ffnn.proj_weight[0])
 	);
 }
 void
 ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
                                        const string_view &name,
                                        const size_t &layer,
                                        const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_ln_1_weight(decoder &d,
                                     const string_view &name,
                                     const size_t &layer,
                                     const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_ln_1_bias(decoder &d,
                                   const string_view &name,
                                   const size_t &layer,
                                   const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_ln_2_weight(decoder &d,
                                     const string_view &name,
                                     const size_t &layer,
                                     const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_ln_2_bias(decoder &d,
                                   const string_view &name,
                                   const size_t &layer,
                                   const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
                                          const string_view &name,
                                          const size_t &layer,
                                          const json::array &mat)
 {
 	size_t i(0);
 	for(const json::array vec : mat)
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
 			d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
 		always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
 		++i;
 	}
 	always_assert
 	(
 		i == sizeof(d.layer[layer].attn.attn_weight)
 		/ sizeof(d.layer[layer].attn.attn_weight[0])
 	);
 }
 void
 ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
                                        const string_view &name,
                                        const size_t &layer,
                                        const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
                                          const string_view &name,
                                          const size_t &layer,
                                          const json::array &mat)
 {
 	size_t i(0);
 	for(const json::array vec : mat)
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
 			d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
 		always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
 		++i;
 	}
 	always_assert
 	(
 		i == sizeof(d.layer[layer].attn.proj_weight)
 		/ sizeof(d.layer[layer].attn.proj_weight[0])
 	);
 }
 void
 ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
                                        const string_view &name,
                                        const size_t &layer,
                                        const json::array &vec)
 {
 	size_t i(0);
 	for(const auto &elem : vec)
 		d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
 	always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
 }
 void
 ircd::gpt::model::init_h_attn_bias(decoder &d,
                                   const string_view &name,
                                   const size_t &layer,
                                   const json::array &mat)
 {
 	for(const json::array dim0 : mat)
 	{
 		for(const json::array dim1 : dim0)
 		{
 			size_t k(0);
 			for(const json::array dim2 : dim1)
 			{
 				size_t l(0);
 				for(const auto &elem : dim2)
 				{
 					always_assert(elem == "1.0" || elem == "0.0");
 					d.layer[layer].attn.bias[k][l++] = startswith(elem, '1');
 				}
 				++k;
 			}
 			always_assert
 			(
 				k == sizeof(d.layer[layer].attn.bias)
 				/ sizeof(d.layer[layer].attn.bias[0])
 			);
 		}
 	}
 }