ircd::gpt: More Matrix Than Matrix.

2024-12-26 07:23:53 +01:00 · 2021-03-04 17:03:33 -08:00 · 2021-03-04 17:03:33 -08:00 · 4458235dfa
commit 4458235dfa
parent 53c4260a21
5 changed files with 1085 additions and 0 deletions
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -16,9 +16,19 @@
 namespace ircd::gpt
 {
 	IRCD_EXCEPTION(ircd::error, error)
+
+	u16
+	generate(const vector_view<const f32> &) noexcept;
+
+	vector_view<f32>
+	embed(const vector_view<f32> &,
+	      const vector_view<const u16> &) noexcept;
+
+	extern log::log log;
 }

 #include "vocab.h"
+#include "model.h"

 namespace ircd::gpt
 {
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -0,0 +1,68 @@
+// Tensor Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_MODEL_H
+
+namespace ircd::gpt::model
+{
+	struct norm;
+	struct attn;
+	struct ffnn;
+	struct block;
+	struct decoder;
+}
+
+/// Attention aperature
+struct ircd::gpt::model::attn
+{
+	float
+	attn_bias    alignas(64) [2304],
+	attn_weight  alignas(64) [768][2304],
+	proj_bias    alignas(64) [768],
+	proj_weight  alignas(64) [768][768];
+	bool bias    alignas(64) [1024][1024];
+};
+
+/// Feed-forward neural network
+struct ircd::gpt::model::ffnn
+{
+	float
+	fc_bias      alignas(64) [3072],
+	fc_weight    alignas(64) [768][3072],
+	proj_bias    alignas(64) [768],
+	proj_weight  alignas(64) [3072][768];
+};
+
+/// Layer normalization
+struct ircd::gpt::model::norm
+{
+	float
+	bias    alignas(64) [768],
+	weight  alignas(64) [768];
+};
+
+/// Transformer block
+struct ircd::gpt::model::block
+{
+	norm ln1;
+	model::attn attn;
+	norm ln2;
+	model::ffnn ffnn;
+};
+
+struct ircd::gpt::model::decoder
+{
+	float
+	wpe  alignas(64) [1024][768],
+	wte  alignas(64) [65536][768];
+	block layer[12];
+	norm f;
+};
--- a/ircd/Makefile.am
+++ b/ircd/Makefile.am
@ -217,6 +217,8 @@ libircd_la_SOURCES += png.cc
 if OPENCL
 libircd_la_SOURCES += cl.cc
 endif
+libircd_la_SOURCES += gpt.cc
+libircd_la_SOURCES += gpt_model.cc
 libircd_la_SOURCES += gpt_vocab.cc
 libircd_la_SOURCES += openssl.cc
 libircd_la_SOURCES += rfc1459.cc
@ -254,6 +256,16 @@ ROCKSDB_SRC_CPPFLAGS =#
 ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
 ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb

+GPT_FP_CXXFLAGS =#
+GPT_FP_CXXFLAGS += -fno-math-errno
+GPT_FP_CXXFLAGS += -fno-trapping-math
+GPT_FP_CXXFLAGS += -ffinite-math-only
+GPT_FP_CXXFLAGS += -fno-signed-zeros
+GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
+GPT_FP_CXXFLAGS += -fassociative-math
+GPT_FP_CXXFLAGS += -ffp-contract=fast
+GPT_FP_CXXFLAGS += -freciprocal-math
+
 #
 # Specific unit option composition
 #
@ -282,6 +294,7 @@ endif
 if IOU
 fs_iou.lo:            AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 endif
+gpt.lo:               AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
 http.lo:              AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 http.lo:              AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
 ios.lo:               AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -0,0 +1,510 @@
+// Matrix Construct Is All You Need Is All You Need Is AllĊĊĊĊĊĊĊĊ
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+decltype(ircd::gpt::log)
+ircd::gpt::log
+{
+	"gpt"
+};
+
+namespace ircd::gpt
+{
+	static void gelu(float &, const float &);
+	static void gelu(float (&)[3072], const float (&)[3072]);
+	static void norm(float (&)[768], const float (&)[768], const float (&)[768], const float (&)[768], const float);
+	static void fmma(float (&)[768], const float (&)[3072], const float (&)[768], const float (&)[3072][768]);
+	static void fmma(float (&)[3072], const float (&)[768], const float (&)[3072], const float (&)[768][3072]);
+	static void fmma(float (&)[2304], const float (&)[768], const float (&)[2304], const float (&)[768][2304]);
+	static void fmma(float *, const float (&)[12][1024][64], const float (&)[768], const float (&)[768][768], const size_t);
+	static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
+	static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
+	static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
+	static void smax(float (&)[12][1024][1024], const float (&)[12][1024][1024], const size_t);
+	static void ctrl(float (&)[3][1024][12][64], const float *const, const size_t, const model::block &);
+	static void ffnn(float (&)[768], const float (&)[768], const model::block &);
+	static void transform(float *, const size_t, const model::decoder &);
+	static void logitsmax(float *, const float *);
+	static void logits(float *, const float (&)[768], const model::decoder &);
+	static void tail(float *, const float (&)[768], const model::decoder &);
+	static u16 argmax(const float *);
+
+	std::unique_ptr<model::decoder> device
+	{
+		new model::decoder{}
+	};
+
+	static f32
+	logit alignas(64) [65536],
+	scratch alignas(64) [1024 * 768];
+}
+
+namespace ircd::gpt::model
+{
+	constexpr float embed_pdrop
+	{
+		0.1
+	};
+
+	constexpr float ln1_epsilon
+	{
+		0.00001
+	};
+
+	constexpr float ln2_epsilon
+	{
+		0.00001
+	};
+
+	constexpr float lnf_epsilon
+	{
+		0.00001
+	};
+
+	constexpr float attn_pdrop
+	{
+		0.1
+	};
+
+	constexpr float resid_pdrop
+	{
+		0.1
+	};
+}
+
+ircd::vector_view<ircd::f32>
+ircd::gpt::embed(const vector_view<f32> &out,
+                 const vector_view<const u16> &in)
+noexcept
+{
+	assert(device);
+
+	uint i(0);
+	for(; i < in.size(); ++i)
+	{
+		const auto &wpe
+		{
+			device->wpe[i]
+		};
+
+		const auto &wte
+		{
+			device->wte[in[i]]
+		};
+
+		for(uint j(0); j < 768; ++j)
+			out[i * 768 + j] = wte[j] + wpe[j];
+	}
+
+	return vector_view<f32>
+	{
+		data(out), i * 768
+	};
+}
+
+uint16_t
+ircd::gpt::generate(const vector_view<const f32> &in)
+noexcept
+{
+	always_assert(in.size() % 768 == 0);
+	const auto toks
+	{
+		in.size() / 768
+	};
+
+	const vector_view<f32> scratch
+	{
+		gpt::scratch, in.size()
+	};
+
+	for(uint i(0); i < in.size(); ++i)
+		scratch[i] = in[i];
+
+	transform(data(scratch), toks, *device);
+
+	static float
+	buf alignas(64) [768];
+
+	for(uint i(0); i < 768; ++i)
+		buf[i] = scratch[(toks - 1) * 768 + i];
+
+	tail(logit, buf, *device);
+	return argmax(logit);
+}
+
+uint16_t
+ircd::gpt::argmax(const float *const __restrict__ logit)
+{
+	u16 ret(0);
+	for(uint j(0); j < vocab::tokens; ++j)
+		if(logit[j] > logit[ret])
+			ret = j;
+
+	return ret;
+}
+
+[[gnu::noinline]]
+void
+ircd::gpt::tail(float *const __restrict__ logit,
+                const float (&__restrict__ state)[768],
+                const model::decoder &d)
+{
+	static float
+	buf alignas(64) [768];
+
+	norm(buf, state, d.f.bias, d.f.weight, model::lnf_epsilon);
+	logits(logit, buf, d);
+	//logitsmax(logit, logit);
+}
+
+void
+ircd::gpt::logits(float *const __restrict__ out,
+                  const float (&__restrict__ in)[768],
+                  const model::decoder &d)
+{
+	for(uint j(0); j < vocab::tokens; ++j)
+		out[j] = 0;
+
+	for(uint j(0); j < vocab::tokens; ++j)
+		for(uint k(0); k < 768; ++k)
+			out[j] += in[k] * d.wte[j][k];
+}
+
+void
+ircd::gpt::logitsmax(float *const out,
+                     const float *const in)
+{
+	static float
+	exps alignas(64) [65536];
+
+	for(uint j(0); j < vocab::tokens; ++j)
+		exps[j] = exp(in[j]);
+
+	for(uint j(0); j < vocab::tokens; ++j)
+		out[j] = 0;
+
+	for(uint j(0); j < vocab::tokens; ++j)
+		for(uint k(0); k < vocab::tokens; ++k)
+			out[k] += exps[j];
+
+	for(uint j(0); j < vocab::tokens; ++j)
+		out[j] = exps[j] / out[j];
+}
+
+[[gnu::noinline]]
+void
+ircd::gpt::transform(float *__restrict__ accum,
+                     const size_t tokens,
+                     const model::decoder &decoder)
+{
+	static float
+	qkv alignas(64) [3][1024][12][64],
+	state alignas(64) [12][1024][1024],
+	attns alignas(64) [12][1024][64],
+	buf alignas(64) [768];
+
+	for(uint i(0); i < 12; ++i)
+	{
+		const auto &layer
+		{
+			decoder.layer[i]
+		};
+
+		ctrl(qkv, accum, tokens, layer);
+		pare(state, qkv, tokens);
+		mask(state, state, layer.attn.bias, tokens);
+		smax(state, state, tokens);
+		vals(attns, state, qkv, tokens);
+		fmma(accum, attns, layer.attn.proj_bias, layer.attn.proj_weight, tokens);
+
+		for(uint j(0); j < tokens; ++j)
+		{
+			for(uint k(0); k < 768; ++k)
+				buf[k] = accum[j * 768 + k];
+
+			ffnn(buf, buf, layer);
+			for(uint k(0); k < 768; ++k)
+				accum[j * 768 + k] += buf[k];
+		}
+	}
+}
+
+void
+ircd::gpt::ffnn(float (&__restrict__ out)[768],
+                const float (&__restrict__ in)[768],
+                const model::block &layer)
+{
+	static float
+	proj alignas(64) [3072];
+
+	norm(out, in, layer.ln2.bias, layer.ln2.weight, model::ln2_epsilon);
+	fmma(proj, out, layer.ffnn.fc_bias, layer.ffnn.fc_weight);
+	gelu(proj, proj);
+	fmma(out, proj, layer.ffnn.proj_bias, layer.ffnn.proj_weight);
+}
+
+void
+ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
+                const float *const __restrict__ in,
+                const size_t num,
+                const model::block &layer)
+{
+	float
+	(&__restrict__ qry)[1024][12][64] { out[0] },
+	(&__restrict__ key)[1024][12][64] { out[1] },
+	(&__restrict__ val)[1024][12][64] { out[2] };
+
+	for(uint i(0); i < num; ++i)
+	{
+		static float
+		buf alignas(64) [768],
+		proj alignas(64) [2304];
+
+		for(uint j(0); j < 768; ++j)
+			buf[j] = in[i * 768 + j];
+
+		norm(buf, buf, layer.ln1.bias, layer.ln1.weight, model::ln1_epsilon);
+		fmma(proj, buf, layer.attn.attn_bias, layer.attn.attn_weight);
+
+		#pragma clang loop unroll (disable)
+		for(uint j(0); j < 12; ++j)
+			for(uint k(0); k < 64; ++k)
+				qry[i][j][k] = proj[768 * 0 + j * 64 + k];
+
+		#pragma clang loop unroll (disable)
+		for(uint j(0); j < 12; ++j)
+			for(uint k(0); k < 64; ++k)
+				key[i][j][k] = proj[768 * 1 + j * 64 + k];
+
+		#pragma clang loop unroll (disable)
+		for(uint j(0); j < 12; ++j)
+			for(uint k(0); k < 64; ++k)
+				val[i][j][k] = proj[768 * 2 + j * 64 + k];
+	}
+}
+
+void
+ircd::gpt::pare(float (&__restrict__ out)[12][1024][1024],
+                const float (&__restrict__ qkv)[3][1024][12][64],
+                const size_t num)
+{
+	const float
+	(&__restrict__ qry)[1024][12][64] { qkv[0] },
+	(&__restrict__ key)[1024][12][64] { qkv[1] },
+	(&__restrict__ val)[1024][12][64] { qkv[2] };
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < num; ++l)
+				out[j][k][l] = 0;
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < num; ++l)
+				for(uint m(0); m < 64; ++m)
+					out[j][k][l] += qry[k][j][m] * key[l][j][m];
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < num; ++l)
+				out[j][k][l] /= 8.0;
+}
+
+void
+ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
+                const float (&__restrict__ in)[12][1024][1024],
+                const float (&__restrict__ qkv)[3][1024][12][64],
+                const size_t num)
+{
+	const float
+	(&__restrict__ val)[1024][12][64] { qkv[2] };
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < 64; ++l)
+				out[j][k][l] = 0;
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < num; ++l)
+				for(uint m(0); m < 64; ++m)
+					out[j][k][m] += in[j][k][l] * val[l][j][m];
+}
+
+void
+ircd::gpt::smax(float (&__restrict__ out)[12][1024][1024],
+                const float (&__restrict__ in)[12][1024][1024],
+                const size_t num)
+{
+	static float
+	exps alignas(64) [12][1024][1024];
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint m(0); m < num; ++m)
+				exps[j][k][m] = exp(in[j][k][m]);
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint m(0); m < num; ++m)
+				out[j][k][m] = 0;
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint m(0); m < num; ++m)
+				for(uint l(0); l < num; ++l)
+					out[j][k][l] += exps[j][k][m];
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < num; ++l)
+				out[j][k][l] = exps[j][k][l] / out[j][k][l];
+}
+
+void
+ircd::gpt::mask(float (&__restrict__ out)[12][1024][1024],
+                const float (&__restrict__ in)[12][1024][1024],
+                const bool (&__restrict__ bias)[1024][1024],
+                const size_t num)
+{
+	static const float masked
+	{
+		-10000.0
+	};
+
+	#pragma clang loop unroll (disable)
+	for(uint j(0); j < 12; ++j)
+		for(uint k(0); k < num; ++k)
+			for(uint l(0); l < num; ++l)
+				out[j][k][l] = bias[k][l]? in[j][k][l]: masked;
+}
+
+void
+ircd::gpt::norm(float (&__restrict__ out)[768],
+                const float (&__restrict__ in)[768],
+                const float (&__restrict__ bias)[768],
+                const float (&__restrict__ weight)[768],
+                const float epsilon)
+{
+	static float
+	tmp alignas(64) [768];
+
+	const float mean
+	{
+		math::mean<float>(in)
+	};
+
+	for(uint j(0); j < 768; ++j)
+		tmp[j] = pow(in[j] - mean, 2);
+
+	const float s
+	{
+		math::mean<float>(tmp)
+	};
+
+	for(uint j(0); j < 768; ++j)
+		out[j] = (in[j] - mean) / sqrt(s + epsilon),
+		out[j] = out[j] * weight[j] + bias[j];
+}
+
+void
+ircd::gpt::fmma(float *const __restrict__ out,
+                const float (&__restrict__ attn)[12][1024][64],
+                const float (&__restrict__ bias)[768],
+                const float (&__restrict__ weight)[768][768],
+                const size_t num)
+{
+	static float
+	a alignas(64) [1024][768],
+	b alignas(64) [1024][768];
+
+	for(uint k(0); k < 12; k++)
+		for(uint j(0); j < num; j++)
+			for(uint l(0); l < 64; l++)
+				a[j][k * 64 + l] = attn[k][j][l];
+
+	for(uint i(0); i < num; i++)
+		for(uint j(0); j < 768; j++)
+			b[i][j] = bias[j];
+
+	for(uint i(0); i < num; i++)
+		for(uint k(0); k < 768; k++)
+			for(uint j(0); j < 768; j++)
+				b[i][k] += a[i][j] * weight[j][k];
+
+	for(uint i(0); i < num; i++)
+		for(uint j(0); j < 768; j++)
+			out[i * 768 + j] += b[i][j];
+}
+
+void
+ircd::gpt::fmma(float (&__restrict__ out)[2304],
+                const float (&__restrict__ in)[768],
+                const float (&__restrict__ bias)[2304],
+                const float (&__restrict__ weight)[768][2304])
+{
+	for(uint j(0); j < 2304; ++j)
+		out[j] = bias[j];
+
+	for(uint k(0); k < 768; ++k)
+		for(uint j(0); j < 2304; ++j)
+			out[j] += in[k] * weight[k][j];
+}
+
+void
+ircd::gpt::fmma(float (&__restrict__ out)[768],
+                const float (&__restrict__ in)[3072],
+                const float (&__restrict__ bias)[768],
+                const float (&__restrict__ weight)[3072][768])
+{
+	for(uint j(0); j < 768; ++j)
+		out[j] = bias[j];
+
+	for(uint k(0); k < 3072; k++)
+		for(uint j(0); j < 768; j++)
+			out[j] += in[k] * weight[k][j];
+}
+
+void
+ircd::gpt::fmma(float (&__restrict__ out)[3072],
+                const float (&__restrict__ in)[768],
+                const float (&__restrict__ bias)[3072],
+                const float (&__restrict__ weight)[768][3072])
+{
+	for(uint j(0); j < 3072; ++j)
+		out[j] = bias[j];
+
+	for(uint k(0); k < 768; ++k)
+		for(uint j(0); j < 3072; ++j)
+			out[j] += in[k] * weight[k][j];
+}
+
+void
+ircd::gpt::gelu(float (&__restrict__ out)[3072],
+                const float (&__restrict__ in)[3072])
+{
+	for(uint j(0); j < 3072; ++j)
+		gelu(out[j], in[j]);
+}
+
+void
+ircd::gpt::gelu(float &out,
+                const float &in)
+{
+	out = 0.5 * in * (1.0 + tanh(in * 0.7978845608 * (1.0 + 0.044715 * in * in)));
+}
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -0,0 +1,484 @@
+// Tensor Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+namespace ircd::gpt::model
+{
+	static void
+	init_f_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_f_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_wpe_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_wte_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ffnn_fc_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ffnn_fc_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ffnn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ffnn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ln_1_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ln_1_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ln_2_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_ln_2_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_attn_attn_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_attn_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_attn_proj_weight(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &),
+	init() noexcept;
+
+	extern conf::item<std::string> path;
+	extern const std::pair
+	<
+		string_view,
+		void (*)(decoder &, const string_view &, const size_t &, const json::array &)
+	>
+	manifest[],
+	manifest_h[],
+	manifest_td[];
+}
+
+decltype(ircd::gpt::model::manifest_h)
+ircd::gpt::model::manifest_h
+{
+	{ "h.%u.mlp.c_fc.weight.json",        init_h_ffnn_fc_weight,    },
+	{ "h.%u.mlp.c_fc.bias.json",          init_h_ffnn_fc_bias,      },
+	{ "h.%u.mlp.c_proj.weight.json",      init_h_ffnn_proj_weight,  },
+	{ "h.%u.mlp.c_proj.bias.json",        init_h_ffnn_proj_bias,    },
+	{ "h.%u.ln_1.weight.json",            init_h_ln_1_weight,       },
+	{ "h.%u.ln_1.bias.json",              init_h_ln_1_bias,         },
+	{ "h.%u.ln_2.weight.json",            init_h_ln_2_weight,       },
+	{ "h.%u.ln_2.bias.json",              init_h_ln_2_bias,         },
+	{ "h.%u.attn.c_attn.weight.json",     init_h_attn_attn_weight,  },
+	{ "h.%u.attn.c_attn.bias.json",       init_h_attn_attn_bias,    },
+	{ "h.%u.attn.c_proj.weight.json",     init_h_attn_proj_weight,  },
+	{ "h.%u.attn.c_proj.bias.json",       init_h_attn_proj_bias     },
+	{ "h.%u.attn.bias.json",              init_h_attn_bias,         },
+};
+
+decltype(ircd::gpt::model::manifest)
+ircd::gpt::model::manifest
+{
+	{ "ln_f.weight.json",   init_f_weight,  },
+	{ "ln_f.bias.json",     init_f_bias,    },
+	{ "wpe.weight.json",    init_wpe_weight },
+	{ "wte.weight.json",    init_wte_weight },
+};
+
+decltype(ircd::gpt::model::manifest_td)
+ircd::gpt::model::manifest_td
+{
+	{ "test.jsonl",    nullptr,  },
+	{ "valid.jsonl",   nullptr,  },
+	{ "train.jsonl",   nullptr,  },
+};
+
+decltype(ircd::gpt::model::path)
+ircd::gpt::model::path
+{
+	{
+		{ "name",     "ircd.gpt.model.path" },
+		{ "default",  string_view{}         },
+	},
+	init
+};
+
+//TODO: XXX
+namespace ircd::gpt
+{
+	extern const std::unique_ptr<model::decoder> device;
+}
+
+void
+ircd::gpt::model::init()
+noexcept
+{
+	if(!model::path)
+		return;
+
+	const size_t layers
+	{
+		12
+	};
+
+	const auto handle{[]
+	(const auto &a, const auto &b, const auto &i)
+	{
+		const auto &[fmt, handler]
+		{
+			a[b]
+		};
+
+		char namebuf[128] {0};
+		const string_view path_part[2]
+		{
+			model::path, fmt::sprintf
+			{
+				namebuf, fmt, i
+			}
+		};
+
+		const fs::fd fd
+		{
+			fs::path(fs::path_scratch, path_part)
+		};
+
+		fs::map::opts map_opts;
+		const fs::map map
+		{
+			fd, map_opts
+		};
+
+		const json::array mat
+		{
+			map
+		};
+
+		assert(gpt::device);
+		handler(*gpt::device, path_part[1], i, mat);
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"Model init [%2d][%2d] :%s",
+			i,
+			b,
+			path_part[1],
+		};
+	}};
+
+	ircd::timer sw;
+	size_t read(0), wrote(0);
+	if(fs::exists("model"))
+	{
+		const auto _read
+		{
+			fs::read(fs::fd{"model"}, mutable_buffer{(char *)(gpt::device.get()), sizeof(model::decoder)})
+		};
+
+		read = size(_read);
+	} else {
+		memset(device.get(),  0x0, sizeof(model::decoder));
+
+		handle(manifest, 0, 0);
+		handle(manifest, 1, 0);
+		handle(manifest, 2, 0);
+		handle(manifest, 3, 0);
+		for(size_t i(0); i < layers; ++i)
+			for(size_t j(0); j < 13; ++j)
+				handle(manifest_h, j, i);
+
+		const auto _wrote
+		{
+			fs::write("model", const_buffer{(const char *)(gpt::device.get()), sizeof(model::decoder)})
+		};
+
+		wrote = size(_wrote);
+	}
+
+	char pbuf[3][48];
+	log::logf
+	{
+		log, log::level::DEBUG,
+		"Model init completed in %s read %s wrote %s",
+		sw.pretty(pbuf[0]),
+		pretty(pbuf[1], iec(size(read))),
+		pretty(pbuf[2], iec(size(wrote))),
+	};
+}
+
+void
+ircd::gpt::model::init_wpe_weight(decoder &d,
+                                  const string_view &name,
+                                  const size_t &layer,
+                                  const json::array &mat)
+{
+	size_t i(0);
+	for(const json::array vec : mat)
+	{
+		size_t j(0);
+		for(const auto &elem : vec)
+			d.wpe[i][j++] = lex_cast<float>(elem);
+
+		always_assert(j == sizeof(d.wpe[i]) / sizeof(float));
+		++i;
+	}
+}
+
+void
+ircd::gpt::model::init_wte_weight(decoder &d,
+                                  const string_view &name,
+                                  const size_t &layer,
+                                  const json::array &mat)
+{
+	size_t i(0);
+	for(const json::array vec : mat)
+	{
+		size_t j(0);
+		for(const auto &elem : vec)
+			d.wte[i][j++] = lex_cast<float>(elem);
+
+		always_assert(j == sizeof(d.wte[i]) / sizeof(float));
+		++i;
+	}
+}
+
+void
+ircd::gpt::model::init_f_weight(decoder &d,
+                                const string_view &name,
+                                const size_t &layer,
+                                const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.f.weight[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.f.weight) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_f_bias(decoder &d,
+                              const string_view &name,
+                              const size_t &layer,
+                              const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.f.bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.f.bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
+                                        const string_view &name,
+                                        const size_t &layer,
+                                        const json::array &mat)
+{
+	size_t i(0);
+	for(const json::array vec : mat)
+	{
+		size_t j(0);
+		for(const auto &elem : vec)
+			d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
+
+		always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
+		++i;
+	}
+
+	always_assert
+	(
+		i == sizeof(d.layer[layer].ffnn.fc_weight)
+		/ sizeof(d.layer[layer].ffnn.fc_weight[0])
+	);
+}
+
+void
+ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
+                                      const string_view &name,
+                                      const size_t &layer,
+                                      const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
+                                          const string_view &name,
+                                          const size_t &layer,
+                                          const json::array &mat)
+{
+	size_t i(0);
+	for(const json::array vec : mat)
+	{
+		size_t j(0);
+		for(const auto &elem : vec)
+			d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
+
+		always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
+		++i;
+	}
+
+	always_assert
+	(
+		i == sizeof(d.layer[layer].ffnn.proj_weight)
+		/ sizeof(d.layer[layer].ffnn.proj_weight[0])
+	);
+}
+
+void
+ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
+                                        const string_view &name,
+                                        const size_t &layer,
+                                        const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_ln_1_weight(decoder &d,
+                                     const string_view &name,
+                                     const size_t &layer,
+                                     const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_ln_1_bias(decoder &d,
+                                   const string_view &name,
+                                   const size_t &layer,
+                                   const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_ln_2_weight(decoder &d,
+                                     const string_view &name,
+                                     const size_t &layer,
+                                     const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_ln_2_bias(decoder &d,
+                                   const string_view &name,
+                                   const size_t &layer,
+                                   const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
+                                          const string_view &name,
+                                          const size_t &layer,
+                                          const json::array &mat)
+{
+	size_t i(0);
+	for(const json::array vec : mat)
+	{
+		size_t j(0);
+		for(const auto &elem : vec)
+			d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
+
+		always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
+		++i;
+	}
+
+	always_assert
+	(
+		i == sizeof(d.layer[layer].attn.attn_weight)
+		/ sizeof(d.layer[layer].attn.attn_weight[0])
+	);
+}
+
+void
+ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
+                                        const string_view &name,
+                                        const size_t &layer,
+                                        const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
+                                          const string_view &name,
+                                          const size_t &layer,
+                                          const json::array &mat)
+{
+	size_t i(0);
+	for(const json::array vec : mat)
+	{
+		size_t j(0);
+		for(const auto &elem : vec)
+			d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
+
+		always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
+		++i;
+	}
+
+	always_assert
+	(
+		i == sizeof(d.layer[layer].attn.proj_weight)
+		/ sizeof(d.layer[layer].attn.proj_weight[0])
+	);
+}
+
+void
+ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
+                                        const string_view &name,
+                                        const size_t &layer,
+                                        const json::array &vec)
+{
+	size_t i(0);
+	for(const auto &elem : vec)
+		d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
+
+	always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
+}
+
+void
+ircd::gpt::model::init_h_attn_bias(decoder &d,
+                                   const string_view &name,
+                                   const size_t &layer,
+                                   const json::array &mat)
+{
+	for(const json::array dim0 : mat)
+	{
+		for(const json::array dim1 : dim0)
+		{
+			size_t k(0);
+			for(const json::array dim2 : dim1)
+			{
+				size_t l(0);
+				for(const auto &elem : dim2)
+				{
+					always_assert(elem == "1.0" || elem == "0.0");
+					d.layer[layer].attn.bias[k][l++] = startswith(elem, '1');
+				}
+
+				++k;
+			}
+
+			always_assert
+			(
+				k == sizeof(d.layer[layer].attn.bias)
+				/ sizeof(d.layer[layer].attn.bias[0])
+			);
+		}
+	}
+}