ircd::gpt::pipe: Correctness; compute loss, statistics; pipeline optimize.

2024-12-26 07:23:53 +01:00 · 2021-04-10 19:28:23 -07:00 · 2021-04-10 19:28:23 -07:00 · 9c062d9c3f
commit 9c062d9c3f
parent 0a6be0efed
13 changed files with 867 additions and 444 deletions
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -22,6 +22,8 @@ namespace ircd::gpt::model

 	constexpr auto align {64};
 	extern const decoder *default_model;
+	extern string_view default_dataset;
+	extern std::vector<json::object> default_data;
 }

 /// Attention aperature
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@ -17,7 +17,7 @@
 /// This stucture and its mutable companion in `task.h` determine the outcome
 /// of the next execution cycle; options are immutable to device software but
 /// may be changed by the host between executions cycles if desired.
-/// 
+///
 struct ircd_gpt_opts
 {
 	/// Specifies the nominal halting condition based on a sequence of tokens.
@ -89,6 +89,82 @@ struct ircd_gpt_opts
 	#endif
 	;

+	/// Embedding vector elements
+	uint embed_elems
+	#ifdef __cplusplus
+	{
+		768
+	}
+	#endif
+	;
+
+	/// Attention unit width multiple
+	uint attn_elems
+	#ifdef __cplusplus
+	{
+		embed_elems * 3
+	}
+	#endif
+	;
+
+	/// FFNN unit width multiple
+	uint ffnn_elems
+	#ifdef __cplusplus
+	{
+		embed_elems * 4
+	}
+	#endif
+	;
+
+	uint embed_width
+	#ifdef __cplusplus
+	{
+		embed_elems / 4
+	}
+	#endif
+	;
+
+	uint attn_width
+	#ifdef __cplusplus
+	{
+		attn_elems / 4
+	}
+	#endif
+	;
+
+	uint attn_height
+	#ifdef __cplusplus
+	{
+		embed_elems / 4
+	}
+	#endif
+	;
+
+	uint ffnn_width
+	#ifdef __cplusplus
+	{
+		ffnn_elems / 4
+	}
+	#endif
+	;
+
+	uint ffnn_height
+	#ifdef __cplusplus
+	{
+		embed_elems / 4
+	}
+	#endif
+	;
+
+	/// Specifies the token context size in tokens.
+	uint logits
+	#ifdef __cplusplus
+	{
+		50257
+	}
+	#endif
+	;
+
 	/// Seed for the task's PRNG.
 	ulong seed
 	#ifdef __cplusplus
@ -97,6 +173,32 @@ struct ircd_gpt_opts
 	}
 	#endif
 	;
+
+	/// Training steps
+	ulong training_steps
+	#ifdef __cplusplus
+	{
+		250000
+	}
+	#endif
+	;
+
+	/// Validation steps
+	ulong validation_steps
+	#ifdef __cplusplus
+	{
+		5000
+	}
+	#endif
+	;
+
+	ushort label
+	#ifdef __cplusplus
+	{
+		198
+	}
+	#endif
+	;
 }
 __attribute__((aligned(4096)));

--- a/include/ircd/gpt/pipe/desc.h
+++ b/include/ircd/gpt/pipe/desc.h
@ -22,7 +22,9 @@ struct ircd::gpt::pipe::desc
 	cl::data
 	state,         // qry/key/val projection (tokens * embed * 3 * float)
 	accum,         // accumulator (tokens * embed * float)
-	logit,         // result output vector (50257 * float)
+	logit,         // result logit vector (50257 * float)
+	logexp,        // outputs distribution (50257 * float)
+	logsm,         // outputs distribution (50257 * float)
 	ctrl,          // control page
 	opts;          // options page

@ -30,6 +32,7 @@ struct ircd::gpt::pipe::desc
 	lm_embed,
 	lm_norm,
 	lm_logit,
+	lm_logsm,
 	lm_select;

 	std::unique_ptr<struct desc::layer>
--- a/include/ircd/gpt/pipe/exec.h
+++ b/include/ircd/gpt/pipe/exec.h
@ -40,6 +40,7 @@ struct ircd::gpt::pipe::exec
 	range_positive,    // Dimension range of a layer kernel.
 	range_lm_norm,     // Dimension range of the final norm kernel.
 	range_lm_logit,    // Dimension range of the language logit kernel.
+	range_lm_logsm,    // Dimension range of the language statistic kernel.
 	range_lm_select;   // Dimension range of the language token kernel.

 	cl::exec
@ -48,7 +49,8 @@ struct ircd::gpt::pipe::exec
 	lm_embed,          // Compute token and positional embeddings.
 	coil[12 * 2],      // Pass over all layers.
 	lm_norm,           // Final normalization.
-	lm_logit,          // Compute logit result vector.
+	lm_logit,          // Compute language logits.
+	lm_logsm,          // Statistics on the logits.
 	lm_select,         // Select next token.
 	acquire_ctrl;      // Acquire the control page.

--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -71,6 +71,42 @@ struct ircd_gpt_task
 	/// State counters for the accept/error sequence codes.
 	uint accept_seq[4], error_seq[4];

+	/// Loss for last token of last cycle
+	float loss;
+
+	/// Sum loss over all cycles
+	float loss_sum;
+
+	/// Average loss over all cycles
+	float loss_mean;
+
+	/// Perplexity score for last token of last cycle
+	float perp;
+
+	/// Perplexity sum over all cycles
+	float perp_sum;
+
+	/// Perplexity mean over context
+	float perp_mean;
+
+	/// Logit softmax mu
+	float samax_mu;
+
+	/// Logit softmax sum
+	float samax_sum;
+
+	/// Logit softmax lambda
+	float samax_lambda;
+
+	/// Certainty difference score for last token of last cycle
+	float cert;
+
+	/// Certainty sum over all cycles
+	float cert_sum;
+
+	/// Certainty mean over context
+	float cert_mean;
+
 	/// The token buffer starts at offset 2048 and continues to the end of
 	/// the page; options specify the size of the tokens buffer in tokens.
 	/// Additional pages must be attached for larger buffer sizes.
@ -112,5 +148,6 @@ enum ircd::gpt::task::status
 };

 static_assert(sizeof(struct ircd_gpt_task) == 4096);
+static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
 static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
 #endif
--- a/include/ircd/gpt/token.h
+++ b/include/ircd/gpt/token.h
@ -9,9 +9,14 @@
 // full license for this software is available in the LICENSE file.

 #pragma once
-#ifdef __OPENCL_C_VERSION__
 #define HAVE_IRCD_GPT_TOKEN_H

+struct ircd_gpt_attn_mask
+{
+	bool
+	token[1024];
+};
+
 union ircd_gpt_token
 {
 	float
@ -19,36 +24,37 @@ union ircd_gpt_token
 	attn[12][64];
 };

+#ifdef __OPENCL_C_VERSION__
 union ircd_gpt_tokenv
 {
 	float4
 	word[768/4],
 	attn[12][64/4];
+
+	union ircd_gpt_token
+	token;
+};
+#endif
+
+struct ircd_gpt_attn_qkv
+{
+	union ircd_gpt_token
+	qry,
+	key,
+	val;
 };

-struct ircd_gpt_qkv
+#ifdef __OPENCL_C_VERSION__
+struct ircd_gpt_attn_qkvv
 {
 	union ircd_gpt_tokenv
 	qry,
 	key,
 	val;
 };
+#endif

-struct ircd_gpt_qkvv
-{
-	union ircd_gpt_tokenv
-	qry,
-	key,
-	val;
-};
-
-struct ircd_gpt_attn_mask
-{
-	bool
-	token[1024];
-};
-
-union ircd_gpt_aperature
+union ircd_gpt_attn_aperature
 {
 	float
 	word[768],
@ -56,9 +62,13 @@ union ircd_gpt_aperature
 	proj[3][768],
 	qkv[3][12][64],
 	attn[12][64];
+
+	union ircd_gpt_token
+	token[3];
 };

-union ircd_gpt_aperaturev
+#ifdef __OPENCL_C_VERSION__
+union ircd_gpt_attn_aperaturev
 {
 	float4
 	word[768/4],
@ -66,6 +76,32 @@ union ircd_gpt_aperaturev
 	proj[3][768/4],
 	qkv[3][12][64/4],
 	attn[12][64/4];
+
+	union ircd_gpt_tokenv
+	token[3];
+};
+#endif
+
+union ircd_gpt_ffnn_aperature
+{
+	float
+	word[768],
+	fcon[3072],
+	proj[4][768];
+
+	union ircd_gpt_token
+	token[4];
 };

+#ifdef __OPENCL_C_VERSION__
+union ircd_gpt_ffnn_aperaturev
+{
+	float4
+	word[768/4],
+	fcon[3072/4],
+	proj[4][768/4];
+
+	union ircd_gpt_tokenv
+	token[4];
+};
 #endif
--- a/include/ircd/simt/mean.h
+++ b/include/ircd/simt/mean.h
@ -19,14 +19,17 @@ ircd_simt_math_mean_f4lldr(__local float4 *const restrict out,
 	out[i] = in[i];
 	ircd_simt_reduce_add_f4lldr(out, num, i);

-	float numerator = 0.0f;
-	float4 numeratorv = out[i];
-	for(uint k = 0; k < 4; ++k)
-		numerator += numeratorv[k];
+	if(i == 0)
+	{
+		float numerator = 0.0f;
+		float4 numeratorv = out[i];
+		for(uint k = 0; k < 4; ++k)
+			numerator += numeratorv[k];
+
+		out[i] = numerator;
+	}

-	out[i] = numerator;
 	ircd_simt_broadcast_f4lldr(out, num, i);
-
-	numeratorv = out[i];
+	const float4 numeratorv = out[i];
 	out[i] = numeratorv / (num * 4);
 }
--- a/include/ircd/simt/reduce.h
+++ b/include/ircd/simt/reduce.h
@ -28,15 +28,16 @@ ircd_simt_reduce_add_f4lldr(__local float4 *const buf,
 /// the greatest value is placed in index [0], the rest of the buffer is
 /// trashed.
 inline void
-ircd_simt_reduce_max_f4lldr(__local float *const buf,
-                            const uint ln,
-                            const uint li)
+ircd_simt_reduce_max_flldr(__local float *const buf,
+                           const uint ln,
+                           const uint li)
 {
 	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
 	{
 		barrier(CLK_LOCAL_MEM_FENCE);

 		if(li < stride)
-			buf[li] = max(buf[li], buf[li + stride]);
+			if(buf[li] < buf[li + stride])
+				buf[li] = buf[li + stride];
 	}
 }
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -8,17 +8,14 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.

-#include <ircd/gpt/pipe/pipe.h>
-
 namespace ircd::gpt
 {
+	template<class T>
+	static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
+
 	static void gelu(f32x4 &, const f32x4 &);
 	static void gelu(f32x4 *, const f32x4 *);
 	static void norm(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const f32);
-	static void fmma4(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
-	static void fmma3(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
-	static void fmma2(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *, const size_t);
-	static void fmma1(f32x4 *, const f32x4 *, const f32x4 *, const f32x4 *);
 	static void vals(float (&)[12][1024][64], const float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
 	static void pare(float (&)[12][1024][1024], const float (&)[3][1024][12][64], const size_t);
 	static void mask(float (&)[12][1024][1024], const float (&)[12][1024][1024], const bool (&)[1024][1024], const size_t);
@ -84,6 +81,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
 	auto &errc(ctrl.error_seq);
 	auto &accc(ctrl.accept_seq);
 	ctrl.tokens = in.size();
+	ctrl.head = 0;

 	const size_t tmax
 	{
@ -112,25 +110,34 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		else
 			embed(data(dst), in[j], j, opts);

+		#if RB_DEBUG
 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[1][64] {{0}};
 		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
-			"%-2u -- %-3u [%5u] --- --- %s        0        0  |  %8s",
+			"%-4u %4u %4u:%-4u %1u%1u  [ %6.2fL %6.2f%% ] %6.2fL %5.1f%%  %s",
+			ctrl.epoch,
+			ctrl.cycle,
 			j,
 			ctrl.tokens,
-			ctrl.token[j],
-			vocab::debug(dbuf, ctrl.token[j]).c_str(),
-			pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
+			0,
+			0,
+			0.0,
+			0.0,
+			0.0,
+			0.0,
+			vocab::debug(dbuf, in[j]).c_str()
 		);

-		log::info
+		log::logf
 		{
-			log, "%s",
+			log, log::level::DEBUG,
+			"%s",
 			string_view{report, report_size}
 		};
+		#endif
 	}

 	uint64_t cycles(0);
@ -192,24 +199,30 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
-			"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s  |  %8s",
-			j,
+			"%4u:%-4u %4u:%-4u %1u%1u [ %4.1f%% %6.2f%% %5.2fL ] %5.1f%% %5.1f%% %4.1fL  %s %04x  %8s %8s | %8s",
 			j + in.size(),
 			ctrl.tokens,
-			ctrl.cycle,
 			ctrl.epoch,
-			out[j],
+			ctrl.cycle,
 			accc[0] + accc[1] + accc[2],
 			errc[0] + errc[1] + errc[2],
+			ctrl.cert_mean < 100.0? ctrl.cert_mean: NAN,
+			ctrl.perp_mean < 100.0? ctrl.perp_mean: NAN,
+			ctrl.loss_mean < 100.0? ctrl.loss_mean: NAN,
+			ctrl.cert < 100.0? ctrl.cert: NAN,
+			ctrl.perp < 100.0? ctrl.perp: NAN,
+			ctrl.loss < 100.0? ctrl.loss: NAN,
 			vocab::debug(dbuf, out[j]).c_str(),
+			out[j],
 			pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
 			pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
 			pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
 		);

-		log::info
+		log::logf
 		{
-			log, "%s",
+			log, log::level::DEBUG,
+			"%s",
 			string_view{report, report_size}
 		};
 	}
@ -375,41 +388,19 @@ ircd::gpt::coil(float *__restrict__ accum,
 					a[j][k * 64 + l] = attns[k][j][l];
 		}

+		static const math::fmma_opts fmma_opts
+		{
+			768, 768, 2U
+		};
+
 		for(uint j(0); j < tokens; ++j)
-			fmma2((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, tokens);
+			fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts);

 		for(uint j(0); j < tokens; ++j)
 			ffnn(accum + j * 768, accum + j * 768, decoder, i);
 	}
 }

-void
-ircd::gpt::ffnn(float *const out,
-                const float *const in,
-                const model::decoder &decoder,
-                const uint laynum)
-{
-	constexpr float ln2_epsilon
-	{
-		0.00001
-	};
-
-	const auto &layer
-	{
-		decoder.layer[laynum]
-	};
-
-	static float
-	buf alignas(64) [768],
-	buf2 alignas(64) [3072];
-
-	memset(buf2, 0x0, sizeof(buf2));
-	norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
-	fmma3((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight);
-	gelu((f32x4 *)buf2, (const f32x4 *)buf2);
-	fmma4((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight);
-}
-
 void
 ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],
                const float *const __restrict__ in,
@ -440,8 +431,13 @@ ircd::gpt::ctrl(float (&__restrict__ out)[3][1024][12][64],

 		norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon);

+		static const math::fmma_opts fmma_opts
+		{
+			768, 2304, 2U,
+		};
+
 		memset(proj, 0x0, sizeof(proj));
-		fmma1((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight);
+		fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts);

 		#pragma clang loop unroll (disable)
 		for(uint j(0); j < 12; ++j)
@ -548,6 +544,43 @@ ircd::gpt::vals(float (&__restrict__ out)[12][1024][64],
 					out[j][k][m] += in[j][k][l] * val[l][j][m];
 }

+void
+ircd::gpt::ffnn(float *const out,
+                const float *const in,
+                const model::decoder &decoder,
+                const uint laynum)
+{
+	static const math::fmma_opts fmma3_opts
+	{
+		768, 3072, 2U,
+	};
+
+	static const math::fmma_opts fmma4_opts
+	{
+		3072, 768, 2U,
+	};
+
+	constexpr float ln2_epsilon
+	{
+		0.00001
+	};
+
+	const auto &layer
+	{
+		decoder.layer[laynum]
+	};
+
+	static float
+	buf alignas(64) [768],
+	buf2 alignas(64) [3072];
+
+	memset(buf2, 0x0, sizeof(buf2));
+	norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
+	fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts);
+	gelu((f32x4 *)buf2, (const f32x4 *)buf2);
+	fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
+}
+
 void
 ircd::gpt::norm(f32x4 *const __restrict__ out,
                const f32x4 *const __restrict__ in,
@ -567,143 +600,18 @@ ircd::gpt::norm(f32x4 *const __restrict__ out,
 		out[j] = out[j] * weight[j] + bias[j];
 }

+template<class T>
 void
-ircd::gpt::fmma1(f32x4 *const __restrict__ out,
-                 const f32x4 *const __restrict__ in,
-                 const f32x4 *const __restrict__ bias,
-                 const f32x4 *const __restrict__ weight)
+ircd::gpt::fmma(T *const __restrict__ out,
+                const T *const __restrict__ in,
+                const T *const __restrict__ bias,
+                const T *const __restrict__ weight,
+                const math::fmma_opts &opts)
 {
-	constexpr uint width
-	{
-		2304
-	};
+	for(uint i(0); i < opts.rows / simd::lanes<T>(); ++i)
+		out[i] += bias[i];

-	constexpr uint height
-	{
-		768
-	};
-
-	constexpr uint lanes
-	{
-		simd::lanes<f32x4>()
-	};
-
-	for(uint j(0); j < width / lanes; ++j)
-		out[j] += bias[j];
-
-	static const math::fmma_opts opts
-	{
-		width,
-		height,
-		2U,
-		'y',
-	};
-
-	math::fmma<opts>(out, in, weight);
-}
-
-void
-ircd::gpt::fmma2(f32x4 *const __restrict__ out,
-                 const f32x4 *const __restrict__ in,
-                 const f32x4 *const __restrict__ bias,
-                 const f32x4 *const __restrict__ weight,
-                 const size_t num)
-{
-	constexpr uint width
-	{
-		768
-	};
-
-	constexpr uint height
-	{
-		768
-	};
-
-	constexpr uint lanes
-	{
-		simd::lanes<f32x4>()
-	};
-
-	for(uint j(0); j < width / lanes; ++j)
-		out[j] += bias[j];
-
-	static const math::fmma_opts opts
-	{
-		width,
-		height,
-		2U,
-	};
-
-	math::fmma<opts>(out, in, weight);
-}
-
-void
-ircd::gpt::fmma3(f32x4 *const __restrict__ out,
-                 const f32x4 *const __restrict__ in,
-                 const f32x4 *const __restrict__ bias,
-                 const f32x4 *const __restrict__ weight)
-{
-	constexpr uint width
-	{
-		3072
-	};
-
-	constexpr uint height
-	{
-		768
-	};
-
-	constexpr uint lanes
-	{
-		simd::lanes<f32x4>()
-	};
-
-	for(uint j(0); j < width / lanes; ++j)
-		out[j] += bias[j];
-
-	static const math::fmma_opts opts
-	{
-		width,
-		height,
-		2U,
-		'y',
-	};
-
-	math::fmma<opts>(out, in, weight);
-}
-
-void
-ircd::gpt::fmma4(f32x4 *const __restrict__ out,
-                 const f32x4 *const __restrict__ in,
-                 const f32x4 *const __restrict__ bias,
-                 const f32x4 *const __restrict__ weight)
-{
-	constexpr uint width
-	{
-		3072
-	};
-
-	constexpr uint height
-	{
-		768
-	};
-
-	constexpr uint lanes
-	{
-		simd::lanes<f32x4>()
-	};
-
-	for(uint j(0); j < height / lanes; ++j)
-		out[j] += bias[j];
-
-	static const math::fmma_opts opts
-	{
-		width,
-		height,
-		2U,
-	};
-
-	math::fmma<opts>(out, in, weight);
+	math::fmma(out, in, weight, opts);
 }

 void
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -10,8 +10,8 @@


 inline void
-ircd_gpt_norm_fmad(__local float4 *const out,
-                   __local const float4 *const in,
+ircd_gpt_norm_fmad(__local float4 *const restrict out,
+                   __local const float4 *const restrict in,
                   __global const float4 *const restrict bias,
                   __global const float4 *const restrict weight,
                   const uint i)
@ -27,35 +27,26 @@ ircd_gpt_sgemv(__local float4 *const restrict out,
               __global const float4 *const restrict weight,
               const uint width,
               const uint height,
-               const uint tiles,
               const uint i)
 {
-	const uint seg = height / tiles;
-
 	float4 acc = bias[i];
-	for(uint j = 0; j < seg; ++j)
-		for(uint t = 0; t < tiles; ++t)
-			for(uint k = 0; k < 4; ++k)
-			{
-				const uint
-				jidx = t * seg + j,
-				kidx = jidx * 4 + k,
-				widx = kidx * width + i;
-
-				acc += weight[widx] * in[jidx][k];
-			}
+	for(uint j = 0; j < height; ++j)
+		for(uint k = 0; k < 4; ++k)
+			acc += in[j][k] * weight[width * (j * 4 + k) + i];

 	out[i] = acc;
 }

+/// Gaussian Error Linear Unit
 inline void
-ircd_gpt_gelu(__local float4 *const out,
-              __local const float4 *const in_,
-              const uint i)
+ircd_gpt_ffnn_gelu(__local float4 *const out,
+                   __local const float4 *const in_,
+                   const uint i)
 {
-	float4 a,
+	const float4
 	in = in_[i];

+	float4 a;
 	a = 0.044715f;
 	a *= in;
 	a *= in;
@ -71,14 +62,35 @@ ircd_gpt_gelu(__local float4 *const out,
 	out[i] = a;
 }

-//
-// core
-//
+inline void
+__attribute__((always_inline))
+ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __local union ircd_gpt_ffnn_aperaturev *const restrict out,
+                   __local const union ircd_gpt_tokenv *const in,
+                   __global const float4 *const restrict bias,
+                   __global const float4 *const restrict weight)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	width = opts->ffnn_width,
+	height = opts->ffnn_height;

-__kernel void
+	for(uint i = 0; i < 4; ++i)
+		ircd_gpt_sgemv(out->fcon, in->word, bias, weight, width, height, i * ln + li);
+
+	for(uint i = 0; i < 4; ++i)
+		ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li);
+}
+
+inline void
+__attribute__((always_inline))
 ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
              __constant const struct ircd_gpt_opts *const opts,
-              __global union ircd_gpt_tokenv *const restrict accum,
+              __local union ircd_gpt_tokenv *const restrict token,
+              __local union ircd_gpt_tokenv *const restrict tmp,
+              __local union ircd_gpt_ffnn_aperaturev *const restrict buf,
              __global const float4 *const restrict norm_bias,
              __global const float4 *const restrict norm_weight,
              __global const float4 *const restrict fcon_bias,
@ -92,72 +104,34 @@ ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
 	li = get_local_id(0),
 	ln = get_local_size(0),
 	wi = get_group_id(0),
-	wn = get_num_groups(0);
-
-	__local union ircd_gpt_aperaturev token;
-	__local float4 tmp[768/4];
-
-	// Fetch local copy of the global accumulator. We operate on a cached
-	// copy as input, and add our output to the global upon completion.
-	token.word[li] = accum[wi].word[li];
+	wn = get_num_groups(0),
+	width = opts->ffnn_width,
+	height = opts->ffnn_height;

 	// Layer re-normalization
-	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
-	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
+	ircd_simt_math_norm_f4lldr(token->word, token->word, buf->word, ln, li);
+	ircd_gpt_norm_fmad(tmp->word, token->word, norm_bias, norm_weight, li);
+
+	// ln's writes are still pending but fcon reads results across threads.
+	barrier(CLK_LOCAL_MEM_FENCE);

 	// Fully connected
-	for(uint i = 0; i < 4; ++i)
-		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
+	ircd_gpt_ffnn_fcon(ctrl, opts, buf, tmp, fcon_bias, fcon_weight);

-	// Gaussian Error Linear Unit
-	for(uint i = 0; i < 4; ++i)
-		ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
-
-	// Projection
-	ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
-
-	// Accumulation; end of layer
-	accum[wi].word[li] += tmp[li];
-}
-
-__kernel void
-ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __global union ircd_gpt_tokenv *const restrict accum,
-                   __local const union ircd_gpt_tokenv *const restrict xattn,
-                   __global const float4 *const restrict proj_bias,
-                   __global const float4 *const restrict proj_weight)
-{
-	const uint
-	gi = get_global_id(0),
-	gn = get_global_size(0),
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_group_id(0),
-	wn = get_num_groups(0);
-
-	__local float4
-	in[768/4],
-	out[768/4];
-
-	// Fetch
-	in[li] = xattn->word[li];
-
-	// Need this here if xattn is __local
+	// fcon's writes are still pending but proj reads results across threads.
 	barrier(CLK_LOCAL_MEM_FENCE);

 	// Projection
-	ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
-
-	// Accumulation; end of layer
-	accum[wi].word[li] += out[li];
+	ircd_gpt_sgemv(token->word, buf->fcon, proj_bias, proj_weight, height, width, li);
 }

-__kernel void
+inline void
+__attribute__((always_inline))
 ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __local union ircd_gpt_tokenv *const restrict out,
-                   __global const struct ircd_gpt_qkvv *const restrict token,
+                   __local union ircd_gpt_tokenv *const restrict tmp,
+                   __global const struct ircd_gpt_attn_qkvv *const restrict token,
                   __global const struct ircd_gpt_attn_mask *const restrict mask)   // [1024][1024],
 {
 	const uint
@ -168,18 +142,13 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);

-	__local union
-	{
-		float
-		attn[12][96];
-	}
-	self;
+	__local union ircd_gpt_token *const restrict self = &tmp->token;

 	for(uint i = 0; i < wn; ++i)
 		if(mask[wi].token[i])
-			self.attn[li][i] = 0.0f;
+			self->attn[li][i] = 0.0f;
 		else
-			self.attn[li][i] = -10000.0f;
+			self->attn[li][i] = -10000.0f;

 	for(uint i = 0; i < wn; ++i)
 		if(mask[wi].token[i])
@ -190,40 +159,153 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
 				key = token[i].key.attn[li][j],
 				res = qry * key;
 				for(uint k = 0; k < 4; ++k)
-					self.attn[li][i] += res[k];
+					self->attn[li][i] += res[k];
 			}

 	for(uint i = 0; i < wn; ++i)
 		if(mask[wi].token[i])
-			self.attn[li][i] /= 8.0f;
+			self->attn[li][i] /= 8.0f;
+
+	float mu = -10000.0f;
+	for(uint i = 0; i < wn; ++i)
+		mu = max(mu, self->attn[li][i]);

 	for(uint i = 0; i < wn; ++i)
-		self.attn[li][i] = exp(self.attn[li][i]);
+		self->attn[li][i] = exp(self->attn[li][i] - mu);

-	float4 vacc = 0.0f;
+	float sum = 0.0f;
 	for(uint i = 0; i < wn; ++i)
-		vacc[i % 4] += self.attn[li][i];
-
-	float acc = 0.0f;
-	for(uint i = 0; i < 4; ++i)
-		acc += vacc[i];
+		sum += self->attn[li][i];

+	const float lambda = 1.0f / sum;
 	for(uint i = 0; i < wn; ++i)
-		self.attn[li][i] /= acc;
+		self->attn[li][i] *= lambda;

 	for(uint j = 0; j < 64/4; ++j)
 		out->attn[li][j] = 0.0f;

 	for(uint i = 0; i < wn; ++i)
 		for(uint j = 0; j < 64/4; ++j)
-			out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
+			out->attn[li][j] += token[i].val.attn[li][j] * self->attn[li][i];
+}
+
+inline void
+__attribute__((always_inline))
+ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __local union ircd_gpt_tokenv *const out,
+                   __local const union ircd_gpt_tokenv *const xattn,
+                   __global const float4 *const restrict bias,
+                   __global const float4 *const restrict weight)
+{
+	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0),
+	height = opts->attn_height,
+	width = opts->attn_height;  // same
+
+	// Projection
+	ircd_gpt_sgemv(out->word, xattn->word, bias, weight, height, width, li);
+}
+
+__kernel void
+ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
+              __constant const struct ircd_gpt_opts *const opts,
+              __global union ircd_gpt_tokenv *const restrict accum,
+              __global const struct ircd_gpt_attn_qkvv *const restrict state,
+              __global const struct ircd_gpt_attn_mask *const restrict mask,   // [1024][1024],
+              __global 	const float4 *const restrict attn_proj_bias,
+              __global const float4 *const restrict attn_proj_weight,
+              __global const float4 *const restrict ffnn_norm_bias,
+              __global const float4 *const restrict ffnn_norm_weight,
+              __global const float4 *const restrict ffnn_fcon_bias,
+              __global const float4 *const restrict ffnn_fcon_weight,
+              __global const float4 *const restrict ffnn_proj_bias,
+              __global const float4 *const restrict ffnn_proj_weight)
+{
+	const uint
+	li = get_local_id(0),
+	wi = get_group_id(0);
+
+	__local union ircd_gpt_ffnn_aperaturev
+	ffnn_fcon;
+
+	__local union ircd_gpt_tokenv
+	buf0, buf1;
+
+	// Self-attention backend; this computes the self-attention result now
+	// that keys and values are globally visible across tokens.
+	ircd_gpt_attn_self
+	(
+		ctrl,
+		opts,
+		&buf1,
+		&buf0,
+		state,
+		mask
+	);
+
+	// Self-attention's writes are pending on each thread but each proj
+	// call requires results from all threads for input to the matmul.
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Project result of self-attention.
+	ircd_gpt_attn_proj
+	(
+		ctrl,
+		opts,
+		&buf0,
+		&buf1,
+		attn_proj_bias,
+		attn_proj_weight
+	);
+
+	// Frontend accumulation
+	{
+		const float4
+		attn = buf0.word[li],
+		resid = accum[wi].word[li];
+
+		buf0.word[li] += resid;
+		accum[wi].word[li] += attn;
+	}
+
+	// Backend mlp; layer-norm acquires any pending writes, no fence required.
+	ircd_gpt_ffnn
+	(
+		ctrl,
+		opts,
+		&buf0,
+		&buf1,
+		&ffnn_fcon,
+		ffnn_norm_bias,
+		ffnn_norm_weight,
+		ffnn_fcon_bias,
+		ffnn_fcon_weight,
+		ffnn_proj_bias,
+		ffnn_proj_weight
+	);
+
+	// Backend accumulation
+	{
+		const float4
+		ffnn = buf0.word[li],
+		resid = accum[wi].word[li],
+		result = ffnn + resid;
+
+		accum[wi].word[li] = result;
+	}
 }

 __kernel void
 ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
-                   __global union ircd_gpt_aperaturev *const restrict out,
-                   __global const union ircd_gpt_tokenv *const restrict in,
+                   __global union ircd_gpt_attn_aperaturev *const restrict state,
+                   __global const union ircd_gpt_tokenv *const restrict accum,
                   __global const float4 *const restrict norm_bias,
                   __global const float4 *const restrict norm_weight,
                   __global const float4 *const restrict fcon_bias,
@ -235,74 +317,32 @@ ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
 	li = get_local_id(0),
 	ln = get_local_size(0),
 	wi = get_group_id(0),
-	wn = get_num_groups(0);
+	wn = get_num_groups(0),
+	width = opts->attn_width,
+	height = opts->attn_height;

-	__local union ircd_gpt_aperaturev token;
-	__local float4 tmp[768/4];
+	__local union ircd_gpt_attn_aperaturev
+	token;

-	token.word[li] = in[wi].word[li];
+	__local float4
+	tmp[768/4];
+
+	token.word[li] = accum[wi].word[li];

 	// Layer re-normalization
 	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
 	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);

+	// Ln's writes are still pending; fcon requires results across threads.
+	barrier(CLK_LOCAL_MEM_FENCE);
+
 	// Fully connected
 	for(uint i = 0; i < 3; ++i)
-		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
+		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, width, height, i * ln + li);

 	// Export queries, keys, and values.
 	for(uint i = 0; i < 3; ++i)
-		out[wi].proj[i][li] = token.proj[i][li];
-}
-
-__kernel void
-ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
-              __constant const struct ircd_gpt_opts *const opts,
-              __global union ircd_gpt_tokenv *const restrict accum,
-              __global const struct ircd_gpt_qkvv *const restrict state,
-              __global const struct ircd_gpt_attn_mask *const restrict mask,   // [1024][1024],
-              __global const float4 *const restrict attn_proj_bias,
-              __global const float4 *const restrict attn_proj_weight,
-              __global const float4 *const restrict ffnn_norm_bias,
-              __global const float4 *const restrict ffnn_norm_weight,
-              __global const float4 *const restrict ffnn_fcon_bias,
-              __global const float4 *const restrict ffnn_fcon_weight,
-              __global const float4 *const restrict ffnn_proj_bias,
-              __global const float4 *const restrict ffnn_proj_weight)
-{
-	__local union ircd_gpt_tokenv value;
-
-	ircd_gpt_attn_self
-	(
-		ctrl,
-		opts,
-		&value,
-		state,
-		mask
-	);
-
-	ircd_gpt_attn_proj
-	(
-		ctrl,
-		opts,
-		accum,
-		&value,
-		attn_proj_bias,
-		attn_proj_weight
-	);
-
-	ircd_gpt_ffnn
-	(
-		ctrl,
-		opts,
-		accum,
-		ffnn_norm_bias,
-		ffnn_norm_weight,
-		ffnn_fcon_bias,
-		ffnn_fcon_weight,
-		ffnn_proj_bias,
-		ffnn_proj_weight
-	);
+		state[wi].proj[i][li] = token.proj[i][li];
 }

 //
@ -320,7 +360,8 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                   const uint word_idx)
 {
 	const ushort
-	token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
+	ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
+	token = ctrl->token[ring_idx];

 	const float4
 	wte = vocab[token].word[word_idx],
@ -362,9 +403,9 @@ ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,

 	// Final re-normalization
 	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
-	ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
+	ircd_gpt_norm_fmad(tmp.word, token.word, norm_bias, norm_weight, li);

-	accum[0].word[li] = token.word[li];
+	accum[wi].word[li] = tmp.word[li];
 }

 __kernel void
@ -375,31 +416,122 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
                  __global const union ircd_gpt_tokenv *const restrict token)
 {
 	const uint
-	gi = get_global_id(0);
+	gi = get_global_id(0),
+	ti = ctrl->tokens - 1,
+	words = opts->embed_width;

 	float4 acc = 0.0f;
-	for(uint j = 0; j < 768/4; ++j)
+	__attribute__((opencl_unroll_hint))
+	for(uint j = 0; j < words; ++j)
 	{
 		const float4
-		in = accum[0].word[j],
-		vocab = token[gi].word[j],
-		res = vocab * in;
+		in = accum[ti].word[j],
+		vocab = token[gi].word[j];

-		acc += res;
+		acc += vocab * in;
 	}

 	float res = 0.0f;
 	for(uint k = 0; k < 4; ++k)
 		res += acc[k];

-	logit[gi] = res;
+	if(gi < opts->logits)
+		logit[gi] = res;
+	else
+		logit[gi] = -10000.0f;
+}
+
+__kernel void
+ircd_gpt_lm_logsm(__global struct ircd_gpt_task *const ctrl,
+                  __constant const struct ircd_gpt_opts *const opts,
+                  __global float4 *const restrict logsm,
+                  __global float4 *const restrict logexp,
+                  __global const float4 *const restrict logit)
+{
+	const uint
+	gi = get_global_id(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	logits = opts->logits,
+	logits_alignup = logits + (ln - (logits % ln)),
+	tn = logits_alignup / ln / 4,
+	ti = tn * li;
+
+	__local float share[256];
+	__local float4 share4[256];
+
+	share4[li] = -10000.0f;
+	for(uint i = ti; i < ti + tn; ++i)
+		share4[li] = max(share4[li], logit[i]);
+
+	share[li] = -10000.0f;
+	for(uint k = 0; k < 4; ++k)
+		share[li] = max(share[li], share4[li][k]);
+
+	ircd_simt_reduce_max_flldr(share, ln, li);
+
+	if(li == 0)
+		share4[li] = ctrl->samax_mu = share[li];
+
+	ircd_simt_broadcast_f4lldr(share4, ln, li);
+
+	const float4
+	mu = share4[li];
+
+	share4[li] = 0.0f;
+	for(uint i = ti; i < ti + tn; ++i)
+	{
+		const float4
+		reg = logit[i] - mu,
+		res = exp(reg);
+
+		for(uint k = 0; k < 4; ++k)
+			if(i * 4 + k < logits)
+				share4[li][k] += res[k];
+
+		for(uint k = 0; k < 4; ++k)
+			if(i * 4 + k < logits)
+				logexp[i][k] = res[k];
+			else
+				logexp[i][k] = 0.0f;
+	}
+
+	ircd_simt_reduce_add_f4lldr(share4, ln, li);
+
+	if(li == 0)
+	{
+		float sum = 0.0f;
+		for(uint k = 0; k < 4; ++k)
+			sum += share4[li][k];
+
+		share4[li][0] = ctrl->samax_sum = sum;
+		share4[li][1] = ctrl->samax_lambda = 1.0f / sum;
+	}
+
+	ircd_simt_broadcast_f4lldr(share4, ln, li);
+
+	const float4
+	sum = share4[li][0],
+	lambda = share4[li][1];
+	for(uint i = ti; i < ti + tn; ++i)
+		for(uint k = 0; k < 4; ++k)
+			if(i * 4 + k < logits)
+				logsm[i] = logexp[i] * lambda;
+			else
+				logsm[i] = 0.0f;
 }

 inline void
+__attribute__((always_inline))
 ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
               __constant const struct ircd_gpt_opts *const opts,
               const uint li)
 {
+	// If the call value has been set to something other than default we
+	// do nothing else here.
+	if(ctrl->call != IRCD_GPT_ECOMPLETE)
+		return;
+
 	// No action for other threads right now
 	if(li != 0)
 		return;
@ -411,11 +543,6 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
 			ctrl->call = IRCD_GPT_ETOKENS;
 	#endif

-	// If the call value has been set to something other than default we
-	// do nothing else here.
-	if(ctrl->call != IRCD_GPT_ECOMPLETE)
-		return;
-
 	// On the last cycle, with no prior call or error code set, indicate
 	// a nominal exit condition.
 	if(ctrl->cycle + 1 >= opts->limit)
@ -425,48 +552,83 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
 	}

 	ctrl->cycle += 1;
+	ctrl->magic = 0xC7012C70U;
 }

 inline void
+__attribute__((always_inline))
 ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   const uint li,
-                   __local const ushort *const restrict idx)
+                   __local const ushort *const restrict idx,
+                   __global const float *const restrict logsm,
+                   __global const float *const restrict logexp,
+                   __global const float *const restrict logit)
 {
+	// When the hypercall code is already set, bail here.
+	if(ctrl->call != IRCD_GPT_ECOMPLETE)
+		return;
+
 	// To read from cells other than idx[0] we need this barrier.
 	if(opts->top_k > 1)
 		barrier(CLK_LOCAL_MEM_FENCE);

-	// No action for other threads right now
+	// Mask for write-leader
 	if(li != 0)
 		return;

-	// When the hypercall code is already set, bail here.
-	if(ctrl->call != IRCD_GPT_ECOMPLETE)
-		return;
-
 	const bool
 	buffer_full = ctrl->tokens >= opts->buffer_tokens;

 	const ulong
-	rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
-	sel = rnd % max(opts->top_k, 1U);
+	rnd = opts->top_k > 1?
+		ircd_simt_rand_xoshiro256pg(ctrl->rand): 1UL;

 	const ushort
-	token = idx[sel],
-	token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
+	entro = max(opts->top_k, 1U),
+	select = rnd % entro,
+	token = idx[select],
+	dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
+	tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
+	head = buffer_full?
+		(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;

-	ctrl->token[token_idx] = token;
+	const ushort
+	next_select = select + 1,
+	next_token = idx[next_select];

-	if(buffer_full)
-		ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
-	else
-		ctrl->tokens++;
+	const float
+	test_lsm = logexp[opts->label] * ctrl->samax_lambda,
+	loss = 0.0f - log(test_lsm * ctrl->samax_lambda),
+	perp = logsm[token] * 100.0f,
+	cert = ((logsm[token] - logsm[next_token]) / logsm[token]) * 100.0f,
+	loss_sum = ctrl->loss_sum + loss,
+	perp_sum = ctrl->perp_sum + perp,
+	cert_sum = ctrl->cert_sum + cert,
+	mean_div = ctrl->epoch + 1.0f,
+	loss_mean = loss_sum / mean_div,
+	perp_mean = perp_sum / mean_div,
+	cert_mean = cert_sum / mean_div;
+
+	ctrl->loss = loss;
+	ctrl->loss_sum = loss_sum;
+	ctrl->loss_mean = loss_mean;
+	ctrl->perp = perp;
+	ctrl->perp_sum = perp_sum;
+	ctrl->perp_mean = perp_mean;
+	ctrl->cert = cert;
+	ctrl->cert_sum = cert_sum;
+	ctrl->cert_mean = cert_mean;
+	ctrl->head = head;
+	ctrl->tokens = tokens;
+	ctrl->token[dest] = token;
 }

 __kernel void
 ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
+                   __global const float *const restrict logsm,
+                   __global const float *const restrict logexp,
                   __global const float *const restrict logit)
 {
 	const uint
@ -476,17 +638,17 @@ ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
 	ln = get_local_size(0),
 	wi = get_group_id(0),
 	wn = get_num_groups(0),
-	tn = 262,
+	tn = opts->logits / ln,
 	ti = tn * li;

-	__local ushort idx[192];
+	__local ushort idx[256];

 	idx[li] = ti;
-	for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
-		if(logit[j] > logit[idx[li]])
+	for(uint j = ti + 1; j < ti + tn; ++j)
+		if(logsm[j] > logsm[idx[li]])
 			idx[li] = j;

-	ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
-	ircd_gpt_lm_result(ctrl, opts, li, idx);
+	ircd_simt_sort_idx16_flldr(idx, logsm, ln, li);
+	ircd_gpt_lm_result(ctrl, opts, li, idx, logsm, logexp, logit);
 	ircd_gpt_leave(ctrl, opts, li);
 }
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -32,20 +32,25 @@ namespace ircd::gpt::model
 	init_h_attn_proj_bias(decoder &, const string_view &, const size_t &, const json::array &),
 	init_h_attn_bias(decoder &, const string_view &, const size_t &, const json::array &);

+	static bool init_dataset(const string_view &);
 	static bool init_from_cache(const string_view &);
 	static void init_from_json_handle(decoder &, const init_handler &, const size_t &);
 	static void init_from_json(const string_view &, const string_view &);
-	static void init() noexcept;
+	static void init(), fini() noexcept;

 	extern const init_handler
 	manifest[],
-	manifest_h[],
-	manifest_td[];
+	manifest_h[];

-	extern conf::item<std::string> path;
-	extern conf::item<std::string> cache_path;
+	extern conf::item<std::string>
+	path,
+	cache_path,
+	dataset_path;
+
+	static fs::map
+	default_model_shm,
+	default_dataset_shm;

-	static fs::map default_model_shm;
 	static std::unique_ptr<decoder> default_model_res;
 }

@ -76,14 +81,6 @@ ircd::gpt::model::manifest
 	{ "wte.weight.json",    init_wte_weight },
 };

-decltype(ircd::gpt::model::manifest_td)
-ircd::gpt::model::manifest_td
-{
-	{ "test.jsonl",    nullptr,  },
-	{ "valid.jsonl",   nullptr,  },
-	{ "train.jsonl",   nullptr,  },
-};
-
 decltype(ircd::gpt::model::cache_path)
 ircd::gpt::model::cache_path
 {
@ -91,6 +88,13 @@ ircd::gpt::model::cache_path
 	{ "default",  "model.cache.localhost"     },
 };

+decltype(ircd::gpt::model::dataset_path)
+ircd::gpt::model::dataset_path
+{
+	{ "name",     "ircd.gpt.model.dataset.path" },
+	{ "default",  string_view{}                 },
+};
+
 decltype(ircd::gpt::model::path)
 ircd::gpt::model::path
 {
@ -104,15 +108,35 @@ ircd::gpt::model::path
 decltype(ircd::gpt::model::default_model)
 ircd::gpt::model::default_model;

+decltype(ircd::gpt::model::default_dataset)
+ircd::gpt::model::default_dataset;
+
+decltype(ircd::gpt::model::default_data)
+ircd::gpt::model::default_data;
+
 void
 ircd::gpt::model::init()
-noexcept
 {
 	if(!model::path)
 		return;

 	if(!init_from_cache(model::cache_path))
 		init_from_json(model::cache_path, model::path);
+
+	if(model::dataset_path)
+		init_dataset(model::dataset_path);
+}
+
+void
+ircd::gpt::model::fini()
+noexcept
+{
+	default_model = nullptr;
+	default_model_shm = {};
+
+	default_dataset = nullptr;
+	default_data.clear();
+	default_dataset_shm = {};
 }

 bool
@ -170,10 +194,9 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
                                 const string_view &model_path)
 {
 	util::timer stopwatch;
-	auto decoder
-	{
-		std::make_unique<model::decoder>()
-	};
+
+	auto decoder(std::make_unique<model::decoder>());
+	memset(decoder.get(), 0x0, sizeof(model::decoder));

 	// Load the top level files, vocab etc
 	for(size_t i(0); i < 4; ++i)
@ -274,6 +297,55 @@ ircd::gpt::model::init_from_json_handle(decoder &d,
 	};
 }

+bool
+ircd::gpt::model::init_dataset(const string_view &path)
+{
+	if(!fs::is_reg(path))
+		return false;
+
+	const auto size
+	{
+		fs::size(path)
+	};
+
+	const fs::fd fd
+	{
+		path
+	};
+
+	fs::map::opts map_opts;
+	map_opts.huge2mb = true;
+	default_dataset_shm = fs::map
+	{
+		fd, map_opts, size
+	};
+
+	default_dataset = string_view
+	(
+		default_dataset_shm
+	);
+
+	size_t checkpoint(0);
+	default_data.resize(260000); //TODO: XXX
+	ircd::tokens(default_dataset, '\n', [&checkpoint]
+	(const string_view &line)
+	{
+		default_data.at(checkpoint++) = line;
+	});
+
+	char pbuf[48];
+	log::info
+	{
+		log, "dataset(%p) mapped `%s' %s @%lu",
+		data(default_dataset_shm),
+		path,
+		pretty(pbuf, iec(size)),
+		checkpoint,
+	};
+
+	return true;
+}
+
 void
 ircd::gpt::model::init_wpe_weight(decoder &d,
                                  const string_view &name,
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -32,7 +32,7 @@ decltype(ircd::gpt::pipe::flush_cycles)
 ircd::gpt::pipe::flush_cycles
 {
 	{ "name",     "ircd.gpt.pipe.flush" },
-	{ "default",  0L,                   },
+	{ "default",  1L,                   },
 };

 decltype(ircd::gpt::pipe::default_model)
@ -73,20 +73,40 @@ ircd::gpt::pipe::init()
 	{
 		*pipe::default_code, *pipe::default_model
 	};
+
+	log::debug
+	{
+		log, "Pipe initialized from model:%p data:%p code:%p desc:%p",
+		&default_model,
+		pipe::default_model,
+		pipe::default_code,
+		pipe::default_desc,
+	};
 }

 void
 ircd::gpt::pipe::fini()
 noexcept
 {
-	delete default_desc;
-	default_desc = nullptr;
+	const auto pending
+	{
+		cl::work::list.size()
+	};

-	delete default_code;
-	default_code = nullptr;
+	if(pending)
+	{
+		log::warning
+		{
+			log, "Waiting for %zu pending tasks to leave the pipe...",
+			pending,
+		};

-	delete default_model;
-	default_model = nullptr;
+		cl::sync();
+	}
+
+	delete default_desc;   default_desc = nullptr;
+	delete default_code;   default_code = nullptr;
+	delete default_model;  default_model = nullptr;
 }

 //
@ -96,26 +116,29 @@ noexcept
 void
 ircd::gpt::generate(task &task)
 {
-	if(unlikely(!pipe::default_model))
-		pipe::init();
+	assert(pipe::default_model);

+	assert(task.opts);
 	const auto &opts
 	{
 		*task.opts
 	};

+	assert(task.ctrl);
 	auto &ctrl
 	{
 		*task.ctrl
 	};

+	ctrl.cycle = 0;
 	ctrl.call = IRCD_GPT_ECOMPLETE;
 	ctrl.host_tsc = prof::cycles();
-	size_t cycle(ctrl.cycle);
-	const size_t tokens(ctrl.tokens);
+	volatile const size_t tokens(ctrl.tokens);
+	volatile const auto epoch(ctrl.epoch);
+	volatile size_t cycle(ctrl.cycle);

 	std::deque<pipe::exec> list;
-	for(; cycle < opts.limit; ++cycle)
+	for(; cycle < opts.limit && run::level == run::level::RUN; ++cycle)
 	{
 		// When the release/acquire bits are set the control pages are sent
 		// and received; only set on first and last iterations of this loop.
@ -137,7 +160,7 @@ ircd::gpt::generate(task &task)
 			pipe::flush_cycles

 			// Skip flushing on cycles already performing IO or waiting.
-			&& !rel && !acq && list.size() <= pipe::queue_cycles
+			&& !acq && list.size() <= pipe::queue_cycles

 			// The configuration item can specify an interval greater than
 			// one between flushes.
@ -167,6 +190,8 @@ ircd::gpt::generate(task &task)
 	// Wait for all unfinished
 	list.clear();

+	assert(ctrl.magic == 0xC7012C70);
+
 	// Interp error codes
 	if(unlikely(ctrl.call <= 0))
 		throw error
@ -176,7 +201,7 @@ ircd::gpt::generate(task &task)
 			reflect(ctrl.call),
 		};

-	always_assert(ctrl.cycle == cycle);
+	assert(ctrl.cycle == cycle);
 }

 void
@ -199,7 +224,7 @@ ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
 		log::logf
 		{
 			log, log::level::DEBUG,
-			"coil:%-2lu %8s %8s %8s %8s\n",
+			"coil:%-2lu %8s %8s %8s %8s",
 			i,
 			util::pretty(tmbuf[0], si(pro[0]), 1),
 			util::pretty(tmbuf[1], si(pro[1]), 1),
@ -259,13 +284,18 @@ ircd::gpt::pipe::exec::exec(task &task,
 }
 ,range_lm_logit
 {
-	{ 262 * 192UL, 0 },  // align_up(50257) / 192
-	{       192UL, 0 },
+	{  786 * 64UL, 0 },  // align_up(50257) / 64
+	{        64UL, 0 },
+}
+,range_lm_logsm
+{
+	{   1 * 256UL, 0 },
+	{       256UL, 0 },
 }
 ,range_lm_select
 {
-	{   1 * 192UL, 0 },
-	{       192UL, 0 },
+	{   1 * 256UL, 0 },
+	{       256UL, 0 },
 }
 ,release_opts
 {
@ -314,6 +344,10 @@ ircd::gpt::pipe::exec::exec(task &task,
 {
 	desc->lm_logit, range_lm_logit, lmhead_opts
 }
+,lm_logsm
+{
+	desc->lm_logsm, range_lm_logsm, lmhead_opts
+}
 ,lm_select
 {
 	desc->lm_select, range_lm_select, lmamax_opts
@ -342,8 +376,8 @@ ircd::gpt::pipe::code::compile_opts
 	" -cl-finite-math-only"
 	" -cl-unsafe-math-optimizations"
 	" -cl-fast-relaxed-math"
-	//" -cl-mad-enable"
-	//" -cl-single-precision-constant"
+	" -cl-mad-enable"
+	" -cl-single-precision-constant"
 	//" -cl-fp32-correctly-rounded-divide-sqrt"
 };

@ -412,6 +446,16 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	65536 * sizeof(float),
 	mutable_buffer{}
 }
+,logexp
+{
+	65536 * sizeof(float),
+	mutable_buffer{}
+}
+,logsm
+{
+	65536 * sizeof(float),
+	mutable_buffer{}
+}
 ,ctrl
 {
 	sizeof(struct ircd_gpt_task),
@ -452,12 +496,24 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	accum,
 	model.embed->token,
 }
+,lm_logsm
+{
+	code,
+	"ircd_gpt_lm_logsm",
+	ctrl,
+	opts,
+	logsm,
+	logexp,
+	logit,
+}
 ,lm_select
 {
 	code,
 	"ircd_gpt_lm_select",
 	ctrl,
 	opts,
+	logsm,
+	logexp,
 	logit,
 }
 ,layer
--- a/modules/console.cc
+++ b/modules/console.cc
@ -17336,6 +17336,7 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
 	};

 	out
+	<< text
 	<< output
 	<< std::endl;
 	return true;
@ -17347,6 +17348,14 @@ console_cmd__gpt(opt &out, const string_view &line)
 	return console_cmd__gpt__raw(out, line);
 }

+bool
+console_cmd__gpt__pipe__reset(opt &out, const string_view &line)
+{
+	gpt::pipe::fini();
+	gpt::pipe::init();
+	return true;
+}
+
 bool
 console_cmd__gpt__token(opt &out, const string_view &line)
 {
@ -17368,3 +17377,33 @@ console_cmd__gpt__token(opt &out, const string_view &line)

 	return true;
 }
+
+bool
+console_cmd__gpt__data(opt &out, const string_view &line)
+{
+	const params param{line, " ",
+	{
+		"id"
+	}};
+
+	const auto idx
+	{
+		param.at<uint>("id")
+	};
+
+	const json::object step
+	{
+		gpt::model::default_data.at(idx)
+	};
+
+	out
+	<< "#" << step["id"]
+	<< " | " << step["length"]
+	<< " " << (step["ended"]? "ended"_sv: string_view{})
+	<< std::endl
+	<< std::endl
+	<< step["text"]
+	<< std::endl;
+
+	return true;
+}