diff --git a/include/ircd/gpt/ctrl.h b/include/ircd/gpt/ctrl.h
index dad122ae2..54f7eafa9 100644
--- a/include/ircd/gpt/ctrl.h
+++ b/include/ircd/gpt/ctrl.h
@@ -11,89 +11,71 @@
 #pragma once
 #define HAVE_IRCD_GPT_CTRL_H
 
-/// Epoch Precision Interrupt Controller
-///
-struct ircd_gpt_ctrl_epic
-{
-	/// Accumulates the number of task cycles. The cycle counter is incremented
-	/// by device software after each repetition of the kernel pipeline to
-	/// produce one additional token.
-	ulong cycle;
-
-	/// Accumulates the epoch count for the task. The counter is incremented
-	/// by one in device software before control returns back to the host.
-	/// Several cycles may occur during each epoch.
-	ulong epoch;
-
-	/// Accumulates the training epoch count for the task. The counter is
-	/// incremented by one in device software for each backward propagation.
-	ulong step;
-
-	/// Updated by the host with the value of the timestamp register as sampled
-	/// immediately before each transfer of control to the device.
-	ulong host_tsc;
-
-	/// Accumulates time in microseconds elapsed for the task.
-	ulong elapsed;
-};
-
-/// Token Context Buffer (Control Block)
-///
-struct ircd_gpt_ctrl_tokens
-{
-	/// Token ring head. Tokens in the ring extend behind the head for
-	/// `tokens`. The `head` value is automatically modulated by device
-	/// software to wrap around the ring.
-	uint head;
-
-	/// Token counter. The counter indicates the number of valid tokens in
-	/// the context buffer. This value must not exceed the buffer size.
-	uint count;
-
-	/// Accumulates the number of tokens produced by the task. Several tokens
-	/// may be produced each epoch, but currently only one token is produced
-	/// each cycle.
-	ulong produced;
-
-	/// Accumulates the number tokens witnessed by the task. The number of
-	/// tokens in the context for each cycle is counted as witnessed.
-	ulong witnessed;
-};
-
-/// Target label register (abridged)
-///
+/// Result logit control block.
 struct ircd_gpt_ctrl_logit
 {
 	/// Vocabulary token.
 	ushort token;
 
 	/// Padding #0.
-	ushort _pad0;
+	ushort flag;
 
 	/// Result logit softmax probability.
 	float samax;
 };
 
-/// Target label register (full)
-///
+/// Target label control block. Results for each target are registered
+/// and state is updated each cycle.
 struct ircd_gpt_ctrl_label
 {
-	/// Vocabulary token.
-	ushort token;
-
-	/// Padding #0.
-	ushort _pad0;
-
-	/// Result logit softmax probability.
-	float samax;
+	/// Logit descriptor
+	struct ircd_gpt_ctrl_logit logit;
 
 	/// Loss state
 	struct ircd_math_mean loss;
 
 	/// Perplexity state
-	struct ircd_math_mean perp;
-}
-__attribute__((aligned(64)));
+	struct ircd_math_mean ppl;
+};
+
+/// Master clock
+struct ircd_gpt_ctrl_clk
+{
+	/// Master clock. The cycle count is incremented by one in device software
+	/// after each repetition of the kernels producing one additional token.
+	/// The cycle count resets to zero before the beginning of each sample.
+	uint cycle;
+
+	/// Master clock. Sample consists of one or more cycles; sample count is
+	/// incremented by one in device software after every accept condition,
+	/// growing monotonically for the `step`; resets to zero each `step`.
+	uint samp;
+
+	/// Master clock. Step (or timestep) consists of one or more samples. Step
+	/// count is incremented by one in device software after each backward
+	/// propagation. Step grows monotonically even across epochs.
+	uint step;
+
+	/// Master clock. Epoch consists of one or more steps; epoch count is
+	/// incremented by one after every backward propagation.
+	uint epoch;
+};
+
+/// Profiling block
+struct ircd_gpt_ctrl_prof
+{
+	/// Host timestamp sampled at last control page transfer to the device.
+	ulong released;
+
+	/// Host timestamp sampled when this control page accquired by the host.
+	ulong acquired;
+
+	/// Device timestamp at beginning of cycle.
+	ulong entered;
+
+	/// Device timestamp at end of cycle.
+	ulong finished;
+};
 
 /// Task Control Page
 ///
@@ -104,50 +86,94 @@ __attribute__((aligned(64)));
 ///
 struct ircd_gpt_ctrl
 {
-	/// Epoch counting & interrupt control block.
-	struct ircd_gpt_ctrl_epic epic;
+	/// Accept register. If >= 0 the cycle produced a token which satisfies the
+	/// indicated accept condition.
+	int accept;
 
-	/// Token context control block. Contains state for the token context
-	/// buffer; the buffer with the tokens themselves is elsewhere.
-	struct ircd_gpt_ctrl_tokens tokens;
+	/// Dispatch register. Device software wishes additional cycles to be
+	/// commanded by the host. Effectively minimum distance until next accept.
+	uint dispatch;
 
-	/// Top result summary from the softed result logit softmax vector. This
-	/// is updated each cycle by device software with extended statistics on
-	/// the top N results.
-	struct ircd_gpt_ctrl_logit top[16];
+	/// Token counter. The counter indicates the number of valid tokens in
+	/// the context buffer. This value must not exceed the opts.buffer_size.
+	/// This value should not exceed the opts.context_size at least for now.
+	uint count;
 
-	/// Target label control block. Results for each target are registered
-	/// and state is updated each cycle.
-	struct ircd_gpt_ctrl_label label[4];
+	/// Token counter. The counter indicates the number of valid tokens in
+	/// the context buffer. This value must not exceed the opts.buffer_size.
+	/// This value should not exceed the opts.context_size at least for now.
+	uint tokens;
 
-	/// Result logit vector softmax internal state.
-	struct ircd_math_samax samax;
+	/// Master clock.
+	struct ircd_gpt_ctrl_clk clk;
+
+	/// Profiling related.
+	struct ircd_gpt_ctrl_prof prof;
 
 	/// PRNG xoshiro256 internal state (note: see opts.h to seed the prng).
 	ulong rand[4];
 
-	/// Perform backprop TODO: XXX
-	bool prop;
+	/// Top result summary from the softed result logit softmax vector. This
+	/// is updated each cycle by device software with extended statistics on
+	/// the top N results.
+	struct ircd_gpt_ctrl_logit top[16] __attribute__((aligned(8)));
 
-	/// Header magic 0xC7012C70
-	uint magic;
+	/// User label control block. Results for each target are registered
+	/// and state is updated each cycle; averaged for each step.
+	struct ircd_gpt_ctrl_label label[14] __attribute__((aligned(64)));
 
-	/// The token buffer starts at offset 2048 and continues to the end of
-	/// the page; options specify the size of the tokens buffer in tokens.
-	/// Additional pages must be attached for larger buffer sizes.
-	ushort token[] __attribute__((aligned(2048)));
+	/// Target result label; traces training token.
+	struct ircd_gpt_ctrl_label target __attribute__((aligned(64)));
+
+	/// Selected result token label.
+	struct ircd_gpt_ctrl_label select __attribute__((aligned(64)));
+
+	/// Incremented when the target is the selected token.
+	uint hit, miss;
+
+	/// Attention summary; [layer][head] => [token]. Each value points to a
+	/// position in the token buffer. The top-scoring softmax result for each
+	/// head in each layer is attending to the token.at(value) for this cycle.
+	/// These values are completely updated every cycle.
+	ushort attn[12][12];
+
+	/// Header magic: host sets 0xDEADBEEF before release to device; device
+	/// sets 0xC7012C7012 before release to host.
+	ulong magic;
+
+	/// Token buffer
+	ushort token[1024] __attribute__((aligned(2048)));
 }
 __attribute__((aligned(4096)));
 
-#ifdef __cplusplus
+#if defined(__cplusplus)
 namespace ircd::gpt
 {
 	using ctrl = struct ircd_gpt_ctrl;
+	using ctrl_clk = struct ircd_gpt_ctrl_clk;
+	using ctrl_prof = struct ircd_gpt_ctrl_prof;
+	using ctrl_logit = struct ircd_gpt_ctrl_logit;
+	using ctrl_label = struct ircd_gpt_ctrl_label;
+
+	string_view debug_token_at(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = -1U);
+	string_view debug_token(const mutable_buffer &, const opts &, const ctrl &, const uint fmt = -1U);
+	string_view debug_head(const mutable_buffer &, const opts &, const ctrl_clk &);
+	string_view debug_head(const mutable_buffer &, const opts &, const ctrl &);
+	string_view debug(const mutable_buffer &, const opts &, const ctrl_logit &, const uint fmt = 0);
+	string_view debug(const mutable_buffer &, const opts &, const ctrl_label &, const uint fmt = 0);
+	string_view debug(const mutable_buffer &, const opts &, const ctrl &);
+
+	string_view debug_attn(const mutable_buffer &, const opts &, const ctrl &, const uint);
+	string_view debug_label(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = 0);
+	string_view debug_top(const mutable_buffer &, const opts &, const ctrl &, const uint);
 }
 #endif
 
-#ifdef __cplusplus
-static_assert(sizeof(struct ircd_gpt_ctrl) == 4096);
+#if defined(__cplusplus)
+static_assert(sizeof(struct ircd_gpt_ctrl) % 4096 == 0);
+#endif
+
+#if defined(__cplusplus) && defined(__GLIBCXX__)
 static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
 static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
 #endif
diff --git a/include/ircd/gpt/epoch.h b/include/ircd/gpt/epoch.h
new file mode 100644
index 000000000..a4b4b9433
--- /dev/null
+++ b/include/ircd/gpt/epoch.h
@@ -0,0 +1,38 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_EPOCH_H
+
+/// Perform one task epoch on the device.
+///
+struct ircd::gpt::epoch
+{
+	gpt::task &task;
+	pipe::desc &desc;
+
+	const gpt::opts &opts;
+	gpt::ctrl &ctrl;
+
+	const uint id;
+	const size_t start, stop;
+	f32 *moment[2];
+
+	pipe::prof profile;
+
+	void profile_accumulate(const pipe::prof &);
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	epoch(gpt::task &);
+	~epoch() noexcept;
+};
diff --git a/include/ircd/gpt/gpt.h b/include/ircd/gpt/gpt.h
index f3b91a5d4..cef5fe9ae 100644
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@@ -17,16 +17,36 @@ namespace ircd::gpt
 {
 	IRCD_EXCEPTION(ircd::error, error)
 
+	struct samp;
+	struct step;
+	struct epoch;
 	struct task;
 
 	extern log::log log;
 }
 
 #include "vocab.h"
-#include "model.h"
 #include "token.h"
+#include "vector.h"
+#include "model.h"
 #include "opts.h"
 #include "ctrl.h"
-#include "task.h"
 #include "pipe/pipe.h"
+#include "samp.h"
+#include "step.h"
+#include "epoch.h"
+#include "task.h"
 #include "generate.h"
+
+namespace ircd::gpt
+{
+	void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept;
+
+	void log_debug(const opts &, const ctrl &);
+	void log_debug_token(const opts &, const ctrl &, const uint);
+	void log_debug_attns(const opts &, const ctrl &);
+	void log_debug_attns_top(const opts &, const ctrl &);
+	void log_debug_labels(const opts &, const ctrl &);
+	void log_debug_topn(const opts &, const ctrl &);
+	void log_debug_prof(const opts &, const ctrl &, const pipe::prof &);
+}
diff --git a/include/ircd/gpt/gpu.h b/include/ircd/gpt/gpu.h
new file mode 100644
index 000000000..6568d8418
--- /dev/null
+++ b/include/ircd/gpt/gpu.h
@@ -0,0 +1,290 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+typedef union ircd_gpt_vector_f32x4 ircd_gpt_vectorv __attribute__((aligned(4096)));
+typedef struct ircd_gpt_attn_qkv_f32x4 ircd_gpt_attn_qkvv __attribute__((aligned(4096)));
+typedef union ircd_gpt_attn_aperature_f32x4 ircd_gpt_attn_aperaturev __attribute__((aligned(4096)));
+typedef union ircd_gpt_ffnn_aperature_f32x4 ircd_gpt_ffnn_aperaturev __attribute__((aligned(4096)));
+
+//
+// Frontside
+//
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *out,
+                   __local const ircd_gpt_vectorv *in,
+                   __global const ircd_gpt_vectorv *bias,
+                   __global const ircd_gpt_vectorv *weight,
+                   const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_norm(__local ircd_gpt_vectorv *out,
+              __local const ircd_gpt_vectorv *in,
+              __local ircd_gpt_vectorv *tmp,
+              __global const ircd_gpt_vectorv *bias,
+              __global const ircd_gpt_vectorv *weight,
+              const uint ln,
+              const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *out,
+                   __local const ircd_gpt_ffnn_aperaturev *in,
+                   const uint i);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *opts,
+                        __local ircd_gpt_ffnn_aperaturev *out,
+                        __local const ircd_gpt_vectorv *in,
+                        __global const ircd_gpt_ffnn_aperaturev *bias,
+                        __global const ircd_gpt_ffnn_aperaturev *weight,
+                        const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __local ircd_gpt_ffnn_aperaturev *out,
+                   __local const ircd_gpt_vectorv *in,
+                   __global const ircd_gpt_ffnn_aperaturev *bias,
+                   __global const ircd_gpt_ffnn_aperaturev *weight,
+                   const uint ln,
+                   const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *opts,
+                        __local ircd_gpt_vectorv *out,
+                        __local const ircd_gpt_ffnn_aperaturev *in,
+                        __global const ircd_gpt_vectorv *bias,
+                        __global const ircd_gpt_vectorv *weight,
+                        const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *ctrl,
+              __constant const struct ircd_gpt_opts *opts,
+              __local ircd_gpt_vectorv *vector,
+              __local ircd_gpt_ffnn_aperaturev *buf,
+              __local ircd_gpt_vectorv *tmp,
+              __global const ircd_gpt_vectorv *norm_bias,
+              __global const ircd_gpt_vectorv *norm_weight,
+              __global const ircd_gpt_ffnn_aperaturev *fcon_bias,
+              __global const ircd_gpt_ffnn_aperaturev *fcon_weight,
+              __global const ircd_gpt_vectorv *proj_bias,
+              __global const ircd_gpt_vectorv *proj_weight,
+              const uint ln,
+              const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *opts,
+                        __local ircd_gpt_attn_aperaturev *out,
+                        __local const ircd_gpt_vectorv *in,
+                        __global const ircd_gpt_attn_aperaturev *bias,
+                        __global const ircd_gpt_attn_aperaturev *weight,
+                        const uint ln,
+                        const uint li);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __private const uint layer,
+                   __global ircd_gpt_attn_aperaturev *state,
+                   __global const ircd_gpt_vectorv *accum,
+                   __global const ircd_gpt_vectorv *norm_bias,
+                   __global const ircd_gpt_vectorv *norm_weight,
+                   __global const ircd_gpt_attn_aperaturev *fcon_bias,
+                   __global const ircd_gpt_attn_aperaturev *fcon_weight);
+
+//
+// head
+//
+
+void
+__attribute__((internal_linkage))
+_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __global ircd_gpt_vectorv *accum,
+                   __global const ircd_gpt_vectorv *pos,
+                   __global const ircd_gpt_vectorv *vocab,
+                   const ushort out_idx,
+                   const ushort tok_idx,
+                   const ushort word_idx);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl,
+                  __constant const struct ircd_gpt_opts *opts,
+                  __global ircd_gpt_vectorv *accum,
+                  __global const ircd_gpt_vectorv *pos,
+                  __global const ircd_gpt_vectorv *vocab);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *ctrl,
+                 __constant const struct ircd_gpt_opts *opts,
+                 __global ircd_gpt_vectorv *accum,
+                 __global const ircd_gpt_vectorv *norm_bias,
+                 __global const ircd_gpt_vectorv *norm_weight);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *ctrl,
+                  __constant const struct ircd_gpt_opts *opts,
+                  __global float *logit,
+                  __global const ircd_gpt_vectorv *accum,
+                  __global const ircd_gpt_vectorv *pos,
+                  __global const ircd_gpt_vectorv *vocab);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *ctrl,
+                  __constant const struct ircd_gpt_opts *opts,
+                  __global float logit[65536]);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *ctrl,
+                       __constant const struct ircd_gpt_opts *opts,
+                       __local const ushort *idx,
+                       __global const float *logsm,
+                       const uint i);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *ctrl,
+                              __constant const struct ircd_gpt_opts *opts,
+                              __local struct ircd_math_mean *mean,
+                              const float last);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *ctrl,
+                         __constant const struct ircd_gpt_opts *opts,
+                         __local struct ircd_gpt_ctrl_label *label,
+                         __global const float *logsm);
+
+ushort
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *ctrl,
+                          __constant const struct ircd_gpt_opts *opts,
+                          __local const ushort *idx,
+                          __global const float *logsm);
+
+uint
+__attribute__((internal_linkage))
+ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *ctrl,
+                    __constant const struct ircd_gpt_opts *opts,
+                    const uint i);
+
+uint
+__attribute__((internal_linkage))
+ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *ctrl,
+                      __constant const struct ircd_gpt_opts *opts,
+                      const uint i);
+
+int
+__attribute__((internal_linkage))
+ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *ctrl,
+                      __constant const struct ircd_gpt_opts *opts);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_accept(__local struct ircd_gpt_ctrl *ctrl,
+                __constant const struct ircd_gpt_opts *opts);
+
+//
+// backside
+//
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __global float4 *param_,
+                   __global float4 *exp_avg_,
+                   __global float4 *exp_avg_sqr_);
+
+//
+// backpropagations
+//
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __global ircd_gpt_vectorv *bias,
+                   __global ircd_gpt_vectorv *bias_m0,
+                   __global ircd_gpt_vectorv *bias_m1,
+                   __global ircd_gpt_vectorv *weight,
+                   __global ircd_gpt_vectorv *weight_m0,
+                   __global ircd_gpt_vectorv *weight_m1);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *ctrl,
+                        __constant const struct ircd_gpt_opts *opts,
+                        __global ircd_gpt_vectorv *norm_bias,
+                        __global ircd_gpt_vectorv *norm_bias_m0,
+                        __global ircd_gpt_vectorv *norm_bias_m1,
+                        __global ircd_gpt_vectorv *norm_weight,
+                        __global ircd_gpt_vectorv *norm_weight_m0,
+                        __global ircd_gpt_vectorv *norm_weight_m1,
+                        __global ircd_gpt_attn_aperaturev *fcon_bias,
+                        __global ircd_gpt_attn_aperaturev *fcon_bias_m0,
+                        __global ircd_gpt_attn_aperaturev *fcon_bias_m1,
+                        __global ircd_gpt_attn_aperaturev *fcon_weight,
+                        __global ircd_gpt_attn_aperaturev *fcon_weight_m0,
+                        __global ircd_gpt_attn_aperaturev *fcon_weight_m1,
+                        __global ircd_gpt_vectorv *proj_bias,
+                        __global ircd_gpt_vectorv *proj_bias_m0,
+                        __global ircd_gpt_vectorv *proj_bias_m1,
+                        __global ircd_gpt_vectorv *proj_weight,
+                        __global ircd_gpt_vectorv *proj_weight_m0,
+                        __global ircd_gpt_vectorv *proj_weight_m1);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *ctrl,
+                        __constant const struct ircd_gpt_opts *opts,
+                        __global ircd_gpt_vectorv *norm_bias,
+                        __global ircd_gpt_vectorv *norm_bias_m0,
+                        __global ircd_gpt_vectorv *norm_bias_m1,
+                        __global ircd_gpt_vectorv *norm_weight,
+                        __global ircd_gpt_vectorv *norm_weight_m0,
+                        __global ircd_gpt_vectorv *norm_weight_m1,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_bias,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_bias_m0,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_bias_m1,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_weight,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_weight_m0,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_weight_m1,
+                        __global ircd_gpt_vectorv *proj_bias,
+                        __global ircd_gpt_vectorv *proj_bias_m0,
+                        __global ircd_gpt_vectorv *proj_bias_m1,
+                        __global ircd_gpt_vectorv *proj_weight,
+                        __global ircd_gpt_vectorv *proj_weight_m0,
+                        __global ircd_gpt_vectorv *proj_weight_m1);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *ctrl,
+                       __constant const struct ircd_gpt_opts *opts,
+                       __global ircd_gpt_vectorv *pos,
+                       __global ircd_gpt_vectorv *pos_m0,
+                       __global ircd_gpt_vectorv *pos_m1,
+                       __global ircd_gpt_vectorv *token,
+                       __global ircd_gpt_vectorv *token_m0,
+                       __global ircd_gpt_vectorv *token_m1);
diff --git a/include/ircd/gpt/model.h b/include/ircd/gpt/model.h
index b196e2d25..cbbd72cec 100644
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@@ -20,66 +20,105 @@ namespace ircd::gpt::model
 	struct embed;
 	struct decoder;
 
-	constexpr auto align {64};
+	struct prop;
+	struct text;
+
 	extern decoder *default_model;
+	extern float *default_moment[2];
+	extern float *default_checkpoint[3];
 	extern string_view default_dataset;
 	extern std::vector<json::object> default_data;
+
+	constexpr auto alignment {4096};
 }
 
+/// Layer normalization
+struct ircd::gpt::model::norm
+{
+	union ircd_gpt_vector
+	bias alignas(alignment),
+	weight alignas(alignment);
+};
+
 /// Attention aperature
 struct ircd::gpt::model::attn
 {
-	float
-	attn_bias    alignas(align) [2304],
-	attn_weight  alignas(align) [768][2304];
+	model::norm
+	norm;
 
-	float
-	proj_bias    alignas(align) [768],
-	proj_weight  alignas(align) [768][768];
+	union ircd_gpt_attn_aperature
+	fcon_bias alignas(alignment),
+	fcon_weight alignas(alignment) [768];
+
+	union ircd_gpt_vector
+	proj_bias alignas(alignment),
+	proj_weight alignas(alignment) [768];
 };
 
 /// Feed-forward neural network
 struct ircd::gpt::model::ffnn
 {
-	float
-	fc_bias      alignas(align) [3072],
-	fc_weight    alignas(align) [768][3072];
+	model::norm
+	norm;
 
-	float
-	proj_bias    alignas(align) [768],
-	proj_weight  alignas(align) [3072][768];
-};
+	union ircd_gpt_ffnn_aperature
+	fcon_bias alignas(alignment),
+	fcon_weight alignas(alignment) [768];
 
-/// Layer normalization
-struct ircd::gpt::model::norm
-{
-	float
-	bias    alignas(align) [768],
-	weight  alignas(align) [768];
+	union ircd_gpt_vector
+	proj_bias alignas(alignment),
+	proj_weight alignas(alignment) [3072];
 };
 
 /// Transformer block
 struct ircd::gpt::model::block
 {
-	norm ln1;
-	model::attn attn;
+	model::attn
+	attn;
 
-	norm ln2;
-	model::ffnn ffnn;
+	model::ffnn
+	ffnn;
 };
 
 /// Vocabulary embeddings
 struct ircd::gpt::model::embed
 {
-	float
-	pos     alignas(align) [1024][768],
-	token   alignas(align) [65536][768];
+	model::norm
+	norm;
+
+	union ircd_gpt_vector
+	pos alignas(alignment) [1024],
+	token alignas(alignment) [65536];
 };
 
-struct ircd::gpt::model::decoder
+/// Transformer decoder
+struct alignas(ircd::gpt::model::alignment)
+ircd::gpt::model::decoder
 {
-	block layer[12];
+	model::block
+	layer[12];
 
-	norm f;
-	embed word;
+	model::embed
+	embed;
+};
+
+struct ircd::gpt::model::prop
+{
+	static constexpr const char
+	*const ended {"ended"},
+	*const id {"id"},
+	*const length {"length"},
+	*const text {"text"};
+};
+
+struct ircd::gpt::model::text
+:json::tuple
+<
+	json::property<prop::ended, bool>,
+	json::property<prop::id, uint>,
+	json::property<prop::length, uint>,
+	json::property<prop::text, json::string>
+>
+{
+	using super_type::tuple;
 };
diff --git a/include/ircd/gpt/opts.h b/include/ircd/gpt/opts.h
index 3c4e64255..9f0b8ae85 100644
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@@ -11,6 +11,13 @@
 #pragma once
 #define HAVE_IRCD_GPT_OPTS_H
 
+#if defined(__cplusplus)
+namespace ircd::gpt::model
+{
+	struct decoder;
+}
+#endif
+
 /// Task Options Page
 ///
 /// The option block is directly shared with task software as constant data.
@@ -20,30 +27,23 @@
 ///
 struct ircd_gpt_opts
 {
-	#ifdef __cplusplus
-	ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
+	#if defined(__cplusplus)
+	ircd_gpt_opts() noexcept;
 	#endif
 
-	/// Reference to the model (currently not available in device software).
-	#ifndef __cplusplus
-	const void *model;
-	#else
-	const ircd::gpt::model::decoder *model;
-	#endif
+	//
+	// Frontside
+	//
 
 	/// Seed for the task's PRNG.
 	ulong seed;
 
-	/// Limit number of output tokens. Default of -1 is unlimited; the number
-	/// of tokens generated will be limited by other factors.
-	uint limit;
-
 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
 	uint top_k;
 
 	/// Flip a random coin between 0 and top_p ( = 90 = 0.9) for logit select.
-	uint top_p;
+	float top_p;
 
 	/// Registers the top n result logits in the ctrl block each cycle.
 	uint top_n;
@@ -51,59 +51,25 @@ struct ircd_gpt_opts
 	/// Number of target labels to register results for in the ctrl block.
 	uint labels;
 
-	/// Bitbar toggling various debug modes
+	/// Number of pages available after the control block for the frame log.
+	uint frames;
+
+	/// Limit number of output tokens. Default of -1; other halting conditions
+	/// will be used.
+	uint limit;
+
+	/// Bitbar toggling various debug modes.
 	uint debug;
 
-	/// Specifies the token context size in tokens.
-	uint context_tokens;
+	/// Accepting condition codes.
+	ushort accept[4][8] __attribute__((aligned(4)));
 
-	/// Specifies the token buffer size in tokens.
-	uint buffer_tokens;
+	//
+	// Backside
+	//
 
-	/// Decoding layers.
-	uint layers;
-
-	/// SIMD lane count.
-	uint lanes;
-
-	/// Embedding vector elements
-	uint embed_elems;
-
-	/// Cross-attention dimension
-	uint attn_rank;
-
-	/// Attention unit fcon width multiple
-	uint attn_mult;
-
-	/// (computed) MLP unit fcon width multiple
-	uint ffnn_mult;
-
-	/// (computed) attention unit width multiple
-	uint attn_elems;
-
-	/// FFNN unit width multiple
-	uint ffnn_elems;
-
-	/// SIMD lane count
-	uint lanes;
-
-	/// (computed) `embed_elems` / `lanes`
-	uint embed_width;
-
-	/// (computed) Attention unit X dimension
-	uint attn_width;
-
-	/// (computed) Attention unit Y dimension
-	uint attn_height;
-
-	/// (computed) MLP backend X dimension
-	uint ffnn_width;
-
-	/// (computed) MLP backend Y dimension
-	uint ffnn_height;
-
-	/// Number of possible target n-grams.
-	uint logits;
+	/// Samples per step.
+	uint batch_size;
 
 	/// Training steps
 	uint training_steps;
@@ -122,15 +88,90 @@ struct ircd_gpt_opts
 
 	/// Denorm smoothing
 	float epsilon;
+
+	/// Tuning convergence rate
+	float lambda;
+
+	//
+	// Model dimensions
+	//
+
+	/// Number of possible target n-grams.
+	uint logits;
+
+	/// Specifies the token buffer size in tokens.
+	uint buffer_tokens;
+
+	/// Specifies the token context size in tokens.
+	uint context_tokens;
+
+	/// Decoding layers.
+	uint layers;
+
+	/// SIMD lane count.
+	uint lanes;
+
+	/// Embedding vector elements
+	uint embed_elems;
+
+	/// (computed) `embed_elems` / `lanes`
+	uint embed_width;
+
+	/// Cross-attention dimension
+	uint attn_rank;
+
+	/// Attention unit fcon width multiple
+	uint attn_mult;
+
+	/// (computed) attention unit width multiple
+	uint attn_elems;
+
+	/// (computed) Attention unit X dimension
+	uint attn_fcon_width;
+
+	/// (computed) Attention unit Y dimension
+	uint attn_fcon_height;
+
+	/// (computed) Attention unit X dimension
+	uint attn_proj_width;
+
+	/// (computed) Attention unit Y dimension
+	uint attn_proj_height;
+
+	/// (computed) Packed attention array total element count
+	uint attn_self_elems;
+
+	/// MLP unit fcon width multiple
+	uint ffnn_mult;
+
+	/// (computed) FFNN unit width multiple
+	uint ffnn_elems;
+
+	/// (computed) MLP backend X dimension
+	uint ffnn_fcon_width;
+
+	/// (computed) MLP backend Y dimension
+	uint ffnn_fcon_height;
+
+	/// (computed) MLP backend X dimension
+	uint ffnn_proj_width;
+
+	/// (computed) MLP backend Y dimension
+	uint ffnn_proj_height;
 }
 __attribute__((aligned(4096)));
 
-#ifdef __cplusplus
+#if defined(__cplusplus)
 namespace ircd::gpt
 {
 	using opts = ::ircd_gpt_opts;
 }
+#endif
 
+#if defined(__cplusplus)
 static_assert(sizeof(struct ircd_gpt_opts) == 4096);
+#endif
+
+#if defined(__cplusplus) && defined(__GLIBCXX__)
 static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
 #endif
diff --git a/include/ircd/gpt/pipe/code.h b/include/ircd/gpt/pipe/code.h
index c61020622..f8d7ce6f7 100644
--- a/include/ircd/gpt/pipe/code.h
+++ b/include/ircd/gpt/pipe/code.h
@@ -16,12 +16,15 @@ struct ircd::gpt::pipe::code
 :cl::code
 {
 	static conf::item<std::string> default_path;
-	static conf::item<std::string> default_opts;
+	static conf::item<std::string> default_compile_opts;
+	static conf::item<std::string> default_link_opts;
 	static conf::item<std::string> cache_path;
 
 	static string_view make_cache_path(const mutable_buffer &);
-	static cl::code from_cache(const string_view &opts, const string_view &path);
-	static cl::code from_source(const string_view &opts);
+
+	static cl::code from_cache();
+	static cl::code from_source(const string_view &comp_opts = {}, const string_view &link_opts = {});
+	static cl::code from_bitcode(const string_view &link_opts = {});
 
 	void set_cache(const string_view &path);
 	bool put_cache();
diff --git a/include/ircd/gpt/pipe/cycle.h b/include/ircd/gpt/pipe/cycle.h
new file mode 100644
index 000000000..73da93d14
--- /dev/null
+++ b/include/ircd/gpt/pipe/cycle.h
@@ -0,0 +1,48 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_CYCLE_H
+
+namespace ircd::gpt::pipe
+{
+	const gpt::ctrl &acquire(cycle &);
+}
+
+/// Perform one task cycle on the device. The cycle is an atomic unit of
+/// generation producing one token.
+///
+/// Constructions of this object enqueue device commands to complete an
+/// additional cycle of the task as provided by `ctrl` and `opts`.
+///
+/// Destructions of this object yield the ircd::ctx until those commands
+/// are complete.
+///
+struct ircd::gpt::pipe::cycle
+{
+	struct profile;
+
+	static constexpr size_t stages
+	{
+		4 + 3 + (12 * 2) + 4 + 2 + (12 * 2) + 1
+	};
+
+	pipe::desc &desc;
+	uint tick;
+	uint count;
+	uint tokens;
+	uint cached;
+	uint frame;
+	pipe::range range;
+	std::array<cl::exec, stages> stage;
+
+	cycle(gpt::samp &);
+	~cycle() noexcept;
+};
diff --git a/include/ircd/gpt/pipe/desc.h b/include/ircd/gpt/pipe/desc.h
index b68147db6..0300d39e2 100644
--- a/include/ircd/gpt/pipe/desc.h
+++ b/include/ircd/gpt/pipe/desc.h
@@ -16,43 +16,62 @@ struct ircd::gpt::pipe::desc
 {
 	struct layer;
 
+	// Model descriptor
 	pipe::model *model;
+
+	// Code descriptor
 	pipe::code *code;
 
+	// Memories
 	cl::data
-	state,         // [root] projection (layers * tokens * embed * 3 * float)
+	opts,          // [root] options page
+	ctrl,          // [root] control page
 	master,        // [root] single allocation for additional buffers:
+	state,         // [-sub] projection (layers * tokens * embed * 3 * float)
 	accum,         // [-sub] accumulator (tokens * embed * float)
 	logit,         // [-sub] result logit vector (50257 * float)
-	logsm,         // [-sub] outputs distribution (50257 * float)
-	ctrl,          // [root] control page
-	opts;          // [root] options page
+	attns,         // [-sub] result attention softmax
+	frame[8];      // [root] result stream
 
+	// Programs
 	cl::kern
+	alloc,
+	enter,
 	lm_embed,
 	lm_norm,
 	lm_logit,
 	lm_logsm,
 	lm_select,
-	lm_norm_backprop,
-	lm_embed_backprop;
+	lm_prop_embed,
+	lm_prop_norm,
+	leave[8];
 
-	std::unique_ptr<struct desc::layer>
-	layer[12];
+	/// Coil pack
+	std::unique_ptr<struct desc::layer> layer[12];
 
-	desc(pipe::code &, pipe::model &);
+	/// Attention projection for first N tokens already contained in `state`.
+	uint cached {0};
+
+	desc(const gpt::opts *const &,
+	     gpt::ctrl *const &,
+	     pipe::model &model,
+	     pipe::code &code);
 };
 
+/// Pipe descriptor: coil layer
 struct ircd::gpt::pipe::desc::layer
 {
 	cl::data
-	state;         // [-sub] qry/key/val projection (tokens * embed * 3 * float)
+	state,         // [-sub] qry/key/val projection (tokens * embed * 3 * float)
+	attns;         // [-sub] attn softmax result (((tokens * tokens) / 2) * 12 * float)
 
 	cl::kern
-	negative,
-	positive,
-	backattn,
-	backffnn;
+	attn,
+	ffnn,
+	prop_attn,
+	prop_ffnn;
 
-	layer(pipe::desc &, const int);
+	layer(pipe::desc &,
+	      const gpt::opts *const &,
+	      const uint laynum);
 };
diff --git a/include/ircd/gpt/pipe/exec.h b/include/ircd/gpt/pipe/exec.h
deleted file mode 100644
index 3cec74cb8..000000000
--- a/include/ircd/gpt/pipe/exec.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Matrix Construct
-//
-// Copyright (C) Matrix Construct Developers, Authors & Contributors
-// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice is present in all copies. The
-// full license for this software is available in the LICENSE file.
-
-#pragma once
-#define HAVE_IRCD_GPT_PIPE_EXEC_H
-
-/// Perform one task cycle on the device.
-///
-/// Constructions of this object enqueue device commands to complete an
-/// additional epoch of the task as provided by `ctrl` and `opts`.
-///
-/// Destructions of this object yield the ircd::ctx until those commands
-/// are complete.
-///
-/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
-/// achieved by constructing several objects before following with destructions
-/// i.e in a std::deque.
-///
-struct ircd::gpt::pipe::exec
-{
-	pipe::desc *desc;
-
-	const_buffer
-	send_opts,         // Set when sending the options page.
-	send_ctrl,         // Set when sending the control page.
-	send_coil,         // Set when sending the updated model.
-	send_head;         // Set when sending the updated model.
-
-	mutable_buffer
-	recv_ctrl;         // Set when receiving the control page.
-
-	cl::kern::range
-	range_full,
-	range_last,
-	range_lm_embed,    // Dimension range of the lm_embed kernel.
-	range_negative,    // Dimension range of a layer kernel.
-	range_positive,    // Dimension range of a layer kernel.
-	range_lm_norm,     // Dimension range of the final norm kernel.
-	range_lm_logit,    // Dimension range of the language logit kernel.
-	range_lm_logsm,    // Dimension range of the language statistic kernel.
-	range_lm_select;   // Dimension range of the language token kernel.
-
-	cl::exec
-	release_opts,      // Release the options page.
-	release_ctrl,      // Release the control page.
-	release_coil,      // Release updates to the model.
-	release_head,      // Release updates to the model.
-	lm_embed,          // Compute token and positional embeddings.
-	coil[12 * 2],      // Pass over all layers.
-	lm_norm,           // Final normalization.
-	lm_logit,          // Compute language logits.
-	lm_logsm,          // Statistics on the logits.
-	lm_select,         // Select next token.
-	acquire_ctrl;      // Acquire the control page.
-
-	exec(task &, const size_t tokens, const bool rel, const bool acq);
-	~exec() noexcept;
-};
diff --git a/include/ircd/gpt/pipe/model.h b/include/ircd/gpt/pipe/model.h
index ce62bd671..24dde174a 100644
--- a/include/ircd/gpt/pipe/model.h
+++ b/include/ircd/gpt/pipe/model.h
@@ -21,15 +21,15 @@ struct ircd::gpt::pipe::model
 	struct attn;
 	struct ffnn;
 	struct block;
+	struct embed;
 	struct decoder;
-	struct language;
 
+	const gpt::model::decoder *decode_const {nullptr};
+	gpt::model::decoder *decode_mutable {nullptr};
 	std::unique_ptr<model::decoder> decode;
-	std::unique_ptr<model::language> embed;
-	bool invalid {false};
 
-	model(const gpt::model::decoder &, const gpt::model::embed &);
-	model(gpt::model::decoder &, gpt::model::embed &);
+	model(const gpt::model::decoder &);
+	model(gpt::model::decoder &);
 	~model() noexcept;
 };
 
@@ -49,8 +49,8 @@ struct ircd::gpt::pipe::model::tensor
 	bias,
 	weight;
 
-	tensor(cl::data *, const off_t, const const_buffer &bias, const const_buffer &weight);
-	tensor(cl::data *, const off_t, const mutable_buffer &bias, const mutable_buffer &weight);
+	tensor(cl::data *, const off_t, const const_buffer &bias, const off_t, const const_buffer &weight);
+	tensor(cl::data *, const off_t, const mutable_buffer &bias, const off_t, const mutable_buffer &weight);
 };
 
 struct ircd::gpt::pipe::model::attn
@@ -60,8 +60,8 @@ struct ircd::gpt::pipe::model::attn
 	fcon,
 	proj;
 
-	attn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::attn &);
-	attn(cl::data *, const off_t, gpt::model::norm &, gpt::model::attn &);
+	attn(cl::data *, const off_t, const gpt::model::attn &);
+	attn(cl::data *, const off_t, gpt::model::attn &);
 };
 
 struct ircd::gpt::pipe::model::ffnn
@@ -71,40 +71,33 @@ struct ircd::gpt::pipe::model::ffnn
 	fcon,
 	proj;
 
-	ffnn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::ffnn &);
-	ffnn(cl::data *, const off_t, gpt::model::norm &, gpt::model::ffnn &);
+	ffnn(cl::data *, const off_t, const gpt::model::ffnn &);
+	ffnn(cl::data *, const off_t, gpt::model::ffnn &);
 };
 
 struct ircd::gpt::pipe::model::block
 {
-	// Single layer memory roots
-	cl::data
-	master[3];
+	model::attn
+	attn;
 
-	// Layer units
-	model::attn attn;
-	model::ffnn ffnn;
+	model::ffnn
+	ffnn;
 
 	block(cl::data *, const off_t, const gpt::model::block &, const size_t);
 	block(cl::data *, const off_t, gpt::model::block &, const size_t);
-	block(const gpt::model::block &, const size_t);
-	block(gpt::model::block &, const size_t);
 };
 
-struct ircd::gpt::pipe::model::language
+struct ircd::gpt::pipe::model::embed
 {
-	cl::data
-	master[3];
+	tensor
+	norm;
 
 	matrix
 	pos,
 	token;
 
-	language(cl::data *, const off_t, const gpt::model::embed &);
-	language(cl::data *, const off_t, gpt::model::embed &);
-	language(const gpt::model::embed &);
-	language( gpt::model::embed &);
-	~language() noexcept;
+	embed(cl::data *, const off_t, const gpt::model::embed &);
+	embed(cl::data *, const off_t, gpt::model::embed &);
 };
 
 struct ircd::gpt::pipe::model::decoder
@@ -115,10 +108,11 @@ struct ircd::gpt::pipe::model::decoder
 
 	// Layer blocks
 	model::block
-	block[12];
+	layer[12];
 
-	// Final norm
-	tensor norm;
+	// Language model head
+	model::embed
+	embed;
 
 	decoder(const gpt::model::decoder &);
 	decoder(gpt::model::decoder &);
diff --git a/include/ircd/gpt/pipe/pipe.h b/include/ircd/gpt/pipe/pipe.h
index 8b29965f1..c309721cd 100644
--- a/include/ircd/gpt/pipe/pipe.h
+++ b/include/ircd/gpt/pipe/pipe.h
@@ -13,21 +13,21 @@
 
 namespace ircd::gpt::pipe
 {
-	struct model;
 	struct code;
+	struct model;
 	struct desc;
-	struct exec;
+	struct range;
+	struct cycle;
+	struct prof;
 
-	extern model *default_model;
-	extern code *default_code;
-	extern desc *default_desc;
-
-	void generate(task &);
+	extern conf::item<size_t> queue_cycles;
 
 	void init(), fini() noexcept;
 };
 
-#include "model.h"
 #include "code.h"
+#include "model.h"
 #include "desc.h"
-#include "exec.h"
+#include "range.h"
+#include "cycle.h"
+#include "prof.h"
diff --git a/include/ircd/gpt/pipe/prof.h b/include/ircd/gpt/pipe/prof.h
new file mode 100644
index 000000000..e3bb0a418
--- /dev/null
+++ b/include/ircd/gpt/pipe/prof.h
@@ -0,0 +1,56 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_PROF_H
+
+namespace ircd::gpt::pipe
+{
+	string_view debug(const mutable_buffer &, const prof &, const size_t &pos);
+	string_view debug(const mutable_buffer &, const prof &);
+}
+
+/// Extract profiling information for a cycle. This object contains timing
+/// state integers for each corresponding stage of the cycle.
+///
+/// Default constructions initialize to zero and the state can also be used
+/// as an accumulator.
+struct ircd::gpt::pipe::prof
+{
+	static constexpr size_t stages
+	{
+		cycle::stages
+	};
+
+	static constexpr size_t phases
+	{
+		num_of<cl::work::prof::phase>()
+	};
+
+	using phase = cl::work::prof::phase;
+	using phase_array = cl::work::prof;
+	using stage_array = std::array<phase_array, stages>;
+	using info_type = std::tuple<string_view, int>;
+	using info_array = std::array<info_type, stages>;
+	using info_name_array = std::array<char[64], stages>;
+
+	static info_name_array name;
+	static info_array info;
+
+  private:
+	static bool init;
+	static void init_info(const pipe::cycle &);
+
+  public:
+	stage_array ts;
+
+	prof(const pipe::cycle &);
+	prof() noexcept;
+};
diff --git a/include/ircd/gpt/pipe/range.h b/include/ircd/gpt/pipe/range.h
new file mode 100644
index 000000000..3538c3101
--- /dev/null
+++ b/include/ircd/gpt/pipe/range.h
@@ -0,0 +1,39 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_RANGE_H
+
+struct ircd::gpt::pipe::range
+{
+	cl::kern::range
+	_full,
+	_last,
+	alloc,
+	embed,
+	attn,
+	ffnn,
+	fffnn,
+	fnorm,
+	logit,
+	logsm,
+	select,
+	prop_embed,
+	prop_norm,
+	prop_attn,
+	prop_ffnn;
+
+	range(const uint tick,
+	      const uint count,
+	      const uint tokens,
+	      const uint cached,
+	      const bool fwd,
+	      const bool rev) noexcept;
+};
diff --git a/include/ircd/gpt/samp.h b/include/ircd/gpt/samp.h
new file mode 100644
index 000000000..a0c3a6e68
--- /dev/null
+++ b/include/ircd/gpt/samp.h
@@ -0,0 +1,49 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_SAMP_H
+
+/// Perform one task step on the device. The step is a sequence of cycles
+/// which generate tokens until satisfying a halting condition. The number of
+/// cycles for the step is limited by the size of the context buffer.
+///
+struct ircd::gpt::samp
+{
+	gpt::step &step;
+	pipe::desc &desc;
+
+	const gpt::opts &opts;
+	gpt::ctrl &ctrl;
+
+	const uint id;
+
+	int accept;
+	uint dispatch;
+	uint cycle;
+	uint tokens;
+	uint count;
+
+	pipe::prof profile;
+	std::deque<pipe::cycle> queue;
+
+  public:
+	void profile_accumulate(const pipe::prof &);
+	bool retire(pipe::cycle &, const gpt::ctrl &);
+	bool evaluate(pipe::cycle &);
+	uint tokenize();
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	samp(gpt::step &);
+	~samp() noexcept;
+};
diff --git a/include/ircd/gpt/step.h b/include/ircd/gpt/step.h
new file mode 100644
index 000000000..3cadbf7a3
--- /dev/null
+++ b/include/ircd/gpt/step.h
@@ -0,0 +1,40 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_STEP_H
+
+/// Perform one task step on the device. The step is a sequence of cycles
+/// which generate tokens until satisfying a halting condition. The number of
+/// cycles for the step is limited by the size of the context buffer.
+///
+struct ircd::gpt::step
+{
+	gpt::epoch &epoch;
+	pipe::desc &desc;
+
+	const gpt::opts &opts;
+	gpt::ctrl &ctrl;
+
+	const uint id;
+	const uint start;
+
+	pipe::prof profile;
+
+	void profile_accumulate(const pipe::prof &);
+	bool backpropagate();
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	step(gpt::epoch &);
+	~step() noexcept;
+};
diff --git a/include/ircd/gpt/task.h b/include/ircd/gpt/task.h
index 0521100a3..243f77d65 100644
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@@ -12,6 +12,14 @@
 #define HAVE_IRCD_GPT_TASK_H
 
 #ifdef __cplusplus
+namespace ircd::gpt
+{
+	void seed(task &, const uint64_t &) noexcept;
+	void seed(task &) noexcept;
+	void clear(task &) noexcept;
+	void reset(task &) noexcept;
+}
+
 /// Task Context
 ///
 /// State for a task.
@@ -22,27 +30,25 @@ struct ircd::gpt::task
 	/// Reference to the attached options.
 	const gpt::opts *opts {nullptr};
 
-	/// Reference to control pages.
+	/// Reference to user's control block.
 	gpt::ctrl *ctrl {nullptr};
 
-	/// Current task status.
-	enum status status {'\0'};
+	/// Pipe code
+	std::unique_ptr<pipe::code> code;
+
+	/// Pipe model
+	std::unique_ptr<pipe::model> model;
+
+	/// Pipe state
+	pipe::desc desc;
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	task(const gpt::opts *     = nullptr,
+	     gpt::ctrl *           = nullptr);
 
-	task(const gpt::opts * = nullptr, gpt::ctrl * = nullptr);
 	~task() noexcept;
 };
-
-/// The current status of a task is indicated with intelligible characters
-enum ircd::gpt::task::status
-:char
-{
-	QUEUED    = 'Q',  ///< Queued for execution.
-	RUNNING   = 'R',  ///< Currently being executed.
-	ACCEPT    = 'A',  ///< Execution completed successfully.
-	ERROR     = 'E',  ///< Execution did not complete successfully.
-};
-
-static_assert(sizeof(struct ircd_gpt_ctrl) == 4096);
-static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
-static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
 #endif
diff --git a/include/ircd/gpt/token.h b/include/ircd/gpt/token.h
index 7cd963999..b928c8758 100644
--- a/include/ircd/gpt/token.h
+++ b/include/ircd/gpt/token.h
@@ -11,127 +11,67 @@
 #pragma once
 #define HAVE_IRCD_GPT_TOKEN_H
 
-union ircd_gpt_token
+namespace ircd::gpt
 {
-	float
-	word[768],
-	attn[12][64];
+	struct token;
+}
+
+/// Token is just a 16-bit index into the vocabulary. This lightweight wrapper
+/// convenience constructs from a string lookup or from a u16 directly.
+class ircd::gpt::token
+{
+	uint16_t val;
+
+  public:
+	operator const uint16_t &() const;
+	operator uint16_t &();
+
+	operator string_view() const;
+
+	token(const_buffer &buf) noexcept;
+	token(const string_view &);
+	token(const uint16_t &) noexcept;
 };
 
-#ifdef __OPENCL_C_VERSION__
-union ircd_gpt_tokenv
+static_assert(sizeof(ircd::gpt::token) == sizeof(uint16_t));
+static_assert(std::is_standard_layout<ircd::gpt::token>::value);
+
+/// Direct construction; no lookup
+inline
+ircd::gpt::token::token(const uint16_t &val)
+noexcept
+:val{val}
+{}
+
+/// Must resolve to one token or error thrown.
+inline
+ircd::gpt::token::token(const string_view &str)
+:val{vocab::tokenize(str)}
+{}
+
+/// Consumes input for one token off front of buf
+inline
+ircd::gpt::token::token(const_buffer &buf)
+noexcept
+:val{vocab::tokenize(buf)}
+{}
+
+inline ircd::gpt::token::operator
+string_view()
+const
 {
-	float4
-	word[768/4],
-	attn[12][64/4];
+	return vocab::token[val];
+}
 
-	union ircd_gpt_token
-	token;
-};
-#endif
-
-struct ircd_gpt_attn_qkv
+inline ircd::gpt::token::operator
+uint16_t &()
 {
-	union ircd_gpt_token
-	qry,
-	key,
-	val;
-};
+	return val;
+}
 
-#ifdef __OPENCL_C_VERSION__
-struct ircd_gpt_attn_qkvv
+inline ircd::gpt::token::operator
+const uint16_t &()
+const
 {
-	union ircd_gpt_tokenv
-	qry,
-	key,
-	val;
-};
-#endif
-
-union ircd_gpt_attn_aperature
-{
-	float
-	fcon[2304],
-	proj[3][768],
-	qkv[3][12][64];
-
-	union ircd_gpt_token
-	token[3];
-};
-
-#ifdef __OPENCL_C_VERSION__
-union ircd_gpt_attn_aperaturev
-{
-	float4
-	fcon[2304/4],
-	proj[3][768/4],
-	qkv[3][12][64/4];
-
-	union ircd_gpt_token_f32x4
-	token[3];
-};
-
-union ircd_gpt_attn_aperature_f32x8
-{
-	float8
-	fcon[2304/8],
-	proj[3][768/8],
-	qkv[3][12][64/8];
-
-	union ircd_gpt_token_f32x8
-	token[3];
-};
-
-union ircd_gpt_attn_aperature_f32x16
-{
-	float16
-	fcon[2304/16],
-	proj[3][768/16],
-	qkv[3][12][64/16];
-
-	union ircd_gpt_token_f32x16
-	token[3];
-};
-#endif
-
-union ircd_gpt_ffnn_aperature
-{
-	float
-	fcon[3072],
-	proj[4][768];
-
-	union ircd_gpt_token
-	token[4];
-};
-
-#ifdef __OPENCL_C_VERSION__
-union ircd_gpt_ffnn_aperaturev
-{
-	float4
-	fcon[3072/4],
-	proj[4][768/4];
-
-	union ircd_gpt_token_f32x4
-	token[4];
-};
-
-union ircd_gpt_ffnn_aperature_f32x8
-{
-	float8
-	fcon[3072/8],
-	proj[4][768/8];
-
-	union ircd_gpt_token_f32x4
-	token[4];
-};
-
-union ircd_gpt_ffnn_aperature_f32x16
-{
-	float16
-	fcon[3072/16],
-	proj[4][768/16];
-
-	union ircd_gpt_token_f32x4
-	token[4];
-};
-#endif
+	return val;
+}
diff --git a/include/ircd/gpt/vector.h b/include/ircd/gpt/vector.h
new file mode 100644
index 000000000..fd97bf619
--- /dev/null
+++ b/include/ircd/gpt/vector.h
@@ -0,0 +1,147 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_VECTOR_H
+
+#if !defined(__SIZEOF_FLOAT4__) && defined(__OPENCL_VERSION__)
+	#define __SIZEOF_FLOAT4__ 16
+#endif
+
+#if !defined(__SIZEOF_FLOAT8__) && defined(__OPENCL_VERSION__)
+	#define __SIZEOF_FLOAT8__ 32
+#endif
+
+#if !defined(__SIZEOF_FLOAT16__) && defined(__OPENCL_VERSION__)
+	#define __SIZEOF_FLOAT16__ 64
+#endif
+
+#ifndef __OPENCL_VERSION__
+	#define __constant
+#endif
+
+static __constant const uint
+ircd_gpt_context_tokens = 512, // 1024,
+ircd_gpt_vector_elems = 768,
+ircd_gpt_attn_rank = 12,
+ircd_gpt_attn_segs = 3,
+ircd_gpt_ffnn_segs = 4;
+
+static __constant const uint
+ircd_gpt_vector_attn_elems = ircd_gpt_vector_elems / ircd_gpt_attn_rank,
+ircd_gpt_attn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_attn_segs,
+ircd_gpt_ffnn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_ffnn_segs;
+
+//
+// embed vector
+//
+
+#if defined(__SIZEOF_FLOAT__)
+union ircd_gpt_vector
+{
+	float
+	elem[ircd_gpt_vector_elems],
+	attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems];
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+union ircd_gpt_vector_f32x4
+{
+	float4
+	elem[ircd_gpt_vector_elems / 4],
+	attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4];
+
+	union ircd_gpt_vector
+	vector;
+};
+#endif
+
+//
+// attn qkv
+//
+
+#if defined(__SIZEOF_FLOAT__)
+struct ircd_gpt_attn_qkv
+{
+	union ircd_gpt_vector
+	qry,
+	key,
+	val;
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+struct ircd_gpt_attn_qkv_f32x4
+{
+	union ircd_gpt_vector_f32x4
+	qry,
+	key,
+	val;
+};
+#endif
+
+//
+// attn aperature
+//
+
+#if defined(__SIZEOF_FLOAT__)
+union ircd_gpt_attn_aperature
+{
+	float
+	fcon[ircd_gpt_attn_fcon_elems],
+	proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems],
+	qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems];
+
+	union ircd_gpt_vector
+	vector[ircd_gpt_attn_segs];
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+union ircd_gpt_attn_aperature_f32x4
+{
+	float4
+	fcon[ircd_gpt_attn_fcon_elems / 4],
+	proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems / 4],
+	qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4];
+
+	union ircd_gpt_vector_f32x4
+	vector[ircd_gpt_attn_segs];
+};
+#endif
+
+//
+// ffnn aperature
+//
+
+#if defined(__SIZEOF_FLOAT__)
+union ircd_gpt_ffnn_aperature
+{
+	float
+	fcon[ircd_gpt_ffnn_fcon_elems],
+	proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems];
+
+	union ircd_gpt_vector
+	vector[ircd_gpt_ffnn_segs];
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+union ircd_gpt_ffnn_aperature_f32x4
+{
+	float4
+	fcon[ircd_gpt_ffnn_fcon_elems / 4],
+	proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems / 4];
+
+	union ircd_gpt_vector_f32x4
+	vector[ircd_gpt_ffnn_segs];
+};
+#endif
diff --git a/include/ircd/gpt/vocab.h b/include/ircd/gpt/vocab.h
index 04b6fb8a4..ac314d797 100644
--- a/include/ircd/gpt/vocab.h
+++ b/include/ircd/gpt/vocab.h
@@ -34,11 +34,17 @@ namespace ircd::gpt::vocab
 	merges_path;
 
 	// Tokenize UTF-8 input string of any length into proper token values,
-	vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in);
+	vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in) noexcept;
+
+	// Tokenize one token. The buffer is advanced consuming one token per call.
+	u16 tokenize(const_buffer &) noexcept;
+
+	// Tokenize one token. Error thrown if input is not exactly one token.
+	u16 tokenize(const string_view &in);
 
 	// Decode token values to build output text string.
-	string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in);
+	string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;
 
 	// Other tools
-	string_view debug(const mutable_buffer &buf, const u16 token);
+	string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U);
 }
diff --git a/ircd/Makefile.am b/ircd/Makefile.am
index da5d53ea4..69c3057f6 100644
--- a/ircd/Makefile.am
+++ b/ircd/Makefile.am
@@ -53,6 +53,7 @@ if LTO
 if CLANG
 AM_CXXFLAGS += -fstrict-vtable-pointers
 AM_CXXFLAGS += -fwhole-program-vtables
+#AM_LDFLAGS += -Wl,-plugin-opt,-pass-remarks='.*'
 endif
 endif
 
@@ -87,22 +88,6 @@ AM_LDFLAGS += -Wl,--enable-runtime-pseudo-reloc
 AM_LDFLAGS += -export-symbols-regex '*'
 endif
 
-ROCKSDB_SRC_CPPFLAGS =#
-ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
-ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
-
-GPT_FP_CXXFLAGS =#
-GPT_FP_CXXFLAGS += -fno-math-errno
-GPT_FP_CXXFLAGS += -fno-trapping-math
-GPT_FP_CXXFLAGS += -ffinite-math-only
-GPT_FP_CXXFLAGS += -fno-signed-zeros
-GPT_FP_CXXFLAGS += -fassociative-math
-GPT_FP_CXXFLAGS += -ffp-contract=fast
-GPT_FP_CXXFLAGS += -freciprocal-math
-if CLANG
-GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
-endif
-
 libircddir = @libdir@
 libircd_LTLIBRARIES = libircd.la
 
@@ -241,16 +226,12 @@ libircd_la_SOURCES += png.cc
 if OPENCL
 libircd_la_SOURCES += cl.cc
 endif
-libircd_la_SOURCES += gpt.cc
-libircd_la_SOURCES += gpt_pipe.cc
-libircd_la_SOURCES += gpt_model.cc
 libircd_la_SOURCES += gpt_vocab.cc
+libircd_la_SOURCES += gpt_model.cc
+libircd_la_SOURCES += gpt_pipe_code.cc
+libircd_la_SOURCES += gpt_pipe.cc
 libircd_la_SOURCES += gpt_cpu.cc
-if OPENCL
-if CLANG
-BUILT_SOURCES += gpt_gpu.o
-endif
-endif
+libircd_la_SOURCES += gpt.cc
 libircd_la_SOURCES += openssl.cc
 libircd_la_SOURCES += rfc1459.cc
 libircd_la_SOURCES += rfc3986.cc
@@ -283,6 +264,23 @@ libircd_la_SOURCES += ircd.cc
 # Specific unit option composition
 #
 
+ROCKSDB_SRC_CPPFLAGS =#
+ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
+ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
+
+GPT_FP_CXXFLAGS =#
+GPT_FP_CXXFLAGS += -fno-math-errno
+GPT_FP_CXXFLAGS += -fno-trapping-math
+GPT_FP_CXXFLAGS += -ffinite-math-only
+GPT_FP_CXXFLAGS += -fno-signed-zeros
+GPT_FP_CXXFLAGS += -fassociative-math
+GPT_FP_CXXFLAGS += -freciprocal-math
+GPT_FP_CXXFLAGS += -ffp-contract=fast
+if CLANG
+GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
+GPT_FP_CXXFLAGS += -ffp-model=fast
+endif
+
 client.lo:            AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 ctx_x86_64.lo:        AM_CPPFLAGS := -I$(top_srcdir)/include
 ctx.lo:               AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
@@ -307,7 +305,7 @@ endif
 if IOU
 fs_iou.lo:            AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 endif
-gpt.lo:               AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
+gpt_cpu.lo:           AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
 http.lo:              AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 http.lo:              AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
 ios.lo:               AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
@@ -364,12 +362,118 @@ default.profdata:
 	-$(LLVM_PROFDATA) merge -output=default.profdata default.proftext
 endif
 
+###############################################################################
+#
+# Hardware Acceleration / sub-targets
+#
+
 if CLANG
 if OPENCL
-gpt_gpu.ll: gpt_gpu.cl
-	$(CC) -S -emit-llvm -std=CL1.1 $(AM_CPPFLAGS) $(CPPFLAGS) $(DEFS) -Xclang -finclude-default-header -include "ircd/config.h" -o $@ -c $^
 
-gpt_gpu.o: gpt_gpu.ll
-	$(CC) -std=CL1.1 -o $@ -c $^
-endif
-endif
+GPU_CPPFLAGS =#
+GPU_CPPFLAGS += -D__OPENCL_VERSION__=110
+GPU_CPPFLAGS += -Dcl_clang_storage_class_specifiers
+GPU_CPPFLAGS += -DNOFP64
+GPU_CPPFLAGS += $(AM_CPPFLAGS)
+GPU_CPPFLAGS += $(CPPFLAGS)
+GPU_CPPFLAGS += $(DEFS)
+
+GPU_CFLAGS =#
+GPU_CFLAGS += -std=cl1.1
+GPU_CFLAGS += -fident
+GPU_CFLAGS += -fno-builtin
+GPU_CFLAGS += -fverbose-asm
+GPU_CFLAGS += -fno-discard-value-names
+GPU_CFLAGS += -mllvm -propagate-attrs=true
+GPU_CFLAGS += -O3
+GPU_CFLAGS += -fno-trapping-math
+GPU_CFLAGS += -ffinite-math-only
+GPU_CFLAGS += -fno-signed-zeros
+GPU_CFLAGS += -ffp-contract=fast
+#GPU_CFLAGS += -include "ircd/config.h"
+#GPU_CFLAGS += -include "clc/clc.h"
+#GPU_CFLAGS += -Wassume
+#GPU_CFLAGS += -Rpass-analysis=".*"
+#GPU_CFLAGS += -Rpass-missed=".*"
+
+GPU_LINKFLAGS =#
+GPU_LINKFLAGS +=#
+
+GPU_OPTFLAGS =#
+GPU_OPTFLAGS += -propagate-attrs=true
+GPU_OPTFLAGS += -O3
+#GPU_OPTFLAGS += -opt-bisect-limit=-1
+#GPU_OPTFLAGS += -debug-pass=Arguments
+#GPU_OPTFLAGS += -pass-remarks='.*'
+
+GPU_ASFLAGS =#
+GPU_ASFLAGS += -fident
+GPU_ASFLAGS += -fno-builtin
+GPU_ASFLAGS += -mllvm -propagate-attrs=true
+GPU_ASFLAGS += -mllvm -verify-machineinstrs
+GPU_ASFLAGS += -O3
+GPU_ASFLAGS += -Rpass-analysis=asm-printer
+#GPU_ASFLAGS += -Rpass-analysis=".*"
+#GPU_ASFLAGS += -mllvm -debug-pass=Arguments
+#GPU_ASFLAGS += -mllvm -pass-remarks='.*'
+
+#
+# SPV
+#
+
+BUILT_SOURCES += gpt_gpu.spv.bc
+gpt_gpu.spv.bc: gpt_gpu.cl
+	clang-13 -target spir-- $(GPU_CPPFLAGS) -O0 -emit-llvm -o $@ -x cl -c $^
+
+BUILT_SOURCES += gpt_gpu.spv
+gpt_gpu.spv: gpt_gpu.spv.bc
+	llvm-spirv -o $@ $^
+
+CLEANFILES += gpt_gpu.spv.cc
+libircd_la_SOURCES += gpt_gpu.spv.cc
+gpt_gpu.spv.cc: gpt_gpu.spv
+	xxd -i $^ $@
+
+#
+# R600
+#
+
+R600_TARGET = r600--
+
+#
+# R600 Saint Barthélemy
+#
+
+R600_BARTS_CFLAGS = $(GPU_CFLAGS)
+R600_BARTS_CFLAGS += -target $(R600_TARGET)
+R600_BARTS_CFLAGS += -mcpu=barts
+R600_BARTS_CFLAGS += -Xclang -mlink-bitcode-file -Xclang /usr/lib/clc/barts-r600--.bc
+
+BUILT_SOURCES += gpt_gpu.r600_barts.bc
+gpt_gpu.r600_barts.bc: gpt_gpu.cl
+	$(CC) $(R600_BARTS_CFLAGS) $(GPU_CPPFLAGS) -emit-llvm -o $@ -x cl -c $^
+
+#BUILT_SOURCES += gpt_gpu.r600_barts.link.bc
+gpt_gpu.r600_barts.link.bc: gpt_gpu.r600_barts.bc
+	llvm-link-14 $(GPU_LINKFLAGS) -o $@ $^ /usr/lib/clc/barts-r600--.bc
+	opt-14 $(GPU_OPTFLAGS) -o $@ $@
+
+#BUILT_SOURCES += gpt_gpu.r600_barts.s
+gpt_gpu.r600_barts.s: gpt_gpu.r600_barts.link.bc
+	$(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -S -o $@ -x ir $^
+
+#BUILT_SOURCES += gpt_gpu.r600_barts.o
+gpt_gpu.r600_barts.o: gpt_gpu.r600_barts.link.bc
+	$(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -o $@ -x ir $^
+
+CLEANFILES += gpt_gpu.r600_barts.bc.cc
+libircd_la_SOURCES += gpt_gpu.r600_barts.bc.cc
+gpt_gpu.r600_barts.bc.cc: gpt_gpu.r600_barts.bc
+	xxd -i $^ $@
+
+#
+#
+#
+
+endif # OPENCL
+endif # CLANG
diff --git a/ircd/gpt.cc b/ircd/gpt.cc
index 6cf022c7a..96be65de9 100644
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@@ -8,13 +8,6 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 
-namespace ircd::gpt
-{
-	size_t backprop(task &, const f32, model::decoder &, f32 *const (&)[2], size_t = 0);
-
-	void generate_debug(task &, const uint &, const uint &);
-}
-
 decltype(ircd::gpt::log)
 ircd::gpt::log
 {
@@ -26,20 +19,21 @@ ircd::gpt::generate(const mutable_buffer &out,
                     const string_view &in,
                     task &task)
 {
-	u16 buf[2][1024];
+	u16 input_buf[1024];
 	const auto input_tokens
 	{
-		vocab::tokenize(buf[0], in)
+		gpt::vocab::tokenize(input_buf, in)
 	};
 
+	u16 output_buf[1024];
 	const auto output_tokens
 	{
-		generate(buf[1], input_tokens, task)
+		generate(output_buf, input_tokens, task)
 	};
 
 	const auto output
 	{
-		vocab::detokenize(out, output_tokens)
+		gpt::vocab::detokenize(out, output_tokens)
 	};
 
 	return output;
@@ -50,193 +44,309 @@ ircd::gpt::generate(const vector_view<u16> &out,
                     const vector_view<const u16> &in,
                     task &task)
 {
-	assert(task.ctrl);
 	assert(task.opts);
-
-	uint ret(0);
-	bool halt(false);
-
-	const auto &opts(*task.opts);
-	auto &ctrl(*task.ctrl);
-	ctrl.tokens.count = 0;
-	ctrl.tokens.head = 0;
-
-	uint j(0);
-	while(j < in.size() && ctrl.tokens.count < opts.buffer_tokens)
-		ctrl.token[ctrl.tokens.count++] = in[j++];
-
-	const size_t in_size
+	const auto &opts
 	{
-		ctrl.tokens.count
+		*task.opts
 	};
 
+	assert(task.ctrl);
+	auto &ctrl
+	{
+		*task.ctrl
+	};
+
+	size_t in_i(0);
+	while(in_i < in.size() && ctrl.count < opts.buffer_tokens)
+		if(in[in_i] == 628)
+		{
+			ctrl.token[ctrl.count++] = 198;
+			ctrl.token[ctrl.count++] = 198;
+			in_i++;
+		}
+		else ctrl.token[ctrl.count++] = in[in_i++];
+
 	generate(task);
 
-	for(uint i(0); i < ctrl.tokens.count && ret < out.size() && !halt; ++i)
-	{
-		const auto j
-		{
-			(i + ctrl.tokens.head) % opts.buffer_tokens
-		};
+	size_t out_i(0);
+	for(; out_i < out.size() && in_i + out_i < ctrl.count; out_i++)
+		out[out_i] = ctrl.token[in_i + out_i];
 
-		const auto tok
-		{
-			ctrl.token[j]
-		};
-
-		if(j >= in_size)
-			out[ret++] = tok;
-
-		if(likely(~opts.debug & 0x01))
-			continue;
-
-		if(likely(~opts.debug & 0x02))
-			if(j < in_size)
-				continue;
-
-		generate_debug(task, j, in_size);
-	}
-
-	ctx::interruption_point();
 	return vector_view<u16>
 	{
-		out, ret
+		out, out_i
 	};
 }
 
 void
 ircd::gpt::generate(task &task)
 {
-	const auto &opts(*task.opts);
-	auto &ctrl(*task.ctrl);
-
-	const size_t in_size
+	assert(task.opts);
+	const auto &opts
 	{
-		ctrl.tokens.count
+		*task.opts
 	};
 
-	uint64_t cycles(0);
-	if(ctrl.prop)
+	assert(task.ctrl);
+	auto &ctrl
 	{
-		static f32 *_momentum[2];
-		if(!_momentum[0])
-		{
-			_momentum[0] = new f32[sizeof(model::decoder) / 4] {0.0f};
-			_momentum[1] = new f32[sizeof(model::decoder) / 4] {0.0f};
-		}
-
-		f32 *const momentum[2]
-		{
-			_momentum[0], _momentum[1],
-		};
-
-		const prof::scope_cycles task_cycles
-		{
-			cycles
-		};
-
-		backprop(task, ctrl.label[0].loss.mean, *model::default_model, momentum);
-	}
-
-	if(ctrl.prop)
-	{
-		log::debug
-		{
-			log, "Backpropagation of %2.6f in %lu cycles.",
-			ctrl.label[0].loss.mean,
-			cycles,
-		};
-
-		ctrl.epic.epoch = 0;
-		ctrl.label[0].loss.mean = 0;
-		ctrl.label[0].loss.last = ctrl.label[0].loss.mean;
-		ctrl.label[0].perp.mean = 0;
-		ctrl.label[0].perp.last = ctrl.label[0].perp.mean;
-		ctrl.prop = false;
-		pipe::default_model->invalid = true;
-		return;
-	}
-
-	cycles = 0;
-	util::timer stopwatch;
-	{
-		const prof::scope_cycles task_cycles
-		{
-			cycles
-		};
-
-		pipe::generate(task);
-	}
-
-	const milliseconds last_time
-	{
-		stopwatch.at<milliseconds>()
+		*task.ctrl
 	};
 
-	ctrl.epic.elapsed += last_time.count();
+	gpt::epoch epoch
+	{
+		task
+	};
+
+	gpt::step step
+	{
+		epoch
+	};
+
+	gpt::samp samp
+	{
+		step
+	};
+
+	bool halt {false}; do
+	{
+		gpt::pipe::cycle cycle
+		{
+			samp
+		};
+
+		halt = !samp.evaluate(cycle);
+	}
+	while(!halt);
+}
+
+//
+// debug
+//
+
+void
+ircd::gpt::log_debug_prof(const opts &opts,
+                          const ctrl &ctrl,
+                          const pipe::prof &prof)
+{
+	static char
+	buf[2][512];
+
+	const auto head
+	{
+		debug_head(buf[0], opts, ctrl)
+	};
+
+	for(uint i(0); i < prof.stages; ++i)
+	{
+		if(!std::get<1>(prof.info[i]))
+			continue;
+
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"%s %2u: %s",
+			head,
+			i,
+			pipe::debug(buf[1], prof, i),
+		};
+	}
 }
 
 void
-ircd::gpt::generate_debug(task &task,
-                          const uint &i,
-                          const uint &in_size)
+ircd::gpt::log_debug_topn(const opts &opts,
+                          const ctrl &ctrl)
 {
-	const auto &opts(*task.opts);
-	auto &ctrl(*task.ctrl);
+	static char
+	buf[2][512];
 
-	const auto j
+	const auto head
 	{
-		(i + ctrl.tokens.head) % opts.buffer_tokens
+		debug_head(buf[0], opts, ctrl)
 	};
 
-	const auto tok
+	for(uint i(0); i < opts.top_n; ++i)
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"%s %s",
+			head,
+			debug_top(buf[1], opts, ctrl, i),
+		};
+}
+
+void
+ircd::gpt::log_debug_labels(const opts &opts,
+                            const ctrl &ctrl)
+{
+	static char
+	buf[2][512];
+
+	const auto head
 	{
-		ctrl.token[j]
+		debug_head(buf[0], opts, ctrl)
 	};
 
-	static char dbuf[512];
-	static char report[1536];
-	static char tmbuf[4][64];
-	const size_t bsz(ctrl.tokens.count - in_size);
-	const size_t report_size = snprintf
-	(
-		report, sizeof(report),
-		"%-3u %-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
-		j,
-		ctrl.tokens.count,
-		ctrl.epic.epoch,
-		ctrl.epic.cycle,
-		0.0f, // cert
-		std::clamp(ctrl.label[0].perp.mean, 0.0f, 100.0f),
-		std::clamp(ctrl.label[0].loss.mean, 0.0f, 99.99f),
-		ctrl.label[0].token == tok? '+': ' ',
-		' ', // flag place
-		' ', // flag place
-		ctrl.label[0].token,
-		std::clamp(ctrl.label[0].loss.last, 0.0f, 99.99f),
-		std::clamp(ctrl.label[0].perp.last, 0.0f, 100.0f),
-		0.0f, // cert
-		vocab::debug(dbuf, tok).c_str(),
-		tok,
-		pretty(tmbuf[0], milliseconds(0ms / bsz), 1).c_str(),
-		pretty(tmbuf[1], si(0UL / bsz), 1).c_str(),
-		pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
-	);
+	for(uint i(0); i < opts.labels; ++i)
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"%s %s",
+			head,
+			debug_label(buf[1], opts, ctrl, i, 1),
+		};
+}
+
+void
+ircd::gpt::log_debug_attns_top(const opts &opts,
+                               const ctrl &ctrl)
+{
+	static char
+	buf[8][512];
+
+	const auto head
+	{
+		debug_head(buf[0], opts, ctrl)
+	};
+
+	std::map<uint, uint> tokm;
+	for(uint i(0); i < opts.layers; ++i)
+		for(uint j(0); j < opts.attn_rank; ++j)
+			tokm[ctrl.attn[i][j]]++;
+
+	std::vector<std::pair<uint, uint>> tok(begin(tokm), end(tokm));
+	std::sort(begin(tok), end(tok), [&tokm]
+	(const auto &a, const auto &b)
+	{
+		return b.second < a.second;
+	});
+
+	for(const auto &[idx, score] : tok)
+	{
+		const auto barsz
+		{
+			std::min(score, std::min(80U, uint(sizeof(buf[2]) - 1)))
+		};
+
+		memset(buf[2], '|', barsz);
+		buf[2][barsz] = '\0';
+
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"%s %s [%3u] %s %-3u",
+			head,
+			vocab::debug(buf[1], ctrl.token[idx], 1),
+			idx,
+			buf[2],
+			score,
+		};
+	}
+}
+
+void
+ircd::gpt::log_debug_attns(const opts &opts,
+                           const ctrl &ctrl)
+{
+	static char
+	buf[2][512];
+
+	const auto head
+	{
+		debug_head(buf[0], opts, ctrl)
+	};
+
+	for(uint i(0); i < ctrl.count; ++i)
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"%s %s",
+			head,
+			debug_attn(buf[1], opts, ctrl, i),
+		};
+}
+
+void
+ircd::gpt::log_debug_token(const opts &opts,
+                           const ctrl &ctrl,
+                           const uint i)
+{
+	static char
+	buf[2][512];
 
 	log::logf
 	{
 		log, log::level::DEBUG,
-		"%s",
-		string_view{report, report_size}
+		"%s %s",
+		debug_head(buf[0], opts, ctrl),
+		debug_token_at(buf[1], opts, ctrl, i),
 	};
 }
 
+void
+ircd::gpt::log_debug(const opts &opts,
+                     const ctrl &ctrl)
+{
+	static char
+	buf[2][512];
+
+	log::logf
+	{
+		log, log::level::DEBUG,
+		"%s %s",
+		debug_head(buf[0], opts, ctrl),
+		debug(buf[1], opts, ctrl),
+	};
+}
+
+///////////////////////////////////////////////////////////////////////////////
 //
 // gpt::task
 //
 
+void
+ircd::gpt::reset(task &task)
+noexcept
+{
+	clear(task);
+	seed(task);
+}
+
+void
+ircd::gpt::clear(task &task)
+noexcept
+{
+	assert(task.ctrl);
+	memset(task.ctrl, 0x0, sizeof(gpt::ctrl));
+}
+
+void
+ircd::gpt::seed(task &task)
+noexcept
+{
+	assert(task.opts);
+	seed(task, task.opts->seed);
+}
+
+void
+ircd::gpt::seed(task &task,
+                const uint64_t &val)
+noexcept
+{
+	assert(task.ctrl);
+    task.ctrl->rand[0] = val;
+    task.ctrl->rand[1] = val;
+    task.ctrl->rand[2] = 65537;
+    task.ctrl->rand[3] = -1UL;
+}
+
+//
+// gpt::task::task
+//
+
 ircd::gpt::task::task(const gpt::opts *const opts,
                       gpt::ctrl *const ctrl)
+try
 :opts
 {
 	opts
@@ -245,121 +355,996 @@ ircd::gpt::task::task(const gpt::opts *const opts,
 {
 	ctrl
 }
-,frame
+,code
 {
-	new gpt::ctrl[opts->frames]
+	std::make_unique<pipe::code>()
+}
+,model
+{
+	std::make_unique<pipe::model>
+	(
+		*const_cast<const gpt::model::decoder *>(gpt::model::default_model)
+	)
+}
+,desc
+{
+	this->opts,
+	this->ctrl,
+	*this->model,
+	*this->code,
 }
 {
-	memset(ctrl, 0x0, sizeof(gpt::ctrl));
+	assert(aligned(opts, size_t(cl::data::gart_page_size)));
+	assert(aligned(ctrl, size_t(cl::data::gart_page_size)));
+
 	seed(*this, this->opts->seed);
 }
+catch(const std::exception &e)
+{
+	log::error
+	{
+		log, "Task ctor :%s", e.what()
+	};
+
+	throw;
+}
 
 ircd::gpt::task::~task()
 noexcept
 {
 }
 
+bool
+ircd::gpt::task::operator()()
+{
+	gpt::epoch epoch
+	{
+		*this
+	};
+
+	while(!epoch())
+		ctx::interruption_point();
+
+	return done();
+}
+
+bool
+ircd::gpt::task::done()
+const noexcept
+{
+	return false;
+}
+
+///////////////////////////////////////////////////////////////////////////////
 //
-// gpt::opts
+// epoch
 //
 
-ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model)
-noexcept
-:model
+namespace ircd::gpt
 {
-	model?: ircd::gpt::model::default_model
+	static thread_local u16 marker alignas(64) [1024];
 }
-,seed
+
+//
+// epoch::epoch
+//
+
+ircd::gpt::epoch::epoch(gpt::task &task)
+:task
+{
+	task
+}
+,desc
+{
+	task.desc
+}
+,opts
+{
+	*task.opts
+}
+,ctrl
+{
+	*task.ctrl
+}
+,id
+{
+	ctrl.clk.epoch
+}
+,start
+{
+	0
+}
+,stop
+{
+	std::min(start + uint(opts.batch_size), gpt::model::default_data.size())
+}
+,moment
+{
+	gpt::model::default_moment[0],
+	gpt::model::default_moment[1],
+}
+{
+	assert(task.opts);
+	assert(task.ctrl);
+
+	ctrl.clk.step = 0;
+}
+
+ircd::gpt::epoch::~epoch()
+noexcept
+{
+	if(opts.debug & 0x80000000U)
+		log_debug_prof(opts, ctrl, this->profile);
+}
+
+bool
+ircd::gpt::epoch::operator()()
+{
+	gpt::step step
+	{
+		*this
+	};
+
+	while(!step())
+		ctx::interruption_point();
+
+	if(!step.backpropagate())
+		throw error
+		{
+			"Failed to backprop."
+		};
+
+	return done();
+}
+
+bool
+ircd::gpt::epoch::done()
+const noexcept
+{
+	return ctrl.clk.epoch != id;
+}
+
+void
+ircd::gpt::epoch::profile_accumulate(const pipe::prof &profile)
+{
+	for(size_t i(0); i < profile.ts.size(); ++i)
+		for(size_t j(0); j < profile.phases; ++j)
+			this->profile.ts[i][j] += profile.ts[i][j];
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// step::step
+//
+
+ircd::gpt::step::step(gpt::epoch &epoch)
+:epoch
+{
+	epoch
+}
+,desc
+{
+	epoch.desc
+}
+,opts
+{
+	epoch.opts
+}
+,ctrl
+{
+	epoch.ctrl
+}
+,id
+{
+	ctrl.clk.step
+}
+,start
+{
+	ctrl.clk.step * opts.batch_size
+}
+{
+	assert(opts.batch_size > 0);
+
+	ctrl.clk.samp = 0;
+	ctrl.hit = 0;
+	ctrl.miss = 0;
+	ctrl.target.ppl = {{0}};
+	ctrl.target.loss = {{0}};
+	ctrl.select.ppl = {{0}};
+	ctrl.select.loss = {{0}};
+
+	for(uint i(0); i < opts.labels; ++i)
+	{
+		ctrl.label[i].ppl = {{0}};
+		ctrl.label[i].loss = {{0}};
+	}
+}
+
+ircd::gpt::step::~step()
+noexcept
+{
+	if(opts.debug & 0x40000000U)
+		log_debug_prof(opts, ctrl, this->profile);
+}
+
+bool
+ircd::gpt::step::backpropagate()
+{
+	const auto hit
+	{
+		ctrl.target.logit.token == ctrl.select.logit.token
+	};
+
+	const auto select_loss_mean
+	{
+		ctrl.select.loss.mean
+	};
+
+	const auto target_loss_mean
+	{
+		ctrl.target.loss.mean
+	};
+
+	const auto loss_mean
+	{
+		(target_loss_mean + select_loss_mean) / 2.0f
+	};
+
+	static float mean_best { 10000.0f }, target_mean_best { 10000.0f };
+	static ulong hit_best;
+	static bool tack, last_tack;
+	last_tack = tack;
+
+	const auto loss
+	{
+		loss_mean
+	};
+
+	const bool improve_global
+	{
+		target_loss_mean < target_mean_best
+	};
+
+	const bool improve
+	{
+		improve_global
+	};
+
+	if(improve)
+		mean_best = loss,
+		target_mean_best = target_loss_mean,
+		hit_best = ctrl.hit;
+	else
+		tack = !tack;
+
+	const auto grad
+	{
+		!tack? loss : -loss
+	};
+
+	const auto steps
+	{
+		(opts.training_steps + opts.validation_steps + opts.testing_steps) / opts.batch_size
+	};
+
+	const auto step
+	{
+		this->epoch.id * steps + this->id
+	};
+
+	log::logf
+	{
+		log, improve? log::level::INFO: log::level::ERROR,
+		"epoch:%u step:%u completed range[%u -> %zu] dsid:%u target:%-10.7f select:%-10.7f loss:%-10.7f [ %10.7f ] hit:%u miss:%u",
+		this->epoch.id,
+		step,
+		this->start,
+		this->start + opts.batch_size,
+		this->id * opts.batch_size + ctrl.clk.samp,
+		target_loss_mean,
+		select_loss_mean,
+		loss,
+		grad * opts.alpha,
+		ctrl.hit,
+		ctrl.miss,
+	};
+
+	if(!opts.alpha)
+		return true;
+
+	if(!improve)
+		return false;
+
+	cl::exec
+	{
+		desc.model->decode->master[0], std::memory_order_acq_rel
+	};
+
+	auto &model
+	{
+		*mutable_cast(desc.model->decode_const)
+	};
+
+	const mutable_buffer model_buffer
+	{
+		reinterpret_cast<char *>(&model),
+		sizeof(gpt::model::decoder) * 3
+	};
+
+	const mutable_buffer checkpoint_buffer
+	{
+		reinterpret_cast<char *>(&model) + sizeof(gpt::model::decoder) * 3,
+		sizeof(gpt::model::decoder) * 3
+	};
+
+	if(improve)
+		copy(checkpoint_buffer, model_buffer);
+	else
+		copy(model_buffer, checkpoint_buffer);
+
+	ircd::timer stopwatch;
+	backprop(opts, step, grad, model, epoch.moment);
+	allocator::sync(model_buffer);
+
+	char pbuf[1][32];
+	log::logf
+	{
+		log, improve? log::level::DEBUG: log::level::ERROR,
+		"backpropagation step:%u lr:%-8.6f mean:%-10.7f$L hits:%-5u Tbest:%-10.7f$L Mbest:%-10.7f$L Hbest:%-5lu grad:{ %10.7f$L } %s",
+		step,
+		opts.alpha,
+		loss_mean,
+		ctrl.hit,
+		target_mean_best,
+		mean_best,
+		hit_best,
+		grad,
+		pretty(pbuf[0], stopwatch.at<milliseconds>(), 1),
+	};
+
+	return true;
+}
+
+bool
+ircd::gpt::step::operator()()
+{
+	gpt::samp samp
+	{
+		*this
+	};
+
+	while(!samp())
+		ctx::interruption_point();
+
+	return done();
+}
+
+bool
+ircd::gpt::step::done()
+const noexcept
+{
+	return ctrl.clk.step != id;
+}
+
+void
+ircd::gpt::step::profile_accumulate(const pipe::prof &profile)
+{
+	for(size_t i(0); i < profile.ts.size(); ++i)
+		for(size_t j(0); j < profile.phases; ++j)
+			this->profile.ts[i][j] += profile.ts[i][j];
+
+	epoch.profile_accumulate(profile);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// samp::samp
+//
+
+ircd::gpt::samp::samp(gpt::step &step)
+:step
+{
+	step
+}
+,desc
+{
+	step.desc
+}
+,opts
+{
+	step.opts
+}
+,ctrl
+{
+	step.ctrl
+}
+,id
+{
+	ctrl.clk.samp
+}
+,accept
+{
+	-1
+}
+,dispatch
+{
+	1
+}
+,cycle
+{
+	0
+}
+,tokens
+{
+	tokenize()
+}
+,count
+{
+	int(opts.limit) > 0?
+		tokens - opts.limit:
+	int(opts.limit) < 0?
+		std::abs(int(opts.limit)):
+		tokens
+}
+{
+	desc.cached = 0;
+
+	ctrl.clk.cycle = cycle;
+	ctrl.dispatch = dispatch;
+	ctrl.accept = accept;
+	ctrl.count = count;
+	ctrl.tokens = tokens;
+	ctrl.magic = 0xDEADBEEF;
+
+	for(uint i(0); i < opts.labels; ++i)
+	{
+		ctrl.label[i].ppl = {{0}};
+		ctrl.label[i].loss = {{0}};
+	}
+
+	assert(ctrl.count > 0);
+	assert(ctrl.count < opts.context_tokens);
+	assert(ctrl.count <= ctrl.tokens);
+
+	if(opts.debug & 0x01)
+		for(uint j(0); j < ctrl.count; ++j)
+			log_debug_token(opts, ctrl, j);
+}
+
+ircd::gpt::samp::~samp()
+noexcept
+{
+	if(run::level != run::level::RUN)
+		return;
+
+	cl::exec
+	{
+		desc.ctrl, std::memory_order_acq_rel
+	};
+
+	if(opts.debug & 0x04)
+		log_debug(opts, ctrl);
+
+	if(opts.debug & 0x40)
+		log_debug_labels(opts, ctrl);
+
+	if(opts.debug & 0x20000000U)
+		log_debug_prof(opts, ctrl, this->profile);
+}
+
+bool
+ircd::gpt::samp::operator()()
+{
+	if(dispatch > 0)
+	{
+		ctx::interruption_point();
+		queue.emplace_back(*this);
+		desc.cached = tokens;
+		tokens += count < tokens? 0: 1;
+		++cycle;
+		++count;
+		--dispatch;
+		return false;
+	}
+
+	while(!queue.empty())
+	{
+		const unwind pop{[this]
+		{
+			queue.pop_front();
+		}};
+
+		if(evaluate(queue.front()))
+			break;
+	}
+
+	return done();
+}
+
+bool
+ircd::gpt::samp::done()
+const noexcept
+{
+	return accept >= 0;
+}
+
+uint
+ircd::gpt::samp::tokenize()
+{
+	const auto idx
+	{
+		step.start + ctrl.clk.samp
+	};
+
+	const gpt::model::text text
+	{
+		gpt::model::default_data.at(idx)
+	};
+
+	const json::string input
+	{
+		json::get<"text"_>(text)
+	};
+
+	thread_local char str_buf[16_KiB];
+	const string_view str
+	{
+		json::unescape(str_buf, input)
+	};
+
+	assert(!empty(str));
+	static const auto delim
+	{
+		"\n\n"_sv
+	};
+
+	const int phrases
+	(
+		ircd::token_count(str, delim)
+	);
+
+	uint count(0);
+	int p(phrases);
+	assert(p >= 0);
+
+	if(startswith(str, delim))
+	{
+		ctrl.token[count++] = 198;
+		ctrl.token[count++] = 198;
+	}
+
+	ircd::tokens(str, delim, [this, &count, &p, &phrases]
+	(const string_view &phrase) -> bool
+	{
+		assert(!empty(phrase));
+		const vector_view<u16> buf
+		{
+			ctrl.token + count, opts.buffer_tokens - count
+		};
+
+		const auto in
+		{
+			gpt::vocab::tokenize(buf, phrase)
+		};
+
+		if(count + size(in) + 2 > opts.context_tokens)
+			return false;
+
+		count += size(in);
+		ctrl.token[count++] = 198;
+		ctrl.token[count++] = 198;
+
+		assert(p > 0);
+		marker[--p] = count;
+		return true;
+	});
+
+	for(assert(p >= 0); p < phrases; ++p)
+		if(marker[p] <= opts.context_tokens)
+			break;
+
+	assert(p <= phrases);
+	count = marker[p];
+
+	for(uint i(count); i < opts.buffer_tokens; ++i)
+		ctrl.token[i] = 198;
+
+	if(!endswith(str, delim))
+		count -= 2;
+
+	assert(count > 0);
+	assert(count <= opts.context_tokens);
+	return count;
+}
+
+bool
+ircd::gpt::samp::evaluate(pipe::cycle &cycle)
+{
+	cl::exec
+	{
+		desc.frame[cycle.frame], std::memory_order_consume
+	};
+
+	const auto &frame
+	{
+		acquire(cycle)
+	};
+
+	if(!retire(cycle, frame))
+		return false;
+
+	memcpy(&ctrl, &frame, sizeof(gpt::ctrl));
+
+	const uint
+	batch_size = opts.batch_size,
+	samps = opts.training_steps + opts.validation_steps + opts.testing_steps,
+	steps = samps / batch_size;
+
+	const bool
+	accepting = accept >= 0,
+	cycling = !accepting,
+	sampling = accepting,
+	stepping = sampling && (frame.clk.samp + 1) >= batch_size,
+	epoching = stepping && (frame.clk.step + 1) >= steps;
+
+	//ctrl[ctrl.count] = ctrl.select.logit.token;
+	//ctrl.count++;
+
+	if(accepting)
+	{
+		ctrl.clk.cycle += cycling;
+		ctrl.clk.samp += sampling;
+		ctrl.clk.step += stepping;
+		ctrl.clk.epoch += epoching;
+	}
+
+	return true;
+}
+
+bool
+ircd::gpt::samp::retire(pipe::cycle &cycle,
+                        const gpt::ctrl &frame)
+{
+	assert(accept < 0);
+	accept = frame.accept;
+	dispatch = frame.dispatch;
+
+	if(cl::profile_queue)
+	{
+		const pipe::prof profile
+		{
+			cycle
+		};
+
+		if(opts.debug & 0x10000000U)
+			log_debug_prof(opts, frame, profile);
+
+		profile_accumulate(profile);
+	}
+
+	if(opts.debug & 0x02)
+		log_debug(opts, frame);
+
+	if(opts.debug & 0x20)
+		log_debug_labels(opts, frame);
+
+	if(opts.debug & 0x10)
+		log_debug_topn(opts, frame);
+
+	if(opts.debug & 0x200)
+		log_debug_attns_top(opts, frame);
+
+	dispatch &= boolmask<uint>(ircd::run::level == run::level::RUN);
+	dispatch &= boolmask<uint>(!ctx::interruption_requested());
+	dispatch &= boolmask<uint>(accept < 0);
+	const bool finished
+	{
+		dispatch == 0
+	};
+
+	return finished;
+}
+
+void
+ircd::gpt::samp::profile_accumulate(const pipe::prof &profile)
+{
+	for(size_t i(0); i < profile.ts.size(); ++i)
+		for(size_t j(0); j < profile.phases; ++j)
+			this->profile.ts[i][j] += profile.ts[i][j];
+
+	step.profile_accumulate(profile);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// ctrl
+//
+
+ircd::string_view
+ircd::gpt::debug_top(const mutable_buffer &out,
+                     const opts &opts,
+                     const ctrl &ctrl,
+                     const uint i)
+{
+	thread_local char buf[2][256];
+
+	assert(opts.top_n > i);
+	const auto &top
+	{
+		ctrl.top[i]
+	};
+
+	return fmt::sprintf
+	{
+		out, "%s T%02d %s",
+		vocab::debug(buf[0], top.token, 1),
+		i,
+		debug(buf[1], opts, top),
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug_label(const mutable_buffer &out,
+                       const opts &opts,
+                       const ctrl &ctrl,
+                       const uint i,
+                       const uint fmt)
+{
+	thread_local char buf[2][256];
+
+	assert(opts.labels > i);
+	const auto &label
+	{
+		ctrl.label[i]
+	};
+
+	return fmt::sprintf
+	{
+		out, "%s L%02d %s",
+		vocab::debug(buf[0], label.logit.token, 1),
+		i,
+		debug(buf[1], opts, label, fmt),
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug_attn(const mutable_buffer &out,
+                      const opts &opts,
+                      const ctrl &ctrl,
+                      const uint ti)
+{
+	thread_local char buf[4][256];
+	assert(ti < ctrl.count);
+
+	memset(buf[1], 0x0, sizeof(buf[1]));
+	for(uint i(0); i < opts.layers; ++i)
+	{
+		const auto f{[&](const auto &a) { return a == ti; }};
+		if(std::none_of(ctrl.attn[i], ctrl.attn[i] + opts.attn_rank, f))
+			continue;
+
+		strlcat{buf[1], fmt::sprintf
+		{
+			buf[2], "  %1x[", uint(i)
+		}};
+
+		for(uint j(0); j < opts.attn_rank; ++j)
+			if(ctrl.attn[i][j] == ti)
+				strlcat{buf[1], fmt::sprintf
+				{
+					buf[2], "%1x", uint(j)
+				}};
+
+		strlcat{buf[1], "]"_sv};
+	}
+
+	return fmt::sprintf
+	{
+		out, "%s [%3u] <-%s",
+		vocab::debug(buf[0], ctrl.token[ti], 1),
+		ti,
+		string_view{buf[1]},
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug(const mutable_buffer &out,
+                 const opts &opts,
+                 const ctrl &ctrl)
+{
+	thread_local char
+	buf[8][128],
+	tmbuf[4][32];
+
+	int top_idx {-1};
+	for(uint i(0); i < opts.top_n; ++i)
+		if(ctrl.top[i].token == ctrl.select.logit.token)
+		{
+			top_idx = i;
+			break;
+		}
+
+	return fmt::sprintf
+	{
+		out, "%s %s %c T%02d %4u %6.2f%% %10.7f$L %c %s %s",
+		vocab::debug(buf[0], ctrl.select.logit.token, 1),
+		debug(buf[1], opts, ctrl.select),
+		ctrl.target.logit.token == ctrl.top[0].token? '=' : ' ',
+		top_idx,
+		ctrl.hit,
+		(ctrl.hit / float(ctrl.hit + ctrl.miss)) * 100.0f,
+		ctrl.target.loss.mean - ctrl.select.loss.mean,
+		ctrl.target.logit.token == ctrl.select.logit.token? '=' : ' ',
+		debug(buf[2], opts, ctrl.target),
+		vocab::debug(buf[3], ctrl.target.logit.token, 1),
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug(const mutable_buffer &out,
+                 const opts &opts,
+                 const ctrl_label &label,
+                 const uint fmt)
+{
+	thread_local char buf[64], bar[128];
+
+	const auto diff
+	{
+		log2f(65536) - label.loss.mean
+	};
+
+	const auto pct
+	{
+		(diff / log2f(opts.logits)) * 100.0f
+	};
+
+	const auto barsz
+	{
+		std::min(uint(pct), std::min(66U, uint(sizeof(bar) - 1)))
+	};
+
+	memset(bar, '|', barsz);
+	bar[barsz] = '\0';
+
+	return fmt::sprintf
+	{
+		out,
+		fmt == 1?
+			"%s %10.7f$La %6.2f%% %s":
+			"%s %10.7f$La",
+		debug(buf, opts, label.logit, fmt),
+		label.loss.mean,
+		pct,
+		string_view{bar},
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug(const mutable_buffer &out,
+                 const opts &opts,
+                 const ctrl_logit &logit,
+                 const uint fmt)
+{
+	return fmt::sprintf
+	{
+		out, "%6.2f%% %10.7f$L %5.1f$P",
+		logit.samax * 100.0f,
+		+0.0f - logf(logit.samax),
+		(1.0f - logit.samax) * log2f(opts.logits),
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug_head(const mutable_buffer &out,
+                      const opts &opts,
+                      const ctrl &ctrl)
+{
+	thread_local char head[64];
+
+	return fmt::sprintf
+	{
+		out, "%s[%4u]-%1u",
+		debug_head(head, opts, ctrl.clk),
+		ctrl.count,
+		ctrl.dispatch,
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug_head(const mutable_buffer &out,
+                      const opts &opts,
+                      const ctrl_clk &clk)
+{
+	return fmt::sprintf
+	{
+		out, "%02u:%06u|%04u|%04u|%04u",
+		clk.epoch,
+		clk.step * opts.batch_size + clk.samp,
+		clk.step,
+		clk.samp,
+		clk.cycle,
+	};
+}
+
+ircd::string_view
+ircd::gpt::debug_token(const mutable_buffer &out,
+                       const opts &opts,
+                       const ctrl &ctrl,
+                       const uint fmt)
+{
+	assert(ctrl.count > 0);
+	const auto pos
+	{
+		ctrl.count - 1
+	};
+
+	return debug_token_at(out, opts, ctrl, pos, fmt);
+}
+
+ircd::string_view
+ircd::gpt::debug_token_at(const mutable_buffer &out,
+                          const opts &opts,
+                          const ctrl &ctrl,
+                          const uint i,
+                          const uint fmt)
+{
+	const auto &token
+	{
+		ctrl.token[i]
+	};
+
+	return vocab::debug(out, token, fmt);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// opts
+//
+
+ircd_gpt_opts::ircd_gpt_opts()
+noexcept
+:seed
 {
 	1234567890UL
 }
-,limit
-{
-	-1U
-}
 ,top_k
 {
-	2U
+	16
 }
 ,top_p
 {
-	90U
+	0.90f
 }
 ,top_n
 {
-	16
+	0
 }
 ,labels
 {
 	0
 }
+,frames
+{
+	8
+}
+,limit
+{
+	-1U
+}
 ,debug
 {
-	0x01
+	0x00
 }
-,context_tokens
+,accept
 {
-	1024U
+	{ 198, 198, ushort(-1),    },
+	{ 0, 0, 0, ushort(-1),     },
+	{ ushort(-1),              },
+	{ ushort(-1),              },
 }
-,buffer_tokens
+,batch_size
 {
-	1024U
-}
-,embed_elems
-{
-	768U
-}
-,attn_rank
-{
-	12U
-}
-,attn_mult
-{
-	3U
-}
-,ffnn_mult
-{
-	4U
-}
-,attn_elems
-{
-	embed_elems * attn_mult
-}
-,ffnn_elems
-{
-	embed_elems * ffnn_mult
-}
-,lanes
-{
-	4U
-}
-,layers
-{
-	12
-}
-,embed_width
-{
-	embed_elems / lanes
-}
-,attn_width
-{
-	attn_elems / lanes
-}
-,attn_height
-{
-	embed_elems / lanes
-}
-,ffnn_width
-{
-	ffnn_elems / lanes
-}
-,ffnn_height
-{
-	embed_elems / lanes
-}
-,logits
-{
-	50257
+	32
 }
 ,training_steps
 {
@@ -375,7 +1360,7 @@ noexcept
 }
 ,alpha
 {
-	0.001f
+	0.00002
 }
 ,beta
 {
@@ -384,7 +1369,95 @@ noexcept
 }
 ,epsilon
 {
-	0.000001
+	0.00001
+}
+,lambda
+{
+	0.5
+}
+,logits
+{
+	50256
+}
+,buffer_tokens
+{
+	1024 - 16 // XXX
+}
+,context_tokens
+{
+	512 // 1024
+}
+,layers
+{
+	12
+}
+,lanes
+{
+	4
+}
+,embed_elems
+{
+	768
+}
+,embed_width
+{
+	embed_elems / lanes
+}
+,attn_rank
+{
+	12
+}
+,attn_mult
+{
+	3
+}
+,attn_elems
+{
+	embed_elems * attn_mult
+}
+,attn_fcon_width
+{
+	attn_elems / lanes
+}
+,attn_fcon_height
+{
+	embed_elems / lanes
+}
+,attn_proj_width
+{
+	embed_elems / lanes
+}
+,attn_proj_height
+{
+	embed_elems / lanes
+}
+,attn_self_elems
+{
+	(uint(powl(context_tokens, 2)) / 2) * attn_rank
+}
+,ffnn_mult
+{
+	4
+}
+,ffnn_elems
+{
+	embed_elems * ffnn_mult
+}
+,ffnn_fcon_width
+{
+	ffnn_elems / lanes
+}
+,ffnn_fcon_height
+{
+	embed_elems / lanes
+}
+,ffnn_proj_width
+{
+	embed_elems / lanes
+}
+,ffnn_proj_height
+{
+	ffnn_elems / lanes
 }
 {
 }
diff --git a/ircd/gpt_cpu.cc b/ircd/gpt_cpu.cc
index a338b9369..d2ef62a1e 100644
--- a/ircd/gpt_cpu.cc
+++ b/ircd/gpt_cpu.cc
@@ -8,17 +8,21 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 
+#pragma clang fp exceptions(ignore)
+#pragma clang fp reassociate(on)
+#pragma clang fp contract(fast)
+
 namespace ircd::gpt
 {
-	static size_t adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32, size_t);
-	static size_t adamw(task &, const f32, f32 *, const size_t, f32 *const (&)[2], const size_t);
+	static void adamw(const opts &, const u32, const f32, const uint, f32 *, f32 *, f32 *);
 
-	static size_t backprop(task &, const f32, model::norm &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::attn &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::ffnn &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::block &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::embed &, f32 *const (&)[2], size_t);
-	extern size_t backprop(task &, const f32, model::decoder &, f32 *const (&)[2], size_t = 0);
+	static void backprop(const opts &, const u32, const f32, model::norm &, model::norm &, model::norm &);
+	static void backprop(const opts &, const u32, const f32, model::attn &, model::attn &, model::attn &);
+	static void backprop(const opts &, const u32, const f32, model::ffnn &, model::ffnn &, model::ffnn &);
+	static void backprop(const opts &, const u32, const f32, model::block &, model::block &, model::block &);
+	static void backprop(const opts &, const u32, const f32, model::embed &, model::embed &, model::embed &);
+	static void backprop(const opts &, const u32, const f32, model::decoder &, model::decoder &, model::decoder &);
+	extern void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept;
 
 	template<class T>
 	static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
@@ -37,7 +41,7 @@ namespace ircd::gpt
 	static void logits(float *, const float *, const model::decoder &);
 	static void tail(float *, const float *, const model::decoder &);
 	static u16 argmax(const float *, const opts &);
-	static void embed(float *, const u16 token, const u16 position, const opts &);
+	static void embed(float *, const u16 token, const u16 position, const model::decoder &);
 
 	static f32
 	logit alignas(64) [65536],
@@ -49,21 +53,20 @@ void
 ircd::gpt::embed(float *const out,
                  const u16 token,
                  const u16 position,
-                 const opts &opts)
+                 const model::decoder &model)
 {
-	assert(opts.model);
 	const auto &wpe
 	{
-		opts.model->word.pos[position]
+		model.embed.pos[position]
 	};
 
 	const auto &wte
 	{
-		opts.model->word.token[token]
+		model.embed.token[token]
 	};
 
 	for(uint j(0); j < 768; ++j)
-		out[j] = wte[j] + wpe[j];
+		out[j] = wte.elem[j] + wpe.elem[j];
 }
 
 uint16_t
@@ -117,7 +120,7 @@ ircd::gpt::tail(float *const __restrict__ logit,
 	for(uint i(0); i < 768; ++i)
 		buf[0][i] = state[i];
 
-	norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.f.bias, (const f32x4 *)d.f.weight, lnf_epsilon);
+	norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.embed.norm.bias.elem, (const f32x4 *)d.embed.norm.weight.elem, lnf_epsilon);
 	logits(logit, buf[0], d);
 	//logitsmax(logit, logit, vocab::tokens);
 }
@@ -132,7 +135,7 @@ ircd::gpt::logits(float *const __restrict__ out,
 
 	for(uint j(0); j < vocab::tokens; ++j)
 		for(uint k(0); k < 768; ++k)
-			out[j] += in[k] * d.word.token[j][k];
+			out[j] += in[k] * d.embed.token[j].elem[k];
 }
 
 [[gnu::noinline]]
@@ -192,7 +195,7 @@ ircd::gpt::coil(float *__restrict__ accum,
 		};
 
 		for(uint j(0); j < tokens; ++j)
-			fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
+			fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias.elem, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
 
 		for(uint j(0); j < tokens; ++j)
 			ffnn(accum + j * 768, accum + j * 768, decoder, i);
@@ -227,7 +230,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64],
 		buf alignas(64) [768],
 		proj alignas(64) [2304];
 
-		norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon);
+		norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.attn.norm.bias.elem, (const f32x4 *)layer.attn.norm.weight.elem, ln1_epsilon);
 
 		static const math::fmma_opts fmma_opts
 		{
@@ -235,7 +238,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64],
 		};
 
 		memset(proj, 0x0, sizeof(proj));
-		fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts);
+		fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.fcon_bias.fcon, (const f32x4 *)layer.attn.fcon_weight, fmma_opts);
 
 		#pragma clang loop unroll (disable)
 		for(uint j(0); j < 12; ++j)
@@ -372,10 +375,10 @@ ircd::gpt::ffnn(float *const out,
 	buf2 alignas(64) [3072];
 
 	memset(buf2, 0x0, sizeof(buf2));
-	norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
-	fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts);
+	norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ffnn.norm.bias.elem, (const f32x4 *)layer.ffnn.norm.weight.elem, ln2_epsilon);
+	fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fcon_bias.fcon, (const f32x4 *)layer.ffnn.fcon_weight, fmma3_opts);
 	gelu((f32x4 *)buf2, (const f32x4 *)buf2);
-	fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
+	fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias.elem, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
 }
 
 void
@@ -431,213 +434,219 @@ ircd::gpt::gelu(f32x4 &out,
 //
 
 [[gnu::noinline]]
-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                     const f32 grad,
-                    model::decoder &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::decoder &__restrict__ param,
+                    f32 *const __restrict__ buf[2])
+noexcept
 {
-	for(uint i(0); i < 12; ++i)
-		off = backprop(task, grad, param.layer[i], moment, off);
-
-	off = backprop(task, grad, param.f, moment, off);
-	off = backprop(task, grad, param.word, moment, off);
-	return off;
-}
-
-size_t
-ircd::gpt::backprop(task &task,
-                    const f32 grad,
-                    model::embed &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
-{
-	assert(task.opts);
-	const auto &opts
+	model::decoder *const __restrict__ moment[2]
 	{
-		*task.opts
+		reinterpret_cast<model::decoder *>(buf[0]),
+		reinterpret_cast<model::decoder *>(buf[1]),
 	};
 
+	backprop(opts, step, grad, param, *moment[0], *moment[1]);
+}
+
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
+                    const f32 grad,
+                    model::decoder &__restrict__ param,
+                    model::decoder &__restrict__ moment0,
+                    model::decoder &__restrict__ moment1)
+{
+	fpe::errors_handle eh;
+
+	assume(opts.attn_rank > 0);
+	assume(opts.layers > 0);
+	const auto eln
+	{
+		opts.layers - 1 // step % opts.layers
+	};
+
+	for(int i(opts.layers - 1); i >= int(opts.layers - eln - 1); --i)
+	{
+		assert(i >= 0 && i < int(opts.layers));
+		backprop(opts, step, grad, param.layer[i], moment0.layer[i], moment1.layer[i]);
+	}
+
+	backprop(opts, step, grad, param.embed, moment0.embed, moment1.embed);
+
+	auto pending(eh.pending());
+	eh.clear_pending();
+	pending &= ~pending & FE_INEXACT;
+	if(unlikely(pending))
+		fpe::throw_errors(pending);
+}
+
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
+                    const f32 grad,
+                    model::embed &__restrict__ param,
+                    model::embed &__restrict__ moment0,
+                    model::embed &__restrict__ moment1)
+{
+	backprop(opts, step, grad, param.norm, moment0.norm, moment1.norm);
+
+	assume(opts.context_tokens > 0);
 	for(uint i(0); i < opts.context_tokens; ++i)
-		off = adamw(task, grad, param.pos[i], 768, moment, off);
+		adamw(opts, step, grad, 768, param.pos[i].elem, moment0.pos[i].elem, moment1.pos[i].elem);
 
+	assume(opts.logits > 0);
 	for(uint i(0); i < opts.logits; ++i)
-		off = adamw(task, grad, param.token[i], 768, moment, off);
-
-	return off;
+		adamw(opts, step, grad, 768, param.token[i].elem, moment0.token[i].elem, moment1.token[i].elem);
 }
 
-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                     const f32 grad,
-                    model::block &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::block &__restrict__ param,
+                    model::block &__restrict__ moment0,
+                    model::block &__restrict__ moment1)
 {
-	off = backprop(task, grad, param.ln1, moment, off);
-	off = backprop(task, grad, param.attn, moment, off);
-	off = backprop(task, grad, param.ln2, moment, off);
-	off = backprop(task, grad, param.ffnn, moment, off);
-	return off;
+	backprop(opts, step, grad, param.attn.norm, moment0.attn.norm, moment1.attn.norm);
+	backprop(opts, step, grad, param.attn, moment0.attn, moment1.attn);
+
+	backprop(opts, step, grad, param.ffnn.norm, moment0.ffnn.norm, moment1.ffnn.norm);
+	backprop(opts, step, grad, param.ffnn, moment0.ffnn, moment1.ffnn);
 }
 
-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                     const f32 grad,
-                    model::attn &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::attn &__restrict__ param,
+                    model::attn &__restrict__ moment0,
+                    model::attn &__restrict__ moment1)
 {
-	off = adamw(task, grad, param.attn_bias, 2304, moment, off);
+	adamw(opts, step, grad, 2304, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon);
 
 	for(uint i(0); i < 768; ++i)
-		off = adamw(task, grad, param.attn_weight[i], 2304, moment, off);
+		adamw(opts, step, grad, 2304, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon);
 
-	off = adamw(task, grad, param.proj_bias, 768, moment, off);
+	adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem);
 
 	for(uint i(0); i < 768; ++i)
-		off = adamw(task, grad, param.proj_weight[i], 768, moment, off);
-
-	return off;
+		adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem);
 }
 
-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                     const f32 grad,
-                    model::ffnn &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::ffnn &__restrict__ param,
+                    model::ffnn &__restrict__ moment0,
+                    model::ffnn &__restrict__ moment1)
 {
-	off = adamw(task, grad, param.fc_bias, 3072, moment, off);
+	adamw(opts, step, grad, 3072, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon);
 
 	for(uint i(0); i < 768; ++i)
-		off = adamw(task, grad, param.fc_weight[i], 3072, moment, off);
+		adamw(opts, step, grad, 3072, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon);
 
-	off = adamw(task, grad, param.proj_bias, 768, moment, off);
+	adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem);
 
 	for(uint i(0); i < 3072; ++i)
-		off = adamw(task, grad, param.proj_weight[i], 768, moment, off);
-
-	return off;
+		adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem);
 }
 
-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                     const f32 grad,
-                    model::norm &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::norm &__restrict__ param,
+                    model::norm &__restrict__ moment0,
+                    model::norm &__restrict__ moment1)
 {
-	off = adamw(task, grad, param.bias, 768, moment, off);
-	off = adamw(task, grad, param.weight, 768, moment, off);
-	return off;
+	adamw(opts, step, grad, 768, param.bias.elem, moment0.bias.elem, moment1.bias.elem);
+	adamw(opts, step, grad, 768, param.weight.elem, moment0.weight.elem, moment1.weight.elem);
 }
 
-[[gnu::noinline]]
-size_t
-ircd::gpt::adamw(task &task,
-                 const f32 grad,
-                 f32 *const p_,
-                 const size_t num,
-                 f32 *const (&__restrict__ m_)[2],
-                 size_t off)
+namespace ircd::gpt
 {
-	assert(task.opts);
-	const auto &opts
+	static f32x4 adamw_moment(const f32x4, const f32, const f32);
+	static f32x4 adamw_numer(const f32x4, const f32, const u32);
+	static f32x4 adamw_denom(const f32x4, const f32, const u32);
+	static f32x4 adamw_delta(const f32x4, const f32x4, const f32, const f32, const f32, const u32);
+	static void adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32);
+	static void adamw(const opts &, const u32, const f32, const u32, f32 *, f32 *, f32 *);
+}
+
+void
+ircd::gpt::adamw(const opts &opts,
+                 const u32 step,
+                 const f32 grad,
+                 const u32 num,
+                 f32 *const __restrict__ param,
+                 f32 *const __restrict__ moment0,
+                 f32 *const __restrict__ moment1)
+{
+	f32x4 *const __restrict__ val[3]
 	{
-		*task.opts
+		reinterpret_cast<f32x4 *>(param),
+		reinterpret_cast<f32x4 *>(moment0),
+		reinterpret_cast<f32x4 *>(moment1),
 	};
 
-	assert(task.ctrl);
-	auto &ctrl
+	const auto n
 	{
-		*task.ctrl
+		num / 4
 	};
 
-	f32x4 *const p[3]
-	{
-		reinterpret_cast<f32x4 *>(p_),
-		reinterpret_cast<f32x4 *>(m_[0]) + off,
-		reinterpret_cast<f32x4 *>(m_[1]) + off,
-	};
-
-	assert(num >= 4);
-	const uint n
-	{
-		uint(num) / 4
-	};
-
-	// Assume loop body always taken w/o soundness; otherwise extra branch.
-	assert(n > 0);
-	uint i(0); do
-	{
-		off = adamw
+	assume(0 < n);
+	for(uint i(0); i < n; ++i)
+		adamw
 		(
-			p[0][i],
-			p[1][i],
-			p[2][i],
+			val[0][i],
+			val[1][i],
+			val[2][i],
 			grad,
 			opts.alpha,
 			opts.beta[0],
 			opts.beta[1],
-			ctrl.epic.step,
-			off
+			step + 1
 		);
-	}
-	while(++i < n);
-
-	return off;
 }
 
-size_t
+void
 ircd::gpt::adamw(f32x4 &__restrict__ param,
                  f32x4 &__restrict__ moment0,
                  f32x4 &__restrict__ moment1,
-                 const f32 grad,
-                 const f32 alpha,
+                 const f32 grad_,
+                 const f32 alpha_,
                  const f32 beta0,
                  const f32 beta1,
-                 const u32 step,
-                 const size_t off)
+                 const u32 step)
 {
-	const f32x4 one
+	const f32 alpha
 	{
-		1.0f, 1.0f, 1.0f, 1.0f,
+		grad_ < 0? -alpha_ : alpha_
 	};
 
-	const f32x4 a[2]
+	const f32 grad
 	{
-		{ one - beta0 },
-		{ one - beta1 },
+		grad_ < 0? -grad_ : grad_
 	};
 
-	const f32x4 avg_mul[2]
+	const f32 grad_grad
 	{
-		{ moment0 * beta0 },
-		{ moment1 * beta1 },
+		grad * grad
 	};
 
-	const f32x4 avg_dot[2]
+	const f32x4 moment[]
 	{
-		{ avg_mul[0] + a[0] * grad         },
-		{ avg_mul[1] + a[1] * grad * grad  },
-	};
-
-	const f32x4 bias[2]
-	{
-		{ avg_dot[0] / (one - powf(beta0, step + 1)) },
-		{ avg_dot[1] / (one - powf(beta1, step + 1)) },
-	};
-
-	const f32x4 denom
-	{
-		sqrtf(bias[1]) + 0.000001f // epsilon
+		adamw_moment(moment0, grad, beta0),
+		adamw_moment(moment1, grad_grad, beta1)
 	};
 
 	const f32x4 delta
 	{
-		alpha * (bias[0] / denom)
+		adamw_delta(moment[0], moment[1], alpha, beta0, beta1, step)
 	};
 
 	const f32x4 update
@@ -645,8 +654,168 @@ ircd::gpt::adamw(f32x4 &__restrict__ param,
 		param - delta
 	};
 
-	moment0 = avg_dot[0];
-	moment1 = avg_dot[1];
+	if((false))
+	for(uint i(0); i < 4; ++i)
+		printf("%-15p p[%11.8lf] m[%11.8lf][%11.8lf] g[%11.8lf] d[%11.8lf] p[%11.8lf] m[%11.8lf][%11.8lf]\n",
+		((f32 *)&param) + i,
+		param[i],
+		moment0[i],
+		moment1[i],
+		grad,
+		delta[i],
+		update[i],
+		moment[0][i],
+		moment[1][i]);
+
+	assert(std::isnormal(update[0]));
+	assert(std::isnormal(update[1]));
+	assert(std::isnormal(update[2]));
+	assert(std::isnormal(update[3]));
+
+	assert(std::isnormal(moment[0][0]));
+	assert(std::isnormal(moment[0][1]));
+	assert(std::isnormal(moment[0][2]));
+	assert(std::isnormal(moment[0][3]));
+
+	assert(std::isnormal(moment[1][0]));
+	assert(std::isnormal(moment[1][1]));
+	assert(std::isnormal(moment[1][2]));
+	assert(std::isnormal(moment[1][3]));
+
 	param = update;
-	return off + 1;
+	//__builtin_nontemporal_store(update, &param);
+
+	moment0 = moment[0];
+	moment1 = moment[1];
+	//__builtin_nontemporal_store(moment[0], &moment0);
+	//__builtin_nontemporal_store(moment[1], &moment1);
+}
+
+ircd::f32x4
+ircd::gpt::adamw_delta(const f32x4 moment0,
+                       const f32x4 moment1,
+                       const f32 alpha,
+                       const f32 beta0,
+                       const f32 beta1,
+                       const u32 step)
+{
+	static const f32 epsilon
+	{
+		FLT_EPSILON
+	};
+
+	const f32x4 denom
+	{
+		adamw_denom(moment1, beta1, step) + epsilon
+	};
+
+	const f32x4 decay
+	{
+		adamw_numer(moment0, beta0, step)
+	};
+
+	const f32x4 smooth
+	{
+		alpha * decay
+	};
+
+	assert(std::isnormal(denom[0]));
+	assert(std::isnormal(denom[1]));
+	assert(std::isnormal(denom[2]));
+	assert(std::isnormal(denom[3]));
+	const f32x4 delta
+	{
+		smooth / denom
+	};
+
+	return delta;
+}
+
+ircd::f32x4
+ircd::gpt::adamw_denom(const f32x4 moment,
+                       const f32 beta,
+                       const u32 step)
+{
+	static const f32x4 one
+	{
+		1.0f, 1.0f, 1.0f, 1.0f,
+	};
+
+	assert(step > 0);
+	const f32x4 decay
+	{
+		one - powf(beta, step)
+	};
+
+	assert(std::isnormal(decay[0]));
+	assert(std::isnormal(decay[1]));
+	assert(std::isnormal(decay[2]));
+	assert(std::isnormal(decay[3]));
+	const f32x4 bias
+	{
+		moment / decay
+	};
+
+	const f32x4 denom
+	{
+		sqrtf(bias)
+	};
+
+	return denom;
+}
+
+ircd::f32x4
+ircd::gpt::adamw_numer(const f32x4 moment,
+                       const f32 beta,
+                       const u32 step)
+{
+	static const f32x4 one
+	{
+		1.0f, 1.0f, 1.0f, 1.0f,
+	};
+
+	assert(step > 0);
+	const f32x4 decay
+	{
+		one - powf(beta, step)
+	};
+
+	assert(std::isnormal(decay[0]));
+	assert(std::isnormal(decay[1]));
+	assert(std::isnormal(decay[2]));
+	assert(std::isnormal(decay[3]));
+	const f32x4 bias
+	{
+		moment / decay
+	};
+
+	return bias;
+}
+
+ircd::f32x4
+ircd::gpt::adamw_moment(const f32x4 moment,
+                        const f32 grad,
+                        const f32 beta)
+{
+	static const f32x4 one
+	{
+		1.0f, 1.0f, 1.0f, 1.0f,
+	};
+
+	const f32x4 rate
+	{
+		one - beta
+	};
+
+	const f32x4 avg
+	{
+		moment * beta
+	};
+
+	const f32x4 dot
+	{
+		rate * grad + avg
+	};
+
+	return dot;
 }
diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl
index 661f6b465..9104e82f9 100644
--- a/ircd/gpt_gpu.cl
+++ b/ircd/gpt_gpu.cl
@@ -8,31 +8,1458 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 
+//#pragma OPENCL EXTENSION cl_amd_device_attribute_query : enable
+//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store :enable
+//#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+//#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+//#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+//#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+#pragma clang fp exceptions(ignore)
+#pragma clang fp reassociate(on)
+#pragma clang fp contract(fast)
+
+#include <ircd/config.h>
+#include <clc/clc.h>
+
+#define __region __attribute__((address_space(0x02)))
+
+#if !defined(assume)
+	#define assume(x) __builtin_assume(x)
+#endif
+
+#if defined(__SPIR)
+	#define restrict
+#elif defined(__cplusplus)
+	#define restrict __restrict
+#endif
+
+#if __OPENCL_VERSION__ < 120
+	#define static __attribute__((internal_linkage))
+#else
+	#define static __constant static
+#endif
+
+#pragma clang attribute push(__attribute__((always_inline)), apply_to = function)
+#pragma clang attribute push(__attribute__((internal_linkage)), apply_to = function)
 #include <ircd/simt/simt.h>
-#include <ircd/gpt/token.h>
+#include <ircd/gpt/vector.h>
 #include <ircd/gpt/opts.h>
 #include <ircd/gpt/ctrl.h>
+#pragma clang attribute pop
+#pragma clang attribute pop
 
-inline void
-__attribute__((always_inline))
-ircd_gpt_norm_fmad(__local float4 *const out,
-                   __local const float4 *const in,
-                   __global const float4 *const restrict bias,
-                   __global const float4 *const restrict weight,
-                   const uint i)
+#if __OPENCL_VERSION__ >= 120
+	#undef static
+#endif
+
+#include <ircd/gpt/gpu.h>
+
+//
+// head
+//
+
+__kernel void
+__attribute__((visibility("protected")))
+__attribute__((reqd_work_group_size(192, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_alloc(__global const void *const restrict model,
+               __global void *const restrict master,
+               __constant const void *const opts,
+               __global void *const restrict ctrl,
+               __global void *const restrict frame0,
+               __global void *const restrict frame1,
+               __global void *const restrict frame2,
+               __global void *const restrict frame3,
+               __global void *const restrict frame4,
+               __global void *const restrict frame5,
+               __global void *const restrict frame6,
+               __global void *const restrict frame7)
 {
-	out[i] = in[i] * weight[i] + bias[i];
+}
+
+__kernel void
+__attribute__((visibility("protected")))
+__attribute__((reqd_work_group_size(192, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_enter(__global const void *const restrict model,
+               __global void *const restrict state,
+               __global void *const restrict master,
+               __constant const struct ircd_gpt_opts *const opts,
+               __global struct ircd_gpt_ctrl *const restrict ctrl)
+{
+	const ushort
+	gi = get_global_id(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	cycle = ctrl->clk.cycle;
+
+	if(li == 0)
+		;//ctrl->prof.entered = __builtin_readcyclecounter();
+}
+
+__kernel void
+__attribute__((vec_type_hint(float4)))
+__attribute__((reqd_work_group_size(192, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
+                  __constant const struct ircd_gpt_opts *const opts,
+                  __global ircd_gpt_vectorv *const restrict accum,
+                  __global const ircd_gpt_vectorv *const restrict pos,
+                  __global const ircd_gpt_vectorv *const restrict vocab)
+{
+	const ushort
+	li = get_local_id(0),
+	ln = get_local_size(0);
+
+	const uint
+	wo = get_global_offset(0);
+
+	assume(ln == 192);
+	assume(wo % ln == 0);
+
+	const ushort
+	wi = wo / ln + get_group_id(0);
+
+	_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li);
+}
+
+static void
+_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global ircd_gpt_vectorv *const restrict accum,
+                   __global const ircd_gpt_vectorv *const restrict pos,
+                   __global const ircd_gpt_vectorv *const restrict vocab,
+                   const ushort out_idx,
+                   const ushort tok_idx,
+                   const ushort elem_idx)
+{
+	const ushort
+	token = ctrl->token[tok_idx];
+
+	const float4
+	wpe = pos[tok_idx].elem[elem_idx],
+	wte = vocab[token].elem[elem_idx],
+	res = wte + wpe;
+
+	accum[out_idx].elem[elem_idx] = res;
+}
+
+//
+// Frontside
+//
+
+void
+ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *const opts,
+                        __local ircd_gpt_ffnn_aperaturev *const restrict out,
+                        __local const ircd_gpt_vectorv *const restrict in,
+                        __global const ircd_gpt_ffnn_aperaturev *const restrict bias,
+                        __global const ircd_gpt_ffnn_aperaturev *const restrict weight,
+                        const uint li)
+{
+	const uint
+	lanes = 4,
+	segs = ircd_gpt_ffnn_segs,
+	height = ircd_gpt_vector_elems / lanes;
+
+	assume(height > 0);
+	assume(height % lanes == 0);
+
+	for(uint x = 0; x < segs; ++x)
+		out->proj[x][li] = bias->proj[x][li];
+
+	for(uint y = 0; y < height; ++y)
+		for(uint k = 0; k < lanes; ++k)
+			for(uint x = 0; x < segs; ++x)
+			{
+				const uint
+				row = y * lanes + k;
+
+				out->proj[x][li] += in->elem[y][k] * weight[row].proj[x][li];
+			}
+}
+
+void
+ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __local ircd_gpt_ffnn_aperaturev *const restrict out,
+                   __local const ircd_gpt_vectorv *const restrict in,
+                   __global const ircd_gpt_ffnn_aperaturev *const restrict bias,
+                   __global const ircd_gpt_ffnn_aperaturev *const restrict weight,
+                   const uint ln,
+                   const uint li)
+{
+	const uint
+	segs = ircd_gpt_ffnn_segs;
+
+	// Fully connected
+	ircd_gpt_ffnn_fcon_tmul
+	(
+		opts,
+		out,
+		in,
+		bias,
+		weight,
+		li
+	);
+
+	for(uint i = 0; i < segs; ++i)
+		ircd_gpt_ffnn_gelu(out, out, i * ln + li);
+}
+
+void
+ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *const opts,
+                        __local ircd_gpt_vectorv *const restrict out,
+                        __local const ircd_gpt_ffnn_aperaturev *const restrict in,
+                        __global const ircd_gpt_vectorv *const restrict bias,
+                        __global const ircd_gpt_vectorv *const restrict weight,
+                        const uint li)
+{
+	const uint
+	lanes = 4,
+	height = ircd_gpt_ffnn_fcon_elems / lanes;
+
+	assume(height > 0);
+	assume(height % lanes == 0);
+
+	out->elem[li] = bias->elem[li];
+
+	for(uint y = 0; y < height; ++y)
+		for(uint k = 0; k < lanes; ++k)
+		{
+			const uint
+			row = y * lanes + k;
+
+			out->elem[li] += in->fcon[y][k] * weight[row].elem[li];
+		}
+}
+
+void
+ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
+              __constant const struct ircd_gpt_opts *const opts,
+              __local ircd_gpt_vectorv *const restrict token,
+              __local ircd_gpt_ffnn_aperaturev *const restrict buf,
+              __local ircd_gpt_vectorv *const restrict tmp,
+              __global const ircd_gpt_vectorv *const restrict norm_bias,
+              __global const ircd_gpt_vectorv *const restrict norm_weight,
+              __global const ircd_gpt_ffnn_aperaturev *const restrict fcon_bias,
+              __global const ircd_gpt_ffnn_aperaturev *const restrict fcon_weight,
+              __global const ircd_gpt_vectorv *const restrict proj_bias,
+              __global const ircd_gpt_vectorv *const restrict proj_weight,
+              const uint ln,
+              const uint li)
+{
+	// Layer re-normalization
+	ircd_gpt_norm(token, token, tmp, norm_bias, norm_weight, ln, li);
+
+	// ln's writes are still pending but fcon reads results across threads.
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Fully connected
+	ircd_gpt_ffnn_fcon
+	(
+		ctrl,
+		opts,
+		buf,
+		token,
+		fcon_bias,
+		fcon_weight,
+		ln,
+		li
+	);
+
+	// fcon's writes are still pending but proj reads results across threads.
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Projection
+	ircd_gpt_ffnn_proj_tmul
+	(
+		opts,
+		token,
+		buf,
+		proj_bias,
+		proj_weight,
+		li
+	);
+}
+
+static void
+ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl,
+                         __constant const struct ircd_gpt_opts *const opts,
+                         __local float self[restrict][12],
+                         const uint ln,
+                         const uint li,
+                         const uint wn,
+                         const uint wi)
+{
+	struct ircd_math_samax samax =
+	{
+		.mu = -10000.0f,
+		.sum = 0.0f,
+	};
+
+	__attribute__((opencl_unroll_hint))
+	for(uint i = 0; i < wn; ++i)
+		samax.mu = max(samax.mu, self[i][li]);
+
+	__attribute__((opencl_unroll_hint))
+	for(uint i = 0; i < wn; ++i)
+		self[i][li] -= samax.mu;
+
+	for(uint i = 0; i < wn; ++i)
+		self[i][li] = native_exp(self[i][li]);
+
+	__attribute__((opencl_unroll_hint))
+	for(uint i = 0; i < wn; ++i)
+		samax.sum += self[i][li];
+
+	samax.sum += FLT_EPSILON;
+	samax.lambda = 1.0f / samax.sum;
+
+	__attribute__((opencl_unroll_hint))
+	for(uint i = 0; i < wn; ++i)
+		self[i][li] *= samax.lambda;
+}
+
+static void
+ircd_gpt_attn_self_keys(__global const struct ircd_gpt_ctrl *const ctrl,
+                        __constant const struct ircd_gpt_opts *const opts,
+                        __local float self[restrict][ircd_gpt_attn_rank],
+                        __global const ircd_gpt_attn_qkvv *const restrict token,
+                        const uint ln,
+                        const uint li,
+                        const uint wi,
+                        const uint kn,
+                        const uint i)
+{
+	assume(i < wi);
+
+	self[i][li] = 0.0f;
+
+	__attribute__((opencl_unroll_hint))
+	for(uint k = 0; k < kn; ++k)
+	{
+		float4
+		qry = token[wi].qry.attn[li][k],
+		key = token[i].key.attn[li][k],
+		res = qry * key;
+
+		self[i][li] += ircd_simt_reduce_add_f4(res);
+	}
+
+	self[i][li] /= 8.0f;
+}
+
+static void
+ircd_gpt_attn_self_vals(__global const struct ircd_gpt_ctrl *const ctrl,
+                        __constant const struct ircd_gpt_opts *const opts,
+                        __local ircd_gpt_vectorv *const restrict out,
+                        __local const float self[restrict][ircd_gpt_attn_rank],
+                        __global const ircd_gpt_attn_qkvv *const restrict token,
+                        const uint li,
+                        const uint wi,
+                        const uint ki,
+                        const uint ti)
+{
+	out->attn[ti][ki] = 0.0f;
+
+	__attribute__((opencl_unroll_hint))
+	for(uint i = 0; i < wi; ++i)
+	{
+		const float4
+		val = token[i].val.attn[ti][ki],
+		attn = self[i][ti],
+		res = attn * val;
+
+		out->attn[ti][ki] += res;
+	}
+}
+
+static void
+ircd_gpt_attn_self(__global struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __local ircd_gpt_vectorv *const restrict out,
+                   __local float self[restrict][ircd_gpt_attn_rank],
+                   __global float attns[restrict][ircd_gpt_attn_rank],
+                   __global const ircd_gpt_attn_qkvv *const restrict token,
+                   const uint ln,
+                   const uint li,
+                   const uint wi)
+{
+	//assume(opts->attn_rank == sizeof(self[0]) / sizeof(float));
+	assume(opts->attn_rank == ircd_gpt_attn_rank);
+	assume(ctrl->count < ircd_gpt_context_tokens);
+	assume(ctrl->tokens <= ircd_gpt_context_tokens);
+	assume(ctrl->tokens > wi);
+	assume(ctrl->tokens > 0);
+
+	const uint
+	wn = ctrl->tokens,
+	kn = ln / opts->attn_rank,
+	ki = li / opts->attn_rank,
+	ti = li % opts->attn_rank;
+
+	// Low-rank mask
+	if(li < opts->attn_rank)
+	{
+		// Left attention
+		uint i;
+		for(i = 0; i < wi; ++i)
+			ircd_gpt_attn_self_keys(ctrl, opts, self, token, ln, li, wi, kn, i);
+
+		// Future mask
+		__attribute__((opencl_unroll_hint))
+		while(i < wn)
+			self[i++][li] = -10000.0f;
+
+		// Three-piece softmax
+		ircd_gpt_attn_self_samax(ctrl, opts, self, ln, li, wn, wi);
+	}
+
+	// Propagate to full width for value dot prod.
+	barrier(CLK_LOCAL_MEM_FENCE);
+	ircd_gpt_attn_self_vals(ctrl, opts, out, self, token, li, wi, ki, ti);
+
+	// Save softmax results for later analysis/observation.
+	if(li < opts->attn_rank)
+	{
+		__attribute__((opencl_unroll_hint))
+		for(uint i = 0; i < wn; ++i)
+			attns[i][li] = self[i][li];
+	}
+}
+
+static void
+ircd_gpt_attn_proj_tmul(__constant const struct ircd_gpt_opts *const opts,
+                        __local ircd_gpt_vectorv *const restrict out,
+                        __local const ircd_gpt_vectorv *const restrict in,
+                        __global const ircd_gpt_vectorv *const restrict bias,
+                        __global const ircd_gpt_vectorv *const restrict weight,
+                        const uint li)
+{
+	const uint
+	lanes = 4,
+	height = ircd_gpt_vector_elems / 4;
+
+	assume(height > 0);
+	assume(height % lanes == 0);
+
+	out->elem[li] = bias->elem[li];
+
+	for(uint y = 0; y < height; ++y)
+		for(uint k = 0; k < lanes; ++k)
+		{
+			const uint
+			row = y * lanes + k;
+
+			const float4
+			a = in->elem[y][k],
+			b = weight[row].elem[li];
+
+			out->elem[li] += a * b;
+		}
+}
+
+__kernel void
+__attribute__((vec_type_hint(float4)))
+__attribute__((reqd_work_group_size(192, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(192, 192)))
+__attribute__((visibility("protected")))
+ircd_gpt_coil(__global struct ircd_gpt_ctrl *const ctrl,
+              __constant const struct ircd_gpt_opts *const opts,
+              __private const uint layer,
+              __global ircd_gpt_vectorv *const restrict accum,
+              __global float attns[restrict][ircd_gpt_attn_rank],
+              __global const ircd_gpt_attn_qkvv *const restrict state,
+              __global const ircd_gpt_vectorv *const restrict attn_proj_bias,
+              __global const ircd_gpt_vectorv *const restrict attn_proj_weight,
+              __global const ircd_gpt_vectorv *const restrict ffnn_norm_bias,
+              __global const ircd_gpt_vectorv *const restrict ffnn_norm_weight,
+              __global const ircd_gpt_ffnn_aperaturev *const restrict ffnn_fcon_bias,
+              __global const ircd_gpt_ffnn_aperaturev *const restrict ffnn_fcon_weight,
+              __global const ircd_gpt_vectorv *const restrict ffnn_proj_bias,
+              __global const ircd_gpt_vectorv *const restrict ffnn_proj_weight)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wo = get_global_offset(0),
+	wi = wo / ln + get_group_id(0);
+
+	assume(ln == 192);
+	assume(wo % ln == 0);
+
+	__local union
+	{
+		float
+		attn_self[ircd_gpt_context_tokens][ircd_gpt_attn_rank];
+
+		ircd_gpt_ffnn_aperaturev
+		ffnn_fcon[2];
+
+		ircd_gpt_vectorv
+		vector[8];
+	}
+	buf;
+
+	__local ircd_gpt_vectorv
+	buf0, buf1,
+	*const restrict attn_self = &buf1,
+	*const restrict token = &buf0,
+	*const restrict tmp = &buf1;
+
+	// Self-attention backend; this computes the self-attention result now
+	// that keys and values are globally visible across tokens.
+	ircd_gpt_attn_self
+	(
+		ctrl,
+		opts,
+		attn_self,
+		buf.attn_self,
+		attns,
+		state,
+		ln,
+		li,
+		wi
+	);
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Project result of self-attention.
+	ircd_gpt_attn_proj_tmul
+	(
+		opts,
+		token,
+		attn_self,
+		attn_proj_bias,
+		attn_proj_weight,
+		li
+	);
+
+	// Frontend accumulation
+	{
+		const float4
+		attn = token->elem[li],
+		resid = accum[wi].elem[li],
+		result = resid + attn;
+
+		token->elem[li] = result;
+		accum[wi].elem[li] = result;
+	}
+
+	// Backend mlp; layer-norm acquires any pending writes, no fence required.
+	ircd_gpt_ffnn
+	(
+		ctrl,
+		opts,
+		token,
+		buf.ffnn_fcon,
+		tmp,
+		ffnn_norm_bias,
+		ffnn_norm_weight,
+		ffnn_fcon_bias,
+		ffnn_fcon_weight,
+		ffnn_proj_bias,
+		ffnn_proj_weight,
+		ln,
+		li
+	);
+
+	// Backend accumulation
+	{
+		const float4
+		ffnn = token->elem[li],
+		resid = accum[wi].elem[li],
+		result = resid + ffnn;
+
+		accum[wi].elem[li] = result;
+	}
+}
+
+static void
+ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *const opts,
+                        __local ircd_gpt_attn_aperaturev *const restrict out,
+                        __local const ircd_gpt_vectorv *const restrict in,
+                        __global const ircd_gpt_attn_aperaturev *const restrict bias,
+                        __global const ircd_gpt_attn_aperaturev *const restrict weight,
+                        const uint ln,
+                        const uint li)
+{
+	const uint
+	lanes = 4,
+	segs = ircd_gpt_attn_segs,
+	height = ircd_gpt_vector_elems / lanes;
+
+	assume(height > 0);
+	assume(height % segs == 0);
+	assume(height % lanes == 0);
+
+	for(uint x = 0; x < segs; ++x)
+		out->proj[x][li] = bias->proj[x][li];
+
+	for(uint y = 0; y < height; ++y)
+		for(uint k = 0; k < lanes; ++k)
+			for(uint x = 0; x < segs; ++x)
+			{
+				const uint
+				row = y * lanes + k;
+
+				const float4
+				a = in->elem[y][k],
+				b = weight[row].proj[x][li];
+
+				out->proj[x][li] += a * b;
+			}
+}
+
+__kernel void
+__attribute__((vec_type_hint(float4)))
+__attribute__((reqd_work_group_size(192, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __private const uint layer,
+                   __global ircd_gpt_attn_aperaturev *const restrict state,
+                   __global const ircd_gpt_vectorv *const restrict accum,
+                   __global const ircd_gpt_vectorv *const restrict norm_bias,
+                   __global const ircd_gpt_vectorv *const restrict norm_weight,
+                   __global const ircd_gpt_attn_aperaturev *const restrict fcon_bias,
+                   __global const ircd_gpt_attn_aperaturev *const restrict fcon_weight)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wo = get_global_offset(0),
+	wi = wo / ln + get_group_id(0),
+	segs = ircd_gpt_attn_segs;
+
+	assume(ln == 192);
+	assume(wo % ln == 0);
+
+	__local ircd_gpt_attn_aperaturev
+	attn;
+
+	__local ircd_gpt_vectorv
+	token, *const restrict tmp = attn.vector;
+
+	token.elem[li] = accum[wi].elem[li];
+
+	// Layer re-normalization
+	ircd_gpt_norm(&token, &token, tmp, norm_bias, norm_weight, ln, li);
+
+	// Ln's writes are still pending; fcon requires results across threads.
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Fully connected
+	ircd_gpt_attn_fcon_tmul
+	(
+		opts,
+		&attn,
+		&token,
+		fcon_bias,
+		fcon_weight,
+		ln,
+		li
+	);
+
+	// Export queries, keys, and values.
+	for(uint x = 0; x < segs; ++x)
+		state[wi].proj[x][li] = attn.proj[x][li];
+}
+
+__kernel void
+__attribute__((vec_type_hint(float4)))
+__attribute__((reqd_work_group_size(192, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *const ctrl,
+                 __constant const struct ircd_gpt_opts *const opts,
+                 __global ircd_gpt_vectorv *const restrict accum,
+                 __global const ircd_gpt_vectorv *const restrict norm_bias,
+                 __global const ircd_gpt_vectorv *const restrict norm_weight)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wo = get_global_offset(0),
+	wi = wo / ln + get_group_id(0);
+
+	assume(ln == 192);
+	assume(wo % ln == 0);
+
+	__local ircd_gpt_vectorv
+	tmp, token;
+
+	token.elem[li] = accum[wi].elem[li];
+
+	// Final re-normalization
+	ircd_gpt_norm(&token, &token, &tmp, norm_bias, norm_weight, ln, li);
+
+	accum[wi].elem[li] = token.elem[li];
+}
+
+__kernel void
+ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl,
+                  __constant const struct ircd_gpt_opts *const opts,
+                  __global float *const restrict logit,
+                  __global const ircd_gpt_vectorv *const restrict accum,
+                  __global const ircd_gpt_vectorv *const restrict pos,
+                  __global const ircd_gpt_vectorv *const restrict vocab)
+{
+	const uint
+	gi = get_global_id(0),
+	wi = ctrl->count - 1;
+
+	assume(opts->embed_width == 192);
+	assume(opts->logits <= 65536);
+
+	if(gi >= opts->logits)
+	{
+		logit[gi] = -10000.0f;
+		return;
+	}
+
+	float acc = 0.0f;
+	for(uint j = 0; j < opts->embed_width; ++j)
+	{
+		const float4
+		token = vocab[gi].elem[j],
+		in = accum[wi].elem[j],
+		wpe = pos[wi].elem[j],
+		res = in * token + wpe;
+
+		acc += ircd_simt_reduce_add_f4(res);
+	}
+
+	logit[gi] = acc;
+}
+
+__kernel void
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(256, 256)))
+ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *const ctrl,
+                  __constant const struct ircd_gpt_opts *const opts,
+                  __global float logit[restrict 65536])
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	//wo = get_global_offset(0),
+	//wi = wo / ln + get_group_id(0),
+	wn = 50432,
+	tn = wn / ln,
+	start = tn * li,
+	stop = min(start + tn, opts->logits);
+
+	__local float
+	mu[256], sum[256], lambda[256];
+
+	__local struct ircd_math_samax
+	samax;
+
+	assume(ln == 256);
+
+	mu[li] = -10000.0f;
+	__attribute__((opencl_unroll_hint))
+	for(uint ti = start; ti < stop; ++ti)
+		mu[li] = max(mu[li], logit[ti]);
+
+	ircd_simt_reduce_max_flldr(mu, ln, li);
+
+	if(li == 0)
+		samax.mu = mu[li];
+
+	sum[li] = 0.0f;
+	for(uint ti = start; ti < stop; ++ti)
+	{
+		const float
+		sub = logit[ti] - samax.mu,
+		res = native_exp(sub);
+
+		sum[li] += res;
+	}
+
+	ircd_simt_reduce_add_flldr(sum, ln, li);
+
+	if(li == 0)
+		sum[li] += FLT_EPSILON,
+		samax.sum = sum[li],
+		samax.lambda = lambda[li] = 1.0f / sum[li];
+
+	ircd_simt_broadcast_flldr(lambda, ln, li);
+
+	for(uint ti = start; ti < stop; ++ti)
+	{
+		const float
+		sub = logit[ti] - samax.mu,
+		res = lambda[li] * native_exp(sub);
+
+		logit[ti] = res;
+	}
+}
+
+void
+ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *const ctrl,
+                       __constant const struct ircd_gpt_opts *const opts,
+                       __local const ushort *const restrict idx,
+                       __global const float *const restrict logsm,
+                       const uint i)
+{
+	const ushort
+	token = idx[i];
+
+	const float
+	samax = logsm[token] + FLT_EPSILON;
+
+	ctrl->top[i].token = token;
+	ctrl->top[i].samax = samax;
+}
+
+void
+ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *const ctrl,
+                              __constant const struct ircd_gpt_opts *const opts,
+                              __local struct ircd_math_mean *const mean,
+                              const float last)
+{
+	const uint
+	div = mean->div + 1,
+	sum_sel = mean->div % 4;
+
+	const float
+	sum = mean->sum[0] + mean->sum[1] + mean->sum[2] + mean->sum[3] + last,
+	res = sum / div;
+
+	mean->sum[sum_sel] += last;
+	mean->div = div;
+	mean->last = last;
+	mean->mean = res;
+}
+
+void
+ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *const ctrl,
+                         __constant const struct ircd_gpt_opts *const opts,
+                         __local struct ircd_gpt_ctrl_label *const label,
+                         __global const float *const restrict logsm)
+{
+	const ushort
+	token = label->logit.token;
+
+	const float
+	samax = logsm[token] + FLT_EPSILON,
+	loss = 0.0f - native_log(samax),
+	ppl = (1.0f - samax) * native_log2(opts->logits);
+
+	label->logit.samax = samax;
+	ircd_gpt_lm_result_label_mean(ctrl, opts, &label->loss, loss);
+	ircd_gpt_lm_result_label_mean(ctrl, opts, &label->ppl, ppl);
+}
+
+ushort
+ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *const ctrl,
+                          __constant const struct ircd_gpt_opts *const opts,
+                          __local const ushort *const restrict idx,
+                          __global const float *const restrict logsm)
+{
+	const ulong
+	ent_k = max(opts->top_k, 1U) - 1,
+	rnd = ircd_simt_rand_xoshiro256pl(ctrl->rand);
+
+	const float
+	ent_p = min(max(opts->top_p, 0.0f), 1.0f),
+	thresh = ent_p;
+
+	float acc = 1.0f;
+	ushort select = 0;
+	for(; select < ent_k; ++select)
+		if((acc -= logsm[idx[select]]) < thresh)
+			break;
+
+	const ushort
+	token = idx[select];
+
+	return token;
+}
+
+static ushort
+ircd_gpt_lm_result(__local struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __local const ushort *const restrict idx,
+                   __global const float *const restrict logsm)
+{
+	const ushort
+	token = ircd_gpt_lm_result_select(ctrl, opts, idx, logsm);
+
+	// Update the dynamic result label.
+	ctrl->select.logit.token = token;
+	ircd_gpt_lm_result_label(ctrl, opts, &ctrl->select, logsm);
+
+	// Update the dynamic target label.
+	ctrl->target.logit.token = ctrl->count < ctrl->tokens?
+		ctrl->token[ctrl->count]:
+		ctrl->select.logit.token;
+
+	ircd_gpt_lm_result_label(ctrl, opts, &ctrl->target, logsm);
+
+	const bool
+	hit = ctrl->select.logit.token == ctrl->target.logit.token;
+
+	// Update the token context.
+	if(ctrl->count == ctrl->tokens)
+	{
+		ctrl->token[ctrl->count] = ctrl->select.logit.token;
+		ctrl->tokens++;
+	}
+
+	ctrl->miss += !hit;
+	ctrl->hit += hit;
+	ctrl->count++;
+	return token;
+}
+
+static void
+ircd_gpt_lm_result_attns(__local struct ircd_gpt_ctrl *const ctrl,
+                         __constant const struct ircd_gpt_opts *const opts,
+                         __global const float *const restrict attns,
+                         const uint ln,
+                         const uint li)
+{
+	const uint
+	layer = li / opts->layers,
+	head = li % opts->attn_rank,
+	base = layer * opts->attn_self_elems;
+
+	uint best = 0;
+	float bestv = 10000.0f;
+	for(uint i = 0; i < ctrl->count; ++i)
+	{
+		const uint
+		bx = (((i + 1) * i) / 2) * opts->attn_rank,
+		idx = base + bx + i * 12 + head;
+
+		if(attns[idx] < bestv)
+			bestv = attns[idx],
+			best = i;
+	}
+
+	ctrl->attn[layer][head] = best;
+}
+
+__kernel void
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(256, 256)))
+__attribute__((visibility("protected")))
+ircd_gpt_lm_select(__global struct ircd_gpt_ctrl *const restrict ctrl_,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global const float logsm[restrict 65536],
+                   __global const float *const restrict attns)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	logits_pad = ln - (opts->logits % ln),
+	tn = (opts->logits + logits_pad) / ln,
+	start = tn * li,
+	stop = min(start + tn, opts->logits);
+
+	__local ushort idx[256];
+	__local struct ircd_gpt_ctrl ctrl;
+	__private event_t event[1];
+
+	assume(ln == 256);
+	assume(start < stop);
+
+	event[0] = async_work_group_copy
+	(
+		(__local char16 *)&ctrl,
+		(__global const char16 *)ctrl_,
+		sizeof(struct ircd_gpt_ctrl) / sizeof(char16),
+		0
+	);
+
+	idx[li] = start;
+	for(uint j = start + 1; j < stop; ++j)
+		if(logsm[j] > logsm[idx[li]])
+			idx[li] = j;
+
+	ircd_simt_sort_idx16_flldr(idx, logsm, ln, li);
+	wait_group_events(1, event);
+
+	if(ctrl.count >= opts->buffer_tokens)
+		return;
+
+	if(li < opts->top_n)
+		ircd_gpt_lm_result_top(&ctrl, opts, idx, logsm, li);
+
+	if(li < opts->labels)
+		ircd_gpt_lm_result_label(&ctrl, opts, ctrl.label + li, logsm);
+
+	if(li < opts->layers * opts->attn_rank)
+		ircd_gpt_lm_result_attns(&ctrl, opts, attns, ln, li);
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(li == 0)
+		ircd_gpt_lm_result(&ctrl, opts, idx, logsm);
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	event[0] = async_work_group_copy
+	(
+		(__global char16 *)ctrl_,
+		(__local const char16 *)&ctrl,
+		sizeof(struct ircd_gpt_ctrl) / sizeof(char16),
+		0
+	);
+
+	wait_group_events(1, event);
+}
+
+__kernel void
+__attribute__((visibility("protected")))
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((amdgpu_flat_work_group_size(256, 256)))
+ircd_gpt_leave(__global const void *const restrict model,
+               __global void *const restrict state,
+               __global void *const restrict master,
+               __constant const struct ircd_gpt_opts *const opts,
+               __global struct ircd_gpt_ctrl *const ctrl_,
+               __global struct ircd_gpt_ctrl *const frame)
+{
+	const ushort
+	li = get_local_id(0),
+	ln = get_local_size(0);
+
+	assume(ln == 256);
+
+	__local struct ircd_gpt_ctrl _ctrl;
+	__local struct ircd_gpt_ctrl *const ctrl = &_ctrl;
+
+	if(li == 0)
+		*ctrl = *ctrl_;
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(li == 0 && ctrl->accept < 0)
+		ircd_gpt_accept(ctrl, opts);
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	const uint
+	batch_size = opts->batch_size,
+	samps = opts->training_steps + opts->validation_steps + opts->testing_steps,
+	steps = samps / batch_size;
+
+	const bool
+	accepting = ctrl->accept >= 0,
+	cycling = !accepting,
+	sampling = accepting,
+	stepping = sampling && (ctrl->clk.samp + 1) >= batch_size,
+	epoching = stepping && (ctrl->clk.step + 1) >= steps;
+
+	if(li == 0)
+		;//ctrl->prof.finished = __builtin_readcyclecounter();
+
+	if(li == 0)
+		*frame = *ctrl;
+
+	if(!accepting && li == 0)
+	{
+		ctrl->clk.cycle += cycling;
+		ctrl->clk.samp += sampling;
+		ctrl->clk.step += stepping;
+		ctrl->clk.epoch += epoching;
+	}
+}
+
+void
+ircd_gpt_accept(__local struct ircd_gpt_ctrl *const ctrl,
+                __constant const struct ircd_gpt_opts *const opts)
+{
+	const bool
+	unlimited = opts->limit == -1U;
+
+	const uint
+	batch_size = opts->batch_size,
+	samps = opts->training_steps + opts->validation_steps + opts->testing_steps,
+	steps = samps / batch_size;
+
+	const int
+	limit = min(opts->limit, opts->context_tokens),
+	cycle_remain = limit - (ctrl->clk.cycle + 1),    // cycle not yet incr
+	token_remain = opts->context_tokens - ctrl->count, // but count already incr
+	remain_ = min(cycle_remain, token_remain),
+	accept_ = ircd_gpt_accept_check(ctrl, opts),
+	accept_abs = abs(accept_),
+	remain = accept_ < 0 && accept_abs < remain_? accept_abs: remain_,
+	_accept = accept_ >= 0? accept_: -remain;
+
+	const bool
+	accepting = _accept >= 0,
+	dispatching = _accept < 0,
+	limiting = remain <= 0;
+
+	const int
+	accept_num = 4,
+	accept = limiting? accept_num: _accept,
+	dispatch = accept >= 0? 0: remain;
+
+	ctrl->accept = accept;
+	ctrl->dispatch = dispatch;
+
+	ctrl->magic = 0xC7012C70UL;
+}
+
+int
+ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *const ctrl,
+                      __constant const struct ircd_gpt_opts *const opts)
+{
+	int best = 8;
+	for(uint i = 0; i < 4; ++i)
+	{
+		const int
+		remain = ircd_gpt_accept_match(ctrl, opts, i);
+
+		if(remain == 0)
+			return i;
+
+		if(remain < best)
+			best = remain;
+	}
+
+	return -best;
+}
+
+uint
+ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *const ctrl,
+                      __constant const struct ircd_gpt_opts *const opts,
+                      const uint i)
+{
+	const uint
+	len = ircd_gpt_accept_len(ctrl, opts, i),
+	n = min(ctrl->count, len),
+	maxlen = 8;
+
+	uint ret = len?: maxlen;
+	for(uint j = 1; j <= n; ++j)
+	{
+		uint match = 0;
+		for(; match < j; ++match)
+		{
+			const uint
+			accept = opts->accept[i][match],
+			token = ctrl->token[ctrl->count - j + match];
+
+			if(token != accept)
+				break;
+		}
+
+		if(match >= j)
+			if(!(ret = len - match))
+				break;
+	}
+
+	ret = max(ret, ctrl->tokens - ctrl->count);
+	ret = min(ret, maxlen);
+	return ret;
+}
+
+uint
+ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *const ctrl,
+                    __constant const struct ircd_gpt_opts *const opts,
+                    const uint i)
+{
+	uint len = 0;
+	for(; len < 8; ++len)
+		if(opts->accept[i][len] == (ushort)-1U)
+			break;
+
+	return len;
+}
+
+//
+// backside
+//
+
+void
+__attribute__((always_inline))
+ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global float4 *const restrict param_,
+                   __global float4 *const restrict exp_avg_,
+                   __global float4 *const restrict exp_avg_sqr_)
+{
+	const uint
+	li = get_local_id(0),
+	ts = ctrl->clk.step;
+
+	const float4
+	param = param_[li],
+	grad = ctrl->label[0].loss.mean,
+	alpha[2] = { 1.0f - opts->beta[0], 1.0f - opts->beta[1], },
+	exp_avg = ts? exp_avg_[li]: 0.0f,
+	exp_avg_sqr = ts? exp_avg_sqr_[li]: 0.0f,
+	exp_avg_mul = exp_avg * opts->beta[0],
+	exp_avg_dot = exp_avg_mul + alpha[0] * grad,
+	exp_avg_sqr_mul = exp_avg_sqr * opts->beta[1],
+	exp_avg_sqr_dot = exp_avg_sqr_mul + alpha[1] * grad * grad,
+	denom = native_sqrt(exp_avg_sqr_dot) + FLT_EPSILON,
+	delta = opts->alpha * (exp_avg_dot / denom),
+	update = param - delta;
+
+	param_[li] = param + FLT_EPSILON;
+	exp_avg_[li] = exp_avg + FLT_EPSILON;
+	exp_avg_sqr_[li] = exp_avg_sqr + FLT_EPSILON;
+
+	//param_[li] = update;
+	//exp_avg_[li] = exp_avg_dot;
+	//exp_avg_sqr_[li] = exp_avg_sqr_dot;
+}
+
+//
+// backpropagations
+//
+
+__kernel void
+__attribute__((always_inline))
+//__attribute__((vec_type_hint(float4)))
+//__attribute__((reqd_work_group_size(192, 1, 1)))
+//__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global ircd_gpt_vectorv *const restrict bias,
+                   __global ircd_gpt_vectorv *const restrict bias_m0,
+                   __global ircd_gpt_vectorv *const restrict bias_m1,
+                   __global ircd_gpt_vectorv *const restrict weight,
+                   __global ircd_gpt_vectorv *const restrict weight_m0,
+                   __global ircd_gpt_vectorv *const restrict weight_m1)
+{
+	ircd_gpt_prop_elem
+	(
+		ctrl, opts,
+		bias->elem,
+		bias_m0->elem,
+		bias_m1->elem
+	);
+
+	ircd_gpt_prop_elem
+	(
+		ctrl, opts,
+		weight->elem,
+		weight_m0->elem,
+		weight_m1->elem
+	);
+}
+
+__kernel void
+//__attribute__((vec_type_hint(float4)))
+//__attribute__((reqd_work_group_size(192, 1, 1)))
+//__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *const ctrl,
+                        __constant const struct ircd_gpt_opts *const opts,
+                        __global ircd_gpt_vectorv *const restrict norm_bias,
+                        __global ircd_gpt_vectorv *const restrict norm_bias_m0,
+                        __global ircd_gpt_vectorv *const restrict norm_bias_m1,
+                        __global ircd_gpt_vectorv *const restrict norm_weight,
+                        __global ircd_gpt_vectorv *const restrict norm_weight_m0,
+                        __global ircd_gpt_vectorv *const restrict norm_weight_m1,
+                        __global ircd_gpt_attn_aperaturev *const restrict fcon_bias,
+                        __global ircd_gpt_attn_aperaturev *const restrict fcon_bias_m0,
+                        __global ircd_gpt_attn_aperaturev *const restrict fcon_bias_m1,
+                        __global ircd_gpt_attn_aperaturev *const restrict fcon_weight,
+                        __global ircd_gpt_attn_aperaturev *const restrict fcon_weight_m0,
+                        __global ircd_gpt_attn_aperaturev *const restrict fcon_weight_m1,
+                        __global ircd_gpt_vectorv *const restrict proj_bias,
+                        __global ircd_gpt_vectorv *const restrict proj_bias_m0,
+                        __global ircd_gpt_vectorv *const restrict proj_bias_m1,
+                        __global ircd_gpt_vectorv *const restrict proj_weight,
+                        __global ircd_gpt_vectorv *const restrict proj_weight_m0,
+                        __global ircd_gpt_vectorv *const restrict proj_weight_m1)
+{
+	const uint
+	fcon_height = opts->embed_elems,
+	proj_height = opts->embed_elems,
+	segs = 3;
+
+	ircd_gpt_norm_prop
+	(
+		ctrl, opts,
+		norm_bias,
+		norm_bias_m0,
+		norm_bias_m1,
+		norm_weight,
+		norm_weight_m0,
+		norm_weight_m1
+	);
+
+	for(uint j = 0; j < segs; ++j)
+		ircd_gpt_prop_elem
+		(
+			ctrl, opts,
+			fcon_bias->proj[j],
+			fcon_bias_m0->proj[j],
+			fcon_bias_m1->proj[j]
+		);
+
+	for(uint i = 0; i < fcon_height; ++i)
+		for(uint j = 0; j < segs; ++j)
+			ircd_gpt_prop_elem
+			(
+				ctrl, opts,
+				fcon_weight[i].proj[j],
+				fcon_weight_m0[i].proj[j],
+				fcon_weight_m1[i].proj[j]
+			);
+
+	ircd_gpt_prop_elem
+	(
+		ctrl, opts,
+		proj_bias->elem,
+		proj_bias_m0->elem,
+		proj_bias_m1->elem
+	);
+
+	for(uint i = 0; i < proj_height; ++i)
+		ircd_gpt_prop_elem
+		(
+			ctrl, opts,
+			proj_weight[i].elem,
+			proj_weight_m0[i].elem,
+			proj_weight_m1[i].elem
+		);
+}
+
+__kernel void
+//__attribute__((vec_type_hint(float4)))
+//__attribute__((reqd_work_group_size(192, 1, 1)))
+//__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
+                        __constant const struct ircd_gpt_opts *const opts,
+                        __global ircd_gpt_vectorv *const restrict norm_bias,
+                        __global ircd_gpt_vectorv *const restrict norm_bias_m0,
+                        __global ircd_gpt_vectorv *const restrict norm_bias_m1,
+                        __global ircd_gpt_vectorv *const restrict norm_weight,
+                        __global ircd_gpt_vectorv *const restrict norm_weight_m0,
+                        __global ircd_gpt_vectorv *const restrict norm_weight_m1,
+                        __global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias,
+                        __global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m0,
+                        __global ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m1,
+                        __global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight,
+                        __global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m0,
+                        __global ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m1,
+                        __global ircd_gpt_vectorv *const restrict proj_bias,
+                        __global ircd_gpt_vectorv *const restrict proj_bias_m0,
+                        __global ircd_gpt_vectorv *const restrict proj_bias_m1,
+                        __global ircd_gpt_vectorv *const restrict proj_weight,
+                        __global ircd_gpt_vectorv *const restrict proj_weight_m0,
+                        __global ircd_gpt_vectorv *const restrict proj_weight_m1)
+{
+	const uint
+	fcon_height = opts->embed_elems,
+	proj_height = opts->ffnn_elems,
+	segs = 4;
+
+	ircd_gpt_norm_prop
+	(
+		ctrl, opts,
+		norm_bias,
+		norm_bias_m0,
+		norm_bias_m1,
+		norm_weight,
+		norm_weight_m0,
+		norm_weight_m1
+	);
+
+	for(uint j = 0; j < segs; ++j)
+		ircd_gpt_prop_elem
+		(
+			ctrl, opts,
+			fcon_bias->proj[j],
+			fcon_bias_m0->proj[j],
+			fcon_bias_m1->proj[j]
+		);
+
+	for(uint i = 0; i < fcon_height; ++i)
+		for(uint j = 0; j < segs; ++j)
+			ircd_gpt_prop_elem
+			(
+				ctrl, opts,
+				fcon_weight[i].proj[j],
+				fcon_weight_m0[i].proj[j],
+				fcon_weight_m1[i].proj[j]
+			);
+
+	ircd_gpt_prop_elem
+	(
+		ctrl, opts,
+		proj_bias->elem,
+		proj_bias_m0->elem,
+		proj_bias_m1->elem
+	);
+
+	for(uint i = 0; i < proj_height; ++i)
+		ircd_gpt_prop_elem
+		(
+			ctrl, opts,
+			proj_weight[i].elem,
+			proj_weight_m0[i].elem,
+			proj_weight_m1[i].elem
+		);
+}
+
+__kernel void
+//__attribute__((vec_type_hint(float4)))
+//__attribute__((reqd_work_group_size(192, 1, 1)))
+//__attribute__((amdgpu_flat_work_group_size(192, 192)))
+ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *const ctrl,
+                       __constant const struct ircd_gpt_opts *const opts,
+                       __global ircd_gpt_vectorv *const restrict pos,
+                       __global ircd_gpt_vectorv *const restrict pos_m0,
+                       __global ircd_gpt_vectorv *const restrict pos_m1,
+                       __global ircd_gpt_vectorv *const restrict token,
+                       __global ircd_gpt_vectorv *const restrict token_m0,
+                       __global ircd_gpt_vectorv *const restrict token_m1)
+{
+	const uint
+	ln = get_local_size(0),
+	wi = get_global_offset(0) / ln + get_group_id(0),
+	wn = ctrl->count,
+	cn = opts->context_tokens / wn,
+	ci = cn * wi,
+	tn = opts->logits / wn,
+	ti = tn * wi;
+
+	for(uint i = ci; i < ci + cn; ++i)
+		ircd_gpt_prop_elem
+		(
+			ctrl, opts,
+			pos[i].elem,
+			pos_m0[i].elem,
+			pos_m1[i].elem
+		);
+
+	for(uint i = ti; i < ti + tn; ++i)
+		ircd_gpt_prop_elem
+		(
+			ctrl, opts,
+			token[i].elem,
+			token_m0[i].elem,
+			token_m1[i].elem
+		);
 }
 
 /// Gaussian Error Linear Unit
-inline void
-__attribute__((always_inline))
-ircd_gpt_ffnn_gelu(__local float4 *const out,
-                   __local const float4 *const in_,
+void
+ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *const out,
+                   __local const ircd_gpt_ffnn_aperaturev *const in_,
                    const uint i)
 {
 	const float4
-	in = in_[i];
+	in = in_->fcon[i];
 
 	float4 a;
 	a = 0.044715f;
@@ -47,966 +1474,29 @@ ircd_gpt_ffnn_gelu(__local float4 *const out,
 	a *= in;
 	a *= 0.5f;
 
-	out[i] = a;
+	out->fcon[i] = a;
 }
 
-// Matrix * Vector Multiply/Accumulate
-inline float4
-__attribute__((flatten, always_inline))
-ircd_gpt_tmul_dot(__local const float4 *const restrict in,
-                  __global const float4 *const restrict bias,
-                  __global const float4 *const restrict weight,
-                  const uint width,
-                  const uint height,
-                  const uint col,
-                  const uint i,
-                  const uint j)
+void
+ircd_gpt_norm(__local ircd_gpt_vectorv *const out,
+              __local const ircd_gpt_vectorv *const in,
+              __local ircd_gpt_vectorv *const restrict tmp,
+              __global const ircd_gpt_vectorv *const restrict bias,
+              __global const ircd_gpt_vectorv *const restrict weight,
+              const uint ln,
+              const uint li)
 {
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	lanes = 4;
-
-	float4
-	acc = 0.0f;
-
-	for(uint k = 0; k < lanes; ++k)
-	{
-		const uint
-		row = j * lanes + k,
-		cell = row * width + col;
-
-		acc += in[j][k] * weight[cell];
-	}
-
-	return acc;
-}
-
-// Matrix * Vector Multiply/Accumulate
-inline void
-__attribute__((flatten, always_inline))
-ircd_gpt_tmul(__local float4 *const restrict out,
-              __local const float4 *const restrict in,
-              __global const float4 *const restrict bias,
-              __global const float4 *const restrict weight,
-              const uint width,
-              const uint height,
-              const uint segs)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0);
-
-	__attribute__((opencl_unroll_hint))
-	for(uint i = 0; i < segs; ++i)
-	{
-		const uint
-		col = i * ln + li;
-
-		out[col] = bias[col];
-	}
-
-	for(uint i = 0; i < segs; ++i)
-	{
-		const uint
-		col = i * ln + li;
-
-		for(uint j = 0; j < height; ++j)
-			out[col] += ircd_gpt_tmul_dot(in, bias, weight, width, height, col, i, j);
-	}
-}
-
-inline void
-__attribute__((flatten, always_inline))
-ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __local union ircd_gpt_ffnn_aperaturev *const restrict out,
-                   __local const union ircd_gpt_tokenv *const in,
-                   __global const float4 *const restrict bias,
-                   __global const float4 *const restrict weight)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	width = opts->ffnn_width,
-	height = opts->ffnn_height,
-	tiles = opts->ffnn_mult;
-
-	ircd_gpt_tmul(out->fcon, in->word, bias, weight, width, height, tiles);
-
-	for(uint i = 0; i < tiles; ++i)
-		ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li);
-}
-
-inline void
-__attribute__((flatten, always_inline))
-ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
-              __constant const struct ircd_gpt_opts *const opts,
-              __local union ircd_gpt_tokenv *const restrict token,
-              __local union ircd_gpt_ffnn_aperaturev *const restrict buf,
-              __local union ircd_gpt_ffnn_aperaturev *const restrict tmp0,
-              __local union ircd_gpt_tokenv *const restrict tmp1,
-              __global const float4 *const restrict norm_bias,
-              __global const float4 *const restrict norm_weight,
-              __global const float4 *const restrict fcon_bias,
-              __global const float4 *const restrict fcon_weight,
-              __global const float4 *const restrict proj_bias,
-              __global const float4 *const restrict proj_weight)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0),
-	width = opts->ffnn_width,
-	height = opts->ffnn_height;
-
 	// Layer re-normalization
-	ircd_simt_math_norm_f4lldr(token->word, token->word, buf->word);
-	ircd_gpt_norm_fmad(token->word, token->word, norm_bias, norm_weight, li);
-
-	// ln's writes are still pending but fcon reads results across threads.
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	// Fully connected
-	ircd_gpt_ffnn_fcon(ctrl, opts, buf, token, fcon_bias, fcon_weight);
-
-	// fcon's writes are still pending but proj reads results across threads.
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	// Projection
-	ircd_gpt_tmul(token->word, buf->fcon, proj_bias, proj_weight, height, width, 1);
+	ircd_simt_math_norm_f4lldr(out->elem, in->elem, tmp->elem, ln, li);
+	ircd_gpt_norm_fmad(out, out, bias, weight, li);
 }
 
-inline void
-__attribute__((flatten, always_inline))
-ircd_gpt_attn_self_samax(__global const struct ircd_gpt_ctrl *const ctrl,
-                         __constant const struct ircd_gpt_opts *const opts,
-                         __local float self[][12],
-                         const uint wn)
+void
+ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *const out,
+                   __local const ircd_gpt_vectorv *const in,
+                   __global const ircd_gpt_vectorv *const restrict bias,
+                   __global const ircd_gpt_vectorv *const restrict weight,
+                   const uint i)
 {
-	const uint
-	gn = get_global_size(0),
-	li = get_local_id(0),
-	ln = get_local_size(0);
-
-	struct ircd_math_samax samax =
-	{
-		.mu = -10000.0f,
-		.sum = 0.0f,
-	};
-
-	for(uint i = 0; i < wn; ++i)
-		samax.mu = max(samax.mu, self[i][li]);
-
-	for(uint i = 0; i < wn; ++i)
-		self[i][li] = exp(self[i][li] - samax.mu);
-
-	__attribute__((opencl_unroll_hint))
-	for(uint i = 0; i < wn; ++i)
-		samax.sum += self[i][li];
-
-	samax.lambda = 1.0f / samax.sum;
-
-	__attribute__((opencl_unroll_hint))
-	for(uint i = 0; i < wn; ++i)
-		self[i][li] *= samax.lambda;
-}
-
-inline void
-__attribute__((flatten, always_inline))
-ircd_gpt_attn_self(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __local union ircd_gpt_tokenv *const restrict out,
-                   __local float self[][12],
-                   __global const struct ircd_gpt_attn_qkvv *const restrict token)
-{
-	const uint
-	gi = get_global_id(0),
-	gn = get_global_size(0),
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0),
-	wn = ctrl->tokens.count,
-	ti = li % opts->attn_rank,
-	ki = li / opts->attn_rank,
-	kn = ln / opts->attn_rank;
-
-	// Low-rank mask
-	if(li < opts->attn_rank)
-	{
-		// For each token
-		for(uint i = 0; i < wn; ++i)
-		{
-			// Left-attention mask
-			if(wi < i)
-			{
-				self[i][li] = -10000.0f;
-				continue;
-			}
-
-			float4 acc = 0.0f;
-			__attribute__((opencl_unroll_hint))
-			for(uint k = 0; k < kn; ++k)
-			{
-				float4
-				qry = token[wi].qry.attn[li][k],
-				key = token[i].key.attn[li][k];
-
-				acc += qry * key;
-			}
-
-			const float
-			sum = ircd_simt_reduce_add_f4(acc),
-			res = sum / 8.0f;
-
-			self[i][li] = res;
-		}
-
-		// Three-piece softmax
-		ircd_gpt_attn_self_samax(ctrl, opts, self, wn);
-	}
-
-	// Propagate to full width for value dot prod.
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	float4 acc = 0.0f;
-	__attribute__((opencl_unroll_hint))
-	for(uint i = 0; i < wi; ++i)
-	{
-		const float4
-		attn = self[i][ti],
-		val = token[i].val.attn[ti][ki];
-
-		acc += attn * val;
-	}
-
-	out->attn[ti][ki] = acc;
-}
-
-inline void
-__attribute__((flatten, always_inline))
-ircd_gpt_attn_proj(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __local union ircd_gpt_tokenv *const out,
-                   __local const union ircd_gpt_tokenv *const xattn,
-                   __global const float4 *const restrict bias,
-                   __global const float4 *const restrict weight)
-{
-	const uint
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0),
-	width = opts->attn_height,  // same
-	height = opts->attn_height;
-
-	// Projection
-	ircd_gpt_tmul(out->word, xattn->word, bias, weight, width, height, 1);
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_coil(__global const struct ircd_gpt_ctrl *const ctrl,
-              __constant const struct ircd_gpt_opts *const opts,
-              __global union ircd_gpt_tokenv *const restrict accum,
-              __global const struct ircd_gpt_attn_qkvv *const restrict state,
-              __global const float4 *const restrict attn_proj_bias,
-              __global const float4 *const restrict attn_proj_weight,
-              __global const float4 *const restrict ffnn_norm_bias,
-              __global const float4 *const restrict ffnn_norm_weight,
-              __global const float4 *const restrict ffnn_fcon_bias,
-              __global const float4 *const restrict ffnn_fcon_weight,
-              __global const float4 *const restrict ffnn_proj_bias,
-              __global const float4 *const restrict ffnn_proj_weight)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0);
-
-	__local union ircd_gpt_tokenv
-	buf1, buf0;
-
-	__local union
-	{
-		union ircd_gpt_ffnn_aperaturev
-		ffnn_fcon[2];
-
-		float
-		attn_self[512][12];
-	}
-	buf;
-
-	// Self-attention backend; this computes the self-attention result now
-	// that keys and values are globally visible across tokens.
-	ircd_gpt_attn_self
-	(
-		ctrl,
-		opts,
-		&buf1,
-		buf.attn_self,
-		state
-	);
-
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	// Project result of self-attention.
-	ircd_gpt_attn_proj
-	(
-		ctrl,
-		opts,
-		&buf0,
-		&buf1,
-		attn_proj_bias,
-		attn_proj_weight
-	);
-
-	// Frontend accumulation
-	{
-		const float4
-		attn = buf0.word[li],
-		resid = accum[wi].word[li];
-
-		buf0.word[li] += resid;
-		accum[wi].word[li] += attn;
-	}
-
-	// Backend mlp; layer-norm acquires any pending writes, no fence required.
-	ircd_gpt_ffnn
-	(
-		ctrl,
-		opts,
-		&buf0,
-		buf.ffnn_fcon + 0,
-		buf.ffnn_fcon + 1,
-		&buf1,
-		ffnn_norm_bias,
-		ffnn_norm_weight,
-		ffnn_fcon_bias,
-		ffnn_fcon_weight,
-		ffnn_proj_bias,
-		ffnn_proj_weight
-	);
-
-	// Backend accumulation
-	{
-		const float4
-		ffnn = buf0.word[li],
-		resid = accum[wi].word[li],
-		result = ffnn + resid;
-
-		accum[wi].word[li] = result;
-	}
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __global union ircd_gpt_attn_aperaturev *const restrict state,
-                   __global const union ircd_gpt_tokenv *const restrict accum,
-                   __global const float4 *const restrict norm_bias,
-                   __global const float4 *const restrict norm_weight,
-                   __global const float4 *const restrict fcon_bias,
-                   __global const float4 *const restrict fcon_weight)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0),
-	width = opts->attn_width,
-	height = opts->attn_height,
-	tiles = opts->attn_mult;
-
-	__local union ircd_gpt_attn_aperaturev
-	token;
-
-	__local float4
-	tmp[768/4];
-
-	token.word[li] = accum[wi].word[li];
-
-	// Layer re-normalization
-	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp);
-	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
-
-	// Ln's writes are still pending; fcon requires results across threads.
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	// Fully connected
-	ircd_gpt_tmul(token.fcon, tmp, fcon_bias, fcon_weight, width, height, tiles);
-
-	// Export queries, keys, and values.
-	for(uint i = 0; i < tiles; ++i)
-		state[wi].proj[i][li] = token.proj[i][li];
-}
-
-//
-// frontend
-//
-
-inline void
-__attribute__((always_inline))
-_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __global union ircd_gpt_tokenv *const restrict accum,
-                   __global const union ircd_gpt_tokenv *const restrict pos,
-                   __global const union ircd_gpt_tokenv *const restrict vocab,
-                   const uint out_idx,
-                   const uint tok_idx,
-                   const uint word_idx)
-{
-	const ushort
-	ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens,
-	token = ctrl->token[ring_idx];
-
-	const float4
-	wte = vocab[token].word[word_idx],
-	wpe = pos[tok_idx].word[word_idx];
-
-	accum[out_idx].word[word_idx] = wte + wpe;
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *const ctrl,
-                  __constant const struct ircd_gpt_opts *const opts,
-                  __global union ircd_gpt_tokenv *const restrict accum,
-                  __global const union ircd_gpt_tokenv *const restrict pos,
-                  __global const union ircd_gpt_tokenv *const restrict vocab)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0);
-
-	_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, wi, wi, li);
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *const ctrl,
-                 __constant const struct ircd_gpt_opts *const opts,
-                 __global union ircd_gpt_tokenv *const restrict accum,
-                 __global const float4 *const restrict norm_bias,
-                 __global const float4 *const restrict norm_weight)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0);
-
-	__local union ircd_gpt_tokenv
-	token, tmp;
-
-	token.word[li] = accum[wi].word[li];
-
-	// Final re-normalization
-	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word);
-	ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
-
-	accum[wi].word[li] = token.word[li];
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl,
-                  __constant const struct ircd_gpt_opts *const opts,
-                  __global float *const restrict logit,
-                  __global const union ircd_gpt_tokenv *const restrict accum,
-                  __global const union ircd_gpt_tokenv *const restrict token)
-{
-	const uint
-	gi = get_global_id(0),
-	ti = ctrl->tokens.count - 1,
-	words = opts->embed_width;
-
-	float4 acc = 0.0f;
-	__attribute__((opencl_unroll_hint))
-	for(uint j = 0; j < words; ++j)
-	{
-		const float4
-		in = accum[ti].word[j],
-		vocab = token[gi].word[j];
-
-		acc += vocab * in;
-	}
-
-	const float
-	ret = ircd_simt_reduce_add_f4(acc);
-
-	if(gi < opts->logits)
-		logit[gi] = ret;
-	else
-		logit[gi] = -10000.0f;
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *const ctrl,
-                  __constant const struct ircd_gpt_opts *const opts,
-                  __global float4 *const restrict logsm,
-                  __global float4 *const restrict logexp,
-                  __global const float4 *const restrict logit)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	logits = opts->logits,
-	logits_alignup = logits + (ln - (logits % ln)),
-	tn = logits_alignup / ln / 4,
-	ti = tn * li;
-
-	__local float share[256];
-	__local float4 share4[256];
-
-	share4[li] = -10000.0f;
-	for(uint i = ti; i < ti + tn; ++i)
-		share4[li] = max(share4[li], logit[i]);
-
-	share[li] = -10000.0f;
-	for(uint k = 0; k < 4; ++k)
-		share[li] = max(share[li], share4[li][k]);
-
-	ircd_simt_reduce_max_flldr(share);
-
-	if(li == 0)
-		share4[li] = ctrl->samax.mu = share[li];
-
-	ircd_simt_broadcast_f4lldr(share4);
-
-	const float4
-	mu = share4[li];
-
-	share4[li] = 0.0f;
-	for(uint i = ti; i < ti + tn; ++i)
-	{
-		const float4
-		reg = logit[i] - mu;
-
-		float4 res;
-		for(uint k = 0; k < 4; ++k)
-			if(i * 4 + k < logits)
-				res[k] = exp(reg[k]);
-			else
-				res[k] = 0.0f;
-
-		share4[li] += res;
-		logexp[i] = res;
-	}
-
-	ircd_simt_reduce_add_f4lldr(share4);
-
-	if(li == 0)
-	{
-		const float
-		sum = ircd_simt_reduce_add_f4(share4[li]);
-
-		share4[li][0] = ctrl->samax.sum = sum;
-		share4[li][1] = ctrl->samax.lambda = 1.0f / sum;
-	}
-
-	ircd_simt_broadcast_f4lldr(share4);
-
-	const float4
-	sum = share4[li][0],
-	lambda = share4[li][1];
-
-	for(uint i = ti; i < ti + tn; ++i)
-		logsm[i] = logexp[i] * lambda;
-}
-
-inline void
-__attribute__((always_inline))
-ircd_gpt_lm_result_top(__global struct ircd_gpt_ctrl *const ctrl,
-                       __constant const struct ircd_gpt_opts *const opts,
-                       __local const ushort *const restrict idx,
-                       __global const float *const restrict logsm,
-                       __global const float *const restrict logit,
-                       const uint i)
-{
-	const ushort
-	token = idx[i];
-
-	const float
-	samax = logsm[token];
-
-	ctrl->top[i].token = token;
-	ctrl->top[i].samax = samax;
-}
-
-inline void
-__attribute__((always_inline))
-ircd_gpt_lm_result_label(__global struct ircd_gpt_ctrl *const ctrl,
-                         __constant const struct ircd_gpt_opts *const opts,
-                         __local const ushort *const restrict idx,
-                         __global const float *const restrict logsm,
-                         __global const float *const restrict logit,
-                         const uint i)
-{
-	__global struct ircd_gpt_ctrl_label
-	*const label = ctrl->label + i;
-
-	const ushort
-	token = label->token,
-	sum_sel = ctrl->epic.cycle % 3;
-
-	const float
-	samax = logsm[token],
-	mean_div = ctrl->epic.cycle + 1.0f;
-
-	const float
-	loss = 0.0f - log(samax),
-	loss_sum = label->loss.sum[0] + label->loss.sum[1] + label->loss.sum[2] + loss,
-	loss_mean = loss_sum / mean_div;
-
-	const float
-	perp = (1.0f - samax) * native_log2(opts->logits),
-	perp_sum = label->perp.sum[0] + label->perp.sum[1] + label->perp.sum[2] + perp,
-	perp_mean = perp_sum / mean_div;
-
-	label->samax = samax;
-
-	label->loss.last = loss;
-	label->loss.sum[sum_sel] += loss;
-	label->loss.mean = loss_mean;
-
-	label->perp.last = perp;
-	label->perp.sum[sum_sel] += perp;
-	label->perp.mean = perp_mean;
-}
-
-inline void
-__attribute__((always_inline))
-ircd_gpt_lm_result_select(__global struct ircd_gpt_ctrl *const ctrl,
-                          __constant const struct ircd_gpt_opts *const opts,
-                          __local const ushort *const restrict idx,
-                          __global const float *const restrict logsm,
-                          __global const float *const restrict logexp,
-                          __global const float *const restrict logit)
-{
-	const bool
-	buffer_full = ctrl->tokens.count >= opts->buffer_tokens;
-
-	const ulong
-	rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
-	ent_k = max(opts->top_k, 1U),
-	ent_p = max(1U, min(opts->top_p, 100U));
-
-	const float
-	thresh = (rnd % ent_p) / 100.0f;
-
-	ushort select = 0;
-	float smacc = 0.0f;
-	for(; select < opts->top_k; ++select)
-		if((smacc += logsm[idx[select]]) > thresh)
-			break;
-
-	const ushort
-	token = idx[select],
-	dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens,
-	tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens),
-	head = buffer_full?
-		(ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head;
-
-	ctrl->tokens.head = head;
-	ctrl->tokens.count = tokens;
-	ctrl->token[dest] = token;
-}
-
-inline void
-__attribute__((always_inline))
-ircd_gpt_leave(__global struct ircd_gpt_ctrl *const ctrl,
-               __constant const struct ircd_gpt_opts *const opts,
-               const uint li)
-{
-	if(ctrl->epic.cycle + 1 >= opts->limit)
-		ctrl->epic.epoch += 1;
-
-	ctrl->epic.cycle += 1;
-	ctrl->magic = 0xC7012C70U;
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_lm_select(__global struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __global const float *const restrict logsm,
-                   __global const float *const restrict logit)
-{
-	const uint
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	tn = opts->logits / ln,
-	ti = tn * li;
-
-	__local ushort idx[256];
-
-	idx[li] = ti;
-	for(uint j = ti + 1; j < ti + tn; ++j)
-		if(logsm[j] > logsm[idx[li]])
-			idx[li] = j;
-
-	ircd_simt_sort_idx16_flldr(idx, logsm);
-
-	if(li < opts->top_n)
-		ircd_gpt_lm_result_top(ctrl, opts, idx, logsm, logexp, logit, li);
-
-	if(li < opts->labels)
-		ircd_gpt_lm_result_label(ctrl, opts, idx, logsm, logexp, logit, li);
-
-	// Writes to `idx` from the sort are still pending across threads.
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	// Mask for write-leader
-	if(li == 0)
-		ircd_gpt_lm_result_select(ctrl, opts, idx, logsm, logexp, logit);
-
-	if(li != 0)
-		return;
-
-	ircd_gpt_leave(ctrl, opts, li);
-}
-
-//
-// backpropagations
-//
-
-inline void
-__attribute__((always_inline))
-ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __global float4 *const restrict param_,
-                   __global float4 *const restrict exp_avg_,
-                   __global float4 *const restrict exp_avg_sqr_)
-{
-	const uint
-	li = get_local_id(0),
-	step = ctrl->epic.step;
-
-	const float4
-	param = param_[li],
-	grad = ctrl->label[0].loss.mean,
-	alpha[2] = { 1.0f - opts->beta[0], 1.0f - opts->beta[1], },
-	exp_avg = step? exp_avg_[li]: 0.0f,
-	exp_avg_sqr = step? exp_avg_sqr_[li]: 0.0f,
-	exp_avg_mul = exp_avg * opts->beta[0],
-	exp_avg_dot = exp_avg_mul + alpha[0] * grad,
-	exp_avg_sqr_mul = exp_avg_sqr * opts->beta[1],
-	exp_avg_sqr_dot = exp_avg_sqr_mul + alpha[1] * grad * grad,
-	denom = sqrt(exp_avg_sqr_dot) + opts->epsilon,
-	delta = opts->alpha * (exp_avg_dot / denom),
-	update = param - delta;
-
-	param_[li] = update;
-	exp_avg_[li] = exp_avg_dot;
-	exp_avg_sqr_[li] = exp_avg_sqr_dot;
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *const ctrl,
-                   __constant const struct ircd_gpt_opts *const opts,
-                   __global union ircd_gpt_tokenv *const restrict bias,
-                   __global union ircd_gpt_tokenv *const restrict bias_m0,
-                   __global union ircd_gpt_tokenv *const restrict bias_m1,
-                   __global union ircd_gpt_tokenv *const restrict weight,
-                   __global union ircd_gpt_tokenv *const restrict weight_m0,
-                   __global union ircd_gpt_tokenv *const restrict weight_m1)
-{
-	ircd_gpt_prop_elem
-	(
-		ctrl, opts,
-		bias->word,
-		bias_m0->word,
-		bias_m1->word
-	);
-
-	ircd_gpt_prop_elem
-	(
-		ctrl, opts,
-		weight->word,
-		weight_m0->word,
-		weight_m1->word
-	);
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *const ctrl,
-                        __constant const struct ircd_gpt_opts *const opts,
-                        __global union ircd_gpt_tokenv *const restrict norm_bias,
-                        __global union ircd_gpt_tokenv *const restrict norm_bias_m0,
-                        __global union ircd_gpt_tokenv *const restrict norm_bias_m1,
-                        __global union ircd_gpt_tokenv *const restrict norm_weight,
-                        __global union ircd_gpt_tokenv *const restrict norm_weight_m0,
-                        __global union ircd_gpt_tokenv *const restrict norm_weight_m1,
-                        __global union ircd_gpt_attn_aperaturev *const restrict fcon_bias,
-                        __global union ircd_gpt_attn_aperaturev *const restrict fcon_bias_m0,
-                        __global union ircd_gpt_attn_aperaturev *const restrict fcon_bias_m1,
-                        __global union ircd_gpt_attn_aperaturev *const restrict fcon_weight,
-                        __global union ircd_gpt_attn_aperaturev *const restrict fcon_weight_m0,
-                        __global union ircd_gpt_attn_aperaturev *const restrict fcon_weight_m1,
-                        __global union ircd_gpt_tokenv *const restrict proj_bias,
-                        __global union ircd_gpt_tokenv *const restrict proj_bias_m0,
-                        __global union ircd_gpt_tokenv *const restrict proj_bias_m1,
-                        __global union ircd_gpt_tokenv *const restrict proj_weight,
-                        __global union ircd_gpt_tokenv *const restrict proj_weight_m0,
-                        __global union ircd_gpt_tokenv *const restrict proj_weight_m1)
-{
-	ircd_gpt_norm_prop
-	(
-		ctrl, opts,
-		norm_bias,
-		norm_bias_m0,
-		norm_bias_m1,
-		norm_weight,
-		norm_weight_m0,
-		norm_weight_m1
-	);
-
-	for(uint j = 0; j < 3; ++j)
-		ircd_gpt_prop_elem
-		(
-			ctrl, opts,
-			fcon_bias->proj[j],
-			fcon_bias_m0->proj[j],
-			fcon_bias_m1->proj[j]
-		);
-
-	for(uint i = 0; i < 768; ++i)
-		for(uint j = 0; j < 3; ++j)
-			ircd_gpt_prop_elem
-			(
-				ctrl, opts,
-				fcon_weight[i].proj[j],
-				fcon_weight_m0[i].proj[j],
-				fcon_weight_m1[i].proj[j]
-			);
-
-	ircd_gpt_prop_elem
-	(
-		ctrl, opts,
-		proj_bias->word,
-		proj_bias_m0->word,
-		proj_bias_m1->word
-	);
-
-	for(uint i = 0; i < 768; ++i)
-		ircd_gpt_prop_elem
-		(
-			ctrl, opts,
-			proj_weight[i].word,
-			proj_weight_m0[i].word,
-			proj_weight_m1[i].word
-		);
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *const ctrl,
-                        __constant const struct ircd_gpt_opts *const opts,
-                        __global union ircd_gpt_tokenv *const restrict norm_bias,
-                        __global union ircd_gpt_tokenv *const restrict norm_bias_m0,
-                        __global union ircd_gpt_tokenv *const restrict norm_bias_m1,
-                        __global union ircd_gpt_tokenv *const restrict norm_weight,
-                        __global union ircd_gpt_tokenv *const restrict norm_weight_m0,
-                        __global union ircd_gpt_tokenv *const restrict norm_weight_m1,
-                        __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias,
-                        __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m0,
-                        __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_bias_m1,
-                        __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight,
-                        __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m0,
-                        __global union ircd_gpt_ffnn_aperaturev *const restrict fcon_weight_m1,
-                        __global union ircd_gpt_tokenv *const restrict proj_bias,
-                        __global union ircd_gpt_tokenv *const restrict proj_bias_m0,
-                        __global union ircd_gpt_tokenv *const restrict proj_bias_m1,
-                        __global union ircd_gpt_tokenv *const restrict proj_weight,
-                        __global union ircd_gpt_tokenv *const restrict proj_weight_m0,
-                        __global union ircd_gpt_tokenv *const restrict proj_weight_m1)
-{
-	ircd_gpt_norm_prop
-	(
-		ctrl, opts,
-		norm_bias,
-		norm_bias_m0,
-		norm_bias_m1,
-		norm_weight,
-		norm_weight_m0,
-		norm_weight_m1
-	);
-
-	for(uint j = 0; j < 4; ++j)
-		ircd_gpt_prop_elem
-		(
-			ctrl, opts,
-			fcon_bias->proj[j],
-			fcon_bias_m0->proj[j],
-			fcon_bias_m1->proj[j]
-		);
-
-	for(uint i = 0; i < 768; ++i)
-		for(uint j = 0; j < 4; ++j)
-			ircd_gpt_prop_elem
-			(
-				ctrl, opts,
-				fcon_weight[i].proj[j],
-				fcon_weight_m0[i].proj[j],
-				fcon_weight_m1[i].proj[j]
-			);
-
-	ircd_gpt_prop_elem
-	(
-		ctrl, opts,
-		proj_bias->word,
-		proj_bias_m0->word,
-		proj_bias_m1->word
-	);
-
-	for(uint i = 0; i < 3072; ++i)
-		ircd_gpt_prop_elem
-		(
-			ctrl, opts,
-			proj_weight[i].word,
-			proj_weight_m0[i].word,
-			proj_weight_m1[i].word
-		);
-}
-
-__kernel void
-__attribute__((flatten))
-ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *const ctrl,
-                       __constant const struct ircd_gpt_opts *const opts,
-                       __global union ircd_gpt_tokenv *const restrict pos,
-                       __global union ircd_gpt_tokenv *const restrict pos_m0,
-                       __global union ircd_gpt_tokenv *const restrict pos_m1,
-                       __global union ircd_gpt_tokenv *const restrict token,
-                       __global union ircd_gpt_tokenv *const restrict token_m0,
-                       __global union ircd_gpt_tokenv *const restrict token_m1)
-{
-	const uint
-	gn = get_global_size(0),
-	ln = get_local_size(0),
-	wi = get_global_offset(0) / ln + get_group_id(0),
-	wn = ctrl->tokens.count,
-	cn = opts->context_tokens / wn,
-	ci = cn * wi,
-	tn = opts->logits / wn,
-	ti = tn * wi;
-
-	for(uint i = ci; i < ci + cn; ++i)
-		ircd_gpt_prop_elem
-		(
-			ctrl, opts,
-			pos[i].word,
-			pos_m0[i].word,
-			pos_m1[i].word
-		);
-
-	for(uint i = ti; i < ti + tn; ++i)
-		ircd_gpt_prop_elem
-		(
-			ctrl, opts,
-			token[i].word,
-			token_m0[i].word,
-			token_m1[i].word
-		);
+	out->elem[i] = in->elem[i] * weight->elem[i] + bias->elem[i];
 }
diff --git a/ircd/gpt_model.cc b/ircd/gpt_model.cc
index d8d7f9038..65e27a0d0 100644
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@@ -54,10 +54,14 @@ namespace ircd::gpt::model
 	static fs::map
 	default_model_shm,
 	default_dataset_shm;
-
-	static std::unique_ptr<decoder> default_model_res;
 }
 
+constexpr const char
+*const ircd::gpt::model::prop::ended,
+*const ircd::gpt::model::prop::id,
+*const ircd::gpt::model::prop::length,
+*const ircd::gpt::model::prop::text;
+
 decltype(ircd::gpt::model::manifest_h)
 ircd::gpt::model::manifest_h
 {
@@ -102,7 +106,7 @@ decltype(ircd::gpt::model::cache_hugepage)
 ircd::gpt::model::cache_hugepage
 {
 	{ "name",     "ircd.gpt.model.cache.hugepage" },
-	{ "default",  true                            },
+	{ "default",  false                           },
 };
 
 decltype(ircd::gpt::model::cache_path)
@@ -132,6 +136,12 @@ ircd::gpt::model::path
 decltype(ircd::gpt::model::default_model)
 ircd::gpt::model::default_model;
 
+decltype(ircd::gpt::model::default_moment)
+ircd::gpt::model::default_moment;
+
+decltype(ircd::gpt::model::default_checkpoint)
+ircd::gpt::model::default_checkpoint;
+
 decltype(ircd::gpt::model::default_dataset)
 ircd::gpt::model::default_dataset;
 
@@ -144,17 +154,31 @@ ircd::gpt::model::init()
 	if(!model::path)
 		return;
 
-	if(!init_from_cache(model::cache_path))
-		init_from_json(model::cache_path, model::path);
-
 	if(model::dataset_path)
 		init_dataset(model::dataset_path);
+
+	if(likely(init_from_cache(model::cache_path)))
+		return;
+
+	init_from_json(model::cache_path, model::path);
+	if(unlikely(!init_from_cache(model::cache_path)))
+		throw error
+		{
+			"Failed to find and/or initialize model."
+		};
 }
 
 void
 ircd::gpt::model::fini()
 noexcept
 {
+	default_checkpoint[2] = nullptr;
+	default_checkpoint[1] = nullptr;
+	default_checkpoint[0] = nullptr;
+
+	default_moment[1] = nullptr;
+	default_moment[0] = nullptr;
+
 	default_model = nullptr;
 	default_model_shm = {};
 
@@ -169,18 +193,33 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
 	if(!fs::is_reg(cache_path))
 		return false;
 
-	const auto size
+	const auto file_size
 	{
 		fs::size(cache_path)
 	};
 
-	if(unlikely(size != sizeof(model::decoder)))
+	const auto decoder_size
+	{
+		sizeof(model::decoder)
+	};
+
+	const bool has_params
+	{
+		file_size >= decoder_size
+	};
+
+	const bool has_moments
+	{
+		file_size >= decoder_size * 6
+	};
+
+	if(unlikely(!has_params))
 		throw error
 		{
-			"Cached model `%s' size %zu differs from %zu.",
+			"Cached model `%s' size %zu insufficient for decoder size %zu.",
 			cache_path,
-			size,
-			sizeof(model::decoder),
+			file_size,
+			decoder_size,
 		};
 
 	const auto mode
@@ -192,20 +231,41 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
 
 	const fs::fd fd
 	{
-		cache_path, mode
+		cache_path, fs::fd::opts
+		{
+			.mode = mode,
+		},
 	};
 
-	fs::map::opts map_opts
+	const bool map_moments
 	{
-		mode
+		has_moments || cache_shared
+	};
+
+	if(!has_moments && map_moments)
+	{
+		fs::truncate(fd, decoder_size * 6);
+		fs::allocate(fd, decoder_size * 5, decoder_size);
+	}
+
+	const auto map_size
+	{
+		map_moments?
+			decoder_size * 6:
+			decoder_size
+	};
+
+	const fs::map::opts map_opts
+	{
+		.alignment = alignof(model::decoder),
+		.shared = bool(cache_shared),
+		.locked = bool(cache_locked),
+		.huge2mb = bool(cache_hugepage),
 	};
 
-	map_opts.locked = bool(cache_locked);
-	map_opts.shared = bool(cache_shared);
-	map_opts.huge2mb = bool(cache_hugepage);
 	default_model_shm = fs::map
 	{
-		fd, map_opts, sizeof(decoder)
+		fd, map_opts, map_size
 	};
 
 	default_model = reinterpret_cast<decoder *>
@@ -213,13 +273,28 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
 		data(default_model_shm)
 	);
 
+	if(map_moments)
+	{
+		default_moment[0] = reinterpret_cast<float *>(default_model + 1);
+		default_moment[1] = reinterpret_cast<float *>(default_model + 2);
+		default_checkpoint[0] = reinterpret_cast<float *>(default_model + 3);
+		default_checkpoint[1] = reinterpret_cast<float *>(default_model + 4);
+		default_checkpoint[2] = reinterpret_cast<float *>(default_model + 5);
+	}
+
+	allocator::lock({(const char *)default_model, sizeof(decoder)});
+	fs::prefetch(default_model_shm, sizeof(decoder));
+
 	char pbuf[48];
 	log::info
 	{
-		log, "model(%p) mapped cached model `%s' %s",
+		log, "model(%p) mapped cached model `%s' params:%b moments:%b align:%u %s",
 		data(default_model_shm),
 		cache_path,
-		pretty(pbuf, iec(size)),
+		has_params,
+		has_moments,
+		map_opts.alignment,
+		pretty(pbuf, iec(map_size)),
 	};
 
 	return true;
@@ -264,9 +339,6 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
 		cache_path,
 		stopwatch.pretty(pbuf[1]),
 	};
-
-	default_model_res = std::move(decoder);
-	default_model = default_model_res.get();
 }
 
 void
@@ -363,6 +435,7 @@ ircd::gpt::model::init_dataset(const string_view &path)
 
 	size_t checkpoint(0);
 	default_data.resize(260000); //TODO: XXX
+	fs::prefetch(default_dataset_shm, size);
 	ircd::tokens(default_dataset, '\n', [&checkpoint]
 	(const string_view &line)
 	{
@@ -379,6 +452,7 @@ ircd::gpt::model::init_dataset(const string_view &path)
 		checkpoint,
 	};
 
+	fs::evict(default_dataset_shm, size);
 	return true;
 }
 
@@ -393,9 +467,9 @@ ircd::gpt::model::init_wpe_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.word.pos[i][j++] = lex_cast<float>(elem);
+			d.embed.pos[i].elem[j++] = lex_cast<float>(elem);
 
-		always_assert(j == sizeof(d.word.pos[i]) / sizeof(float));
+		always_assert(j == sizeof(d.embed.pos[i]) / sizeof(float));
 		++i;
 	}
 }
@@ -411,9 +485,9 @@ ircd::gpt::model::init_wte_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.word.token[i][j++] = lex_cast<float>(elem);
+			d.embed.token[i].elem[j++] = lex_cast<float>(elem);
 
-		always_assert(j == sizeof(d.word.token[i]) / sizeof(float));
+		always_assert(j == sizeof(d.embed.token[i]) / sizeof(float));
 		++i;
 	}
 }
@@ -426,9 +500,9 @@ ircd::gpt::model::init_f_weight(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.f.weight[i++] = lex_cast<float>(elem);
+		d.embed.norm.weight.elem[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.f.weight) / sizeof(float));
+	always_assert(i == sizeof(d.embed.norm.weight) / sizeof(float));
 }
 
 void
@@ -439,9 +513,9 @@ ircd::gpt::model::init_f_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.f.bias[i++] = lex_cast<float>(elem);
+		d.embed.norm.bias.elem[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.f.bias) / sizeof(float));
+	always_assert(i == sizeof(d.embed.norm.bias) / sizeof(float));
 }
 
 void
@@ -455,16 +529,16 @@ ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].ffnn.fcon_weight[i].fcon[j++] = lex_cast<float>(elem);
 
-		always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
+		always_assert(j == sizeof(d.layer[layer].ffnn.fcon_weight[i]) / sizeof(float));
 		++i;
 	}
 
 	always_assert
 	(
-		i == sizeof(d.layer[layer].ffnn.fc_weight)
-		/ sizeof(d.layer[layer].ffnn.fc_weight[0])
+		i == sizeof(d.layer[layer].ffnn.fcon_weight)
+		/ sizeof(d.layer[layer].ffnn.fcon_weight[0])
 	);
 }
 
@@ -476,9 +550,9 @@ ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.fcon_bias.fcon[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].ffnn.fcon_bias) / sizeof(float));
 }
 
 void
@@ -492,7 +566,7 @@ ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].ffnn.proj_weight[i].elem[j++] = lex_cast<float>(elem);
 
 		always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
 		++i;
@@ -513,7 +587,7 @@ ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.proj_bias.elem[i++] = lex_cast<float>(elem);
 
 	always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
 }
@@ -526,9 +600,9 @@ ircd::gpt::model::init_h_ln_1_weight(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.norm.weight.elem[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].attn.norm.weight) / sizeof(float));
 }
 
 void
@@ -539,9 +613,9 @@ ircd::gpt::model::init_h_ln_1_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.norm.bias.elem[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].attn.norm.bias) / sizeof(float));
 }
 
 void
@@ -552,9 +626,9 @@ ircd::gpt::model::init_h_ln_2_weight(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.norm.weight.elem[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].ffnn.norm.weight) / sizeof(float));
 }
 
 void
@@ -565,9 +639,9 @@ ircd::gpt::model::init_h_ln_2_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.norm.bias.elem[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].ffnn.norm.bias) / sizeof(float));
 }
 
 void
@@ -581,16 +655,16 @@ ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].attn.fcon_weight[i].fcon[j++] = lex_cast<float>(elem);
 
-		always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
+		always_assert(j == sizeof(d.layer[layer].attn.fcon_weight[i]) / sizeof(float));
 		++i;
 	}
 
 	always_assert
 	(
-		i == sizeof(d.layer[layer].attn.attn_weight)
-		/ sizeof(d.layer[layer].attn.attn_weight[0])
+		i == sizeof(d.layer[layer].attn.fcon_weight)
+		/ sizeof(d.layer[layer].attn.fcon_weight[0])
 	);
 }
 
@@ -602,9 +676,9 @@ ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.fcon_bias.fcon[i++] = lex_cast<float>(elem);
 
-	always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].attn.fcon_bias) / sizeof(float));
 }
 
 void
@@ -618,7 +692,7 @@ ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].attn.proj_weight[i].elem[j++] = lex_cast<float>(elem);
 
 		always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
 		++i;
@@ -639,7 +713,7 @@ ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.proj_bias.elem[i++] = lex_cast<float>(elem);
 
 	always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
 }
diff --git a/ircd/gpt_pipe.cc b/ircd/gpt_pipe.cc
index ebd084f7f..fc1447097 100644
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@@ -10,15 +10,9 @@
 
 namespace ircd::gpt::pipe
 {
-	static void profile_dumplog(pipe::exec &);
+	static void profile_dumplog(pipe::cycle &);
 
-	extern conf::item<size_t> queue_cycles;
 	extern const ircd::run::changed handle_quit;
-
-	static ircd::cl::exec::opts
-	send_opts_opts, send_ctrl_opts, send_coil_opts, send_head_opts,
-	anode_opts, negative_opts, positive_opts, cathode_opts,
-	lmhead_opts, lmamax_opts, backprop_opts, recv_ctrl_opts;
 }
 
 decltype(ircd::gpt::pipe::queue_cycles)
@@ -28,62 +22,19 @@ ircd::gpt::pipe::queue_cycles
 	{ "default",  1L,                   },
 };
 
-decltype(ircd::gpt::pipe::default_model)
-ircd::gpt::pipe::default_model;
-
-decltype(ircd::gpt::pipe::default_code)
-ircd::gpt::pipe::default_code;
-
-decltype(ircd::gpt::pipe::default_desc)
-ircd::gpt::pipe::default_desc;
-
 decltype(ircd::gpt::pipe::handle_quit)
 ircd::gpt::pipe::handle_quit
 {
 	run::level::QUIT, pipe::fini
 };
 
+[[gnu::visibility("hidden")]]
 void
 ircd::gpt::pipe::init()
 {
-	const gpt::model::decoder &default_model
-	{
-		*gpt::model::default_model
-	};
-
-	assert(!pipe::default_model);
-	pipe::default_model = new pipe::model
-	{
-		default_model, default_model.word
-	};
-
-	pipe::default_code = new pipe::code
-	{
-
-	};
-
-	pipe::default_desc = new pipe::desc
-	{
-		*pipe::default_code, *pipe::default_model
-	};
-
-	//XXX
-	send_ctrl_opts.flush = true;
-	send_ctrl_opts.nice = 1;
-	lmamax_opts.flush = true;
-	lmamax_opts.nice = 2;
-	recv_ctrl_opts.flush = true;
-
-	log::debug
-	{
-		log, "Pipe initialized from model:%p data:%p code:%p desc:%p",
-		&default_model,
-		pipe::default_model,
-		pipe::default_code,
-		pipe::default_desc,
-	};
 }
 
+[[using gnu: cold, visibility("hidden")]]
 void
 ircd::gpt::pipe::fini()
 noexcept
@@ -101,372 +52,409 @@ noexcept
 		};
 
 	cl::sync();
-
-	delete default_desc;   default_desc = nullptr;
-	delete default_code;   default_code = nullptr;
-	delete default_model;  default_model = nullptr;
+	ctx::yield();
 }
 
 //
-// pipe
+// pipe::prof
 //
 
-void
-ircd::gpt::pipe::generate(task &task)
+ircd::string_view
+ircd::gpt::pipe::debug(const mutable_buffer &buf,
+                       const prof &p)
 {
-	assert(pipe::default_model);
-
-	assert(task.opts);
-	const auto &opts
-	{
-		*task.opts
-	};
-
-	assert(task.ctrl);
-	auto &ctrl
-	{
-		*task.ctrl
-	};
-
-	ctrl.epic.cycle = 0;
-	ctrl.epic.host_tsc = prof::cycles();
-
-	const auto tokens(ctrl.tokens.count);
-	const auto epoch(ctrl.epic.epoch);
-	volatile auto cycle(ctrl.epic.cycle);
-
-	std::deque<pipe::exec> list;
-	for(; cycle < opts.limit; ++cycle)
-	{
-		// When the release/acquire bits are set the control pages are sent
-		// and received; only set on first and last iterations of this loop.
-		const bool
-		rel(cycle == 0),
-		acq(cycle + 1 >= opts.limit || ctx::interruption_requested());
-
-		// Enqueue the cycle's commands
-		list.emplace_back
-		(
-			task, tokens + cycle, rel, acq
-		);
-
-		if(ctx::interruption_requested())
-			if(acq || termination(ctx::cur()))
-				break;
-
-		// Enqueue consecutive repetitions of our kernel batch before waiting
-		// on the first; based on the configuration. XXX get from ircd::cl
-		if(list.size() <= pipe::queue_cycles)
-			continue;
-
-		// Profiling branch
-		if((false))
+	window_buffer window(buf);
+	for(uint i(0); i < p.stages; ++i)
+		window([&p, &i](auto buf)
 		{
-			auto &ex(list.front());
-			profile_dumplog(ex);
-		}
+			size_t ret(0);
+			ret += consume(buf, size(debug(buf, p, i)));
+			ret += consume(buf, copy(buf, '\n'));
+			return ret;
+		});
 
-		// Destructing the front of the queue waits for completion by yielding
-		// this ircd::ctx.
-		list.pop_front();
-	}
-
-	// Wait for all unfinished
-	list.clear();
-
-	assert(ctrl.magic == 0xC7012C70);
-	assert(ctrl.epic.cycle == cycle || ctx::interruption_requested());
-	this_ctx::interruption_point();
+	return window.completed();
 }
 
-void
-ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
+ircd::string_view
+ircd::gpt::pipe::debug(const mutable_buffer &buf,
+                       const prof &p,
+                       const size_t &i)
 {
-	constexpr size_t coils
+	using phase = prof::phase;
+
+	assert(i < p.info.size());
+	assert(i < p.ts.size());
+
+	char tbuf[4][32];
+	return fmt::sprintf
 	{
-		sizeof(exec.coil) / sizeof(cl::exec)
+		buf, "%-20s %04x [ %10s %10s %10s %10s ]",
+		std::get<0>(p.info[i]),
+		std::get<1>(p.info[i]),
+		pretty(tbuf[0], p.ts[i][phase::QUEUE], 1),
+		pretty(tbuf[1], p.ts[i][phase::SUBMIT], 1),
+		pretty(tbuf[2], p.ts[i][phase::START], 1),
+		pretty(tbuf[3], p.ts[i][phase::END], 1),
 	};
+}
 
-	for(size_t i(0); i < coils; ++i)
+//
+// prof::prof
+//
+
+decltype(ircd::gpt::pipe::prof::info)
+ircd::gpt::pipe::prof::info;
+
+decltype(ircd::gpt::pipe::prof::name)
+ircd::gpt::pipe::prof::name;
+
+[[gnu::visibility("hidden")]]
+decltype(ircd::gpt::pipe::prof::init)
+ircd::gpt::pipe::prof::init;
+
+ircd::gpt::pipe::prof::prof()
+noexcept
+{
+	for(uint i(0); i < stages; ++i)
+		for(uint j(0); j < phases; ++j)
+			ts[i][j] = 0ns;
+}
+
+ircd::gpt::pipe::prof::prof(const cycle &c)
+{
+	if(!std::exchange(init, true))
+		init_info(c);
+
+	if(!cl::profile_queue)
+		return;
+
+	for(uint i(0); i < stages; ++i)
 	{
-		exec.coil[i].wait();
-		const auto &pro
+		const cl::work::prof p
 		{
-			exec.coil[i].profile()
+			c.stage[i]
 		};
 
-		char tmbuf[4][32] {{0}};
-		log::logf
-		{
-			log, log::level::DEBUG,
-			"coil:%-2lu %8s %8s %8s %8s",
-			i,
-			util::pretty(tmbuf[0], si(pro[0]), 1),
-			util::pretty(tmbuf[1], si(pro[1]), 1),
-			util::pretty(tmbuf[2], si(pro[2]), 1),
-			util::pretty(tmbuf[3], si(pro[3]), 1),
-		};
+		ts[i][phase::QUEUE] = p[phase::SUBMIT] - p[phase::QUEUE];
+		ts[i][phase::SUBMIT] = p[phase::START] - p[phase::SUBMIT];
+		ts[i][phase::START] = p[phase::END] - p[phase::START];
+		ts[i][phase::END] = p[phase::END] - p[phase::QUEUE];
 	}
 }
 
+[[gnu::visibility("hidden")]]
+void
+ircd::gpt::pipe::prof::init_info(const cycle &c)
+{
+	static_assert
+	(
+		name.size() >= stages
+	);
+
+	for(uint i(0); i < stages; ++i)
+		info[i] = info_type
+		{
+			c.stage[i].name(name[i]),
+			c.stage[i].type(),
+		};
+}
+
+///////////////////////////////////////////////////////////////////////////////
 //
-// pipe::exec
+// pipe::cycle
 //
 
-ircd::gpt::pipe::exec::exec(task &task,
-                            const size_t tokens,
-                            const bool release,
-                            const bool acquire)
+const ircd::gpt::ctrl &
+ircd::gpt::pipe::acquire(cycle &cycle)
+{
+	// Some tail stages may not be active each cycle
+	const auto last_exec
+	{
+		std::find_if(std::rbegin(cycle.stage), std::rend(cycle.stage), []
+		(const auto &work)
+		{
+			return work.handle;
+		})
+	};
+
+	assert(last_exec != std::rend(cycle.stage));
+	last_exec->wait();
+
+	const auto ctrl
+	{
+		reinterpret_cast<const gpt::ctrl *>(cycle.desc.frame[cycle.frame].ptr())
+	};
+
+	assert(ctrl);
+	assert(ctrl->magic != 0xDEADBEEF);
+	assert(ctrl->magic == 0xC7012C70UL);
+	return *ctrl;
+}
+
+//
+// pipe::cycle::cycle
+//
+
+ircd::gpt::pipe::cycle::cycle(gpt::samp &samp)
 :desc
 {
-	default_desc
+	samp.desc
 }
-,send_opts
+,tick
 {
-	reinterpret_cast<const char *>(task.opts),
-	release?
-		sizeof(gpt::opts):
-		0
+	samp.cycle
 }
-,send_ctrl
+,count
 {
-	reinterpret_cast<const char *>(task.ctrl),
-	release?
-		sizeof(gpt::ctrl):
-		0
+	samp.count
 }
-,send_coil
+,tokens
 {
-	reinterpret_cast<const char *>(gpt::model::default_model),
-	release && desc->model->invalid?
-		(sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm)):
-		0
+	samp.tokens
 }
-,send_head
+,cached
 {
-	reinterpret_cast<const char *>(&gpt::model::default_model->word),
-	release && desc->model->invalid?
-		sizeof(gpt::model::embed):
-		0
+	desc.cached
 }
-,recv_ctrl
+,frame
 {
-	reinterpret_cast<char *>(task.ctrl),
-	acquire?
-		sizeof(gpt::ctrl):
-		0
+	tick % samp.opts.frames
 }
-,range_full
+,range
 {
-	{ tokens * 192UL, 0, },
-	{          192UL, 0, },
+	tick,
+	count,
+	tokens,
+	cached,
+	true,
+	false,
 }
-,range_last
+,stage
+{
+	cl::exec // data
+	{
+		desc.opts, std::memory_order_release
+	},
+	cl::exec // data
+	{
+		desc.ctrl, std::memory_order_release
+	},
+	cl::exec // data
+	{
+		desc.frame[frame], std::memory_order_release
+	},
+	cl::exec // data
+	{
+		desc.model->decode->master[0], std::memory_order_release
+	},
+	cl::exec // Initial kernel
+	{
+		desc.alloc, range.alloc,
+	},
+	cl::exec // Initial cycle kernel
+	{
+		desc.enter, range.embed,
+	},
+	cl::exec // Compute token and positional embeddings.
+	{
+		desc.lm_embed, range.embed,
+	},
+	// Forward Pass
+	cl::exec { desc.layer[0x00]->attn, range.attn },
+	cl::exec { desc.layer[0x00]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x01]->attn, range.attn },
+	cl::exec { desc.layer[0x01]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x02]->attn, range.attn },
+	cl::exec { desc.layer[0x02]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x03]->attn, range.attn },
+	cl::exec { desc.layer[0x03]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x04]->attn, range.attn },
+	cl::exec { desc.layer[0x04]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x05]->attn, range.attn },
+	cl::exec { desc.layer[0x05]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x06]->attn, range.attn },
+	cl::exec { desc.layer[0x06]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x07]->attn, range.attn },
+	cl::exec { desc.layer[0x07]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x08]->attn, range.attn },
+	cl::exec { desc.layer[0x08]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x09]->attn, range.attn },
+	cl::exec { desc.layer[0x09]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x0a]->attn, range.attn },
+	cl::exec { desc.layer[0x0a]->ffnn, range.ffnn },
+	cl::exec { desc.layer[0x0b]->attn, range.attn },
+	cl::exec { desc.layer[0x0b]->ffnn, range.fffnn },
+	cl::exec // Final normalization.
+	{
+		desc.lm_norm, range.fnorm
+	},
+	cl::exec // Compute language logits.
+	{
+		desc.lm_logit, range.logit
+	},
+	cl::exec // Statistics on the logits.
+	{
+		desc.lm_logsm, range.logsm
+	},
+	cl::exec // Select next token.
+	{
+		desc.lm_select, range.select
+	},
+	cl::exec // Backpropagate
+	{
+		desc.lm_prop_embed, range.prop_embed
+	},
+	cl::exec // Backpropagate
+	{
+		desc.lm_prop_norm, range.prop_norm
+	},
+	// Backward Pass
+	cl::exec { desc.layer[0x0b]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x0b]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x0a]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x0a]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x09]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x09]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x08]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x08]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x07]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x07]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x06]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x06]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x05]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x05]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x04]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x04]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x03]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x03]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x02]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x02]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x01]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x01]->prop_attn, range.prop_attn },
+	cl::exec { desc.layer[0x00]->prop_ffnn, range.prop_ffnn },
+	cl::exec { desc.layer[0x00]->prop_attn, range.prop_attn },
+	cl::exec // Final kernel
+	{
+		desc.leave[frame], range.select
+	},
+}
+{
+}
+
+ircd::gpt::pipe::cycle::~cycle()
+noexcept
+{
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// pipe::range
+//
+
+ircd::gpt::pipe::range::range(const uint tick,
+                              const uint count,
+                              const uint tokens,
+                              const uint cached,
+                              const bool fwd,
+                              const bool rev)
+noexcept
+:_full
+{
+	{ (tokens - cached) * 192UL, 0 },
+	{                     192UL, 0 },
+	{            cached * 192UL, 0 },
+}
+,_last
 {
 	{            1 * 192UL, 0 },
 	{                192UL, 0 },
-	{ (tokens - 1) * 192UL, 0 },
+	{  (count - 1) * 192UL, 0 },
 }
-,range_lm_embed
+,alloc
 {
-	release?
-		range_full:
-		range_last
+	{  (tick == 0) * 192UL, 0 },
+	{                192UL, 0 },
 }
-,range_negative
+,embed
 {
-	release?
-		range_full:
-		range_last
+	fwd?
+		_full:
+		cl::kern::range{},
 }
-,range_positive
+,attn
 {
-	release?
-		range_full:
-		range_last
+	fwd?
+		_full:
+		cl::kern::range{},
 }
-,range_lm_norm
+,ffnn
 {
-	range_last
+	fwd?
+		_full:
+		cl::kern::range{},
 }
-,range_lm_logit
+,fffnn
 {
-	{  786 * 64UL, 0 },  // align_up(50257) / 64
-	{        64UL, 0 },
+	fwd && tokens > count?
+		_full:
+	fwd?
+		_last:
+		cl::kern::range{},
 }
-,range_lm_logsm
+,fnorm
 {
-	{  1 * 256UL, 0 },
-	{      256UL, 0 },
+	fwd?
+		_last:
+		cl::kern::range{},
 }
-,range_lm_select
+,logit // TODO: align_up(50257) / 64|256
 {
-	{   1 * 256UL, 0 },
-	{       256UL, 0 },
+	{ int(fwd) * 50432UL, 0 },
+	{                64L, 0 },
 }
-,release_opts
+,logsm
 {
-	desc->opts, send_opts, send_opts_opts,
+	{ int(fwd) * 1 * 256UL, 0 },
+	{                256UL, 0 },
 }
-,release_ctrl
+,select
 {
-	desc->ctrl, send_ctrl, send_ctrl_opts
+	{ int(fwd) * 1 * 256UL, 0 },
+	{                256UL, 0 },
 }
-,release_coil
+,prop_embed
 {
-	desc->model->decode->master[0], send_coil, send_coil_opts
+	{ int(rev) * 0 * 192UL, 0 },
+	{                192UL, 0 },
 }
-,release_head
+,prop_norm
 {
-	desc->model->embed->master[0], send_head, send_head_opts
+	{ int(rev) * 0 * 192UL, 0 },
+	{                192UL, 0 },
 }
-,lm_embed
+,prop_attn
 {
-	desc->lm_embed, range_lm_embed, anode_opts
+	{ int(rev) * 0 * 192UL, 0 },
+	{                192UL, 0 },
 }
-,coil
+,prop_ffnn
 {
-	{ desc->layer[0x00]->negative, range_negative, negative_opts },
-	{ desc->layer[0x00]->positive, range_positive, positive_opts },
-	{ desc->layer[0x01]->negative, range_negative, negative_opts },
-	{ desc->layer[0x01]->positive, range_positive, positive_opts },
-	{ desc->layer[0x02]->negative, range_negative, negative_opts },
-	{ desc->layer[0x02]->positive, range_positive, positive_opts },
-	{ desc->layer[0x03]->negative, range_negative, negative_opts },
-	{ desc->layer[0x03]->positive, range_positive, positive_opts },
-	{ desc->layer[0x04]->negative, range_negative, negative_opts },
-	{ desc->layer[0x04]->positive, range_positive, positive_opts },
-	{ desc->layer[0x05]->negative, range_negative, negative_opts },
-	{ desc->layer[0x05]->positive, range_positive, positive_opts },
-	{ desc->layer[0x06]->negative, range_negative, negative_opts },
-	{ desc->layer[0x06]->positive, range_positive, positive_opts },
-	{ desc->layer[0x07]->negative, range_negative, negative_opts },
-	{ desc->layer[0x07]->positive, range_positive, positive_opts },
-	{ desc->layer[0x08]->negative, range_negative, negative_opts },
-	{ desc->layer[0x08]->positive, range_positive, positive_opts },
-	{ desc->layer[0x09]->negative, range_negative, negative_opts },
-	{ desc->layer[0x09]->positive, range_positive, positive_opts },
-	{ desc->layer[0x0a]->negative, range_negative, negative_opts },
-	{ desc->layer[0x0a]->positive, range_positive, positive_opts },
-	{ desc->layer[0x0b]->negative, range_negative, negative_opts },
-	{ desc->layer[0x0b]->positive, range_positive, positive_opts },
+	{ int(rev) * 0 * 192UL, 0 },
+	{                192UL, 0 },
 }
-,lm_norm
-{
-	desc->lm_norm, range_lm_norm, cathode_opts
-}
-,lm_logit
-{
-	desc->lm_logit, range_lm_logit, lmhead_opts
-}
-,lm_logsm
-{
-	desc->lm_logsm, range_lm_logsm, lmhead_opts
-}
-,lm_select
-{
-	desc->lm_select, range_lm_select, lmamax_opts
-}
-,acquire_ctrl
-{
-	desc->ctrl, recv_ctrl, recv_ctrl_opts
-}
-{
-	if(release && desc->model->invalid)
-		desc->model->invalid = false;
-}
-
-ircd::gpt::pipe::exec::~exec()
-noexcept
-{
-}
-
-//
-// code
-//
-
-decltype(ircd::gpt::pipe::code::default_path)
-ircd::gpt::pipe::code::default_path
-{
-	{ "name",     "ircd.gpt.pipe.code.path" },
-};
-
-decltype(ircd::gpt::pipe::code::default_opts)
-ircd::gpt::pipe::code::default_opts
-{
-	{ "name",     "ircd.gpt.pipe.code.opts" },
-	{ "default",  string_view
-	{
-		" -cl-strict-aliasing"
-		" -cl-no-signed-zeros"
-		" -cl-finite-math-only"
-		" -cl-unsafe-math-optimizations"
-		" -cl-fast-relaxed-math"
-		" -cl-mad-enable"
-		" -cl-single-precision-constant"
-		//" -cl-fp32-correctly-rounded-divide-sqrt"
-
-		" -cl-kernel-arg-info"
-	}}
-};
-
-ircd::gpt::pipe::code::code()
-:cl::code{[]
-{
-	const string_view code_path
-	{
-		default_path
-	};
-
-	const fs::fd fd
-	{
-		code_path
-	};
-
-	const std::string read
-	{
-		fs::read(fd)
-	};
-
-	const string_view bin
-	{
-		read
-	};
-
-	const vector_view<const string_view> bins
-	(
-		&bin, 1
-	);
-
-	const auto opts
-	{
-		fmt::snstringf
-		{
-			4096, "%s -I%s",
-			string_view{default_opts},
-			string_view{fs::base::include},
-		}
-	};
-
-	return cl::code
-	{
-		bins, opts
-	};
-}()}
-{
-}
-
-ircd::gpt::pipe::code::~code()
-noexcept
 {
 }
 
+///////////////////////////////////////////////////////////////////////////////
 //
 // pipe::desc
 //
 
-ircd::gpt::pipe::desc::desc(pipe::code &code,
-                            pipe::model &model)
+ircd::gpt::pipe::desc::desc(const gpt::opts *const &opt,
+                            gpt::ctrl *const &ctrl_,
+                            pipe::model &model,
+                            pipe::code &code)
 :model
 {
 	&model
@@ -475,27 +463,44 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 {
 	&code
 }
-,state
+,opts
 {
-	0
-	+ 12 * 512 * 3 * 768 * sizeof(float),
-	mutable_buffer{},
+	const_buffer
+	{
+		reinterpret_cast<const char *>(opt),
+		sizeof(gpt::opts)
+	},
+}
+,ctrl
+{
+	const_buffer
+	{
+		reinterpret_cast<const char *>(ctrl_),
+		sizeof(gpt::ctrl)
+	},
 }
 ,master
 {
 	0
-	+ 512 * 768 * sizeof(float)
+	+ opt->layers * opt->context_tokens * opt->attn_elems * sizeof(float)
+	+ opt->context_tokens * opt->embed_elems * sizeof(float)
 	+ 65536 * sizeof(float)
-	+ 65536 * sizeof(float)
-	+ 65536 * sizeof(float)
-	,mutable_buffer{}
+	+ opt->layers * opt->attn_self_elems * sizeof(float)
+}
+,state
+{
+	master,
+	{
+		opt->layers * opt->context_tokens * opt->attn_elems * sizeof(float),
+		off_t(0),
+	}
 }
 ,accum
 {
 	master,
 	{
-		512 * 768 * sizeof(float),
-		off_t(0),
+		opt->context_tokens * opt->embed_elems * sizeof(float),
+		state.offset() + off_t(state.size()),
 	},
 }
 ,logit
@@ -506,23 +511,52 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 		accum.offset() + off_t(accum.size()),
 	},
 }
-,logsm
+,attns
 {
 	master,
 	{
-		65536 * sizeof(float),
-		logit.offset() + off_t(logit.size()),
-	},
+		opt->layers * opt->attn_self_elems * sizeof(float),
+		logit.offset() + off_t(logit.size())
+	}
 }
-,ctrl
+,frame
 {
-	sizeof(gpt::ctrl),
-	mutable_buffer{}
+	//             size, read, write, }, // idx
+	{ sizeof(gpt::ctrl), true, false, }, // 0
+	{ sizeof(gpt::ctrl), true, false, }, // 1
+	{ sizeof(gpt::ctrl), true, false, }, // 2
+	{ sizeof(gpt::ctrl), true, false, }, // 3
+	{ sizeof(gpt::ctrl), true, false, }, // 4
+	{ sizeof(gpt::ctrl), true, false, }, // 5
+	{ sizeof(gpt::ctrl), true, false, }, // 6
+	{ sizeof(gpt::ctrl), true, false, }, // 7
 }
-,opts
+,alloc
 {
-	sizeof(gpt::opts),
-	const_buffer{}
+	code,
+	"ircd_gpt_alloc",
+	model.decode->master[0],
+	master,
+	opts,
+	ctrl,
+	frame[0],
+	frame[1],
+	frame[2],
+	frame[3],
+	frame[4],
+	frame[5],
+	frame[6],
+	frame[7],
+}
+,enter
+{
+	code,
+	"ircd_gpt_enter",
+	model.decode->master[0],
+	state,
+	master,
+	opts,
+	ctrl,
 }
 ,lm_embed
 {
@@ -531,8 +565,8 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	ctrl,
 	opts,
 	accum,
-	model.embed->pos.param,
-	model.embed->token.param,
+	model.decode->embed.pos.param,
+	model.decode->embed.token.param,
 }
 ,lm_norm
 {
@@ -541,8 +575,8 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	ctrl,
 	opts,
 	accum,
-	model.decode->norm.bias.param,
-	model.decode->norm.weight.param,
+	model.decode->embed.norm.bias.param,
+	model.decode->embed.norm.weight.param,
 }
 ,lm_logit
 {
@@ -552,7 +586,8 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	opts,
 	logit,
 	accum,
-	model.embed->token.param,
+	model.decode->embed.pos.param,
+	model.decode->embed.token.param,
 }
 ,lm_logsm
 {
@@ -560,7 +595,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	"ircd_gpt_lm_logsm",
 	ctrl,
 	opts,
-	logsm,
 	logit,
 }
 ,lm_select
@@ -569,49 +603,132 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	"ircd_gpt_lm_select",
 	ctrl,
 	opts,
-	logsm,
 	logit,
+	attns,
 }
-,lm_norm_backprop
-{
-	code,
-	"ircd_gpt_norm_prop",
-	ctrl,
-	opts,
-	model.decode->norm.bias.param,
-	model.decode->norm.bias.moment[0],
-	model.decode->norm.bias.moment[1],
-	model.decode->norm.weight.param,
-	model.decode->norm.weight.moment[0],
-	model.decode->norm.weight.moment[1],
-}
-,lm_embed_backprop
+,lm_prop_embed
 {
 	code,
 	"ircd_gpt_lm_embed_prop",
 	ctrl,
 	opts,
-	model.embed->pos.param,
-	model.embed->pos.moment[0],
-	model.embed->pos.moment[1],
-	model.embed->token.param,
-	model.embed->token.moment[0],
-	model.embed->token.moment[1],
+	model.decode->embed.pos.param,
+	model.decode->embed.pos.moment[0],
+	model.decode->embed.pos.moment[1],
+	model.decode->embed.token.param,
+	model.decode->embed.token.moment[0],
+	model.decode->embed.token.moment[1],
+}
+,lm_prop_norm
+{
+	code,
+	"ircd_gpt_norm_prop",
+	ctrl,
+	opts,
+	model.decode->embed.norm.bias.param,
+	model.decode->embed.norm.bias.moment[0],
+	model.decode->embed.norm.bias.moment[1],
+	model.decode->embed.norm.weight.param,
+	model.decode->embed.norm.weight.moment[0],
+	model.decode->embed.norm.weight.moment[1],
+}
+,leave
+{
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[0],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[1],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[2],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[3],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[4],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[5],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[6],
+	},
+	{
+		code,
+		"ircd_gpt_leave",
+		model.decode->master[0],
+		state,
+		master,
+		opts,
+		ctrl,
+		frame[7],
+	},
 }
 ,layer
 {
-	std::make_unique<struct desc::layer>(*this, 0x00),
-	std::make_unique<struct desc::layer>(*this, 0x01),
-	std::make_unique<struct desc::layer>(*this, 0x02),
-	std::make_unique<struct desc::layer>(*this, 0x03),
-	std::make_unique<struct desc::layer>(*this, 0x04),
-	std::make_unique<struct desc::layer>(*this, 0x05),
-	std::make_unique<struct desc::layer>(*this, 0x06),
-	std::make_unique<struct desc::layer>(*this, 0x07),
-	std::make_unique<struct desc::layer>(*this, 0x08),
-	std::make_unique<struct desc::layer>(*this, 0x09),
-	std::make_unique<struct desc::layer>(*this, 0x0a),
-	std::make_unique<struct desc::layer>(*this, 0x0b),
+	std::make_unique<struct desc::layer>(*this, opt, 0x00),
+	std::make_unique<struct desc::layer>(*this, opt, 0x01),
+	std::make_unique<struct desc::layer>(*this, opt, 0x02),
+	std::make_unique<struct desc::layer>(*this, opt, 0x03),
+	std::make_unique<struct desc::layer>(*this, opt, 0x04),
+	std::make_unique<struct desc::layer>(*this, opt, 0x05),
+	std::make_unique<struct desc::layer>(*this, opt, 0x06),
+	std::make_unique<struct desc::layer>(*this, opt, 0x07),
+	std::make_unique<struct desc::layer>(*this, opt, 0x08),
+	std::make_unique<struct desc::layer>(*this, opt, 0x09),
+	std::make_unique<struct desc::layer>(*this, opt, 0x0a),
+	std::make_unique<struct desc::layer>(*this, opt, 0x0b),
 }
 {
 }
@@ -621,94 +738,106 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 //
 
 ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
-                                    const int laynum)
+                                    const gpt::opts *const &opts,
+                                    const uint laynum)
 :state
 {
 	desc.state,
 	{
-		512 * 3 * 768 * sizeof(float),
-		laynum * 512 * 3 * 768 * sizeof(float),
+		opts->context_tokens * opts->attn_elems * sizeof(float),
+		laynum * opts->context_tokens * opts->attn_elems * sizeof(float),
 	}
 }
-,negative
+,attns
+{
+	desc.attns,
+	{
+		opts->attn_self_elems * sizeof(float),
+		laynum * opts->attn_self_elems * sizeof(float),
+	}
+}
+,attn
 {
 	*desc.code,
 	"ircd_gpt_attn_fcon",
 	desc.ctrl,
 	desc.opts,
+	laynum,
 	state,
 	desc.accum,
-	desc.model->decode->block[laynum].attn.norm.bias.param,
-	desc.model->decode->block[laynum].attn.norm.weight.param,
-	desc.model->decode->block[laynum].attn.fcon.bias.param,
-	desc.model->decode->block[laynum].attn.fcon.weight.param,
+	desc.model->decode->layer[laynum].attn.norm.bias.param,
+	desc.model->decode->layer[laynum].attn.norm.weight.param,
+	desc.model->decode->layer[laynum].attn.fcon.bias.param,
+	desc.model->decode->layer[laynum].attn.fcon.weight.param,
 }
-,positive
+,ffnn
 {
 	*desc.code,
 	"ircd_gpt_coil",
 	desc.ctrl,
 	desc.opts,
+	laynum,
 	desc.accum,
+	attns,
 	state,
-	desc.model->decode->block[laynum].attn.proj.bias.param,
-	desc.model->decode->block[laynum].attn.proj.weight.param,
-	desc.model->decode->block[laynum].ffnn.norm.bias.param,
-	desc.model->decode->block[laynum].ffnn.norm.weight.param,
-	desc.model->decode->block[laynum].ffnn.fcon.bias.param,
-	desc.model->decode->block[laynum].ffnn.fcon.weight.param,
-	desc.model->decode->block[laynum].ffnn.proj.bias.param,
-	desc.model->decode->block[laynum].ffnn.proj.weight.param,
+	desc.model->decode->layer[laynum].attn.proj.bias.param,
+	desc.model->decode->layer[laynum].attn.proj.weight.param,
+	desc.model->decode->layer[laynum].ffnn.norm.bias.param,
+	desc.model->decode->layer[laynum].ffnn.norm.weight.param,
+	desc.model->decode->layer[laynum].ffnn.fcon.bias.param,
+	desc.model->decode->layer[laynum].ffnn.fcon.weight.param,
+	desc.model->decode->layer[laynum].ffnn.proj.bias.param,
+	desc.model->decode->layer[laynum].ffnn.proj.weight.param,
 }
-,backattn
+,prop_attn
 {
 	*desc.code,
 	"ircd_gpt_coil_prop_attn",
 	desc.ctrl,
 	desc.opts,
-	desc.model->decode->block[laynum].attn.norm.bias.param,
-	desc.model->decode->block[laynum].attn.norm.bias.moment[0],
-	desc.model->decode->block[laynum].attn.norm.bias.moment[1],
-	desc.model->decode->block[laynum].attn.norm.weight.param,
-	desc.model->decode->block[laynum].attn.norm.weight.moment[0],
-	desc.model->decode->block[laynum].attn.norm.weight.moment[1],
-	desc.model->decode->block[laynum].attn.fcon.bias.param,
-	desc.model->decode->block[laynum].attn.fcon.bias.moment[0],
-	desc.model->decode->block[laynum].attn.fcon.bias.moment[1],
-	desc.model->decode->block[laynum].attn.fcon.weight.param,
-	desc.model->decode->block[laynum].attn.fcon.weight.moment[0],
-	desc.model->decode->block[laynum].attn.fcon.weight.moment[1],
-	desc.model->decode->block[laynum].attn.proj.bias.param,
-	desc.model->decode->block[laynum].attn.proj.bias.moment[0],
-	desc.model->decode->block[laynum].attn.proj.bias.moment[1],
-	desc.model->decode->block[laynum].attn.proj.weight.param,
-	desc.model->decode->block[laynum].attn.proj.weight.moment[0],
-	desc.model->decode->block[laynum].attn.proj.weight.moment[1],
+	desc.model->decode->layer[laynum].attn.norm.bias.param,
+	desc.model->decode->layer[laynum].attn.norm.bias.moment[0],
+	desc.model->decode->layer[laynum].attn.norm.bias.moment[1],
+	desc.model->decode->layer[laynum].attn.norm.weight.param,
+	desc.model->decode->layer[laynum].attn.norm.weight.moment[0],
+	desc.model->decode->layer[laynum].attn.norm.weight.moment[1],
+	desc.model->decode->layer[laynum].attn.fcon.bias.param,
+	desc.model->decode->layer[laynum].attn.fcon.bias.moment[0],
+	desc.model->decode->layer[laynum].attn.fcon.bias.moment[1],
+	desc.model->decode->layer[laynum].attn.fcon.weight.param,
+	desc.model->decode->layer[laynum].attn.fcon.weight.moment[0],
+	desc.model->decode->layer[laynum].attn.fcon.weight.moment[1],
+	desc.model->decode->layer[laynum].attn.proj.bias.param,
+	desc.model->decode->layer[laynum].attn.proj.bias.moment[0],
+	desc.model->decode->layer[laynum].attn.proj.bias.moment[1],
+	desc.model->decode->layer[laynum].attn.proj.weight.param,
+	desc.model->decode->layer[laynum].attn.proj.weight.moment[0],
+	desc.model->decode->layer[laynum].attn.proj.weight.moment[1],
 }
-,backffnn
+,prop_ffnn
 {
 	*desc.code,
 	"ircd_gpt_coil_prop_ffnn",
 	desc.ctrl,
 	desc.opts,
-	desc.model->decode->block[laynum].ffnn.norm.bias.param,
-	desc.model->decode->block[laynum].ffnn.norm.bias.moment[0],
-	desc.model->decode->block[laynum].ffnn.norm.bias.moment[1],
-	desc.model->decode->block[laynum].ffnn.norm.weight.param,
-	desc.model->decode->block[laynum].ffnn.norm.weight.moment[0],
-	desc.model->decode->block[laynum].ffnn.norm.weight.moment[1],
-	desc.model->decode->block[laynum].ffnn.fcon.bias.param,
-	desc.model->decode->block[laynum].ffnn.fcon.bias.moment[0],
-	desc.model->decode->block[laynum].ffnn.fcon.bias.moment[1],
-	desc.model->decode->block[laynum].ffnn.fcon.weight.param,
-	desc.model->decode->block[laynum].ffnn.fcon.weight.moment[0],
-	desc.model->decode->block[laynum].ffnn.fcon.weight.moment[1],
-	desc.model->decode->block[laynum].ffnn.proj.bias.param,
-	desc.model->decode->block[laynum].ffnn.proj.bias.moment[0],
-	desc.model->decode->block[laynum].ffnn.proj.bias.moment[1],
-	desc.model->decode->block[laynum].ffnn.proj.weight.param,
-	desc.model->decode->block[laynum].ffnn.proj.weight.moment[0],
-	desc.model->decode->block[laynum].ffnn.proj.weight.moment[1],
+	desc.model->decode->layer[laynum].ffnn.norm.bias.param,
+	desc.model->decode->layer[laynum].ffnn.norm.bias.moment[0],
+	desc.model->decode->layer[laynum].ffnn.norm.bias.moment[1],
+	desc.model->decode->layer[laynum].ffnn.norm.weight.param,
+	desc.model->decode->layer[laynum].ffnn.norm.weight.moment[0],
+	desc.model->decode->layer[laynum].ffnn.norm.weight.moment[1],
+	desc.model->decode->layer[laynum].ffnn.fcon.bias.param,
+	desc.model->decode->layer[laynum].ffnn.fcon.bias.moment[0],
+	desc.model->decode->layer[laynum].ffnn.fcon.bias.moment[1],
+	desc.model->decode->layer[laynum].ffnn.fcon.weight.param,
+	desc.model->decode->layer[laynum].ffnn.fcon.weight.moment[0],
+	desc.model->decode->layer[laynum].ffnn.fcon.weight.moment[1],
+	desc.model->decode->layer[laynum].ffnn.proj.bias.param,
+	desc.model->decode->layer[laynum].ffnn.proj.bias.moment[0],
+	desc.model->decode->layer[laynum].ffnn.proj.bias.moment[1],
+	desc.model->decode->layer[laynum].ffnn.proj.weight.param,
+	desc.model->decode->layer[laynum].ffnn.proj.weight.moment[0],
+	desc.model->decode->layer[laynum].ffnn.proj.weight.moment[1],
 }
 {
 }
@@ -722,29 +851,31 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
 // pipe::model::model
 //
 
-ircd::gpt::pipe::model::model(gpt::model::decoder &decoder,
-                              gpt::model::embed &embed)
-:decode
+ircd::gpt::pipe::model::model(gpt::model::decoder &decoder)
+:decode_const
+{
+	std::addressof(decoder)
+}
+,decode_mutable
+{
+	std::addressof(decoder)
+}
+,decode
 {
 	std::make_unique<model::decoder>(decoder)
 }
-,embed
-{
-	std::make_unique<model::language>(embed)
-}
 {
 }
 
-ircd::gpt::pipe::model::model(const gpt::model::decoder &decoder,
-                              const gpt::model::embed &embed)
-:decode
+ircd::gpt::pipe::model::model(const gpt::model::decoder &decoder)
+:decode_const
+{
+	std::addressof(decoder)
+}
+,decode
 {
 	std::make_unique<model::decoder>(decoder)
 }
-,embed
-{
-	std::make_unique<model::language>(embed)
-}
 {
 }
 
@@ -762,26 +893,30 @@ ircd::gpt::pipe::model::decoder::decoder(gpt::model::decoder &decoder)
 {
 	// params
 	{
-		sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), mutable_buffer
+		mutable_buffer
 		{
-			reinterpret_cast<char *>(decoder.layer),
-			sizeof(decoder.layer) + sizeof(decoder.f)
+			reinterpret_cast<char *>(&decoder) + sizeof(gpt::model::decoder) * 0,
+			sizeof(gpt::model::decoder)
 		}
 	},
-
 	// first moment
 	{
-		sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm),
-		mutable_buffer{}
+		mutable_buffer
+		{
+			reinterpret_cast<char *>(&decoder) + sizeof(gpt::model::decoder) * 1,
+			sizeof(gpt::model::decoder)
+		}
 	},
-
 	// second moment
 	{
-		sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm),
-		mutable_buffer{}
+		mutable_buffer
+		{
+			reinterpret_cast<char *>(&decoder) + sizeof(gpt::model::decoder) * 2,
+			sizeof(gpt::model::decoder)
+		}
 	},
 }
-,block
+,layer
 {
 	{ master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
 	{ master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
@@ -796,12 +931,11 @@ ircd::gpt::pipe::model::decoder::decoder(gpt::model::decoder &decoder)
 	{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
 	{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
 }
-,norm
+,embed
 {
 	master,
-	off_t(sizeof(gpt::model::block) * 12),
-	mutable_buffer{decoder.f.bias},
-	mutable_buffer{decoder.f.weight},
+	off_t(offsetof(gpt::model::decoder, embed)),
+	decoder.embed,
 }
 {
 }
@@ -811,34 +945,33 @@ ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
 {
 	// params
 	{
-		sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer
+		const_buffer
 		{
-			reinterpret_cast<const char *>(decoder.layer),
-			sizeof(decoder.layer) + sizeof(decoder.f)
+			reinterpret_cast<const char *>(&decoder),
+			sizeof(gpt::model::decoder)
 		}
 	},
 }
-,block
+,layer
 {
-	{ master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
-	{ master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
-	{ master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, },
-	{ master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, },
-	{ master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, },
-	{ master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, },
-	{ master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, },
-	{ master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, },
-	{ master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, },
-	{ master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, },
-	{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
-	{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x00])), decoder.layer[0x00], 0x00, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x01])), decoder.layer[0x01], 0x01, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x02])), decoder.layer[0x02], 0x02, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x03])), decoder.layer[0x03], 0x03, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x04])), decoder.layer[0x04], 0x04, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x05])), decoder.layer[0x05], 0x05, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x06])), decoder.layer[0x06], 0x06, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x07])), decoder.layer[0x07], 0x07, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x08])), decoder.layer[0x08], 0x08, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x09])), decoder.layer[0x09], 0x09, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x0a])), decoder.layer[0x0a], 0x0a, },
+	{ master, off_t(offsetof(gpt::model::decoder, layer[0x0b])), decoder.layer[0x0b], 0x0b, },
 }
-,norm
+,embed
 {
 	master,
-	off_t(sizeof(gpt::model::block) * 12),
-	const_buffer{decoder.f.bias},
-	const_buffer{decoder.f.weight},
+	off_t(offsetof(gpt::model::decoder, embed)),
+	decoder.embed,
 }
 {
 }
@@ -849,171 +982,65 @@ noexcept
 }
 
 //
-// pipe::model::language
+// pipe::model::embed
 //
 
-ircd::gpt::pipe::model::language::language(gpt::model::embed &embed)
-:master
+ircd::gpt::pipe::model::embed::embed(cl::data *const master,
+                                     const off_t offset,
+                                     gpt::model::embed &embed)
+:norm
 {
-	// params
-	{
-		sizeof(embed), mutable_buffer
-		{
-			reinterpret_cast<char *>(&embed),
-			sizeof(embed),
-		}
-	},
-
-	// first moment
-	{
-		sizeof(embed), mutable_buffer{},
-	},
-
-	// second moment
-	{
-		sizeof(embed), mutable_buffer{},
-	},
+	master,
+	offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, bias)),
+	mutable_buffer{embed.norm.bias.elem},
+	offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, weight)),
+	mutable_buffer{embed.norm.weight.elem},
 }
 ,pos
 {
-	master, 0, mutable_buffer{embed.pos}
+	master,
+	offset + off_t(offsetof(gpt::model::embed, pos)),
+	mutable_buffer{embed.pos}
 }
 ,token
 {
-	master, sizeof(embed.pos), mutable_buffer{embed.token}
+	master,
+	offset + off_t(offsetof(gpt::model::embed, token)),
+	mutable_buffer{embed.token}
 }
 {
 }
 
-ircd::gpt::pipe::model::language::language(const gpt::model::embed &embed)
-:master
+ircd::gpt::pipe::model::embed::embed(cl::data *const master,
+                                     const off_t offset,
+                                     const gpt::model::embed &embed)
+:norm
 {
-	{
-		sizeof(embed), const_buffer
-		{
-			reinterpret_cast<const char *>(&embed),
-			sizeof(embed),
-		}
-	},
+	master,
+	offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, bias)),
+	const_buffer{embed.norm.bias.elem},
+	offset + off_t(offsetof(gpt::model::embed, norm)) + off_t(offsetof(gpt::model::norm, weight)),
+	const_buffer{embed.norm.weight.elem},
 }
 ,pos
 {
-	master, 0, const_buffer{embed.pos}
+	master,
+	offset + off_t(offsetof(gpt::model::embed, pos)),
+	const_buffer{embed.pos}
 }
 ,token
 {
-	master, sizeof(embed.pos), const_buffer{embed.token}
+	master,
+	offset + off_t(offsetof(gpt::model::embed, token)),
+	const_buffer{embed.token}
 }
 {
 }
 
-ircd::gpt::pipe::model::language::language(cl::data *const master,
-                                           const off_t offset,
-                                           gpt::model::embed &embed)
-:pos
-{
-	master, offset, mutable_buffer{embed.pos}
-}
-,token
-{
-	master, offset + off_t(sizeof(embed.pos)), mutable_buffer{embed.token}
-}
-{
-}
-
-ircd::gpt::pipe::model::language::language(cl::data *const master,
-                                           const off_t offset,
-                                           const gpt::model::embed &embed)
-:pos
-{
-	master, offset, const_buffer{embed.pos}
-}
-,token
-{
-	master, offset + off_t(sizeof(embed.pos)), const_buffer{embed.token}
-}
-{
-}
-
-ircd::gpt::pipe::model::language::~language()
-noexcept
-{
-}
-
 //
 // pipe::model::block
 //
 
-ircd::gpt::pipe::model::block::block(gpt::model::block &block,
-                                     const size_t layer)
-:master
-{
-	// params
-	{
-		sizeof(block), mutable_buffer
-		{
-			reinterpret_cast<char *>(&block), sizeof(block)
-		}
-	},
-
-	// first moment
-	{
-		sizeof(block),
-		mutable_buffer{}
-	},
-
-	// second moment
-	{
-		sizeof(block),
-		mutable_buffer{}
-	},
-}
-,attn
-{
-	master,
-	0,
-	block.ln1,
-	block.attn,
-}
-,ffnn
-{
-	master,
-	off_t(sizeof(block.ln1) + sizeof(block.attn)),
-	block.ln2,
-	block.ffnn,
-}
-{
-}
-
-ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
-                                     const size_t layer)
-:master
-{
-	// params
-	{
-		sizeof(block), const_buffer
-		{
-			reinterpret_cast<const char *>(&block), sizeof(block)
-		}
-	}
-}
-,attn
-{
-	master,
-	0,
-	block.ln1,
-	block.attn,
-}
-,ffnn
-{
-	master,
-	off_t(sizeof(block.ln1) + sizeof(block.attn)),
-	block.ln2,
-	block.ffnn,
-}
-{
-}
-
 ircd::gpt::pipe::model::block::block(cl::data *const master,
                                      const off_t offset,
                                      gpt::model::block &block,
@@ -1021,15 +1048,13 @@ ircd::gpt::pipe::model::block::block(cl::data *const master,
 :attn
 {
 	master,
-	offset,
-	block.ln1,
+	offset + off_t(offsetof(gpt::model::block, attn)),
 	block.attn,
 }
 ,ffnn
 {
 	master,
-	offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
-	block.ln2,
+	offset + off_t(offsetof(gpt::model::block, ffnn)),
 	block.ffnn,
 }
 {
@@ -1042,15 +1067,13 @@ ircd::gpt::pipe::model::block::block(cl::data *const master,
 :attn
 {
 	master,
-	offset,
-	block.ln1,
+	offset + off_t(offsetof(gpt::model::block, attn)),
 	block.attn,
 }
 ,ffnn
 {
 	master,
-	offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
-	block.ln2,
+	offset + off_t(offsetof(gpt::model::block, ffnn)),
 	block.ffnn,
 }
 {
@@ -1062,78 +1085,62 @@ ircd::gpt::pipe::model::block::block(cl::data *const master,
 
 ircd::gpt::pipe::model::ffnn::ffnn(cl::data *const master,
                                    const off_t offset,
-                                   gpt::model::norm &norm,
                                    gpt::model::ffnn &ffnn)
 :norm
 {
 	master,
-	offset,
-	mutable_buffer{norm.bias},
-	mutable_buffer{norm.weight},
+	offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, bias)),
+	mutable_buffer{ffnn.norm.bias.elem},
+	offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, weight)),
+	mutable_buffer{ffnn.norm.weight.elem},
 }
 ,fcon
 {
 	master,
-	offset + off_t(sizeof(norm)),
-	mutable_buffer{ffnn.fc_bias},
-	mutable_buffer{ffnn.fc_weight},
+	offset + off_t(offsetof(gpt::model::ffnn, fcon_bias)),
+	mutable_buffer{ffnn.fcon_bias.fcon},
+	offset + off_t(offsetof(gpt::model::ffnn, fcon_weight)),
+	mutable_buffer{ffnn.fcon_weight},
 }
 ,proj
 {
 	master,
-	offset + off_t(sizeof(norm) + sizeof(ffnn.fc_bias) + sizeof(ffnn.fc_weight)),
-	mutable_buffer{ffnn.proj_bias},
+	offset + off_t(offsetof(gpt::model::ffnn, proj_bias)),
+	mutable_buffer{ffnn.proj_bias.elem},
+	offset + off_t(offsetof(gpt::model::ffnn, proj_weight)),
 	mutable_buffer{ffnn.proj_weight},
 }
 {
-	always_assert
-	(
-		ircd::data(const_buffer{ffnn.proj_weight})
-		==
-		ircd::data(const_buffer{norm.bias}) +
-		sizeof(norm) +
-		sizeof(ffnn.fc_bias) +
-		sizeof(ffnn.fc_weight) +
-		ircd::size(const_buffer{ffnn.proj_bias})
-	);
 }
 
 ircd::gpt::pipe::model::ffnn::ffnn(cl::data *const master,
                                    const off_t offset,
-                                   const gpt::model::norm &norm,
                                    const gpt::model::ffnn &ffnn)
 :norm
 {
 	master,
-	offset,
-	const_buffer{norm.bias},
-	const_buffer{norm.weight},
+	offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, bias)),
+	const_buffer{ffnn.norm.bias.elem},
+	offset + off_t(offsetof(gpt::model::ffnn, norm)) + off_t(offsetof(gpt::model::norm, weight)),
+	const_buffer{ffnn.norm.weight.elem},
 }
 ,fcon
 {
 	master,
-	offset + off_t(sizeof(norm)),
-	const_buffer{ffnn.fc_bias},
-	const_buffer{ffnn.fc_weight},
+	offset + off_t(offsetof(gpt::model::ffnn, fcon_bias)),
+	const_buffer{ffnn.fcon_bias.fcon},
+	offset + off_t(offsetof(gpt::model::ffnn, fcon_weight)),
+	const_buffer{ffnn.fcon_weight},
 }
 ,proj
 {
 	master,
-	offset + off_t(sizeof(norm) + sizeof(ffnn.fc_bias) + sizeof(ffnn.fc_weight)),
-	const_buffer{ffnn.proj_bias},
+	offset + off_t(offsetof(gpt::model::ffnn, proj_bias)),
+	const_buffer{ffnn.proj_bias.elem},
+	offset + off_t(offsetof(gpt::model::ffnn, proj_weight)),
 	const_buffer{ffnn.proj_weight},
 }
 {
-	always_assert
-	(
-		ircd::data(const_buffer{ffnn.proj_weight})
-		==
-		ircd::data(const_buffer{norm.bias}) +
-		sizeof(norm) +
-		sizeof(ffnn.fc_bias) +
-		sizeof(ffnn.fc_weight) +
-		ircd::size(const_buffer{ffnn.proj_bias})
-	);
 }
 
 //
@@ -1142,78 +1149,62 @@ ircd::gpt::pipe::model::ffnn::ffnn(cl::data *const master,
 
 ircd::gpt::pipe::model::attn::attn(cl::data *const master,
                                    const off_t offset,
-                                   gpt::model::norm &norm,
                                    gpt::model::attn &attn)
 :norm
 {
 	master,
-	offset,
-	mutable_buffer{norm.bias},
-	mutable_buffer{norm.weight},
+	offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, bias)),
+	mutable_buffer{attn.norm.bias.elem},
+	offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, weight)),
+	mutable_buffer{attn.norm.weight.elem},
 }
 ,fcon
 {
 	master,
-	offset + off_t(sizeof(norm)),
-	mutable_buffer{attn.attn_bias},
-	mutable_buffer{attn.attn_weight},
+	offset + off_t(offsetof(gpt::model::attn, fcon_bias)),
+	mutable_buffer{attn.fcon_bias.fcon},
+	offset + off_t(offsetof(gpt::model::attn, fcon_weight)),
+	mutable_buffer{attn.fcon_weight},
 }
 ,proj
 {
 	master,
-	offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
-	mutable_buffer{attn.proj_bias},
+	offset + off_t(offsetof(gpt::model::attn, proj_bias)),
+	mutable_buffer{attn.proj_bias.elem},
+	offset + off_t(offsetof(gpt::model::attn, proj_weight)),
 	mutable_buffer{attn.proj_weight},
 }
 {
-	always_assert
-	(
-		ircd::data(const_buffer{attn.proj_weight})
-		==
-		ircd::data(const_buffer{norm.bias}) +
-		sizeof(norm) +
-		sizeof(attn.attn_bias) +
-		sizeof(attn.attn_weight) +
-		ircd::size(const_buffer{attn.proj_bias})
-	);
 }
 
 ircd::gpt::pipe::model::attn::attn(cl::data *const master,
                                    const off_t offset,
-                                   const gpt::model::norm &norm,
                                    const gpt::model::attn &attn)
 :norm
 {
 	master,
-	offset,
-	const_buffer{norm.bias},
-	const_buffer{norm.weight},
+	offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, bias)),
+	const_buffer{attn.norm.bias.elem},
+	offset + off_t(offsetof(gpt::model::attn, norm)) + off_t(offsetof(gpt::model::norm, weight)),
+	const_buffer{attn.norm.weight.elem},
 }
 ,fcon
 {
 	master,
-	offset + off_t(sizeof(norm)),
-	const_buffer{attn.attn_bias},
-	const_buffer{attn.attn_weight},
+	offset + off_t(offsetof(gpt::model::attn, fcon_bias)),
+	const_buffer{attn.fcon_bias.fcon},
+	offset + off_t(offsetof(gpt::model::attn, fcon_weight)),
+	const_buffer{attn.fcon_weight},
 }
 ,proj
 {
 	master,
-	offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
-	const_buffer{attn.proj_bias},
+	offset + off_t(offsetof(gpt::model::attn, proj_bias)),
+	const_buffer{attn.proj_bias.elem},
+	offset + off_t(offsetof(gpt::model::attn, proj_weight)),
 	const_buffer{attn.proj_weight},
 }
 {
-	always_assert
-	(
-		ircd::data(const_buffer{attn.proj_weight})
-		==
-		ircd::data(const_buffer{norm.bias}) +
-		sizeof(norm) +
-		sizeof(attn.attn_bias) +
-		sizeof(attn.attn_weight) +
-		ircd::size(const_buffer{attn.proj_bias})
-	);
 }
 
 //
@@ -1221,38 +1212,40 @@ ircd::gpt::pipe::model::attn::attn(cl::data *const master,
 //
 
 ircd::gpt::pipe::model::tensor::tensor(cl::data *const master,
-                                       const off_t offset,
+                                       const off_t bias_offset,
                                        const mutable_buffer &bias,
+                                       const off_t weight_offset,
                                        const mutable_buffer &weight)
 :bias
 {
 	master,
-	offset,
+	bias_offset,
 	bias,
 }
 ,weight
 {
 	master,
-	off_t(offset + ircd::size(bias)),
+	weight_offset,
 	weight,
 }
 {
 }
 
 ircd::gpt::pipe::model::tensor::tensor(cl::data *const master,
-                                       const off_t offset,
+                                       const off_t bias_offset,
                                        const const_buffer &bias,
+                                       const off_t weight_offset,
                                        const const_buffer &weight)
 :bias
 {
 	master,
-	offset,
+	bias_offset,
 	bias,
 }
 ,weight
 {
 	master,
-	off_t(offset + ircd::size(bias)),
+	weight_offset,
 	weight,
 }
 {
@@ -1269,7 +1262,7 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master,
 {
 	master[0],
 	{
-		ircd::size(param),
+		pad_to(ircd::size(param), 4096),
 		offset,
 	},
 }
@@ -1279,7 +1272,7 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master,
 	{
 		master[1],
 		{
-			ircd::size(param),
+			pad_to(ircd::size(param), 4096),
 			offset,
 		},
 	},
@@ -1288,12 +1281,13 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master,
 	{
 		master[2],
 		{
-			ircd::size(param),
+			pad_to(ircd::size(param), 4096),
 			offset,
 		},
 	},
 }
 {
+	assert(aligned(offset, 4096));
 }
 
 ircd::gpt::pipe::model::matrix::matrix(cl::data *const master,
@@ -1303,8 +1297,8 @@ ircd::gpt::pipe::model::matrix::matrix(cl::data *const master,
 {
 	master[0],
 	{
-		ircd::size(param),          // size
-		offset,                     // offset
+		pad_to(ircd::size(param), 4096),
+		offset,
 	},
 }
 {
diff --git a/ircd/gpt_pipe_code.cc b/ircd/gpt_pipe_code.cc
index e19ce75e5..723cabb58 100644
--- a/ircd/gpt_pipe_code.cc
+++ b/ircd/gpt_pipe_code.cc
@@ -152,10 +152,12 @@ ircd::gpt::pipe::code::set_cache(const string_view &path)
 }
 
 extern const uint8_t
-gpt_gpu_r600_barts_bc[];
+gpt_gpu_r600_barts_bc[],
+gpt_gpu_spv[];
 
 extern const uint
-gpt_gpu_r600_barts_bc_len;
+gpt_gpu_r600_barts_bc_len,
+gpt_gpu_spv_len;
 
 ircd::cl::code
 ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
@@ -164,6 +166,9 @@ ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
 	{
 		reinterpret_cast<const char *>(gpt_gpu_r600_barts_bc),
 		gpt_gpu_r600_barts_bc_len
+
+		//reinterpret_cast<const char *>(gpt_gpu_spv),
+		//gpt_gpu_spv_len
 	};
 
 	char pbuf[1][48];
diff --git a/ircd/gpt_vocab.cc b/ircd/gpt_vocab.cc
index 425a22970..c5274843b 100644
--- a/ircd/gpt_vocab.cc
+++ b/ircd/gpt_vocab.cc
@@ -10,6 +10,7 @@
 
 namespace ircd::gpt::vocab
 {
+	static u8x16 get_token(const u16);
 	static u16 find_token(const u8x16);
 	static u16 find_merge(const u8x16, const u8x16);
 	static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint);
@@ -23,11 +24,11 @@ namespace ircd::gpt::vocab
 	static u64x2 tokenize_block(u16x16 &, const u8x16, const i8x16) noexcept;
 	static void init_tokens(), init_merges();
 
-	[[gnu::visibility("internal")]]
 	extern const char32_t charset[256];
 }
 
 /// Remapping of single byte characters (Control (C0) and Basic Latin (ASCII)).
+[[gnu::visibility("internal")]]
 decltype(ircd::gpt::vocab::charset)
 ircd::gpt::vocab::charset
 alignas(64)
@@ -169,7 +170,8 @@ ircd::gpt::vocab::init_merges()
 
 ircd::string_view
 ircd::gpt::vocab::debug(const mutable_buffer &out,
-                        const u16 idx)
+                        const u16 idx,
+                        const uint mask)
 {
 	const auto *const token
 	{
@@ -177,13 +179,21 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
 	};
 
 	thread_local char strbuf[2][512];
-	return string_view{fmt::sprintf
+	return fmt::sprintf
 	{
-		out, "%5u  [%32s]  %s",
+		out, "%5u %s%32s%s%s%s",
 		idx,
-		simd::print_chr(strbuf[0], token[idx]),
-		simd::print_mem(strbuf[1], token[idx]),
-	}};
+		mask & 0x1?
+			"[ "_sv: string_view{},
+		mask & 0x1?
+			simd::print_chr(strbuf[0], token[idx]): string_view{},
+		mask & 0x1?
+			" ]"_sv: string_view{},
+		mask & 0x2?
+			" "_sv: string_view{},
+		mask & 0x2?
+			simd::print_mem(strbuf[1], token[idx]): string_view{},
+	};
 }
 
 //
@@ -193,6 +203,7 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
 ircd::string_view
 ircd::gpt::vocab::detokenize(const mutable_buffer &out,
                              const vector_view<const u16> &in)
+noexcept
 {
 	size_t off(0);
 	for(const u16 &id : in)
@@ -228,9 +239,65 @@ ircd::gpt::vocab::detokenize(const mutable_buffer &out,
 // tokenize
 //
 
+uint16_t
+ircd::gpt::vocab::tokenize(const string_view &in)
+{
+	char str_buf[16];
+	const string_view str
+	{
+		str_buf, copy(str_buf, in)
+	};
+
+	u16 buf[16];
+	const auto out
+	{
+		tokenize(buf, str)
+	};
+
+	if(unlikely(out.size() != 1))
+		throw error
+		{
+			"Input tokenizes to %zu tokens.",
+			out.size()
+		};
+
+	return buf[0];
+}
+
+uint16_t
+ircd::gpt::vocab::tokenize(const_buffer &in)
+noexcept
+{
+	char str_buf[16];
+	const string_view str
+	{
+		str_buf, copy(str_buf, in)
+	};
+
+	u16 buf[16];
+	const auto out
+	{
+		tokenize(buf, str)
+	};
+
+	const auto &tok
+	{
+		get_token(buf[0])
+	};
+
+	const auto consumed
+	{
+		simd::strlen(tok)
+	};
+
+	consume(in, consumed);
+	return buf[0];
+}
+
 ircd::vector_view<ircd::u16>
 ircd::gpt::vocab::tokenize(const vector_view<u16> &out,
                            const string_view &in)
+noexcept
 {
 	using input_t = u8x16;
 	using block_t = u16x16;
@@ -801,13 +868,8 @@ ircd::gpt::vocab::bpe_score(u16 (&score)[16],
 ircd::u16
 ircd::gpt::vocab::find_token(const u8x16 string)
 {
-	const auto *const __restrict__ token
-	{
-		reinterpret_cast<const u8x16 *>(vocab::token)
-	};
-
 	for(uint i(0); i < tokens; ++i)
-		if(simd::streq(string, token[i]))
+		if(simd::streq(string, get_token(i)))
 			return i;
 
 	return u16(-1U);
@@ -835,3 +897,14 @@ ircd::gpt::vocab::find_merge(const u8x16 a,
 
 	return u16(-1U);
 }
+
+ircd::u8x16
+ircd::gpt::vocab::get_token(const u16 idx)
+{
+	const auto *const __restrict__ token
+	{
+		reinterpret_cast<const u8x16 *>(vocab::token)
+	};
+
+	return token[idx];
+}