ircd::gpt: Various refactoring.

2024-12-25 15:04:10 +01:00 · 2022-06-19 18:59:29 -07:00 · 2022-06-19 18:59:29 -07:00 · 78848925ee
commit 78848925ee
parent 31e078506a
28 changed files with 5362 additions and 2644 deletions
--- a/include/ircd/gpt/ctrl.h
+++ b/include/ircd/gpt/ctrl.h
@ -11,89 +11,71 @@
 #pragma once
 #define HAVE_IRCD_GPT_CTRL_H

-/// Epoch Precision Interrupt Controller
-///
-struct ircd_gpt_ctrl_epic
-{
-	/// Accumulates the number of task cycles. The cycle counter is incremented
-	/// by device software after each repetition of the kernel pipeline to
-	/// produce one additional token.
-	ulong cycle;
-
-	/// Accumulates the epoch count for the task. The counter is incremented
-	/// by one in device software before control returns back to the host.
-	/// Several cycles may occur during each epoch.
-	ulong epoch;
-
-	/// Accumulates the training epoch count for the task. The counter is
-	/// incremented by one in device software for each backward propagation.
-	ulong step;
-
-	/// Updated by the host with the value of the timestamp register as sampled
-	/// immediately before each transfer of control to the device.
-	ulong host_tsc;
-
-	/// Accumulates time in microseconds elapsed for the task.
-	ulong elapsed;
-};
-
-/// Token Context Buffer (Control Block)
-///
-struct ircd_gpt_ctrl_tokens
-{
-	/// Token ring head. Tokens in the ring extend behind the head for
-	/// `tokens`. The `head` value is automatically modulated by device
-	/// software to wrap around the ring.
-	uint head;
-
-	/// Token counter. The counter indicates the number of valid tokens in
-	/// the context buffer. This value must not exceed the buffer size.
-	uint count;
-
-	/// Accumulates the number of tokens produced by the task. Several tokens
-	/// may be produced each epoch, but currently only one token is produced
-	/// each cycle.
-	ulong produced;
-
-	/// Accumulates the number tokens witnessed by the task. The number of
-	/// tokens in the context for each cycle is counted as witnessed.
-	ulong witnessed;
-};
-
-/// Target label register (abridged)
-///
+/// Result logit control block.
 struct ircd_gpt_ctrl_logit
 {
 	/// Vocabulary token.
 	ushort token;

 	/// Padding #0.
-	ushort _pad0;
+	ushort flag;

 	/// Result logit softmax probability.
 	float samax;
 };

-/// Target label register (full)
-///
+/// Target label control block. Results for each target are registered
+/// and state is updated each cycle.
 struct ircd_gpt_ctrl_label
 {
-	/// Vocabulary token.
-	ushort token;
-
-	/// Padding #0.
-	ushort _pad0;
-
-	/// Result logit softmax probability.
-	float samax;
+	/// Logit descriptor
+	struct ircd_gpt_ctrl_logit logit;

 	/// Loss state
 	struct ircd_math_mean loss;

 	/// Perplexity state
-	struct ircd_math_mean perp;
-}
-__attribute__((aligned(64)));
+	struct ircd_math_mean ppl;
+};
+
+/// Master clock
+struct ircd_gpt_ctrl_clk
+{
+	/// Master clock. The cycle count is incremented by one in device software
+	/// after each repetition of the kernels producing one additional token.
+	/// The cycle count resets to zero before the beginning of each sample.
+	uint cycle;
+
+	/// Master clock. Sample consists of one or more cycles; sample count is
+	/// incremented by one in device software after every accept condition,
+	/// growing monotonically for the `step`; resets to zero each `step`.
+	uint samp;
+
+	/// Master clock. Step (or timestep) consists of one or more samples. Step
+	/// count is incremented by one in device software after each backward
+	/// propagation. Step grows monotonically even across epochs.
+	uint step;
+
+	/// Master clock. Epoch consists of one or more steps; epoch count is
+	/// incremented by one after every backward propagation.
+	uint epoch;
+};
+
+/// Profiling block
+struct ircd_gpt_ctrl_prof
+{
+	/// Host timestamp sampled at last control page transfer to the device.
+	ulong released;
+
+	/// Host timestamp sampled when this control page accquired by the host.
+	ulong acquired;
+
+	/// Device timestamp at beginning of cycle.
+	ulong entered;
+
+	/// Device timestamp at end of cycle.
+	ulong finished;
+};

 /// Task Control Page
 ///
@ -104,50 +86,94 @@ __attribute__((aligned(64)));
 ///
 struct ircd_gpt_ctrl
 {
-	/// Epoch counting & interrupt control block.
-	struct ircd_gpt_ctrl_epic epic;
+	/// Accept register. If >= 0 the cycle produced a token which satisfies the
+	/// indicated accept condition.
+	int accept;

-	/// Token context control block. Contains state for the token context
-	/// buffer; the buffer with the tokens themselves is elsewhere.
-	struct ircd_gpt_ctrl_tokens tokens;
+	/// Dispatch register. Device software wishes additional cycles to be
+	/// commanded by the host. Effectively minimum distance until next accept.
+	uint dispatch;

-	/// Top result summary from the softed result logit softmax vector. This
-	/// is updated each cycle by device software with extended statistics on
-	/// the top N results.
-	struct ircd_gpt_ctrl_logit top[16];
+	/// Token counter. The counter indicates the number of valid tokens in
+	/// the context buffer. This value must not exceed the opts.buffer_size.
+	/// This value should not exceed the opts.context_size at least for now.
+	uint count;

-	/// Target label control block. Results for each target are registered
-	/// and state is updated each cycle.
-	struct ircd_gpt_ctrl_label label[4];
+	/// Token counter. The counter indicates the number of valid tokens in
+	/// the context buffer. This value must not exceed the opts.buffer_size.
+	/// This value should not exceed the opts.context_size at least for now.
+	uint tokens;

-	/// Result logit vector softmax internal state.
-	struct ircd_math_samax samax;
+	/// Master clock.
+	struct ircd_gpt_ctrl_clk clk;
+
+	/// Profiling related.
+	struct ircd_gpt_ctrl_prof prof;

 	/// PRNG xoshiro256 internal state (note: see opts.h to seed the prng).
 	ulong rand[4];

-	/// Perform backprop TODO: XXX
-	bool prop;
+	/// Top result summary from the softed result logit softmax vector. This
+	/// is updated each cycle by device software with extended statistics on
+	/// the top N results.
+	struct ircd_gpt_ctrl_logit top[16] __attribute__((aligned(8)));

-	/// Header magic 0xC7012C70
-	uint magic;
+	/// User label control block. Results for each target are registered
+	/// and state is updated each cycle; averaged for each step.
+	struct ircd_gpt_ctrl_label label[14] __attribute__((aligned(64)));

-	/// The token buffer starts at offset 2048 and continues to the end of
-	/// the page; options specify the size of the tokens buffer in tokens.
-	/// Additional pages must be attached for larger buffer sizes.
-	ushort token[] __attribute__((aligned(2048)));
+	/// Target result label; traces training token.
+	struct ircd_gpt_ctrl_label target __attribute__((aligned(64)));
+
+	/// Selected result token label.
+	struct ircd_gpt_ctrl_label select __attribute__((aligned(64)));
+
+	/// Incremented when the target is the selected token.
+	uint hit, miss;
+
+	/// Attention summary; [layer][head] => [token]. Each value points to a
+	/// position in the token buffer. The top-scoring softmax result for each
+	/// head in each layer is attending to the token.at(value) for this cycle.
+	/// These values are completely updated every cycle.
+	ushort attn[12][12];
+
+	/// Header magic: host sets 0xDEADBEEF before release to device; device
+	/// sets 0xC7012C7012 before release to host.
+	ulong magic;
+
+	/// Token buffer
+	ushort token[1024] __attribute__((aligned(2048)));
 }
 __attribute__((aligned(4096)));

-#ifdef __cplusplus
+#if defined(__cplusplus)
 namespace ircd::gpt
 {
 	using ctrl = struct ircd_gpt_ctrl;
+	using ctrl_clk = struct ircd_gpt_ctrl_clk;
+	using ctrl_prof = struct ircd_gpt_ctrl_prof;
+	using ctrl_logit = struct ircd_gpt_ctrl_logit;
+	using ctrl_label = struct ircd_gpt_ctrl_label;
+
+	string_view debug_token_at(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = -1U);
+	string_view debug_token(const mutable_buffer &, const opts &, const ctrl &, const uint fmt = -1U);
+	string_view debug_head(const mutable_buffer &, const opts &, const ctrl_clk &);
+	string_view debug_head(const mutable_buffer &, const opts &, const ctrl &);
+	string_view debug(const mutable_buffer &, const opts &, const ctrl_logit &, const uint fmt = 0);
+	string_view debug(const mutable_buffer &, const opts &, const ctrl_label &, const uint fmt = 0);
+	string_view debug(const mutable_buffer &, const opts &, const ctrl &);
+
+	string_view debug_attn(const mutable_buffer &, const opts &, const ctrl &, const uint);
+	string_view debug_label(const mutable_buffer &, const opts &, const ctrl &, const uint, const uint fmt = 0);
+	string_view debug_top(const mutable_buffer &, const opts &, const ctrl &, const uint);
 }
 #endif

-#ifdef __cplusplus
-static_assert(sizeof(struct ircd_gpt_ctrl) == 4096);
+#if defined(__cplusplus)
+static_assert(sizeof(struct ircd_gpt_ctrl) % 4096 == 0);
+#endif
+
+#if defined(__cplusplus) && defined(__GLIBCXX__)
 static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
 static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
 #endif
--- a/include/ircd/gpt/epoch.h
+++ b/include/ircd/gpt/epoch.h
@ -0,0 +1,38 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_EPOCH_H
+
+/// Perform one task epoch on the device.
+///
+struct ircd::gpt::epoch
+{
+	gpt::task &task;
+	pipe::desc &desc;
+
+	const gpt::opts &opts;
+	gpt::ctrl &ctrl;
+
+	const uint id;
+	const size_t start, stop;
+	f32 *moment[2];
+
+	pipe::prof profile;
+
+	void profile_accumulate(const pipe::prof &);
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	epoch(gpt::task &);
+	~epoch() noexcept;
+};
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -17,16 +17,36 @@ namespace ircd::gpt
 {
 	IRCD_EXCEPTION(ircd::error, error)

+	struct samp;
+	struct step;
+	struct epoch;
 	struct task;

 	extern log::log log;
 }

 #include "vocab.h"
-#include "model.h"
 #include "token.h"
+#include "vector.h"
+#include "model.h"
 #include "opts.h"
 #include "ctrl.h"
-#include "task.h"
 #include "pipe/pipe.h"
+#include "samp.h"
+#include "step.h"
+#include "epoch.h"
+#include "task.h"
 #include "generate.h"
+
+namespace ircd::gpt
+{
+	void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept;
+
+	void log_debug(const opts &, const ctrl &);
+	void log_debug_token(const opts &, const ctrl &, const uint);
+	void log_debug_attns(const opts &, const ctrl &);
+	void log_debug_attns_top(const opts &, const ctrl &);
+	void log_debug_labels(const opts &, const ctrl &);
+	void log_debug_topn(const opts &, const ctrl &);
+	void log_debug_prof(const opts &, const ctrl &, const pipe::prof &);
+}
--- a/include/ircd/gpt/gpu.h
+++ b/include/ircd/gpt/gpu.h
@ -0,0 +1,290 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+typedef union ircd_gpt_vector_f32x4 ircd_gpt_vectorv __attribute__((aligned(4096)));
+typedef struct ircd_gpt_attn_qkv_f32x4 ircd_gpt_attn_qkvv __attribute__((aligned(4096)));
+typedef union ircd_gpt_attn_aperature_f32x4 ircd_gpt_attn_aperaturev __attribute__((aligned(4096)));
+typedef union ircd_gpt_ffnn_aperature_f32x4 ircd_gpt_ffnn_aperaturev __attribute__((aligned(4096)));
+
+//
+// Frontside
+//
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_norm_fmad(__local ircd_gpt_vectorv *out,
+                   __local const ircd_gpt_vectorv *in,
+                   __global const ircd_gpt_vectorv *bias,
+                   __global const ircd_gpt_vectorv *weight,
+                   const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_norm(__local ircd_gpt_vectorv *out,
+              __local const ircd_gpt_vectorv *in,
+              __local ircd_gpt_vectorv *tmp,
+              __global const ircd_gpt_vectorv *bias,
+              __global const ircd_gpt_vectorv *weight,
+              const uint ln,
+              const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_gelu(__local ircd_gpt_ffnn_aperaturev *out,
+                   __local const ircd_gpt_ffnn_aperaturev *in,
+                   const uint i);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_fcon_tmul(__constant const struct ircd_gpt_opts *opts,
+                        __local ircd_gpt_ffnn_aperaturev *out,
+                        __local const ircd_gpt_vectorv *in,
+                        __global const ircd_gpt_ffnn_aperaturev *bias,
+                        __global const ircd_gpt_ffnn_aperaturev *weight,
+                        const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __local ircd_gpt_ffnn_aperaturev *out,
+                   __local const ircd_gpt_vectorv *in,
+                   __global const ircd_gpt_ffnn_aperaturev *bias,
+                   __global const ircd_gpt_ffnn_aperaturev *weight,
+                   const uint ln,
+                   const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn_proj_tmul(__constant const struct ircd_gpt_opts *opts,
+                        __local ircd_gpt_vectorv *out,
+                        __local const ircd_gpt_ffnn_aperaturev *in,
+                        __global const ircd_gpt_vectorv *bias,
+                        __global const ircd_gpt_vectorv *weight,
+                        const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_ffnn(__global const struct ircd_gpt_ctrl *ctrl,
+              __constant const struct ircd_gpt_opts *opts,
+              __local ircd_gpt_vectorv *vector,
+              __local ircd_gpt_ffnn_aperaturev *buf,
+              __local ircd_gpt_vectorv *tmp,
+              __global const ircd_gpt_vectorv *norm_bias,
+              __global const ircd_gpt_vectorv *norm_weight,
+              __global const ircd_gpt_ffnn_aperaturev *fcon_bias,
+              __global const ircd_gpt_ffnn_aperaturev *fcon_weight,
+              __global const ircd_gpt_vectorv *proj_bias,
+              __global const ircd_gpt_vectorv *proj_weight,
+              const uint ln,
+              const uint li);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_attn_fcon_tmul(__constant const struct ircd_gpt_opts *opts,
+                        __local ircd_gpt_attn_aperaturev *out,
+                        __local const ircd_gpt_vectorv *in,
+                        __global const ircd_gpt_attn_aperaturev *bias,
+                        __global const ircd_gpt_attn_aperaturev *weight,
+                        const uint ln,
+                        const uint li);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_attn_fcon(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __private const uint layer,
+                   __global ircd_gpt_attn_aperaturev *state,
+                   __global const ircd_gpt_vectorv *accum,
+                   __global const ircd_gpt_vectorv *norm_bias,
+                   __global const ircd_gpt_vectorv *norm_weight,
+                   __global const ircd_gpt_attn_aperaturev *fcon_bias,
+                   __global const ircd_gpt_attn_aperaturev *fcon_weight);
+
+//
+// head
+//
+
+void
+__attribute__((internal_linkage))
+_ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __global ircd_gpt_vectorv *accum,
+                   __global const ircd_gpt_vectorv *pos,
+                   __global const ircd_gpt_vectorv *vocab,
+                   const ushort out_idx,
+                   const ushort tok_idx,
+                   const ushort word_idx);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_embed(__global const struct ircd_gpt_ctrl *ctrl,
+                  __constant const struct ircd_gpt_opts *opts,
+                  __global ircd_gpt_vectorv *accum,
+                  __global const ircd_gpt_vectorv *pos,
+                  __global const ircd_gpt_vectorv *vocab);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_norm(__global const struct ircd_gpt_ctrl *ctrl,
+                 __constant const struct ircd_gpt_opts *opts,
+                 __global ircd_gpt_vectorv *accum,
+                 __global const ircd_gpt_vectorv *norm_bias,
+                 __global const ircd_gpt_vectorv *norm_weight);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *ctrl,
+                  __constant const struct ircd_gpt_opts *opts,
+                  __global float *logit,
+                  __global const ircd_gpt_vectorv *accum,
+                  __global const ircd_gpt_vectorv *pos,
+                  __global const ircd_gpt_vectorv *vocab);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_logsm(__global struct ircd_gpt_ctrl *ctrl,
+                  __constant const struct ircd_gpt_opts *opts,
+                  __global float logit[65536]);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_top(__local struct ircd_gpt_ctrl *ctrl,
+                       __constant const struct ircd_gpt_opts *opts,
+                       __local const ushort *idx,
+                       __global const float *logsm,
+                       const uint i);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_label_mean(__local struct ircd_gpt_ctrl *ctrl,
+                              __constant const struct ircd_gpt_opts *opts,
+                              __local struct ircd_math_mean *mean,
+                              const float last);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_label(__local struct ircd_gpt_ctrl *ctrl,
+                         __constant const struct ircd_gpt_opts *opts,
+                         __local struct ircd_gpt_ctrl_label *label,
+                         __global const float *logsm);
+
+ushort
+__attribute__((internal_linkage))
+ircd_gpt_lm_result_select(__local struct ircd_gpt_ctrl *ctrl,
+                          __constant const struct ircd_gpt_opts *opts,
+                          __local const ushort *idx,
+                          __global const float *logsm);
+
+uint
+__attribute__((internal_linkage))
+ircd_gpt_accept_len(__local struct ircd_gpt_ctrl *ctrl,
+                    __constant const struct ircd_gpt_opts *opts,
+                    const uint i);
+
+uint
+__attribute__((internal_linkage))
+ircd_gpt_accept_match(__local struct ircd_gpt_ctrl *ctrl,
+                      __constant const struct ircd_gpt_opts *opts,
+                      const uint i);
+
+int
+__attribute__((internal_linkage))
+ircd_gpt_accept_check(__local struct ircd_gpt_ctrl *ctrl,
+                      __constant const struct ircd_gpt_opts *opts);
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_accept(__local struct ircd_gpt_ctrl *ctrl,
+                __constant const struct ircd_gpt_opts *opts);
+
+//
+// backside
+//
+
+void
+__attribute__((internal_linkage))
+ircd_gpt_prop_elem(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __global float4 *param_,
+                   __global float4 *exp_avg_,
+                   __global float4 *exp_avg_sqr_);
+
+//
+// backpropagations
+//
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_norm_prop(__global const struct ircd_gpt_ctrl *ctrl,
+                   __constant const struct ircd_gpt_opts *opts,
+                   __global ircd_gpt_vectorv *bias,
+                   __global ircd_gpt_vectorv *bias_m0,
+                   __global ircd_gpt_vectorv *bias_m1,
+                   __global ircd_gpt_vectorv *weight,
+                   __global ircd_gpt_vectorv *weight_m0,
+                   __global ircd_gpt_vectorv *weight_m1);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_coil_prop_attn(__global const struct ircd_gpt_ctrl *ctrl,
+                        __constant const struct ircd_gpt_opts *opts,
+                        __global ircd_gpt_vectorv *norm_bias,
+                        __global ircd_gpt_vectorv *norm_bias_m0,
+                        __global ircd_gpt_vectorv *norm_bias_m1,
+                        __global ircd_gpt_vectorv *norm_weight,
+                        __global ircd_gpt_vectorv *norm_weight_m0,
+                        __global ircd_gpt_vectorv *norm_weight_m1,
+                        __global ircd_gpt_attn_aperaturev *fcon_bias,
+                        __global ircd_gpt_attn_aperaturev *fcon_bias_m0,
+                        __global ircd_gpt_attn_aperaturev *fcon_bias_m1,
+                        __global ircd_gpt_attn_aperaturev *fcon_weight,
+                        __global ircd_gpt_attn_aperaturev *fcon_weight_m0,
+                        __global ircd_gpt_attn_aperaturev *fcon_weight_m1,
+                        __global ircd_gpt_vectorv *proj_bias,
+                        __global ircd_gpt_vectorv *proj_bias_m0,
+                        __global ircd_gpt_vectorv *proj_bias_m1,
+                        __global ircd_gpt_vectorv *proj_weight,
+                        __global ircd_gpt_vectorv *proj_weight_m0,
+                        __global ircd_gpt_vectorv *proj_weight_m1);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_coil_prop_ffnn(__global const struct ircd_gpt_ctrl *ctrl,
+                        __constant const struct ircd_gpt_opts *opts,
+                        __global ircd_gpt_vectorv *norm_bias,
+                        __global ircd_gpt_vectorv *norm_bias_m0,
+                        __global ircd_gpt_vectorv *norm_bias_m1,
+                        __global ircd_gpt_vectorv *norm_weight,
+                        __global ircd_gpt_vectorv *norm_weight_m0,
+                        __global ircd_gpt_vectorv *norm_weight_m1,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_bias,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_bias_m0,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_bias_m1,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_weight,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_weight_m0,
+                        __global ircd_gpt_ffnn_aperaturev *fcon_weight_m1,
+                        __global ircd_gpt_vectorv *proj_bias,
+                        __global ircd_gpt_vectorv *proj_bias_m0,
+                        __global ircd_gpt_vectorv *proj_bias_m1,
+                        __global ircd_gpt_vectorv *proj_weight,
+                        __global ircd_gpt_vectorv *proj_weight_m0,
+                        __global ircd_gpt_vectorv *proj_weight_m1);
+
+__kernel void
+__attribute__((visibility("protected")))
+ircd_gpt_lm_embed_prop(__global const struct ircd_gpt_ctrl *ctrl,
+                       __constant const struct ircd_gpt_opts *opts,
+                       __global ircd_gpt_vectorv *pos,
+                       __global ircd_gpt_vectorv *pos_m0,
+                       __global ircd_gpt_vectorv *pos_m1,
+                       __global ircd_gpt_vectorv *token,
+                       __global ircd_gpt_vectorv *token_m0,
+                       __global ircd_gpt_vectorv *token_m1);
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -20,66 +20,105 @@ namespace ircd::gpt::model
 	struct embed;
 	struct decoder;

-	constexpr auto align {64};
+	struct prop;
+	struct text;
+
 	extern decoder *default_model;
+	extern float *default_moment[2];
+	extern float *default_checkpoint[3];
 	extern string_view default_dataset;
 	extern std::vector<json::object> default_data;
+
+	constexpr auto alignment {4096};
 }

+/// Layer normalization
+struct ircd::gpt::model::norm
+{
+	union ircd_gpt_vector
+	bias alignas(alignment),
+	weight alignas(alignment);
+};
+
 /// Attention aperature
 struct ircd::gpt::model::attn
 {
-	float
-	attn_bias    alignas(align) [2304],
-	attn_weight  alignas(align) [768][2304];
+	model::norm
+	norm;

-	float
-	proj_bias    alignas(align) [768],
-	proj_weight  alignas(align) [768][768];
+	union ircd_gpt_attn_aperature
+	fcon_bias alignas(alignment),
+	fcon_weight alignas(alignment) [768];
+
+	union ircd_gpt_vector
+	proj_bias alignas(alignment),
+	proj_weight alignas(alignment) [768];
 };

 /// Feed-forward neural network
 struct ircd::gpt::model::ffnn
 {
-	float
-	fc_bias      alignas(align) [3072],
-	fc_weight    alignas(align) [768][3072];
+	model::norm
+	norm;

-	float
-	proj_bias    alignas(align) [768],
-	proj_weight  alignas(align) [3072][768];
-};
+	union ircd_gpt_ffnn_aperature
+	fcon_bias alignas(alignment),
+	fcon_weight alignas(alignment) [768];

-/// Layer normalization
-struct ircd::gpt::model::norm
-{
-	float
-	bias    alignas(align) [768],
-	weight  alignas(align) [768];
+	union ircd_gpt_vector
+	proj_bias alignas(alignment),
+	proj_weight alignas(alignment) [3072];
 };

 /// Transformer block
 struct ircd::gpt::model::block
 {
-	norm ln1;
-	model::attn attn;
+	model::attn
+	attn;

-	norm ln2;
-	model::ffnn ffnn;
+	model::ffnn
+	ffnn;
 };

 /// Vocabulary embeddings
 struct ircd::gpt::model::embed
 {
-	float
-	pos     alignas(align) [1024][768],
-	token   alignas(align) [65536][768];
+	model::norm
+	norm;
+
+	union ircd_gpt_vector
+	pos alignas(alignment) [1024],
+	token alignas(alignment) [65536];
 };

-struct ircd::gpt::model::decoder
+/// Transformer decoder
+struct alignas(ircd::gpt::model::alignment)
+ircd::gpt::model::decoder
 {
-	block layer[12];
+	model::block
+	layer[12];

-	norm f;
-	embed word;
+	model::embed
+	embed;
+};
+
+struct ircd::gpt::model::prop
+{
+	static constexpr const char
+	*const ended {"ended"},
+	*const id {"id"},
+	*const length {"length"},
+	*const text {"text"};
+};
+
+struct ircd::gpt::model::text
+:json::tuple
+<
+	json::property<prop::ended, bool>,
+	json::property<prop::id, uint>,
+	json::property<prop::length, uint>,
+	json::property<prop::text, json::string>
+>
+{
+	using super_type::tuple;
 };
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@ -11,6 +11,13 @@
 #pragma once
 #define HAVE_IRCD_GPT_OPTS_H

+#if defined(__cplusplus)
+namespace ircd::gpt::model
+{
+	struct decoder;
+}
+#endif
+
 /// Task Options Page
 ///
 /// The option block is directly shared with task software as constant data.
@ -20,30 +27,23 @@
 ///
 struct ircd_gpt_opts
 {
-	#ifdef __cplusplus
-	ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
+	#if defined(__cplusplus)
+	ircd_gpt_opts() noexcept;
 	#endif

-	/// Reference to the model (currently not available in device software).
-	#ifndef __cplusplus
-	const void *model;
-	#else
-	const ircd::gpt::model::decoder *model;
-	#endif
+	//
+	// Frontside
+	//

 	/// Seed for the task's PRNG.
 	ulong seed;

-	/// Limit number of output tokens. Default of -1 is unlimited; the number
-	/// of tokens generated will be limited by other factors.
-	uint limit;
-
 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
 	uint top_k;

 	/// Flip a random coin between 0 and top_p ( = 90 = 0.9) for logit select.
-	uint top_p;
+	float top_p;

 	/// Registers the top n result logits in the ctrl block each cycle.
 	uint top_n;
@ -51,59 +51,25 @@ struct ircd_gpt_opts
 	/// Number of target labels to register results for in the ctrl block.
 	uint labels;

-	/// Bitbar toggling various debug modes
+	/// Number of pages available after the control block for the frame log.
+	uint frames;
+
+	/// Limit number of output tokens. Default of -1; other halting conditions
+	/// will be used.
+	uint limit;
+
+	/// Bitbar toggling various debug modes.
 	uint debug;

-	/// Specifies the token context size in tokens.
-	uint context_tokens;
+	/// Accepting condition codes.
+	ushort accept[4][8] __attribute__((aligned(4)));

-	/// Specifies the token buffer size in tokens.
-	uint buffer_tokens;
+	//
+	// Backside
+	//

-	/// Decoding layers.
-	uint layers;
-
-	/// SIMD lane count.
-	uint lanes;
-
-	/// Embedding vector elements
-	uint embed_elems;
-
-	/// Cross-attention dimension
-	uint attn_rank;
-
-	/// Attention unit fcon width multiple
-	uint attn_mult;
-
-	/// (computed) MLP unit fcon width multiple
-	uint ffnn_mult;
-
-	/// (computed) attention unit width multiple
-	uint attn_elems;
-
-	/// FFNN unit width multiple
-	uint ffnn_elems;
-
-	/// SIMD lane count
-	uint lanes;
-
-	/// (computed) `embed_elems` / `lanes`
-	uint embed_width;
-
-	/// (computed) Attention unit X dimension
-	uint attn_width;
-
-	/// (computed) Attention unit Y dimension
-	uint attn_height;
-
-	/// (computed) MLP backend X dimension
-	uint ffnn_width;
-
-	/// (computed) MLP backend Y dimension
-	uint ffnn_height;
-
-	/// Number of possible target n-grams.
-	uint logits;
+	/// Samples per step.
+	uint batch_size;

 	/// Training steps
 	uint training_steps;
@ -122,15 +88,90 @@ struct ircd_gpt_opts

 	/// Denorm smoothing
 	float epsilon;
+
+	/// Tuning convergence rate
+	float lambda;
+
+	//
+	// Model dimensions
+	//
+
+	/// Number of possible target n-grams.
+	uint logits;
+
+	/// Specifies the token buffer size in tokens.
+	uint buffer_tokens;
+
+	/// Specifies the token context size in tokens.
+	uint context_tokens;
+
+	/// Decoding layers.
+	uint layers;
+
+	/// SIMD lane count.
+	uint lanes;
+
+	/// Embedding vector elements
+	uint embed_elems;
+
+	/// (computed) `embed_elems` / `lanes`
+	uint embed_width;
+
+	/// Cross-attention dimension
+	uint attn_rank;
+
+	/// Attention unit fcon width multiple
+	uint attn_mult;
+
+	/// (computed) attention unit width multiple
+	uint attn_elems;
+
+	/// (computed) Attention unit X dimension
+	uint attn_fcon_width;
+
+	/// (computed) Attention unit Y dimension
+	uint attn_fcon_height;
+
+	/// (computed) Attention unit X dimension
+	uint attn_proj_width;
+
+	/// (computed) Attention unit Y dimension
+	uint attn_proj_height;
+
+	/// (computed) Packed attention array total element count
+	uint attn_self_elems;
+
+	/// MLP unit fcon width multiple
+	uint ffnn_mult;
+
+	/// (computed) FFNN unit width multiple
+	uint ffnn_elems;
+
+	/// (computed) MLP backend X dimension
+	uint ffnn_fcon_width;
+
+	/// (computed) MLP backend Y dimension
+	uint ffnn_fcon_height;
+
+	/// (computed) MLP backend X dimension
+	uint ffnn_proj_width;
+
+	/// (computed) MLP backend Y dimension
+	uint ffnn_proj_height;
 }
 __attribute__((aligned(4096)));

-#ifdef __cplusplus
+#if defined(__cplusplus)
 namespace ircd::gpt
 {
 	using opts = ::ircd_gpt_opts;
 }
+#endif

+#if defined(__cplusplus)
 static_assert(sizeof(struct ircd_gpt_opts) == 4096);
+#endif
+
+#if defined(__cplusplus) && defined(__GLIBCXX__)
 static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
 #endif
--- a/include/ircd/gpt/pipe/code.h
+++ b/include/ircd/gpt/pipe/code.h
@ -16,12 +16,15 @@ struct ircd::gpt::pipe::code
 :cl::code
 {
 	static conf::item<std::string> default_path;
-	static conf::item<std::string> default_opts;
+	static conf::item<std::string> default_compile_opts;
+	static conf::item<std::string> default_link_opts;
 	static conf::item<std::string> cache_path;

 	static string_view make_cache_path(const mutable_buffer &);
-	static cl::code from_cache(const string_view &opts, const string_view &path);
-	static cl::code from_source(const string_view &opts);
+
+	static cl::code from_cache();
+	static cl::code from_source(const string_view &comp_opts = {}, const string_view &link_opts = {});
+	static cl::code from_bitcode(const string_view &link_opts = {});

 	void set_cache(const string_view &path);
 	bool put_cache();
--- a/include/ircd/gpt/pipe/cycle.h
+++ b/include/ircd/gpt/pipe/cycle.h
@ -0,0 +1,48 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_CYCLE_H
+
+namespace ircd::gpt::pipe
+{
+	const gpt::ctrl &acquire(cycle &);
+}
+
+/// Perform one task cycle on the device. The cycle is an atomic unit of
+/// generation producing one token.
+///
+/// Constructions of this object enqueue device commands to complete an
+/// additional cycle of the task as provided by `ctrl` and `opts`.
+///
+/// Destructions of this object yield the ircd::ctx until those commands
+/// are complete.
+///
+struct ircd::gpt::pipe::cycle
+{
+	struct profile;
+
+	static constexpr size_t stages
+	{
+		4 + 3 + (12 * 2) + 4 + 2 + (12 * 2) + 1
+	};
+
+	pipe::desc &desc;
+	uint tick;
+	uint count;
+	uint tokens;
+	uint cached;
+	uint frame;
+	pipe::range range;
+	std::array<cl::exec, stages> stage;
+
+	cycle(gpt::samp &);
+	~cycle() noexcept;
+};
--- a/include/ircd/gpt/pipe/desc.h
+++ b/include/ircd/gpt/pipe/desc.h
@ -16,43 +16,62 @@ struct ircd::gpt::pipe::desc
 {
 	struct layer;

+	// Model descriptor
 	pipe::model *model;
+
+	// Code descriptor
 	pipe::code *code;

+	// Memories
 	cl::data
-	state,         // [root] projection (layers * tokens * embed * 3 * float)
+	opts,          // [root] options page
+	ctrl,          // [root] control page
 	master,        // [root] single allocation for additional buffers:
+	state,         // [-sub] projection (layers * tokens * embed * 3 * float)
 	accum,         // [-sub] accumulator (tokens * embed * float)
 	logit,         // [-sub] result logit vector (50257 * float)
-	logsm,         // [-sub] outputs distribution (50257 * float)
-	ctrl,          // [root] control page
-	opts;          // [root] options page
+	attns,         // [-sub] result attention softmax
+	frame[8];      // [root] result stream

+	// Programs
 	cl::kern
+	alloc,
+	enter,
 	lm_embed,
 	lm_norm,
 	lm_logit,
 	lm_logsm,
 	lm_select,
-	lm_norm_backprop,
-	lm_embed_backprop;
+	lm_prop_embed,
+	lm_prop_norm,
+	leave[8];

-	std::unique_ptr<struct desc::layer>
-	layer[12];
+	/// Coil pack
+	std::unique_ptr<struct desc::layer> layer[12];

-	desc(pipe::code &, pipe::model &);
+	/// Attention projection for first N tokens already contained in `state`.
+	uint cached {0};
+
+	desc(const gpt::opts *const &,
+	     gpt::ctrl *const &,
+	     pipe::model &model,
+	     pipe::code &code);
 };

+/// Pipe descriptor: coil layer
 struct ircd::gpt::pipe::desc::layer
 {
 	cl::data
-	state;         // [-sub] qry/key/val projection (tokens * embed * 3 * float)
+	state,         // [-sub] qry/key/val projection (tokens * embed * 3 * float)
+	attns;         // [-sub] attn softmax result (((tokens * tokens) / 2) * 12 * float)

 	cl::kern
-	negative,
-	positive,
-	backattn,
-	backffnn;
+	attn,
+	ffnn,
+	prop_attn,
+	prop_ffnn;

-	layer(pipe::desc &, const int);
+	layer(pipe::desc &,
+	      const gpt::opts *const &,
+	      const uint laynum);
 };
--- a/include/ircd/gpt/pipe/exec.h
+++ b/include/ircd/gpt/pipe/exec.h
@ -1,65 +0,0 @@
-// Matrix Construct
-//
-// Copyright (C) Matrix Construct Developers, Authors & Contributors
-// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice is present in all copies. The
-// full license for this software is available in the LICENSE file.
-
-#pragma once
-#define HAVE_IRCD_GPT_PIPE_EXEC_H
-
-/// Perform one task cycle on the device.
-///
-/// Constructions of this object enqueue device commands to complete an
-/// additional epoch of the task as provided by `ctrl` and `opts`.
-///
-/// Destructions of this object yield the ircd::ctx until those commands
-/// are complete.
-///
-/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
-/// achieved by constructing several objects before following with destructions
-/// i.e in a std::deque.
-///
-struct ircd::gpt::pipe::exec
-{
-	pipe::desc *desc;
-
-	const_buffer
-	send_opts,         // Set when sending the options page.
-	send_ctrl,         // Set when sending the control page.
-	send_coil,         // Set when sending the updated model.
-	send_head;         // Set when sending the updated model.
-
-	mutable_buffer
-	recv_ctrl;         // Set when receiving the control page.
-
-	cl::kern::range
-	range_full,
-	range_last,
-	range_lm_embed,    // Dimension range of the lm_embed kernel.
-	range_negative,    // Dimension range of a layer kernel.
-	range_positive,    // Dimension range of a layer kernel.
-	range_lm_norm,     // Dimension range of the final norm kernel.
-	range_lm_logit,    // Dimension range of the language logit kernel.
-	range_lm_logsm,    // Dimension range of the language statistic kernel.
-	range_lm_select;   // Dimension range of the language token kernel.
-
-	cl::exec
-	release_opts,      // Release the options page.
-	release_ctrl,      // Release the control page.
-	release_coil,      // Release updates to the model.
-	release_head,      // Release updates to the model.
-	lm_embed,          // Compute token and positional embeddings.
-	coil[12 * 2],      // Pass over all layers.
-	lm_norm,           // Final normalization.
-	lm_logit,          // Compute language logits.
-	lm_logsm,          // Statistics on the logits.
-	lm_select,         // Select next token.
-	acquire_ctrl;      // Acquire the control page.
-
-	exec(task &, const size_t tokens, const bool rel, const bool acq);
-	~exec() noexcept;
-};
--- a/include/ircd/gpt/pipe/model.h
+++ b/include/ircd/gpt/pipe/model.h
@ -21,15 +21,15 @@ struct ircd::gpt::pipe::model
 	struct attn;
 	struct ffnn;
 	struct block;
+	struct embed;
 	struct decoder;
-	struct language;

+	const gpt::model::decoder *decode_const {nullptr};
+	gpt::model::decoder *decode_mutable {nullptr};
 	std::unique_ptr<model::decoder> decode;
-	std::unique_ptr<model::language> embed;
-	bool invalid {false};

-	model(const gpt::model::decoder &, const gpt::model::embed &);
-	model(gpt::model::decoder &, gpt::model::embed &);
+	model(const gpt::model::decoder &);
+	model(gpt::model::decoder &);
 	~model() noexcept;
 };

@ -49,8 +49,8 @@ struct ircd::gpt::pipe::model::tensor
 	bias,
 	weight;

-	tensor(cl::data *, const off_t, const const_buffer &bias, const const_buffer &weight);
-	tensor(cl::data *, const off_t, const mutable_buffer &bias, const mutable_buffer &weight);
+	tensor(cl::data *, const off_t, const const_buffer &bias, const off_t, const const_buffer &weight);
+	tensor(cl::data *, const off_t, const mutable_buffer &bias, const off_t, const mutable_buffer &weight);
 };

 struct ircd::gpt::pipe::model::attn
@ -60,8 +60,8 @@ struct ircd::gpt::pipe::model::attn
 	fcon,
 	proj;

-	attn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::attn &);
-	attn(cl::data *, const off_t, gpt::model::norm &, gpt::model::attn &);
+	attn(cl::data *, const off_t, const gpt::model::attn &);
+	attn(cl::data *, const off_t, gpt::model::attn &);
 };

 struct ircd::gpt::pipe::model::ffnn
@ -71,40 +71,33 @@ struct ircd::gpt::pipe::model::ffnn
 	fcon,
 	proj;

-	ffnn(cl::data *, const off_t, const gpt::model::norm &, const gpt::model::ffnn &);
-	ffnn(cl::data *, const off_t, gpt::model::norm &, gpt::model::ffnn &);
+	ffnn(cl::data *, const off_t, const gpt::model::ffnn &);
+	ffnn(cl::data *, const off_t, gpt::model::ffnn &);
 };

 struct ircd::gpt::pipe::model::block
 {
-	// Single layer memory roots
-	cl::data
-	master[3];
+	model::attn
+	attn;

-	// Layer units
-	model::attn attn;
-	model::ffnn ffnn;
+	model::ffnn
+	ffnn;

 	block(cl::data *, const off_t, const gpt::model::block &, const size_t);
 	block(cl::data *, const off_t, gpt::model::block &, const size_t);
-	block(const gpt::model::block &, const size_t);
-	block(gpt::model::block &, const size_t);
 };

-struct ircd::gpt::pipe::model::language
+struct ircd::gpt::pipe::model::embed
 {
-	cl::data
-	master[3];
+	tensor
+	norm;

 	matrix
 	pos,
 	token;

-	language(cl::data *, const off_t, const gpt::model::embed &);
-	language(cl::data *, const off_t, gpt::model::embed &);
-	language(const gpt::model::embed &);
-	language( gpt::model::embed &);
-	~language() noexcept;
+	embed(cl::data *, const off_t, const gpt::model::embed &);
+	embed(cl::data *, const off_t, gpt::model::embed &);
 };

 struct ircd::gpt::pipe::model::decoder
@ -115,10 +108,11 @@ struct ircd::gpt::pipe::model::decoder

 	// Layer blocks
 	model::block
-	block[12];
+	layer[12];

-	// Final norm
-	tensor norm;
+	// Language model head
+	model::embed
+	embed;

 	decoder(const gpt::model::decoder &);
 	decoder(gpt::model::decoder &);
--- a/include/ircd/gpt/pipe/pipe.h
+++ b/include/ircd/gpt/pipe/pipe.h
@ -13,21 +13,21 @@

 namespace ircd::gpt::pipe
 {
-	struct model;
 	struct code;
+	struct model;
 	struct desc;
-	struct exec;
+	struct range;
+	struct cycle;
+	struct prof;

-	extern model *default_model;
-	extern code *default_code;
-	extern desc *default_desc;
-
-	void generate(task &);
+	extern conf::item<size_t> queue_cycles;

 	void init(), fini() noexcept;
 };

-#include "model.h"
 #include "code.h"
+#include "model.h"
 #include "desc.h"
-#include "exec.h"
+#include "range.h"
+#include "cycle.h"
+#include "prof.h"
--- a/include/ircd/gpt/pipe/prof.h
+++ b/include/ircd/gpt/pipe/prof.h
@ -0,0 +1,56 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_PROF_H
+
+namespace ircd::gpt::pipe
+{
+	string_view debug(const mutable_buffer &, const prof &, const size_t &pos);
+	string_view debug(const mutable_buffer &, const prof &);
+}
+
+/// Extract profiling information for a cycle. This object contains timing
+/// state integers for each corresponding stage of the cycle.
+///
+/// Default constructions initialize to zero and the state can also be used
+/// as an accumulator.
+struct ircd::gpt::pipe::prof
+{
+	static constexpr size_t stages
+	{
+		cycle::stages
+	};
+
+	static constexpr size_t phases
+	{
+		num_of<cl::work::prof::phase>()
+	};
+
+	using phase = cl::work::prof::phase;
+	using phase_array = cl::work::prof;
+	using stage_array = std::array<phase_array, stages>;
+	using info_type = std::tuple<string_view, int>;
+	using info_array = std::array<info_type, stages>;
+	using info_name_array = std::array<char[64], stages>;
+
+	static info_name_array name;
+	static info_array info;
+
+  private:
+	static bool init;
+	static void init_info(const pipe::cycle &);
+
+  public:
+	stage_array ts;
+
+	prof(const pipe::cycle &);
+	prof() noexcept;
+};
--- a/include/ircd/gpt/pipe/range.h
+++ b/include/ircd/gpt/pipe/range.h
@ -0,0 +1,39 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_RANGE_H
+
+struct ircd::gpt::pipe::range
+{
+	cl::kern::range
+	_full,
+	_last,
+	alloc,
+	embed,
+	attn,
+	ffnn,
+	fffnn,
+	fnorm,
+	logit,
+	logsm,
+	select,
+	prop_embed,
+	prop_norm,
+	prop_attn,
+	prop_ffnn;
+
+	range(const uint tick,
+	      const uint count,
+	      const uint tokens,
+	      const uint cached,
+	      const bool fwd,
+	      const bool rev) noexcept;
+};
--- a/include/ircd/gpt/samp.h
+++ b/include/ircd/gpt/samp.h
@ -0,0 +1,49 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_SAMP_H
+
+/// Perform one task step on the device. The step is a sequence of cycles
+/// which generate tokens until satisfying a halting condition. The number of
+/// cycles for the step is limited by the size of the context buffer.
+///
+struct ircd::gpt::samp
+{
+	gpt::step &step;
+	pipe::desc &desc;
+
+	const gpt::opts &opts;
+	gpt::ctrl &ctrl;
+
+	const uint id;
+
+	int accept;
+	uint dispatch;
+	uint cycle;
+	uint tokens;
+	uint count;
+
+	pipe::prof profile;
+	std::deque<pipe::cycle> queue;
+
+  public:
+	void profile_accumulate(const pipe::prof &);
+	bool retire(pipe::cycle &, const gpt::ctrl &);
+	bool evaluate(pipe::cycle &);
+	uint tokenize();
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	samp(gpt::step &);
+	~samp() noexcept;
+};
--- a/include/ircd/gpt/step.h
+++ b/include/ircd/gpt/step.h
@ -0,0 +1,40 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_STEP_H
+
+/// Perform one task step on the device. The step is a sequence of cycles
+/// which generate tokens until satisfying a halting condition. The number of
+/// cycles for the step is limited by the size of the context buffer.
+///
+struct ircd::gpt::step
+{
+	gpt::epoch &epoch;
+	pipe::desc &desc;
+
+	const gpt::opts &opts;
+	gpt::ctrl &ctrl;
+
+	const uint id;
+	const uint start;
+
+	pipe::prof profile;
+
+	void profile_accumulate(const pipe::prof &);
+	bool backpropagate();
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	step(gpt::epoch &);
+	~step() noexcept;
+};
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -12,6 +12,14 @@
 #define HAVE_IRCD_GPT_TASK_H

 #ifdef __cplusplus
+namespace ircd::gpt
+{
+	void seed(task &, const uint64_t &) noexcept;
+	void seed(task &) noexcept;
+	void clear(task &) noexcept;
+	void reset(task &) noexcept;
+}
+
 /// Task Context
 ///
 /// State for a task.
@ -22,27 +30,25 @@ struct ircd::gpt::task
 	/// Reference to the attached options.
 	const gpt::opts *opts {nullptr};

-	/// Reference to control pages.
+	/// Reference to user's control block.
 	gpt::ctrl *ctrl {nullptr};

-	/// Current task status.
-	enum status status {'\0'};
+	/// Pipe code
+	std::unique_ptr<pipe::code> code;
+
+	/// Pipe model
+	std::unique_ptr<pipe::model> model;
+
+	/// Pipe state
+	pipe::desc desc;
+
+  public:
+	bool done() const noexcept;
+	bool operator()();
+
+	task(const gpt::opts *     = nullptr,
+	     gpt::ctrl *           = nullptr);

-	task(const gpt::opts * = nullptr, gpt::ctrl * = nullptr);
 	~task() noexcept;
 };
-
-/// The current status of a task is indicated with intelligible characters
-enum ircd::gpt::task::status
-:char
-{
-	QUEUED    = 'Q',  ///< Queued for execution.
-	RUNNING   = 'R',  ///< Currently being executed.
-	ACCEPT    = 'A',  ///< Execution completed successfully.
-	ERROR     = 'E',  ///< Execution did not complete successfully.
-};
-
-static_assert(sizeof(struct ircd_gpt_ctrl) == 4096);
-static_assert(offsetof(struct ircd_gpt_ctrl, token) == 2048);
-static_assert(std::is_standard_layout<struct ircd_gpt_ctrl>::value);
 #endif
--- a/include/ircd/gpt/token.h
+++ b/include/ircd/gpt/token.h
@ -11,127 +11,67 @@
 #pragma once
 #define HAVE_IRCD_GPT_TOKEN_H

-union ircd_gpt_token
+namespace ircd::gpt
 {
-	float
-	word[768],
-	attn[12][64];
+	struct token;
+}
+
+/// Token is just a 16-bit index into the vocabulary. This lightweight wrapper
+/// convenience constructs from a string lookup or from a u16 directly.
+class ircd::gpt::token
+{
+	uint16_t val;
+
+  public:
+	operator const uint16_t &() const;
+	operator uint16_t &();
+
+	operator string_view() const;
+
+	token(const_buffer &buf) noexcept;
+	token(const string_view &);
+	token(const uint16_t &) noexcept;
 };

-#ifdef __OPENCL_C_VERSION__
-union ircd_gpt_tokenv
+static_assert(sizeof(ircd::gpt::token) == sizeof(uint16_t));
+static_assert(std::is_standard_layout<ircd::gpt::token>::value);
+
+/// Direct construction; no lookup
+inline
+ircd::gpt::token::token(const uint16_t &val)
+noexcept
+:val{val}
+{}
+
+/// Must resolve to one token or error thrown.
+inline
+ircd::gpt::token::token(const string_view &str)
+:val{vocab::tokenize(str)}
+{}
+
+/// Consumes input for one token off front of buf
+inline
+ircd::gpt::token::token(const_buffer &buf)
+noexcept
+:val{vocab::tokenize(buf)}
+{}
+
+inline ircd::gpt::token::operator
+string_view()
+const
 {
-	float4
-	word[768/4],
-	attn[12][64/4];
+	return vocab::token[val];
+}

-	union ircd_gpt_token
-	token;
-};
-#endif
-
-struct ircd_gpt_attn_qkv
+inline ircd::gpt::token::operator
+uint16_t &()
 {
-	union ircd_gpt_token
-	qry,
-	key,
-	val;
-};
+	return val;
+}

-#ifdef __OPENCL_C_VERSION__
-struct ircd_gpt_attn_qkvv
+inline ircd::gpt::token::operator
+const uint16_t &()
+const
 {
-	union ircd_gpt_tokenv
-	qry,
-	key,
-	val;
-};
-#endif
-
-union ircd_gpt_attn_aperature
-{
-	float
-	fcon[2304],
-	proj[3][768],
-	qkv[3][12][64];
-
-	union ircd_gpt_token
-	token[3];
-};
-
-#ifdef __OPENCL_C_VERSION__
-union ircd_gpt_attn_aperaturev
-{
-	float4
-	fcon[2304/4],
-	proj[3][768/4],
-	qkv[3][12][64/4];
-
-	union ircd_gpt_token_f32x4
-	token[3];
-};
-
-union ircd_gpt_attn_aperature_f32x8
-{
-	float8
-	fcon[2304/8],
-	proj[3][768/8],
-	qkv[3][12][64/8];
-
-	union ircd_gpt_token_f32x8
-	token[3];
-};
-
-union ircd_gpt_attn_aperature_f32x16
-{
-	float16
-	fcon[2304/16],
-	proj[3][768/16],
-	qkv[3][12][64/16];
-
-	union ircd_gpt_token_f32x16
-	token[3];
-};
-#endif
-
-union ircd_gpt_ffnn_aperature
-{
-	float
-	fcon[3072],
-	proj[4][768];
-
-	union ircd_gpt_token
-	token[4];
-};
-
-#ifdef __OPENCL_C_VERSION__
-union ircd_gpt_ffnn_aperaturev
-{
-	float4
-	fcon[3072/4],
-	proj[4][768/4];
-
-	union ircd_gpt_token_f32x4
-	token[4];
-};
-
-union ircd_gpt_ffnn_aperature_f32x8
-{
-	float8
-	fcon[3072/8],
-	proj[4][768/8];
-
-	union ircd_gpt_token_f32x4
-	token[4];
-};
-
-union ircd_gpt_ffnn_aperature_f32x16
-{
-	float16
-	fcon[3072/16],
-	proj[4][768/16];
-
-	union ircd_gpt_token_f32x4
-	token[4];
-};
-#endif
+	return val;
+}
--- a/include/ircd/gpt/vector.h
+++ b/include/ircd/gpt/vector.h
@ -0,0 +1,147 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_VECTOR_H
+
+#if !defined(__SIZEOF_FLOAT4__) && defined(__OPENCL_VERSION__)
+	#define __SIZEOF_FLOAT4__ 16
+#endif
+
+#if !defined(__SIZEOF_FLOAT8__) && defined(__OPENCL_VERSION__)
+	#define __SIZEOF_FLOAT8__ 32
+#endif
+
+#if !defined(__SIZEOF_FLOAT16__) && defined(__OPENCL_VERSION__)
+	#define __SIZEOF_FLOAT16__ 64
+#endif
+
+#ifndef __OPENCL_VERSION__
+	#define __constant
+#endif
+
+static __constant const uint
+ircd_gpt_context_tokens = 512, // 1024,
+ircd_gpt_vector_elems = 768,
+ircd_gpt_attn_rank = 12,
+ircd_gpt_attn_segs = 3,
+ircd_gpt_ffnn_segs = 4;
+
+static __constant const uint
+ircd_gpt_vector_attn_elems = ircd_gpt_vector_elems / ircd_gpt_attn_rank,
+ircd_gpt_attn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_attn_segs,
+ircd_gpt_ffnn_fcon_elems = ircd_gpt_vector_elems * ircd_gpt_ffnn_segs;
+
+//
+// embed vector
+//
+
+#if defined(__SIZEOF_FLOAT__)
+union ircd_gpt_vector
+{
+	float
+	elem[ircd_gpt_vector_elems],
+	attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems];
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+union ircd_gpt_vector_f32x4
+{
+	float4
+	elem[ircd_gpt_vector_elems / 4],
+	attn[ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4];
+
+	union ircd_gpt_vector
+	vector;
+};
+#endif
+
+//
+// attn qkv
+//
+
+#if defined(__SIZEOF_FLOAT__)
+struct ircd_gpt_attn_qkv
+{
+	union ircd_gpt_vector
+	qry,
+	key,
+	val;
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+struct ircd_gpt_attn_qkv_f32x4
+{
+	union ircd_gpt_vector_f32x4
+	qry,
+	key,
+	val;
+};
+#endif
+
+//
+// attn aperature
+//
+
+#if defined(__SIZEOF_FLOAT__)
+union ircd_gpt_attn_aperature
+{
+	float
+	fcon[ircd_gpt_attn_fcon_elems],
+	proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems],
+	qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems];
+
+	union ircd_gpt_vector
+	vector[ircd_gpt_attn_segs];
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+union ircd_gpt_attn_aperature_f32x4
+{
+	float4
+	fcon[ircd_gpt_attn_fcon_elems / 4],
+	proj[ircd_gpt_attn_segs][ircd_gpt_vector_elems / 4],
+	qkv[ircd_gpt_attn_segs][ircd_gpt_attn_rank][ircd_gpt_vector_attn_elems / 4];
+
+	union ircd_gpt_vector_f32x4
+	vector[ircd_gpt_attn_segs];
+};
+#endif
+
+//
+// ffnn aperature
+//
+
+#if defined(__SIZEOF_FLOAT__)
+union ircd_gpt_ffnn_aperature
+{
+	float
+	fcon[ircd_gpt_ffnn_fcon_elems],
+	proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems];
+
+	union ircd_gpt_vector
+	vector[ircd_gpt_ffnn_segs];
+};
+#endif
+
+#if defined(__SIZEOF_FLOAT4__)
+union ircd_gpt_ffnn_aperature_f32x4
+{
+	float4
+	fcon[ircd_gpt_ffnn_fcon_elems / 4],
+	proj[ircd_gpt_ffnn_segs][ircd_gpt_vector_elems / 4];
+
+	union ircd_gpt_vector_f32x4
+	vector[ircd_gpt_ffnn_segs];
+};
+#endif
--- a/include/ircd/gpt/vocab.h
+++ b/include/ircd/gpt/vocab.h
@ -34,11 +34,17 @@ namespace ircd::gpt::vocab
 	merges_path;

 	// Tokenize UTF-8 input string of any length into proper token values,
-	vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in);
+	vector_view<u16> tokenize(const vector_view<u16> &out, const string_view &in) noexcept;
+
+	// Tokenize one token. The buffer is advanced consuming one token per call.
+	u16 tokenize(const_buffer &) noexcept;
+
+	// Tokenize one token. Error thrown if input is not exactly one token.
+	u16 tokenize(const string_view &in);

 	// Decode token values to build output text string.
-	string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in);
+	string_view detokenize(const mutable_buffer &out, const vector_view<const u16> &in) noexcept;

 	// Other tools
-	string_view debug(const mutable_buffer &buf, const u16 token);
+	string_view debug(const mutable_buffer &buf, const u16 token, const uint fmt_msk = -1U);
 }
--- a/ircd/Makefile.am
+++ b/ircd/Makefile.am
@ -53,6 +53,7 @@ if LTO
 if CLANG
 AM_CXXFLAGS += -fstrict-vtable-pointers
 AM_CXXFLAGS += -fwhole-program-vtables
+#AM_LDFLAGS += -Wl,-plugin-opt,-pass-remarks='.*'
 endif
 endif

@ -87,22 +88,6 @@ AM_LDFLAGS += -Wl,--enable-runtime-pseudo-reloc
 AM_LDFLAGS += -export-symbols-regex '*'
 endif

-ROCKSDB_SRC_CPPFLAGS =#
-ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
-ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
-
-GPT_FP_CXXFLAGS =#
-GPT_FP_CXXFLAGS += -fno-math-errno
-GPT_FP_CXXFLAGS += -fno-trapping-math
-GPT_FP_CXXFLAGS += -ffinite-math-only
-GPT_FP_CXXFLAGS += -fno-signed-zeros
-GPT_FP_CXXFLAGS += -fassociative-math
-GPT_FP_CXXFLAGS += -ffp-contract=fast
-GPT_FP_CXXFLAGS += -freciprocal-math
-if CLANG
-GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
-endif
-
 libircddir = @libdir@
 libircd_LTLIBRARIES = libircd.la

@ -241,16 +226,12 @@ libircd_la_SOURCES += png.cc
 if OPENCL
 libircd_la_SOURCES += cl.cc
 endif
-libircd_la_SOURCES += gpt.cc
-libircd_la_SOURCES += gpt_pipe.cc
-libircd_la_SOURCES += gpt_model.cc
 libircd_la_SOURCES += gpt_vocab.cc
+libircd_la_SOURCES += gpt_model.cc
+libircd_la_SOURCES += gpt_pipe_code.cc
+libircd_la_SOURCES += gpt_pipe.cc
 libircd_la_SOURCES += gpt_cpu.cc
-if OPENCL
-if CLANG
-BUILT_SOURCES += gpt_gpu.o
-endif
-endif
+libircd_la_SOURCES += gpt.cc
 libircd_la_SOURCES += openssl.cc
 libircd_la_SOURCES += rfc1459.cc
 libircd_la_SOURCES += rfc3986.cc
@ -283,6 +264,23 @@ libircd_la_SOURCES += ircd.cc
 # Specific unit option composition
 #

+ROCKSDB_SRC_CPPFLAGS =#
+ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb/include
+ROCKSDB_SRC_CPPFLAGS += -isystem $(top_srcdir)/deps/rocksdb
+
+GPT_FP_CXXFLAGS =#
+GPT_FP_CXXFLAGS += -fno-math-errno
+GPT_FP_CXXFLAGS += -fno-trapping-math
+GPT_FP_CXXFLAGS += -ffinite-math-only
+GPT_FP_CXXFLAGS += -fno-signed-zeros
+GPT_FP_CXXFLAGS += -fassociative-math
+GPT_FP_CXXFLAGS += -freciprocal-math
+GPT_FP_CXXFLAGS += -ffp-contract=fast
+if CLANG
+GPT_FP_CXXFLAGS += -fdenormal-fp-math=positive-zero
+GPT_FP_CXXFLAGS += -ffp-model=fast
+endif
+
 client.lo:            AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 ctx_x86_64.lo:        AM_CPPFLAGS := -I$(top_srcdir)/include
 ctx.lo:               AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
@ -307,7 +305,7 @@ endif
 if IOU
 fs_iou.lo:            AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 endif
-gpt.lo:               AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
+gpt_cpu.lo:           AM_CXXFLAGS := ${AM_CXXFLAGS} ${GPT_FP_CXXFLAGS}
 http.lo:              AM_CPPFLAGS := ${SPIRIT_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
 http.lo:              AM_CXXFLAGS := ${SPIRIT_UNIT_CXXFLAGS} ${AM_CXXFLAGS}
 ios.lo:               AM_CPPFLAGS := ${ASIO_UNIT_CPPFLAGS} ${AM_CPPFLAGS}
@ -364,12 +362,118 @@ default.profdata:
 	-$(LLVM_PROFDATA) merge -output=default.profdata default.proftext
 endif

+###############################################################################
+#
+# Hardware Acceleration / sub-targets
+#
+
 if CLANG
 if OPENCL
-gpt_gpu.ll: gpt_gpu.cl
-	$(CC) -S -emit-llvm -std=CL1.1 $(AM_CPPFLAGS) $(CPPFLAGS) $(DEFS) -Xclang -finclude-default-header -include "ircd/config.h" -o $@ -c $^

-gpt_gpu.o: gpt_gpu.ll
-	$(CC) -std=CL1.1 -o $@ -c $^
-endif
-endif
+GPU_CPPFLAGS =#
+GPU_CPPFLAGS += -D__OPENCL_VERSION__=110
+GPU_CPPFLAGS += -Dcl_clang_storage_class_specifiers
+GPU_CPPFLAGS += -DNOFP64
+GPU_CPPFLAGS += $(AM_CPPFLAGS)
+GPU_CPPFLAGS += $(CPPFLAGS)
+GPU_CPPFLAGS += $(DEFS)
+
+GPU_CFLAGS =#
+GPU_CFLAGS += -std=cl1.1
+GPU_CFLAGS += -fident
+GPU_CFLAGS += -fno-builtin
+GPU_CFLAGS += -fverbose-asm
+GPU_CFLAGS += -fno-discard-value-names
+GPU_CFLAGS += -mllvm -propagate-attrs=true
+GPU_CFLAGS += -O3
+GPU_CFLAGS += -fno-trapping-math
+GPU_CFLAGS += -ffinite-math-only
+GPU_CFLAGS += -fno-signed-zeros
+GPU_CFLAGS += -ffp-contract=fast
+#GPU_CFLAGS += -include "ircd/config.h"
+#GPU_CFLAGS += -include "clc/clc.h"
+#GPU_CFLAGS += -Wassume
+#GPU_CFLAGS += -Rpass-analysis=".*"
+#GPU_CFLAGS += -Rpass-missed=".*"
+
+GPU_LINKFLAGS =#
+GPU_LINKFLAGS +=#
+
+GPU_OPTFLAGS =#
+GPU_OPTFLAGS += -propagate-attrs=true
+GPU_OPTFLAGS += -O3
+#GPU_OPTFLAGS += -opt-bisect-limit=-1
+#GPU_OPTFLAGS += -debug-pass=Arguments
+#GPU_OPTFLAGS += -pass-remarks='.*'
+
+GPU_ASFLAGS =#
+GPU_ASFLAGS += -fident
+GPU_ASFLAGS += -fno-builtin
+GPU_ASFLAGS += -mllvm -propagate-attrs=true
+GPU_ASFLAGS += -mllvm -verify-machineinstrs
+GPU_ASFLAGS += -O3
+GPU_ASFLAGS += -Rpass-analysis=asm-printer
+#GPU_ASFLAGS += -Rpass-analysis=".*"
+#GPU_ASFLAGS += -mllvm -debug-pass=Arguments
+#GPU_ASFLAGS += -mllvm -pass-remarks='.*'
+
+#
+# SPV
+#
+
+BUILT_SOURCES += gpt_gpu.spv.bc
+gpt_gpu.spv.bc: gpt_gpu.cl
+	clang-13 -target spir-- $(GPU_CPPFLAGS) -O0 -emit-llvm -o $@ -x cl -c $^
+
+BUILT_SOURCES += gpt_gpu.spv
+gpt_gpu.spv: gpt_gpu.spv.bc
+	llvm-spirv -o $@ $^
+
+CLEANFILES += gpt_gpu.spv.cc
+libircd_la_SOURCES += gpt_gpu.spv.cc
+gpt_gpu.spv.cc: gpt_gpu.spv
+	xxd -i $^ $@
+
+#
+# R600
+#
+
+R600_TARGET = r600--
+
+#
+# R600 Saint Barthélemy
+#
+
+R600_BARTS_CFLAGS = $(GPU_CFLAGS)
+R600_BARTS_CFLAGS += -target $(R600_TARGET)
+R600_BARTS_CFLAGS += -mcpu=barts
+R600_BARTS_CFLAGS += -Xclang -mlink-bitcode-file -Xclang /usr/lib/clc/barts-r600--.bc
+
+BUILT_SOURCES += gpt_gpu.r600_barts.bc
+gpt_gpu.r600_barts.bc: gpt_gpu.cl
+	$(CC) $(R600_BARTS_CFLAGS) $(GPU_CPPFLAGS) -emit-llvm -o $@ -x cl -c $^
+
+#BUILT_SOURCES += gpt_gpu.r600_barts.link.bc
+gpt_gpu.r600_barts.link.bc: gpt_gpu.r600_barts.bc
+	llvm-link-14 $(GPU_LINKFLAGS) -o $@ $^ /usr/lib/clc/barts-r600--.bc
+	opt-14 $(GPU_OPTFLAGS) -o $@ $@
+
+#BUILT_SOURCES += gpt_gpu.r600_barts.s
+gpt_gpu.r600_barts.s: gpt_gpu.r600_barts.link.bc
+	$(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -S -o $@ -x ir $^
+
+#BUILT_SOURCES += gpt_gpu.r600_barts.o
+gpt_gpu.r600_barts.o: gpt_gpu.r600_barts.link.bc
+	$(CC) -cc1 $(GPU_ASFLAGS) -triple $(R600_TARGET) -emit-obj -o $@ -x ir $^
+
+CLEANFILES += gpt_gpu.r600_barts.bc.cc
+libircd_la_SOURCES += gpt_gpu.r600_barts.bc.cc
+gpt_gpu.r600_barts.bc.cc: gpt_gpu.r600_barts.bc
+	xxd -i $^ $@
+
+#
+#
+#
+
+endif # OPENCL
+endif # CLANG
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
--- a/ircd/gpt_cpu.cc
+++ b/ircd/gpt_cpu.cc
@ -8,17 +8,21 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.

+#pragma clang fp exceptions(ignore)
+#pragma clang fp reassociate(on)
+#pragma clang fp contract(fast)
+
 namespace ircd::gpt
 {
-	static size_t adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32, size_t);
-	static size_t adamw(task &, const f32, f32 *, const size_t, f32 *const (&)[2], const size_t);
+	static void adamw(const opts &, const u32, const f32, const uint, f32 *, f32 *, f32 *);

-	static size_t backprop(task &, const f32, model::norm &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::attn &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::ffnn &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::block &, f32 *const (&)[2], size_t);
-	static size_t backprop(task &, const f32, model::embed &, f32 *const (&)[2], size_t);
-	extern size_t backprop(task &, const f32, model::decoder &, f32 *const (&)[2], size_t = 0);
+	static void backprop(const opts &, const u32, const f32, model::norm &, model::norm &, model::norm &);
+	static void backprop(const opts &, const u32, const f32, model::attn &, model::attn &, model::attn &);
+	static void backprop(const opts &, const u32, const f32, model::ffnn &, model::ffnn &, model::ffnn &);
+	static void backprop(const opts &, const u32, const f32, model::block &, model::block &, model::block &);
+	static void backprop(const opts &, const u32, const f32, model::embed &, model::embed &, model::embed &);
+	static void backprop(const opts &, const u32, const f32, model::decoder &, model::decoder &, model::decoder &);
+	extern void backprop(const opts &, const u32, const f32, model::decoder &, f32 *const __restrict__ [2]) noexcept;

 	template<class T>
 	static void fmma(T *out, const T *in, const T *bias, const T *weight, const math::fmma_opts &);
@ -37,7 +41,7 @@ namespace ircd::gpt
 	static void logits(float *, const float *, const model::decoder &);
 	static void tail(float *, const float *, const model::decoder &);
 	static u16 argmax(const float *, const opts &);
-	static void embed(float *, const u16 token, const u16 position, const opts &);
+	static void embed(float *, const u16 token, const u16 position, const model::decoder &);

 	static f32
 	logit alignas(64) [65536],
@ -49,21 +53,20 @@ void
 ircd::gpt::embed(float *const out,
                 const u16 token,
                 const u16 position,
-                 const opts &opts)
+                 const model::decoder &model)
 {
-	assert(opts.model);
 	const auto &wpe
 	{
-		opts.model->word.pos[position]
+		model.embed.pos[position]
 	};

 	const auto &wte
 	{
-		opts.model->word.token[token]
+		model.embed.token[token]
 	};

 	for(uint j(0); j < 768; ++j)
-		out[j] = wte[j] + wpe[j];
+		out[j] = wte.elem[j] + wpe.elem[j];
 }

 uint16_t
@ -117,7 +120,7 @@ ircd::gpt::tail(float *const __restrict__ logit,
 	for(uint i(0); i < 768; ++i)
 		buf[0][i] = state[i];

-	norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.f.bias, (const f32x4 *)d.f.weight, lnf_epsilon);
+	norm((f32x4 *)buf[0], (const f32x4 *)state, (const f32x4 *)d.embed.norm.bias.elem, (const f32x4 *)d.embed.norm.weight.elem, lnf_epsilon);
 	logits(logit, buf[0], d);
 	//logitsmax(logit, logit, vocab::tokens);
 }
@ -132,7 +135,7 @@ ircd::gpt::logits(float *const __restrict__ out,

 	for(uint j(0); j < vocab::tokens; ++j)
 		for(uint k(0); k < 768; ++k)
-			out[j] += in[k] * d.word.token[j][k];
+			out[j] += in[k] * d.embed.token[j].elem[k];
 }

 [[gnu::noinline]]
@ -192,7 +195,7 @@ ircd::gpt::coil(float *__restrict__ accum,
 		};

 		for(uint j(0); j < tokens; ++j)
-			fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias, (const f32x4 *)layer.attn.proj_weight, fmma_opts);
+			fmma((f32x4 *)(accum + j * 768), (const f32x4 *)(a[j]), (const f32x4 *)layer.attn.proj_bias.elem, (const f32x4 *)layer.attn.proj_weight, fmma_opts);

 		for(uint j(0); j < tokens; ++j)
 			ffnn(accum + j * 768, accum + j * 768, decoder, i);
@ -227,7 +230,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64],
 		buf alignas(64) [768],
 		proj alignas(64) [2304];

-		norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.ln1.bias, (const f32x4 *)layer.ln1.weight, ln1_epsilon);
+		norm((f32x4 *)buf, (const f32x4 *)(in + i * 768), (const f32x4 *)layer.attn.norm.bias.elem, (const f32x4 *)layer.attn.norm.weight.elem, ln1_epsilon);

 		static const math::fmma_opts fmma_opts
 		{
@ -235,7 +238,7 @@ ircd::gpt::attn(float (&__restrict__ out)[3][1024][12][64],
 		};

 		memset(proj, 0x0, sizeof(proj));
-		fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.attn_bias, (const f32x4 *)layer.attn.attn_weight, fmma_opts);
+		fmma((f32x4 *)proj, (const f32x4 *)buf, (const f32x4 *)layer.attn.fcon_bias.fcon, (const f32x4 *)layer.attn.fcon_weight, fmma_opts);

 		#pragma clang loop unroll (disable)
 		for(uint j(0); j < 12; ++j)
@ -372,10 +375,10 @@ ircd::gpt::ffnn(float *const out,
 	buf2 alignas(64) [3072];

 	memset(buf2, 0x0, sizeof(buf2));
-	norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ln2.bias, (const f32x4 *)layer.ln2.weight, ln2_epsilon);
-	fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fc_bias, (const f32x4 *)layer.ffnn.fc_weight, fmma3_opts);
+	norm((f32x4 *)buf, (const f32x4 *)in, (const f32x4 *)layer.ffnn.norm.bias.elem, (const f32x4 *)layer.ffnn.norm.weight.elem, ln2_epsilon);
+	fmma((f32x4 *)buf2, (const f32x4 *)buf, (const f32x4 *)layer.ffnn.fcon_bias.fcon, (const f32x4 *)layer.ffnn.fcon_weight, fmma3_opts);
 	gelu((f32x4 *)buf2, (const f32x4 *)buf2);
-	fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
+	fmma((f32x4 *)out, (const f32x4 *)buf2, (const f32x4 *)layer.ffnn.proj_bias.elem, (const f32x4 *)layer.ffnn.proj_weight, fmma4_opts);
 }

 void
@ -431,213 +434,219 @@ ircd::gpt::gelu(f32x4 &out,
 //

 [[gnu::noinline]]
-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                    const f32 grad,
-                    model::decoder &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::decoder &__restrict__ param,
+                    f32 *const __restrict__ buf[2])
+noexcept
 {
-	for(uint i(0); i < 12; ++i)
-		off = backprop(task, grad, param.layer[i], moment, off);
-
-	off = backprop(task, grad, param.f, moment, off);
-	off = backprop(task, grad, param.word, moment, off);
-	return off;
-}
-
-size_t
-ircd::gpt::backprop(task &task,
-                    const f32 grad,
-                    model::embed &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
-{
-	assert(task.opts);
-	const auto &opts
+	model::decoder *const __restrict__ moment[2]
 	{
-		*task.opts
+		reinterpret_cast<model::decoder *>(buf[0]),
+		reinterpret_cast<model::decoder *>(buf[1]),
 	};

+	backprop(opts, step, grad, param, *moment[0], *moment[1]);
+}
+
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
+                    const f32 grad,
+                    model::decoder &__restrict__ param,
+                    model::decoder &__restrict__ moment0,
+                    model::decoder &__restrict__ moment1)
+{
+	fpe::errors_handle eh;
+
+	assume(opts.attn_rank > 0);
+	assume(opts.layers > 0);
+	const auto eln
+	{
+		opts.layers - 1 // step % opts.layers
+	};
+
+	for(int i(opts.layers - 1); i >= int(opts.layers - eln - 1); --i)
+	{
+		assert(i >= 0 && i < int(opts.layers));
+		backprop(opts, step, grad, param.layer[i], moment0.layer[i], moment1.layer[i]);
+	}
+
+	backprop(opts, step, grad, param.embed, moment0.embed, moment1.embed);
+
+	auto pending(eh.pending());
+	eh.clear_pending();
+	pending &= ~pending & FE_INEXACT;
+	if(unlikely(pending))
+		fpe::throw_errors(pending);
+}
+
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
+                    const f32 grad,
+                    model::embed &__restrict__ param,
+                    model::embed &__restrict__ moment0,
+                    model::embed &__restrict__ moment1)
+{
+	backprop(opts, step, grad, param.norm, moment0.norm, moment1.norm);
+
+	assume(opts.context_tokens > 0);
 	for(uint i(0); i < opts.context_tokens; ++i)
-		off = adamw(task, grad, param.pos[i], 768, moment, off);
+		adamw(opts, step, grad, 768, param.pos[i].elem, moment0.pos[i].elem, moment1.pos[i].elem);

+	assume(opts.logits > 0);
 	for(uint i(0); i < opts.logits; ++i)
-		off = adamw(task, grad, param.token[i], 768, moment, off);
-
-	return off;
+		adamw(opts, step, grad, 768, param.token[i].elem, moment0.token[i].elem, moment1.token[i].elem);
 }

-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                    const f32 grad,
-                    model::block &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::block &__restrict__ param,
+                    model::block &__restrict__ moment0,
+                    model::block &__restrict__ moment1)
 {
-	off = backprop(task, grad, param.ln1, moment, off);
-	off = backprop(task, grad, param.attn, moment, off);
-	off = backprop(task, grad, param.ln2, moment, off);
-	off = backprop(task, grad, param.ffnn, moment, off);
-	return off;
+	backprop(opts, step, grad, param.attn.norm, moment0.attn.norm, moment1.attn.norm);
+	backprop(opts, step, grad, param.attn, moment0.attn, moment1.attn);
+
+	backprop(opts, step, grad, param.ffnn.norm, moment0.ffnn.norm, moment1.ffnn.norm);
+	backprop(opts, step, grad, param.ffnn, moment0.ffnn, moment1.ffnn);
 }

-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                    const f32 grad,
-                    model::attn &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::attn &__restrict__ param,
+                    model::attn &__restrict__ moment0,
+                    model::attn &__restrict__ moment1)
 {
-	off = adamw(task, grad, param.attn_bias, 2304, moment, off);
+	adamw(opts, step, grad, 2304, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon);

 	for(uint i(0); i < 768; ++i)
-		off = adamw(task, grad, param.attn_weight[i], 2304, moment, off);
+		adamw(opts, step, grad, 2304, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon);

-	off = adamw(task, grad, param.proj_bias, 768, moment, off);
+	adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem);

 	for(uint i(0); i < 768; ++i)
-		off = adamw(task, grad, param.proj_weight[i], 768, moment, off);
-
-	return off;
+		adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem);
 }

-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                    const f32 grad,
-                    model::ffnn &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::ffnn &__restrict__ param,
+                    model::ffnn &__restrict__ moment0,
+                    model::ffnn &__restrict__ moment1)
 {
-	off = adamw(task, grad, param.fc_bias, 3072, moment, off);
+	adamw(opts, step, grad, 3072, param.fcon_bias.fcon, moment0.fcon_bias.fcon, moment1.fcon_bias.fcon);

 	for(uint i(0); i < 768; ++i)
-		off = adamw(task, grad, param.fc_weight[i], 3072, moment, off);
+		adamw(opts, step, grad, 3072, param.fcon_weight[i].fcon, moment0.fcon_weight[i].fcon, moment1.fcon_weight[i].fcon);

-	off = adamw(task, grad, param.proj_bias, 768, moment, off);
+	adamw(opts, step, grad, 768, param.proj_bias.elem, moment0.proj_bias.elem, moment1.proj_bias.elem);

 	for(uint i(0); i < 3072; ++i)
-		off = adamw(task, grad, param.proj_weight[i], 768, moment, off);
-
-	return off;
+		adamw(opts, step, grad, 768, param.proj_weight[i].elem, moment0.proj_weight[i].elem, moment1.proj_weight[i].elem);
 }

-size_t
-ircd::gpt::backprop(task &task,
+void
+ircd::gpt::backprop(const opts &opts,
+                    const u32 step,
                    const f32 grad,
-                    model::norm &param,
-                    f32 *const (&moment)[2],
-                    size_t off)
+                    model::norm &__restrict__ param,
+                    model::norm &__restrict__ moment0,
+                    model::norm &__restrict__ moment1)
 {
-	off = adamw(task, grad, param.bias, 768, moment, off);
-	off = adamw(task, grad, param.weight, 768, moment, off);
-	return off;
+	adamw(opts, step, grad, 768, param.bias.elem, moment0.bias.elem, moment1.bias.elem);
+	adamw(opts, step, grad, 768, param.weight.elem, moment0.weight.elem, moment1.weight.elem);
 }

-[[gnu::noinline]]
-size_t
-ircd::gpt::adamw(task &task,
-                 const f32 grad,
-                 f32 *const p_,
-                 const size_t num,
-                 f32 *const (&__restrict__ m_)[2],
-                 size_t off)
+namespace ircd::gpt
 {
-	assert(task.opts);
-	const auto &opts
+	static f32x4 adamw_moment(const f32x4, const f32, const f32);
+	static f32x4 adamw_numer(const f32x4, const f32, const u32);
+	static f32x4 adamw_denom(const f32x4, const f32, const u32);
+	static f32x4 adamw_delta(const f32x4, const f32x4, const f32, const f32, const f32, const u32);
+	static void adamw(f32x4 &, f32x4 &, f32x4 &, const f32, const f32, const f32, const f32, const u32);
+	static void adamw(const opts &, const u32, const f32, const u32, f32 *, f32 *, f32 *);
+}
+
+void
+ircd::gpt::adamw(const opts &opts,
+                 const u32 step,
+                 const f32 grad,
+                 const u32 num,
+                 f32 *const __restrict__ param,
+                 f32 *const __restrict__ moment0,
+                 f32 *const __restrict__ moment1)
+{
+	f32x4 *const __restrict__ val[3]
 	{
-		*task.opts
+		reinterpret_cast<f32x4 *>(param),
+		reinterpret_cast<f32x4 *>(moment0),
+		reinterpret_cast<f32x4 *>(moment1),
 	};

-	assert(task.ctrl);
-	auto &ctrl
+	const auto n
 	{
-		*task.ctrl
+		num / 4
 	};

-	f32x4 *const p[3]
-	{
-		reinterpret_cast<f32x4 *>(p_),
-		reinterpret_cast<f32x4 *>(m_[0]) + off,
-		reinterpret_cast<f32x4 *>(m_[1]) + off,
-	};
-
-	assert(num >= 4);
-	const uint n
-	{
-		uint(num) / 4
-	};
-
-	// Assume loop body always taken w/o soundness; otherwise extra branch.
-	assert(n > 0);
-	uint i(0); do
-	{
-		off = adamw
+	assume(0 < n);
+	for(uint i(0); i < n; ++i)
+		adamw
 		(
-			p[0][i],
-			p[1][i],
-			p[2][i],
+			val[0][i],
+			val[1][i],
+			val[2][i],
 			grad,
 			opts.alpha,
 			opts.beta[0],
 			opts.beta[1],
-			ctrl.epic.step,
-			off
+			step + 1
 		);
-	}
-	while(++i < n);
-
-	return off;
 }

-size_t
+void
 ircd::gpt::adamw(f32x4 &__restrict__ param,
                 f32x4 &__restrict__ moment0,
                 f32x4 &__restrict__ moment1,
-                 const f32 grad,
-                 const f32 alpha,
+                 const f32 grad_,
+                 const f32 alpha_,
                 const f32 beta0,
                 const f32 beta1,
-                 const u32 step,
-                 const size_t off)
+                 const u32 step)
 {
-	const f32x4 one
+	const f32 alpha
 	{
-		1.0f, 1.0f, 1.0f, 1.0f,
+		grad_ < 0? -alpha_ : alpha_
 	};

-	const f32x4 a[2]
+	const f32 grad
 	{
-		{ one - beta0 },
-		{ one - beta1 },
+		grad_ < 0? -grad_ : grad_
 	};

-	const f32x4 avg_mul[2]
+	const f32 grad_grad
 	{
-		{ moment0 * beta0 },
-		{ moment1 * beta1 },
+		grad * grad
 	};

-	const f32x4 avg_dot[2]
+	const f32x4 moment[]
 	{
-		{ avg_mul[0] + a[0] * grad         },
-		{ avg_mul[1] + a[1] * grad * grad  },
-	};
-
-	const f32x4 bias[2]
-	{
-		{ avg_dot[0] / (one - powf(beta0, step + 1)) },
-		{ avg_dot[1] / (one - powf(beta1, step + 1)) },
-	};
-
-	const f32x4 denom
-	{
-		sqrtf(bias[1]) + 0.000001f // epsilon
+		adamw_moment(moment0, grad, beta0),
+		adamw_moment(moment1, grad_grad, beta1)
 	};

 	const f32x4 delta
 	{
-		alpha * (bias[0] / denom)
+		adamw_delta(moment[0], moment[1], alpha, beta0, beta1, step)
 	};

 	const f32x4 update
@ -645,8 +654,168 @@ ircd::gpt::adamw(f32x4 &__restrict__ param,
 		param - delta
 	};

-	moment0 = avg_dot[0];
-	moment1 = avg_dot[1];
+	if((false))
+	for(uint i(0); i < 4; ++i)
+		printf("%-15p p[%11.8lf] m[%11.8lf][%11.8lf] g[%11.8lf] d[%11.8lf] p[%11.8lf] m[%11.8lf][%11.8lf]\n",
+		((f32 *)&param) + i,
+		param[i],
+		moment0[i],
+		moment1[i],
+		grad,
+		delta[i],
+		update[i],
+		moment[0][i],
+		moment[1][i]);
+
+	assert(std::isnormal(update[0]));
+	assert(std::isnormal(update[1]));
+	assert(std::isnormal(update[2]));
+	assert(std::isnormal(update[3]));
+
+	assert(std::isnormal(moment[0][0]));
+	assert(std::isnormal(moment[0][1]));
+	assert(std::isnormal(moment[0][2]));
+	assert(std::isnormal(moment[0][3]));
+
+	assert(std::isnormal(moment[1][0]));
+	assert(std::isnormal(moment[1][1]));
+	assert(std::isnormal(moment[1][2]));
+	assert(std::isnormal(moment[1][3]));
+
 	param = update;
-	return off + 1;
+	//__builtin_nontemporal_store(update, &param);
+
+	moment0 = moment[0];
+	moment1 = moment[1];
+	//__builtin_nontemporal_store(moment[0], &moment0);
+	//__builtin_nontemporal_store(moment[1], &moment1);
+}
+
+ircd::f32x4
+ircd::gpt::adamw_delta(const f32x4 moment0,
+                       const f32x4 moment1,
+                       const f32 alpha,
+                       const f32 beta0,
+                       const f32 beta1,
+                       const u32 step)
+{
+	static const f32 epsilon
+	{
+		FLT_EPSILON
+	};
+
+	const f32x4 denom
+	{
+		adamw_denom(moment1, beta1, step) + epsilon
+	};
+
+	const f32x4 decay
+	{
+		adamw_numer(moment0, beta0, step)
+	};
+
+	const f32x4 smooth
+	{
+		alpha * decay
+	};
+
+	assert(std::isnormal(denom[0]));
+	assert(std::isnormal(denom[1]));
+	assert(std::isnormal(denom[2]));
+	assert(std::isnormal(denom[3]));
+	const f32x4 delta
+	{
+		smooth / denom
+	};
+
+	return delta;
+}
+
+ircd::f32x4
+ircd::gpt::adamw_denom(const f32x4 moment,
+                       const f32 beta,
+                       const u32 step)
+{
+	static const f32x4 one
+	{
+		1.0f, 1.0f, 1.0f, 1.0f,
+	};
+
+	assert(step > 0);
+	const f32x4 decay
+	{
+		one - powf(beta, step)
+	};
+
+	assert(std::isnormal(decay[0]));
+	assert(std::isnormal(decay[1]));
+	assert(std::isnormal(decay[2]));
+	assert(std::isnormal(decay[3]));
+	const f32x4 bias
+	{
+		moment / decay
+	};
+
+	const f32x4 denom
+	{
+		sqrtf(bias)
+	};
+
+	return denom;
+}
+
+ircd::f32x4
+ircd::gpt::adamw_numer(const f32x4 moment,
+                       const f32 beta,
+                       const u32 step)
+{
+	static const f32x4 one
+	{
+		1.0f, 1.0f, 1.0f, 1.0f,
+	};
+
+	assert(step > 0);
+	const f32x4 decay
+	{
+		one - powf(beta, step)
+	};
+
+	assert(std::isnormal(decay[0]));
+	assert(std::isnormal(decay[1]));
+	assert(std::isnormal(decay[2]));
+	assert(std::isnormal(decay[3]));
+	const f32x4 bias
+	{
+		moment / decay
+	};
+
+	return bias;
+}
+
+ircd::f32x4
+ircd::gpt::adamw_moment(const f32x4 moment,
+                        const f32 grad,
+                        const f32 beta)
+{
+	static const f32x4 one
+	{
+		1.0f, 1.0f, 1.0f, 1.0f,
+	};
+
+	const f32x4 rate
+	{
+		one - beta
+	};
+
+	const f32x4 avg
+	{
+		moment * beta
+	};
+
+	const f32x4 dot
+	{
+		rate * grad + avg
+	};
+
+	return dot;
 }
--- a/ircd/gpt_gpu.cl
+++ b/ircd/gpt_gpu.cl
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -54,10 +54,14 @@ namespace ircd::gpt::model
 	static fs::map
 	default_model_shm,
 	default_dataset_shm;
-
-	static std::unique_ptr<decoder> default_model_res;
 }

+constexpr const char
+*const ircd::gpt::model::prop::ended,
+*const ircd::gpt::model::prop::id,
+*const ircd::gpt::model::prop::length,
+*const ircd::gpt::model::prop::text;
+
 decltype(ircd::gpt::model::manifest_h)
 ircd::gpt::model::manifest_h
 {
@ -102,7 +106,7 @@ decltype(ircd::gpt::model::cache_hugepage)
 ircd::gpt::model::cache_hugepage
 {
 	{ "name",     "ircd.gpt.model.cache.hugepage" },
-	{ "default",  true                            },
+	{ "default",  false                           },
 };

 decltype(ircd::gpt::model::cache_path)
@ -132,6 +136,12 @@ ircd::gpt::model::path
 decltype(ircd::gpt::model::default_model)
 ircd::gpt::model::default_model;

+decltype(ircd::gpt::model::default_moment)
+ircd::gpt::model::default_moment;
+
+decltype(ircd::gpt::model::default_checkpoint)
+ircd::gpt::model::default_checkpoint;
+
 decltype(ircd::gpt::model::default_dataset)
 ircd::gpt::model::default_dataset;

@ -144,17 +154,31 @@ ircd::gpt::model::init()
 	if(!model::path)
 		return;

-	if(!init_from_cache(model::cache_path))
-		init_from_json(model::cache_path, model::path);
-
 	if(model::dataset_path)
 		init_dataset(model::dataset_path);
+
+	if(likely(init_from_cache(model::cache_path)))
+		return;
+
+	init_from_json(model::cache_path, model::path);
+	if(unlikely(!init_from_cache(model::cache_path)))
+		throw error
+		{
+			"Failed to find and/or initialize model."
+		};
 }

 void
 ircd::gpt::model::fini()
 noexcept
 {
+	default_checkpoint[2] = nullptr;
+	default_checkpoint[1] = nullptr;
+	default_checkpoint[0] = nullptr;
+
+	default_moment[1] = nullptr;
+	default_moment[0] = nullptr;
+
 	default_model = nullptr;
 	default_model_shm = {};

@ -169,18 +193,33 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
 	if(!fs::is_reg(cache_path))
 		return false;

-	const auto size
+	const auto file_size
 	{
 		fs::size(cache_path)
 	};

-	if(unlikely(size != sizeof(model::decoder)))
+	const auto decoder_size
+	{
+		sizeof(model::decoder)
+	};
+
+	const bool has_params
+	{
+		file_size >= decoder_size
+	};
+
+	const bool has_moments
+	{
+		file_size >= decoder_size * 6
+	};
+
+	if(unlikely(!has_params))
 		throw error
 		{
-			"Cached model `%s' size %zu differs from %zu.",
+			"Cached model `%s' size %zu insufficient for decoder size %zu.",
 			cache_path,
-			size,
-			sizeof(model::decoder),
+			file_size,
+			decoder_size,
 		};

 	const auto mode
@ -192,20 +231,41 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)

 	const fs::fd fd
 	{
-		cache_path, mode
+		cache_path, fs::fd::opts
+		{
+			.mode = mode,
+		},
 	};

-	fs::map::opts map_opts
+	const bool map_moments
 	{
-		mode
+		has_moments || cache_shared
+	};
+
+	if(!has_moments && map_moments)
+	{
+		fs::truncate(fd, decoder_size * 6);
+		fs::allocate(fd, decoder_size * 5, decoder_size);
+	}
+
+	const auto map_size
+	{
+		map_moments?
+			decoder_size * 6:
+			decoder_size
+	};
+
+	const fs::map::opts map_opts
+	{
+		.alignment = alignof(model::decoder),
+		.shared = bool(cache_shared),
+		.locked = bool(cache_locked),
+		.huge2mb = bool(cache_hugepage),
 	};

-	map_opts.locked = bool(cache_locked);
-	map_opts.shared = bool(cache_shared);
-	map_opts.huge2mb = bool(cache_hugepage);
 	default_model_shm = fs::map
 	{
-		fd, map_opts, sizeof(decoder)
+		fd, map_opts, map_size
 	};

 	default_model = reinterpret_cast<decoder *>
@ -213,13 +273,28 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
 		data(default_model_shm)
 	);

+	if(map_moments)
+	{
+		default_moment[0] = reinterpret_cast<float *>(default_model + 1);
+		default_moment[1] = reinterpret_cast<float *>(default_model + 2);
+		default_checkpoint[0] = reinterpret_cast<float *>(default_model + 3);
+		default_checkpoint[1] = reinterpret_cast<float *>(default_model + 4);
+		default_checkpoint[2] = reinterpret_cast<float *>(default_model + 5);
+	}
+
+	allocator::lock({(const char *)default_model, sizeof(decoder)});
+	fs::prefetch(default_model_shm, sizeof(decoder));
+
 	char pbuf[48];
 	log::info
 	{
-		log, "model(%p) mapped cached model `%s' %s",
+		log, "model(%p) mapped cached model `%s' params:%b moments:%b align:%u %s",
 		data(default_model_shm),
 		cache_path,
-		pretty(pbuf, iec(size)),
+		has_params,
+		has_moments,
+		map_opts.alignment,
+		pretty(pbuf, iec(map_size)),
 	};

 	return true;
@ -264,9 +339,6 @@ ircd::gpt::model::init_from_json(const string_view &cache_path,
 		cache_path,
 		stopwatch.pretty(pbuf[1]),
 	};
-
-	default_model_res = std::move(decoder);
-	default_model = default_model_res.get();
 }

 void
@ -363,6 +435,7 @@ ircd::gpt::model::init_dataset(const string_view &path)

 	size_t checkpoint(0);
 	default_data.resize(260000); //TODO: XXX
+	fs::prefetch(default_dataset_shm, size);
 	ircd::tokens(default_dataset, '\n', [&checkpoint]
 	(const string_view &line)
 	{
@ -379,6 +452,7 @@ ircd::gpt::model::init_dataset(const string_view &path)
 		checkpoint,
 	};

+	fs::evict(default_dataset_shm, size);
 	return true;
 }

@ -393,9 +467,9 @@ ircd::gpt::model::init_wpe_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.word.pos[i][j++] = lex_cast<float>(elem);
+			d.embed.pos[i].elem[j++] = lex_cast<float>(elem);

-		always_assert(j == sizeof(d.word.pos[i]) / sizeof(float));
+		always_assert(j == sizeof(d.embed.pos[i]) / sizeof(float));
 		++i;
 	}
 }
@ -411,9 +485,9 @@ ircd::gpt::model::init_wte_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.word.token[i][j++] = lex_cast<float>(elem);
+			d.embed.token[i].elem[j++] = lex_cast<float>(elem);

-		always_assert(j == sizeof(d.word.token[i]) / sizeof(float));
+		always_assert(j == sizeof(d.embed.token[i]) / sizeof(float));
 		++i;
 	}
 }
@ -426,9 +500,9 @@ ircd::gpt::model::init_f_weight(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.f.weight[i++] = lex_cast<float>(elem);
+		d.embed.norm.weight.elem[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.f.weight) / sizeof(float));
+	always_assert(i == sizeof(d.embed.norm.weight) / sizeof(float));
 }

 void
@ -439,9 +513,9 @@ ircd::gpt::model::init_f_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.f.bias[i++] = lex_cast<float>(elem);
+		d.embed.norm.bias.elem[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.f.bias) / sizeof(float));
+	always_assert(i == sizeof(d.embed.norm.bias) / sizeof(float));
 }

 void
@ -455,16 +529,16 @@ ircd::gpt::model::init_h_ffnn_fc_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].ffnn.fc_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].ffnn.fcon_weight[i].fcon[j++] = lex_cast<float>(elem);

-		always_assert(j == sizeof(d.layer[layer].ffnn.fc_weight[i]) / sizeof(float));
+		always_assert(j == sizeof(d.layer[layer].ffnn.fcon_weight[i]) / sizeof(float));
 		++i;
 	}

 	always_assert
 	(
-		i == sizeof(d.layer[layer].ffnn.fc_weight)
-		/ sizeof(d.layer[layer].ffnn.fc_weight[0])
+		i == sizeof(d.layer[layer].ffnn.fcon_weight)
+		/ sizeof(d.layer[layer].ffnn.fcon_weight[0])
 	);
 }

@ -476,9 +550,9 @@ ircd::gpt::model::init_h_ffnn_fc_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ffnn.fc_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.fcon_bias.fcon[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.layer[layer].ffnn.fc_bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].ffnn.fcon_bias) / sizeof(float));
 }

 void
@ -492,7 +566,7 @@ ircd::gpt::model::init_h_ffnn_proj_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].ffnn.proj_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].ffnn.proj_weight[i].elem[j++] = lex_cast<float>(elem);

 		always_assert(j == sizeof(d.layer[layer].ffnn.proj_weight[i]) / sizeof(float));
 		++i;
@ -513,7 +587,7 @@ ircd::gpt::model::init_h_ffnn_proj_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ffnn.proj_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.proj_bias.elem[i++] = lex_cast<float>(elem);

 	always_assert(i == sizeof(d.layer[layer].ffnn.proj_bias) / sizeof(float));
 }
@ -526,9 +600,9 @@ ircd::gpt::model::init_h_ln_1_weight(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln1.weight[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.norm.weight.elem[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.layer[layer].ln1.weight) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].attn.norm.weight) / sizeof(float));
 }

 void
@ -539,9 +613,9 @@ ircd::gpt::model::init_h_ln_1_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln1.bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.norm.bias.elem[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.layer[layer].ln1.bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].attn.norm.bias) / sizeof(float));
 }

 void
@ -552,9 +626,9 @@ ircd::gpt::model::init_h_ln_2_weight(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln2.weight[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.norm.weight.elem[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.layer[layer].ln2.weight) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].ffnn.norm.weight) / sizeof(float));
 }

 void
@ -565,9 +639,9 @@ ircd::gpt::model::init_h_ln_2_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].ln2.bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].ffnn.norm.bias.elem[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.layer[layer].ln2.bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].ffnn.norm.bias) / sizeof(float));
 }

 void
@ -581,16 +655,16 @@ ircd::gpt::model::init_h_attn_attn_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].attn.attn_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].attn.fcon_weight[i].fcon[j++] = lex_cast<float>(elem);

-		always_assert(j == sizeof(d.layer[layer].attn.attn_weight[i]) / sizeof(float));
+		always_assert(j == sizeof(d.layer[layer].attn.fcon_weight[i]) / sizeof(float));
 		++i;
 	}

 	always_assert
 	(
-		i == sizeof(d.layer[layer].attn.attn_weight)
-		/ sizeof(d.layer[layer].attn.attn_weight[0])
+		i == sizeof(d.layer[layer].attn.fcon_weight)
+		/ sizeof(d.layer[layer].attn.fcon_weight[0])
 	);
 }

@ -602,9 +676,9 @@ ircd::gpt::model::init_h_attn_attn_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].attn.attn_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.fcon_bias.fcon[i++] = lex_cast<float>(elem);

-	always_assert(i == sizeof(d.layer[layer].attn.attn_bias) / sizeof(float));
+	always_assert(i == sizeof(d.layer[layer].attn.fcon_bias) / sizeof(float));
 }

 void
@ -618,7 +692,7 @@ ircd::gpt::model::init_h_attn_proj_weight(decoder &d,
 	{
 		size_t j(0);
 		for(const auto &elem : vec)
-			d.layer[layer].attn.proj_weight[i][j++] = lex_cast<float>(elem);
+			d.layer[layer].attn.proj_weight[i].elem[j++] = lex_cast<float>(elem);

 		always_assert(j == sizeof(d.layer[layer].attn.proj_weight[i]) / sizeof(float));
 		++i;
@ -639,7 +713,7 @@ ircd::gpt::model::init_h_attn_proj_bias(decoder &d,
 {
 	size_t i(0);
 	for(const auto &elem : vec)
-		d.layer[layer].attn.proj_bias[i++] = lex_cast<float>(elem);
+		d.layer[layer].attn.proj_bias.elem[i++] = lex_cast<float>(elem);

 	always_assert(i == sizeof(d.layer[layer].attn.proj_bias) / sizeof(float));
 }
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
--- a/ircd/gpt_pipe_code.cc
+++ b/ircd/gpt_pipe_code.cc
@ -152,10 +152,12 @@ ircd::gpt::pipe::code::set_cache(const string_view &path)
 }

 extern const uint8_t
-gpt_gpu_r600_barts_bc[];
+gpt_gpu_r600_barts_bc[],
+gpt_gpu_spv[];

 extern const uint
-gpt_gpu_r600_barts_bc_len;
+gpt_gpu_r600_barts_bc_len,
+gpt_gpu_spv_len;

 ircd::cl::code
 ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
@ -164,6 +166,9 @@ ircd::gpt::pipe::code::from_bitcode(const string_view &link_opts)
 	{
 		reinterpret_cast<const char *>(gpt_gpu_r600_barts_bc),
 		gpt_gpu_r600_barts_bc_len
+
+		//reinterpret_cast<const char *>(gpt_gpu_spv),
+		//gpt_gpu_spv_len
 	};

 	char pbuf[1][48];
--- a/ircd/gpt_vocab.cc
+++ b/ircd/gpt_vocab.cc
@ -10,6 +10,7 @@

 namespace ircd::gpt::vocab
 {
+	static u8x16 get_token(const u16);
 	static u16 find_token(const u8x16);
 	static u16 find_merge(const u8x16, const u8x16);
 	static u16 bpe_score(u16 (&)[16], const u8x16 (&)[16][2], const uint);
@ -23,11 +24,11 @@ namespace ircd::gpt::vocab
 	static u64x2 tokenize_block(u16x16 &, const u8x16, const i8x16) noexcept;
 	static void init_tokens(), init_merges();

-	[[gnu::visibility("internal")]]
 	extern const char32_t charset[256];
 }

 /// Remapping of single byte characters (Control (C0) and Basic Latin (ASCII)).
+[[gnu::visibility("internal")]]
 decltype(ircd::gpt::vocab::charset)
 ircd::gpt::vocab::charset
 alignas(64)
@ -169,7 +170,8 @@ ircd::gpt::vocab::init_merges()

 ircd::string_view
 ircd::gpt::vocab::debug(const mutable_buffer &out,
-                        const u16 idx)
+                        const u16 idx,
+                        const uint mask)
 {
 	const auto *const token
 	{
@ -177,13 +179,21 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
 	};

 	thread_local char strbuf[2][512];
-	return string_view{fmt::sprintf
+	return fmt::sprintf
 	{
-		out, "%5u  [%32s]  %s",
+		out, "%5u %s%32s%s%s%s",
 		idx,
-		simd::print_chr(strbuf[0], token[idx]),
-		simd::print_mem(strbuf[1], token[idx]),
-	}};
+		mask & 0x1?
+			"[ "_sv: string_view{},
+		mask & 0x1?
+			simd::print_chr(strbuf[0], token[idx]): string_view{},
+		mask & 0x1?
+			" ]"_sv: string_view{},
+		mask & 0x2?
+			" "_sv: string_view{},
+		mask & 0x2?
+			simd::print_mem(strbuf[1], token[idx]): string_view{},
+	};
 }

 //
@ -193,6 +203,7 @@ ircd::gpt::vocab::debug(const mutable_buffer &out,
 ircd::string_view
 ircd::gpt::vocab::detokenize(const mutable_buffer &out,
                             const vector_view<const u16> &in)
+noexcept
 {
 	size_t off(0);
 	for(const u16 &id : in)
@ -228,9 +239,65 @@ ircd::gpt::vocab::detokenize(const mutable_buffer &out,
 // tokenize
 //

+uint16_t
+ircd::gpt::vocab::tokenize(const string_view &in)
+{
+	char str_buf[16];
+	const string_view str
+	{
+		str_buf, copy(str_buf, in)
+	};
+
+	u16 buf[16];
+	const auto out
+	{
+		tokenize(buf, str)
+	};
+
+	if(unlikely(out.size() != 1))
+		throw error
+		{
+			"Input tokenizes to %zu tokens.",
+			out.size()
+		};
+
+	return buf[0];
+}
+
+uint16_t
+ircd::gpt::vocab::tokenize(const_buffer &in)
+noexcept
+{
+	char str_buf[16];
+	const string_view str
+	{
+		str_buf, copy(str_buf, in)
+	};
+
+	u16 buf[16];
+	const auto out
+	{
+		tokenize(buf, str)
+	};
+
+	const auto &tok
+	{
+		get_token(buf[0])
+	};
+
+	const auto consumed
+	{
+		simd::strlen(tok)
+	};
+
+	consume(in, consumed);
+	return buf[0];
+}
+
 ircd::vector_view<ircd::u16>
 ircd::gpt::vocab::tokenize(const vector_view<u16> &out,
                           const string_view &in)
+noexcept
 {
 	using input_t = u8x16;
 	using block_t = u16x16;
@ -801,13 +868,8 @@ ircd::gpt::vocab::bpe_score(u16 (&score)[16],
 ircd::u16
 ircd::gpt::vocab::find_token(const u8x16 string)
 {
-	const auto *const __restrict__ token
-	{
-		reinterpret_cast<const u8x16 *>(vocab::token)
-	};
-
 	for(uint i(0); i < tokens; ++i)
-		if(simd::streq(string, token[i]))
+		if(simd::streq(string, get_token(i)))
 			return i;

 	return u16(-1U);
@ -835,3 +897,14 @@ ircd::gpt::vocab::find_merge(const u8x16 a,

 	return u16(-1U);
 }
+
+ircd::u8x16
+ircd::gpt::vocab::get_token(const u16 idx)
+{
+	const auto *const __restrict__ token
+	{
+		reinterpret_cast<const u8x16 *>(vocab::token)
+	};
+
+	return token[idx];
+}