ircd::gpt: Reorganize task options and control blocks.

2024-06-29 07:18:20 +02:00 · 2021-05-02 20:40:00 -07:00 · 2021-05-02 20:40:00 -07:00 · 37b1d47c8d
parent 3e9c2d1b56
commit 37b1d47c8d
12 changed files with 527 additions and 643 deletions
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -17,8 +17,8 @@ namespace ircd::gpt
 {
 	IRCD_EXCEPTION(ircd::error, error)

-	struct opts;
 	struct task;
+	struct gate;

 	extern log::log log;
 }
@ -27,7 +27,6 @@ namespace ircd::gpt
 #include "vocab.h"
 #include "model.h"
 #include "token.h"
-#include "opts.h"
-#include "task.h"
+#include "task/task.h"
 #include "pipe/pipe.h"
 #include "generate.h"
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@ -1,277 +0,0 @@
-// Matrix Construct
-//
-// Copyright (C) Matrix Construct Developers, Authors & Contributors
-// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice is present in all copies. The
-// full license for this software is available in the LICENSE file.
-
-#pragma once
-#define HAVE_IRCD_GPT_OPTS_H
-
-/// Task Options Page
-///
-/// The option block is directly shared with task software as constant data.
-/// This stucture and its mutable companion in `task.h` determine the outcome
-/// of the next execution cycle; options are immutable to device software but
-/// may be changed by the host between executions cycles if desired.
-///
-struct ircd_gpt_opts
-{
-	/// Specifies the nominal halting condition based on a sequence of tokens.
-	/// Generation will complete with success after one of these sequences is
-	/// witnessed. Set tokens to -1 starting from the back for shorter
-	/// sequences; zero-length sequences (all -1's) are never matched.
-	uint accept_code[4][4]
-	#ifdef __cplusplus
-	{
-		{    13,  198,  -1U,  -1U,  },
-		{   198,  198,  -1U,  -1U,  },
-		{   -1U,  -1U,  -1U,  -1U,  },
-		{   -1U,  -1U,  -1U,  -1U,  },
-	}
-	#endif
-	;
-
-	/// Specifies the exceptional halting condition based on the sequence of
-	/// tokens. By default, the three zeros represent three outputs of '!'
-	/// which is probably an error; note that a true "!!!" is represented by
-	/// token number 10185. Set tokens to -1 starting from the back to not
-	/// match that token; generated output after errors is usually garbage.
-	uint error_code[4][4]
-	#ifdef __cplusplus
-	{
-		{     0,    0,    0,  -1U,  },
-		{   -1U,  -1U,  -1U,  -1U,  },
-		{   -1U,  -1U,  -1U,  -1U,  },
-		{   -1U,  -1U,  -1U,  -1U,  },
-	}
-	#endif
-	;
-
-	/// Limit number of output tokens. Default of -1 is unlimited; the number
-	/// of tokens generated will be limited by other factors.
-	uint limit
-	#ifdef __cplusplus
-	{
-		1
-	}
-	#endif
-	;
-
-	/// Flip random coins over the top k logits each round. Setting to 1
-	/// deterministically selects the top logit.
-	uint top_k
-	#ifdef __cplusplus
-	{
-		2
-	}
-	#endif
-	;
-
-	/// Specifies the token context size in tokens.
-	uint context_tokens
-	#ifdef __cplusplus
-	{
-		1024
-	}
-	#endif
-	;
-
-	/// Specifies the token buffer size in tokens.
-	uint buffer_tokens
-	#ifdef __cplusplus
-	{
-		1024
-	}
-	#endif
-	;
-
-	/// Embedding vector elements
-	uint embed_elems
-	#ifdef __cplusplus
-	{
-		768
-	}
-	#endif
-	;
-
-	/// Attention unit fcon width multiple
-	uint attn_mult
-	#ifdef __cplusplus
-	{
-		3U
-	}
-	#endif
-	;
-
-	/// MLP unit fcon width multiple
-	uint ffnn_mult
-	#ifdef __cplusplus
-	{
-		4U
-	}
-	#endif
-	;
-
-	/// Attention unit width multiple
-	uint attn_elems
-	#ifdef __cplusplus
-	{
-		embed_elems * attn_mult
-	}
-	#endif
-	;
-
-	/// FFNN unit width multiple
-	uint ffnn_elems
-	#ifdef __cplusplus
-	{
-		embed_elems * ffnn_mult
-	}
-	#endif
-	;
-
-	/// SIMD lane count
-	uint lanes
-	#ifdef __cplusplus
-	{
-		4U
-	}
-	#endif
-	;
-
-	uint embed_width
-	#ifdef __cplusplus
-	{
-		embed_elems / lanes
-	}
-	#endif
-	;
-
-	uint attn_width
-	#ifdef __cplusplus
-	{
-		attn_elems / lanes
-	}
-	#endif
-	;
-
-	uint attn_height
-	#ifdef __cplusplus
-	{
-		embed_elems / lanes
-	}
-	#endif
-	;
-
-	uint ffnn_width
-	#ifdef __cplusplus
-	{
-		ffnn_elems / lanes
-	}
-	#endif
-	;
-
-	uint ffnn_height
-	#ifdef __cplusplus
-	{
-		embed_elems / lanes
-	}
-	#endif
-	;
-
-	/// Specifies the token context size in tokens.
-	uint logits
-	#ifdef __cplusplus
-	{
-		50257
-	}
-	#endif
-	;
-
-	/// Seed for the task's PRNG.
-	ulong seed
-	#ifdef __cplusplus
-	{
-		1234567890UL
-	}
-	#endif
-	;
-
-	/// Training steps
-	ulong training_steps
-	#ifdef __cplusplus
-	{
-		250000
-	}
-	#endif
-	;
-
-	/// Validation steps
-	ulong validation_steps
-	#ifdef __cplusplus
-	{
-		5000
-	}
-	#endif
-	;
-
-	ushort label
-	#ifdef __cplusplus
-	{
-		198
-	}
-	#endif
-	;
-
-	float alpha
-	#ifdef __cplusplus
-	{
-		0.001
-	}
-	#endif
-	;
-
-	float beta[2]
-	#ifdef __cplusplus
-	{
-		0.9,    // Beta1
-		0.999,  // Beta2
-	}
-	#endif
-	;
-
-	float epsilon
-	#ifdef __cplusplus
-	{
-		0.000001
-	}
-	#endif
-	;
-}
-__attribute__((aligned(4096)));
-
-#ifdef __cplusplus
-/// Generator Task Options.
-///
-/// Parameters for a task. Options are constant and one instance can be shared
-/// between multiple task instances. This structure extends the task options
-/// page, starting a new page which is not visible to device software; C++ and
-/// host pointers are available.
-///
-struct ircd::gpt::opts
-:ircd_gpt_opts
-{
-	/// Pointer to the model
-	const model::decoder *model
-	{
-		model::default_model
-	};
-};
-
-static_assert(sizeof(struct ircd_gpt_opts) == 4096);
-static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
-#endif
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -1,166 +0,0 @@
-// Matrix Construct
-//
-// Copyright (C) Matrix Construct Developers, Authors & Contributors
-// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice is present in all copies. The
-// full license for this software is available in the LICENSE file.
-
-#pragma once
-#define HAVE_IRCD_GPT_TASK_H
-
-/// Task Control Page
-///
-/// The control block is shared with our device software. Execution state is
-/// maintained in the task control block across cycles. The control block is
-/// the mutable state component for an execution; for the immutable component
-/// also shared with device software see opts.h.
-///
-struct ircd_gpt_task
-{
-	/// Header magic 0xC7012C70
-	uint magic;
-
-	/// Hypercall code set by our device software upon completion and control
-	/// transfer back to the host. Negative codes indicate errors, positive
-	/// codes are used for status and/or procedure calls; zero is also an error.
-	enum ircd_gpt_hypercall call;
-
-	/// Token ring head. Tokens in the ring extend behind the head for
-	/// `tokens`. The `head` value is automatically modulated by device
-	/// software to wrap around the ring.
-	uint head;
-
-	/// Token counter. The counter indicates the number of valid tokens in
-	/// the context buffer. This value must not exceed the buffer size.
-	uint tokens;
-
-	/// Accumulates the number of task cycles. The cycle counter is incremented
-	/// by device software after each repetition of the kernel pipeline to
-	/// produce one additional token.
-	ulong cycle;
-
-	/// Accumulates the epoch count for the task. The counter is incremented
-	/// by one in device software before control returns back to the host.
-	/// Several cycles may occur during each epoch.
-	ulong epoch;
-
-	/// Accumulates the training epoch count for the task. The counter is
-	/// incremented by one in device software for each backward propagation.
-	ulong step;
-
-	/// Accumulates the number of tokens produced by the task. Several tokens
-	/// may be produced each epoch, but currently only one token is produced
-	/// each cycle.
-	ulong produced;
-
-	/// Accumulates the number tokens witnessed by the task. The number of
-	/// tokens in the context for each cycle is counted as witnessed.
-	ulong witnessed;
-
-	/// Accumulates time in microseconds elapsed for the task.
-	ulong elapsed;
-
-	/// PRNG xoshiro256 state. This is the de facto random seed which can be
-	/// set before cycle entry by the host. It is updated by device software
-	/// when used.
-	ulong rand[4];
-
-	/// Updated by the host with the value of the timestamp register as sampled
-	/// immediately before each transfer of control to the device.
-	ulong host_tsc;
-
-	/// State counters for the accept/error sequence codes.
-	uint accept_seq[4], error_seq[4];
-
-	/// Logit softmax mu
-	float samax_mu;
-
-	/// Logit softmax sum
-	float samax_sum;
-
-	/// Logit softmax lambda
-	float samax_lambda;
-
-	/// Loss for last token of last cycle
-	float loss;
-
-	/// Sum loss over all cycles
-	float loss_sum[4];
-
-	/// Average loss over all cycles
-	float loss_mean;
-
-	/// Perplexity score for last token of last cycle
-	float perp;
-
-	/// Sum ppl over all cycles
-	float perp_sum[4];
-
-	/// Perplexity mean over context
-	float perp_mean;
-
-	/// Certainty difference score for last token of last cycle
-	float cert;
-
-	/// Sum certainty over all cycles
-	float cert_sum[4];
-
-	/// Certainty mean over context
-	float cert_mean;
-
-	/// Final loss
-	float l2_loss;
-
-	/// Final loss mean
-	float l2_loss_mean;
-
-	/// Perform backprop
-	bool prop;
-
-	/// The token buffer starts at offset 2048 and continues to the end of
-	/// the page; options specify the size of the tokens buffer in tokens.
-	/// Additional pages must be attached for larger buffer sizes.
-	ushort token[] __attribute__((aligned(2048)));
-}
-__attribute__((aligned(4096)));
-
-#ifdef __cplusplus
-/// Task Context
-///
-/// State for a task.
-struct ircd::gpt::task
-{
-	enum status :char;
-
-	/// Reference to the attached options.
-	const gpt::opts *opts {nullptr};
-
-	/// Reference to control pages.
-	struct ircd_gpt_task *ctrl {nullptr};
-
-	/// Current task status.
-	enum status status {'\0'};
-
-	task(const gpt::opts *       = nullptr,
-	     struct ircd_gpt_task *  = nullptr);
-
-	~task() noexcept;
-};
-
-/// The current status of a task is indicated with intelligible characters
-enum ircd::gpt::task::status
-:char
-{
-	QUEUED    = 'Q',  ///< Queued for execution.
-	RUNNING   = 'R',  ///< Currently being executed.
-	ACCEPT    = 'A',  ///< Execution completed successfully.
-	ERROR     = 'E',  ///< Execution did not complete successfully.
-};
-
-static_assert(sizeof(struct ircd_gpt_task) == 4096);
-static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
-static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
-#endif
--- a/include/ircd/gpt/task/ctrl.h
+++ b/include/ircd/gpt/task/ctrl.h
@ -0,0 +1,58 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_TASK_CTRL_H
+
+/// Task Control Page
+///
+/// The control block is shared with our device software. Execution state is
+/// maintained in the task control block across cycles. The control block is
+/// the mutable state component for an execution; for the immutable component
+/// also shared with device software see opts.h.
+///
+struct ircd_gpt_task
+{
+	/// Epoch counting & interrupt control block.
+	struct ircd_gpt_task_epic epic;
+
+	/// Token context control block. Contains state for the token context
+	/// buffer; the buffer with the tokens themselves is elsewhere.
+	struct ircd_gpt_task_tokens tokens;
+
+	/// Logit softmax state
+	struct ircd_math_samax samax;
+
+	/// Target label loss state
+	struct ircd_math_mean loss;
+
+	/// Target label perplexity score state
+	struct ircd_math_mean perp;
+
+	/// Target label certainty difference state
+	struct ircd_math_mean cert;
+
+	/// PRNG xoshiro256 state. This is the de facto random seed which can be
+	/// set before cycle entry by the host. It is updated by device software
+	/// when used.
+	ulong rand[4];
+
+	/// Perform backprop
+	bool prop;
+
+	/// Header magic 0xC7012C70
+	uint magic;
+
+	/// The token buffer starts at offset 2048 and continues to the end of
+	/// the page; options specify the size of the tokens buffer in tokens.
+	/// Additional pages must be attached for larger buffer sizes.
+	ushort token[] __attribute__((aligned(2048)));
+}
+__attribute__((aligned(4096)));
--- a/include/ircd/gpt/task/epic.h
+++ b/include/ircd/gpt/task/epic.h
@ -0,0 +1,38 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_TASK_EPIC_H
+
+/// Epoch Precision Interrupt Controller
+///
+struct ircd_gpt_task_epic
+{
+	/// Accumulates the number of task cycles. The cycle counter is incremented
+	/// by device software after each repetition of the kernel pipeline to
+	/// produce one additional token.
+	ulong cycle;
+
+	/// Accumulates the epoch count for the task. The counter is incremented
+	/// by one in device software before control returns back to the host.
+	/// Several cycles may occur during each epoch.
+	ulong epoch;
+
+	/// Accumulates the training epoch count for the task. The counter is
+	/// incremented by one in device software for each backward propagation.
+	ulong step;
+
+	/// Updated by the host with the value of the timestamp register as sampled
+	/// immediately before each transfer of control to the device.
+	ulong host_tsc;
+
+	/// Accumulates time in microseconds elapsed for the task.
+	ulong elapsed;
+};
--- a/include/ircd/gpt/task/gate.h
+++ b/include/ircd/gpt/task/gate.h
@ -0,0 +1,30 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_GATE_H
+
+/// Task Gate Descriptor
+///
+struct ircd_gpt_gate
+{
+	ushort code[8];
+}
+__attribute__((aligned(16)));
+
+#ifdef __cplusplus
+struct ircd::gpt::gate
+:ircd_gpt_gate
+{
+	gate()
+	:ircd_gpt_gate{0}
+	{}
+};
+#endif
--- a/include/ircd/gpt/task/opts.h
+++ b/include/ircd/gpt/task/opts.h
@ -0,0 +1,123 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_OPTS_H
+
+/// Task Options Page
+///
+/// The option block is directly shared with task software as constant data.
+/// This stucture and its mutable companion in `task.h` determine the outcome
+/// of the next execution cycle; options are immutable to device software but
+/// may be changed by the host between executions cycles if desired.
+///
+struct ircd_gpt_opts
+{
+	#ifdef __cplusplus
+	ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
+	#endif
+
+	/// Reference to the model (currently not available in device software).
+	#ifndef __cplusplus
+	const intptr_t model;
+	#else
+	const ircd::gpt::model::decoder *model;
+	#endif
+
+	/// Limit number of output tokens. Default of -1 is unlimited; the number
+	/// of tokens generated will be limited by other factors.
+	uint limit;
+
+	/// Flip random coins over the top k logits each round. Setting to 1
+	/// deterministically selects the top logit.
+	uint top_k;
+
+	/// Specifies the token context size in tokens.
+	uint context_tokens;
+
+	/// Specifies the token buffer size in tokens.
+	uint buffer_tokens;
+
+	/// Embedding vector elements
+	uint embed_elems;
+
+	/// Attention unit fcon width multiple
+	uint attn_mult;
+
+	/// (computed) MLP unit fcon width multiple
+	uint ffnn_mult;
+
+	/// (computed) attention unit width multiple
+	uint attn_elems;
+
+	/// FFNN unit width multiple
+	uint ffnn_elems;
+
+	/// SIMD lane count
+	uint lanes;
+
+	/// (computed) `embed_elems` / `lanes`
+	uint embed_width;
+
+	/// (computed) Attention unit X dimension
+	uint attn_width;
+
+	/// (computed) Attention unit Y dimension
+	uint attn_height;
+
+	/// (computed) MLP backend X dimension
+	uint ffnn_width;
+
+	/// (computed) MLP backend Y dimension
+	uint ffnn_height;
+
+	/// Number of possible target n-grams.
+	uint logits;
+
+	/// Seed for the task's PRNG.
+	ulong seed;
+
+	/// Training steps
+	ulong training_steps;
+
+	/// Validation steps
+	ulong validation_steps;
+
+	/// Target label
+	ushort label;
+
+	/// Learning rate
+	float alpha;
+
+	/// Decay rate
+	float beta[2];
+
+	/// Denorm smoothing
+	float epsilon;
+
+	/// Number of gate descriptors attached to this page.
+	uint gates;
+
+	/// The gate descriptor table starts at offset 2048 and continues to the
+	/// end of the page. For more descriptors additional pages must be
+	/// attached.
+	struct ircd_gpt_gate gate[] __attribute__((aligned(2048)));
+}
+__attribute__((aligned(4096)));
+
+#ifdef __cplusplus
+namespace ircd::gpt
+{
+	using opts = struct ircd_gpt_opts;
+}
+
+static_assert(sizeof(struct ircd_gpt_opts) == 4096);
+static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
+#endif
--- a/include/ircd/gpt/task/task.h
+++ b/include/ircd/gpt/task/task.h
@ -0,0 +1,56 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_TASK_H
+
+#include "epic.h"
+#include "tokens.h"
+#include "gate.h"
+#include "opts.h"
+#include "ctrl.h"
+
+#ifdef __cplusplus
+/// Task Context
+///
+/// State for a task.
+struct ircd::gpt::task
+{
+	enum status :char;
+
+	/// Reference to the attached options.
+	const gpt::opts *opts {nullptr};
+
+	/// Reference to control pages.
+	struct ircd_gpt_task *ctrl {nullptr};
+
+	/// Current task status.
+	enum status status {'\0'};
+
+	task(const gpt::opts *       = nullptr,
+	     struct ircd_gpt_task *  = nullptr);
+
+	~task() noexcept;
+};
+
+/// The current status of a task is indicated with intelligible characters
+enum ircd::gpt::task::status
+:char
+{
+	QUEUED    = 'Q',  ///< Queued for execution.
+	RUNNING   = 'R',  ///< Currently being executed.
+	ACCEPT    = 'A',  ///< Execution completed successfully.
+	ERROR     = 'E',  ///< Execution did not complete successfully.
+};
+
+static_assert(sizeof(struct ircd_gpt_task) == 4096);
+static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
+static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
+#endif
--- a/include/ircd/gpt/task/tokens.h
+++ b/include/ircd/gpt/task/tokens.h
@ -0,0 +1,35 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_TASK_TOKENS_H
+
+/// Token Context Buffer (Control Block)
+///
+struct ircd_gpt_task_tokens
+{
+	/// Token ring head. Tokens in the ring extend behind the head for
+	/// `tokens`. The `head` value is automatically modulated by device
+	/// software to wrap around the ring.
+	uint head;
+
+	/// Token counter. The counter indicates the number of valid tokens in
+	/// the context buffer. This value must not exceed the buffer size.
+	uint count;
+
+	/// Accumulates the number of tokens produced by the task. Several tokens
+	/// may be produced each epoch, but currently only one token is produced
+	/// each cycle.
+	ulong produced;
+
+	/// Accumulates the number tokens witnessed by the task. The number of
+	/// tokens in the context for each cycle is counted as witnessed.
+	ulong witnessed;
+};
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -88,67 +88,29 @@ ircd::gpt::generate(const vector_view<u16> &out,

 	const auto &opts(*task.opts);
 	auto &ctrl(*task.ctrl);
-	auto &errc(ctrl.error_seq);
-	auto &accc(ctrl.accept_seq);
-	ctrl.tokens = in.size();
-	ctrl.head = 0;
-
-	const size_t tmax
-	{
-		in.size() + opts.limit
-	};
-
-	const vector_view<f32> accum
-	{
-		gpt::scratch, tmax * 768
-	};
-
-	const vector_view<f32> embeds
-	{
-		gpt::embeds, tmax * 768
-	};
+	ctrl.tokens.count = 0;
+	ctrl.tokens.head = 0;

 	for(uint j(0); j < in.size(); ++j)
+		ctrl.token[ctrl.tokens.count++] = in[j];
+
+	for(uint i(0); i < opts.gates; ++i)
+		for(uint k(0); k < 8; ++k)
+		{
+			if(ctrl.tokens.count >= opts.buffer_tokens)
+				break;
+
+			if(opts.gate[i].code[k] == 0)
+				break;
+
+			ctrl.token[ctrl.tokens.count] = opts.gate[i].code[k];
+			ctrl.tokens.count++;
+		}
+
+	const size_t in_size
 	{
-		const vector_view<f32> dst
-		{
-			data(embeds) + j * 768, 768
-		};
-
-		if(ircd::cl::enable)
-			ctrl.token[j] = in[j];
-		else
-			embed(data(dst), in[j], j, opts);
-
-		#if 0 // RB_DEBUG
-		static char dbuf[512] {0};
-		char report[1536] {0};
-		char tmbuf[1][64] {{0}};
-		const size_t report_size = snprintf
-		(
-			report, sizeof(report),
-			"%-4u %4u %4u:%-4u %1u%1u  [ %6.2fL %6.2f%% ] %6.2fL %5.1f%%  %s",
-			ctrl.epoch,
-			ctrl.cycle,
-			j,
-			ctrl.tokens,
-			0,
-			0,
-			0.0,
-			0.0,
-			0.0,
-			0.0,
-			vocab::debug(dbuf, in[j]).c_str()
-		);
-
-		log::logf
-		{
-			log, log::level::DEBUG,
-			"%s",
-			string_view{report, report_size}
-		};
-		#endif
-	}
+		ctrl.tokens.count
+	};

 	uint64_t cycles(0);
 	if(ctrl.prop)
@ -170,7 +132,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
 			cycles
 		};

-		backprop(task, ctrl.loss_mean, *model::default_model, momentum);
+		backprop(task, ctrl.loss.mean, *model::default_model, momentum);
 	}

 	if(ctrl.prop)
@ -178,17 +140,17 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		log::debug
 		{
 			log, "Backpropagation of %2.6f in %lu cycles.",
-			ctrl.loss_mean,
+			ctrl.loss.mean,
 			cycles,
 		};

-		ctrl.epoch = 0;
-		ctrl.loss_mean = 0;
-		ctrl.loss = ctrl.loss_mean;
-		ctrl.perp_mean = 0;
-		ctrl.perp = ctrl.perp_mean;
-		ctrl.cert_mean = 0;
-		ctrl.cert = ctrl.cert_mean;
+		ctrl.epic.epoch = 0;
+		ctrl.loss.mean = 0;
+		ctrl.loss.last = ctrl.loss.mean;
+		ctrl.perp.mean = 0;
+		ctrl.perp.last = ctrl.perp.mean;
+		ctrl.cert.mean = 0;
+		ctrl.cert.last = ctrl.cert.mean;
 		ctrl.prop = false;
 		pipe::default_model->invalid = true;
 		return {};
@ -206,73 +168,49 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		generate(task);
 	}
 	last_time = stopwatch.at<milliseconds>();
-	ctrl.elapsed += last_time.count();
+	ctrl.epic.elapsed += last_time.count();

-	/*
-		coil(data(scratch), tokens, *opts.model);
-		tail(logit, data(last_embed), *opts.model);
-		out[i] = argmax(logit, *opts);
-	*/
-
-	uint accc_thresh[3] {3, 3, 3};
-	for(uint i(0); i < 3; ++i)
-		for(uint j(3); j > 0; --j)
-			if(opts.accept_code[i][j - 1] == -1U)
-				--accc_thresh[i];
-			else
-				break;
-
-	uint errc_thresh[3] {3, 3, 3};
-	for(uint i(0); i < 3; ++i)
-		for(uint j(3); j > 0; --j)
-			if(opts.error_code[i][j - 1] == -1U)
-				--errc_thresh[i];
-			else
-				break;
-
-	for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
+	for(uint j(0); j < ctrl.tokens.count && ret < out.size() && !halt; ++j)
 	{
-		out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
+		const auto tok
+		{
+			ctrl.token[j]
+		};

-		for(uint j(0); j < 3; ++j)
-			errc[j] = opts.error_code[j][errc[j]] == out[j]?
-				errc[j] + 1: 0;
+		if(j >= in_size)
+			out[ret++] = tok;

-		for(uint j(0); j < 3; ++j)
-			accc[j] = opts.accept_code[j][accc[j]] == out[j]?
-				accc[j] + 1: 0;
-
-		for(uint j(0); j < 3; ++j)
-			halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
-			halt |= errc_thresh[j] && errc[j] >= errc_thresh[j];
+		if(j < in_size)
+			continue;

 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[4][64] {0};
-		const size_t bsz(ctrl.tokens - in.size());
+		const size_t bsz(ctrl.tokens.count - in_size);
 		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
-			"%4lu:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
-			j + in.size(),
-			ctrl.tokens,
-			ctrl.epoch,
-			ctrl.cycle,
-			std::clamp(ctrl.cert_mean * 100.0f, 0.0f, 100.0f),
-			std::clamp(ctrl.perp_mean, 0.0f, 100.0f),
-			std::clamp(ctrl.loss_mean, 0.0f, 99.99f),
-			opts.label == out[j]? '+': ' ',
-			accc[0] + accc[1] + accc[2] >= 3? 'A': ' ',
-			errc[0] + errc[1] + errc[2] >= 3? 'E': ' ',
+			"%-3u %4u:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
+			j,
+			ret - 1,
+			ctrl.tokens.count,
+			ctrl.epic.epoch,
+			ctrl.epic.cycle,
+			std::clamp(ctrl.cert.mean * 100.0f, 0.0f, 100.0f),
+			std::clamp(ctrl.perp.mean, 0.0f, 100.0f),
+			std::clamp(ctrl.loss.mean, 0.0f, 99.99f),
+			opts.label == tok? '+': ' ',
+			' ', // flag place
+			' ', // flag place
 			opts.label,
-			std::clamp(ctrl.loss, 0.0f, 99.99f),
-			std::clamp(ctrl.perp, 0.0f, 100.0f),
-			std::clamp(ctrl.cert * 100.0f, 0.0f, 100.0f),
-			vocab::debug(dbuf, out[j]).c_str(),
-			out[j],
+			std::clamp(ctrl.loss.last, 0.0f, 99.99f),
+			std::clamp(ctrl.perp.last, 0.0f, 100.0f),
+			std::clamp(ctrl.cert.last * 100.0f, 0.0f, 100.0f),
+			vocab::debug(dbuf, tok).c_str(),
+			tok,
 			pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
 			pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
-			pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
+			pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
 		);

 		log::logf
@ -283,19 +221,6 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		};
 	}

-	ret = ctrl.tokens - in.size();
-	if ((false)) for(uint i(0); i < 3; ++i)
-		if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
-		{
-			ret -= (3 - accc_thresh[i]);
-			break;
-		}
-		else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
-		{
-			ret -= (3 - errc_thresh[i]);
-			break;
-		}
-
 	ctx::interruption_point();
 	return vector_view<u16>
 	{
@ -689,6 +614,7 @@ ircd::gpt::gelu(f32x4 &out,
 // backside
 //

+[[gnu::noinline]]
 size_t
 ircd::gpt::backprop(task &task,
                    const f32 grad,
@ -792,6 +718,7 @@ ircd::gpt::backprop(task &task,
 	return off;
 }

+[[gnu::noinline]]
 size_t
 ircd::gpt::adamw(task &task,
                 const f32 grad,
@ -820,7 +747,7 @@ ircd::gpt::adamw(task &task,
 	};

 	for(uint i(0); i < num / 4; ++i)
-		off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.step, off);
+		off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.epic.step, off);

 	return off;
 }
@ -915,19 +842,111 @@ noexcept
 }

 //
-// hypercall
+// gpt::opts
 //

-ircd::string_view
-ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
+ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model)
 noexcept
+:model
+{
+	model
+}
+,limit
+{
+	-1U
+}
+,top_k
+{
+	2U
+}
+,context_tokens
+{
+	1024U
+}
+,buffer_tokens
+{
+	1024U
+}
+,embed_elems
+{
+	768U
+}
+,attn_mult
+{
+	3U
+}
+,ffnn_mult
+{
+	4U
+}
+,attn_elems
+{
+	embed_elems * attn_mult
+}
+,ffnn_elems
+{
+	embed_elems * ffnn_mult
+}
+,lanes
+{
+	4U
+}
+,embed_width
+{
+	embed_elems / lanes
+}
+,attn_width
+{
+	attn_elems / lanes
+}
+,attn_height
+{
+	embed_elems / lanes
+}
+,ffnn_width
+{
+	ffnn_elems / lanes
+}
+,ffnn_height
+{
+	embed_elems / lanes
+}
+,logits
+{
+	50257
+}
+,seed
+{
+	1234567890UL
+}
+,training_steps
+{
+	250000
+}
+,validation_steps
+{
+	5000
+}
+,label
+{
+	198
+}
+,alpha
+{
+	0.001f
+}
+,beta
+{
+	0.9f,
+	0.999f,
+}
+,epsilon
+{
+	0.000001
+}
+,gates
+{
+	0
+}
 {
-	switch(code)
-	{
-		case IRCD_GPT_ACCEPT:      return "ACCEPT";
-		case IRCD_GPT_ECOMPLETE:   return "ECOMPLETE";
-		case IRCD_GPT_ETOKENS:     return "ETOKENS";
-	}
-
-	return "??????";
 }
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -432,7 +432,7 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                   const uint word_idx)
 {
 	const ushort
-	ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
+	ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens,
 	token = ctrl->token[ring_idx];

 	const float4
@ -454,7 +454,7 @@ ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);

-	for(uint i = 0; i < ctrl->tokens; ++i)
+	for(uint i = 0; i < ctrl->tokens.count; ++i)
 		if(i % wn == wi)
 			_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
 }
@ -492,7 +492,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
 {
 	const uint
 	gi = get_global_id(0),
-	ti = ctrl->tokens - 1,
+	ti = ctrl->tokens.count - 1,
 	words = opts->embed_width;

 	float4 acc = 0.0f;
@ -596,31 +596,16 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
               __constant const struct ircd_gpt_opts *const opts,
               const uint li)
 {
-	// If the call value has been set to something other than default we
-	// do nothing else here.
-	if(ctrl->call != IRCD_GPT_ECOMPLETE)
-		return;
-
 	// No action for other threads right now
 	if(li != 0)
 		return;

-	// Run debug checks and assertions.
-	#ifdef RB_DEBUG
-	if(ctrl->call == IRCD_GPT_ECOMPLETE)
-		if(ctrl->tokens < 2)
-			ctrl->call = IRCD_GPT_ETOKENS;
-	#endif
-
 	// On the last cycle, with no prior call or error code set, indicate
 	// a nominal exit condition.
-	if(ctrl->cycle + 1 >= opts->limit)
-	{
-		ctrl->call = IRCD_GPT_ACCEPT;
-		ctrl->epoch += 1;
-	}
+	if(ctrl->epic.cycle + 1 >= opts->limit)
+		ctrl->epic.epoch += 1;

-	ctrl->cycle += 1;
+	ctrl->epic.cycle += 1;
 	ctrl->magic = 0xC7012C70U;
 }

@ -634,10 +619,6 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
                   __global const float *const restrict logexp,
                   __global const float *const restrict logit)
 {
-	// When the hypercall code is already set, bail here.
-	if(ctrl->call != IRCD_GPT_ECOMPLETE)
-		return;
-
 	// To read from cells other than idx[0] we need this barrier.
 	if(opts->top_k > 1)
 		barrier(CLK_LOCAL_MEM_FENCE);
@ -647,7 +628,7 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
 		return;

 	const bool
-	buffer_full = ctrl->tokens >= opts->buffer_tokens;
+	buffer_full = ctrl->tokens.count >= opts->buffer_tokens;

 	const ulong
 	rnd = opts->top_k > 1?
@ -657,20 +638,20 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
 	entro = max(opts->top_k, 1U),
 	select = rnd % entro,
 	token = idx[select],
-	dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
-	tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
+	dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens,
+	tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens),
 	head = buffer_full?
-		(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;
+		(ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head;

-	ctrl->head = head;
-	ctrl->tokens = tokens;
+	ctrl->tokens.head = head;
+	ctrl->tokens.count = tokens;
 	ctrl->token[dest] = token;

 	const ushort
 	ln = get_local_size(0),
 	next_select = (select + 1) % ln,
 	next_token = idx[next_select],
-	sum_sel = ctrl->epoch % 3;
+	sum_sel = ctrl->epic.epoch % 3;

 	const float
 	test_lsm = logexp[opts->label],
@ -737,7 +718,7 @@ ircd_gpt_prop_elem(__global const struct ircd_gpt_task *const ctrl,
 {
 	const uint
 	li = get_local_id(0),
-	step = ctrl->step;
+	step = ctrl->epic.step;

 	const float4
 	param = param_[li],
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -129,12 +129,11 @@ ircd::gpt::generate(task &task)
 		*task.ctrl
 	};

-	ctrl.cycle = 0;
-	ctrl.call = IRCD_GPT_ECOMPLETE;
-	ctrl.host_tsc = prof::cycles();
-	volatile const size_t tokens(ctrl.tokens);
-	volatile const auto epoch(ctrl.epoch);
-	volatile size_t cycle(ctrl.cycle);
+	ctrl.epic.cycle = 0;
+	ctrl.epic.host_tsc = prof::cycles();
+	volatile const size_t tokens(ctrl.tokens.count);
+	volatile const auto epoch(ctrl.epic.epoch);
+	volatile size_t cycle(ctrl.epic.cycle);

 	std::deque<pipe::exec> list;
 	for(; cycle < opts.limit; ++cycle)
@ -151,8 +150,7 @@ ircd::gpt::generate(task &task)
 			task, tokens + cycle, rel, acq
 		);

-		// Conditions for a cl::flush here; this is not default but
-		// may be configured to improve some workloads.
+		// Conditions for a cl::flush here
 		const bool flush
 		{
 			// Flushing here is enabled by the configuration
@ -194,18 +192,8 @@ ircd::gpt::generate(task &task)
 	list.clear();

 	assert(ctrl.magic == 0xC7012C70);
+	assert(ctrl.epic.cycle == cycle || ctx::interruption_requested());
 	this_ctx::interruption_point();
-
-	// Interp error codes
-	if(unlikely(ctrl.call <= 0))
-		throw error
-		{
-			"hyper (#%d) :%s",
-			abs(int(ctrl.call)),
-			reflect(ctrl.call),
-		};
-
-	assert(ctrl.cycle == cycle || ctx::interruption_requested());
 }

 void