ircd::gpt: Reorganize task options and control blocks.

2024-11-17 23:40:57 +01:00 · 2021-05-02 20:40:00 -07:00 · 2021-05-02 20:40:00 -07:00 · 37b1d47c8d
commit 37b1d47c8d
parent 3e9c2d1b56
12 changed files with 527 additions and 643 deletions
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -17,8 +17,8 @@ namespace ircd::gpt
 {
 	IRCD_EXCEPTION(ircd::error, error)
 	struct opts;
 	struct task;
 	struct gate;
 	extern log::log log;
 }
@ -27,7 +27,6 @@ namespace ircd::gpt
 #include "vocab.h"
 #include "model.h"
 #include "token.h"
-#include "opts.h"
+#include "task/task.h"
 #include "task.h"
 #include "pipe/pipe.h"
 #include "generate.h"
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@ -1,277 +0,0 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_OPTS_H
 /// Task Options Page
 ///
 /// The option block is directly shared with task software as constant data.
 /// This stucture and its mutable companion in `task.h` determine the outcome
 /// of the next execution cycle; options are immutable to device software but
 /// may be changed by the host between executions cycles if desired.
 ///
 struct ircd_gpt_opts
 {
 	/// Specifies the nominal halting condition based on a sequence of tokens.
 	/// Generation will complete with success after one of these sequences is
 	/// witnessed. Set tokens to -1 starting from the back for shorter
 	/// sequences; zero-length sequences (all -1's) are never matched.
 	uint accept_code[4][4]
 	#ifdef __cplusplus
 	{
 		{    13,  198,  -1U,  -1U,  },
 		{   198,  198,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 	}
 	#endif
 	;
 	/// Specifies the exceptional halting condition based on the sequence of
 	/// tokens. By default, the three zeros represent three outputs of '!'
 	/// which is probably an error; note that a true "!!!" is represented by
 	/// token number 10185. Set tokens to -1 starting from the back to not
 	/// match that token; generated output after errors is usually garbage.
 	uint error_code[4][4]
 	#ifdef __cplusplus
 	{
 		{     0,    0,    0,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 	}
 	#endif
 	;
 	/// Limit number of output tokens. Default of -1 is unlimited; the number
 	/// of tokens generated will be limited by other factors.
 	uint limit
 	#ifdef __cplusplus
 	{
 		1
 	}
 	#endif
 	;
 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
 	uint top_k
 	#ifdef __cplusplus
 	{
 		2
 	}
 	#endif
 	;
 	/// Specifies the token context size in tokens.
 	uint context_tokens
 	#ifdef __cplusplus
 	{
 		1024
 	}
 	#endif
 	;
 	/// Specifies the token buffer size in tokens.
 	uint buffer_tokens
 	#ifdef __cplusplus
 	{
 		1024
 	}
 	#endif
 	;
 	/// Embedding vector elements
 	uint embed_elems
 	#ifdef __cplusplus
 	{
 		768
 	}
 	#endif
 	;
 	/// Attention unit fcon width multiple
 	uint attn_mult
 	#ifdef __cplusplus
 	{
 		3U
 	}
 	#endif
 	;
 	/// MLP unit fcon width multiple
 	uint ffnn_mult
 	#ifdef __cplusplus
 	{
 		4U
 	}
 	#endif
 	;
 	/// Attention unit width multiple
 	uint attn_elems
 	#ifdef __cplusplus
 	{
 		embed_elems * attn_mult
 	}
 	#endif
 	;
 	/// FFNN unit width multiple
 	uint ffnn_elems
 	#ifdef __cplusplus
 	{
 		embed_elems * ffnn_mult
 	}
 	#endif
 	;
 	/// SIMD lane count
 	uint lanes
 	#ifdef __cplusplus
 	{
 		4U
 	}
 	#endif
 	;
 	uint embed_width
 	#ifdef __cplusplus
 	{
 		embed_elems / lanes
 	}
 	#endif
 	;
 	uint attn_width
 	#ifdef __cplusplus
 	{
 		attn_elems / lanes
 	}
 	#endif
 	;
 	uint attn_height
 	#ifdef __cplusplus
 	{
 		embed_elems / lanes
 	}
 	#endif
 	;
 	uint ffnn_width
 	#ifdef __cplusplus
 	{
 		ffnn_elems / lanes
 	}
 	#endif
 	;
 	uint ffnn_height
 	#ifdef __cplusplus
 	{
 		embed_elems / lanes
 	}
 	#endif
 	;
 	/// Specifies the token context size in tokens.
 	uint logits
 	#ifdef __cplusplus
 	{
 		50257
 	}
 	#endif
 	;
 	/// Seed for the task's PRNG.
 	ulong seed
 	#ifdef __cplusplus
 	{
 		1234567890UL
 	}
 	#endif
 	;
 	/// Training steps
 	ulong training_steps
 	#ifdef __cplusplus
 	{
 		250000
 	}
 	#endif
 	;
 	/// Validation steps
 	ulong validation_steps
 	#ifdef __cplusplus
 	{
 		5000
 	}
 	#endif
 	;
 	ushort label
 	#ifdef __cplusplus
 	{
 		198
 	}
 	#endif
 	;
 	float alpha
 	#ifdef __cplusplus
 	{
 		0.001
 	}
 	#endif
 	;
 	float beta[2]
 	#ifdef __cplusplus
 	{
 		0.9,    // Beta1
 		0.999,  // Beta2
 	}
 	#endif
 	;
 	float epsilon
 	#ifdef __cplusplus
 	{
 		0.000001
 	}
 	#endif
 	;
 }
 __attribute__((aligned(4096)));
 #ifdef __cplusplus
 /// Generator Task Options.
 ///
 /// Parameters for a task. Options are constant and one instance can be shared
 /// between multiple task instances. This structure extends the task options
 /// page, starting a new page which is not visible to device software; C++ and
 /// host pointers are available.
 ///
 struct ircd::gpt::opts
 :ircd_gpt_opts
 {
 	/// Pointer to the model
 	const model::decoder *model
 	{
 		model::default_model
 	};
 };
 static_assert(sizeof(struct ircd_gpt_opts) == 4096);
 static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
 #endif
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -1,166 +0,0 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_TASK_H
 /// Task Control Page
 ///
 /// The control block is shared with our device software. Execution state is
 /// maintained in the task control block across cycles. The control block is
 /// the mutable state component for an execution; for the immutable component
 /// also shared with device software see opts.h.
 ///
 struct ircd_gpt_task
 {
 	/// Header magic 0xC7012C70
 	uint magic;
 	/// Hypercall code set by our device software upon completion and control
 	/// transfer back to the host. Negative codes indicate errors, positive
 	/// codes are used for status and/or procedure calls; zero is also an error.
 	enum ircd_gpt_hypercall call;
 	/// Token ring head. Tokens in the ring extend behind the head for
 	/// `tokens`. The `head` value is automatically modulated by device
 	/// software to wrap around the ring.
 	uint head;
 	/// Token counter. The counter indicates the number of valid tokens in
 	/// the context buffer. This value must not exceed the buffer size.
 	uint tokens;
 	/// Accumulates the number of task cycles. The cycle counter is incremented
 	/// by device software after each repetition of the kernel pipeline to
 	/// produce one additional token.
 	ulong cycle;
 	/// Accumulates the epoch count for the task. The counter is incremented
 	/// by one in device software before control returns back to the host.
 	/// Several cycles may occur during each epoch.
 	ulong epoch;
 	/// Accumulates the training epoch count for the task. The counter is
 	/// incremented by one in device software for each backward propagation.
 	ulong step;
 	/// Accumulates the number of tokens produced by the task. Several tokens
 	/// may be produced each epoch, but currently only one token is produced
 	/// each cycle.
 	ulong produced;
 	/// Accumulates the number tokens witnessed by the task. The number of
 	/// tokens in the context for each cycle is counted as witnessed.
 	ulong witnessed;
 	/// Accumulates time in microseconds elapsed for the task.
 	ulong elapsed;
 	/// PRNG xoshiro256 state. This is the de facto random seed which can be
 	/// set before cycle entry by the host. It is updated by device software
 	/// when used.
 	ulong rand[4];
 	/// Updated by the host with the value of the timestamp register as sampled
 	/// immediately before each transfer of control to the device.
 	ulong host_tsc;
 	/// State counters for the accept/error sequence codes.
 	uint accept_seq[4], error_seq[4];
 	/// Logit softmax mu
 	float samax_mu;
 	/// Logit softmax sum
 	float samax_sum;
 	/// Logit softmax lambda
 	float samax_lambda;
 	/// Loss for last token of last cycle
 	float loss;
 	/// Sum loss over all cycles
 	float loss_sum[4];
 	/// Average loss over all cycles
 	float loss_mean;
 	/// Perplexity score for last token of last cycle
 	float perp;
 	/// Sum ppl over all cycles
 	float perp_sum[4];
 	/// Perplexity mean over context
 	float perp_mean;
 	/// Certainty difference score for last token of last cycle
 	float cert;
 	/// Sum certainty over all cycles
 	float cert_sum[4];
 	/// Certainty mean over context
 	float cert_mean;
 	/// Final loss
 	float l2_loss;
 	/// Final loss mean
 	float l2_loss_mean;
 	/// Perform backprop
 	bool prop;
 	/// The token buffer starts at offset 2048 and continues to the end of
 	/// the page; options specify the size of the tokens buffer in tokens.
 	/// Additional pages must be attached for larger buffer sizes.
 	ushort token[] __attribute__((aligned(2048)));
 }
 __attribute__((aligned(4096)));
 #ifdef __cplusplus
 /// Task Context
 ///
 /// State for a task.
 struct ircd::gpt::task
 {
 	enum status :char;
 	/// Reference to the attached options.
 	const gpt::opts *opts {nullptr};
 	/// Reference to control pages.
 	struct ircd_gpt_task *ctrl {nullptr};
 	/// Current task status.
 	enum status status {'\0'};
 	task(const gpt::opts *       = nullptr,
 	     struct ircd_gpt_task *  = nullptr);
 	~task() noexcept;
 };
 /// The current status of a task is indicated with intelligible characters
 enum ircd::gpt::task::status
 :char
 {
 	QUEUED    = 'Q',  ///< Queued for execution.
 	RUNNING   = 'R',  ///< Currently being executed.
 	ACCEPT    = 'A',  ///< Execution completed successfully.
 	ERROR     = 'E',  ///< Execution did not complete successfully.
 };
 static_assert(sizeof(struct ircd_gpt_task) == 4096);
 static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
 static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
 #endif
--- a/include/ircd/gpt/task/ctrl.h
+++ b/include/ircd/gpt/task/ctrl.h
@ -0,0 +1,58 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_TASK_CTRL_H
 /// Task Control Page
 ///
 /// The control block is shared with our device software. Execution state is
 /// maintained in the task control block across cycles. The control block is
 /// the mutable state component for an execution; for the immutable component
 /// also shared with device software see opts.h.
 ///
 struct ircd_gpt_task
 {
 	/// Epoch counting & interrupt control block.
 	struct ircd_gpt_task_epic epic;
 	/// Token context control block. Contains state for the token context
 	/// buffer; the buffer with the tokens themselves is elsewhere.
 	struct ircd_gpt_task_tokens tokens;
 	/// Logit softmax state
 	struct ircd_math_samax samax;
 	/// Target label loss state
 	struct ircd_math_mean loss;
 	/// Target label perplexity score state
 	struct ircd_math_mean perp;
 	/// Target label certainty difference state
 	struct ircd_math_mean cert;
 	/// PRNG xoshiro256 state. This is the de facto random seed which can be
 	/// set before cycle entry by the host. It is updated by device software
 	/// when used.
 	ulong rand[4];
 	/// Perform backprop
 	bool prop;
 	/// Header magic 0xC7012C70
 	uint magic;
 	/// The token buffer starts at offset 2048 and continues to the end of
 	/// the page; options specify the size of the tokens buffer in tokens.
 	/// Additional pages must be attached for larger buffer sizes.
 	ushort token[] __attribute__((aligned(2048)));
 }
 __attribute__((aligned(4096)));
--- a/include/ircd/gpt/task/epic.h
+++ b/include/ircd/gpt/task/epic.h
@ -0,0 +1,38 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_TASK_EPIC_H
 /// Epoch Precision Interrupt Controller
 ///
 struct ircd_gpt_task_epic
 {
 	/// Accumulates the number of task cycles. The cycle counter is incremented
 	/// by device software after each repetition of the kernel pipeline to
 	/// produce one additional token.
 	ulong cycle;
 	/// Accumulates the epoch count for the task. The counter is incremented
 	/// by one in device software before control returns back to the host.
 	/// Several cycles may occur during each epoch.
 	ulong epoch;
 	/// Accumulates the training epoch count for the task. The counter is
 	/// incremented by one in device software for each backward propagation.
 	ulong step;
 	/// Updated by the host with the value of the timestamp register as sampled
 	/// immediately before each transfer of control to the device.
 	ulong host_tsc;
 	/// Accumulates time in microseconds elapsed for the task.
 	ulong elapsed;
 };
--- a/include/ircd/gpt/task/gate.h
+++ b/include/ircd/gpt/task/gate.h
@ -0,0 +1,30 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_GATE_H
 /// Task Gate Descriptor
 ///
 struct ircd_gpt_gate
 {
 	ushort code[8];
 }
 __attribute__((aligned(16)));
 #ifdef __cplusplus
 struct ircd::gpt::gate
 :ircd_gpt_gate
 {
 	gate()
 	:ircd_gpt_gate{0}
 	{}
 };
 #endif
--- a/include/ircd/gpt/task/opts.h
+++ b/include/ircd/gpt/task/opts.h
@ -0,0 +1,123 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_OPTS_H
 /// Task Options Page
 ///
 /// The option block is directly shared with task software as constant data.
 /// This stucture and its mutable companion in `task.h` determine the outcome
 /// of the next execution cycle; options are immutable to device software but
 /// may be changed by the host between executions cycles if desired.
 ///
 struct ircd_gpt_opts
 {
 	#ifdef __cplusplus
 	ircd_gpt_opts(const ircd::gpt::model::decoder * = nullptr) noexcept;
 	#endif
 	/// Reference to the model (currently not available in device software).
 	#ifndef __cplusplus
 	const intptr_t model;
 	#else
 	const ircd::gpt::model::decoder *model;
 	#endif
 	/// Limit number of output tokens. Default of -1 is unlimited; the number
 	/// of tokens generated will be limited by other factors.
 	uint limit;
 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
 	uint top_k;
 	/// Specifies the token context size in tokens.
 	uint context_tokens;
 	/// Specifies the token buffer size in tokens.
 	uint buffer_tokens;
 	/// Embedding vector elements
 	uint embed_elems;
 	/// Attention unit fcon width multiple
 	uint attn_mult;
 	/// (computed) MLP unit fcon width multiple
 	uint ffnn_mult;
 	/// (computed) attention unit width multiple
 	uint attn_elems;
 	/// FFNN unit width multiple
 	uint ffnn_elems;
 	/// SIMD lane count
 	uint lanes;
 	/// (computed) `embed_elems` / `lanes`
 	uint embed_width;
 	/// (computed) Attention unit X dimension
 	uint attn_width;
 	/// (computed) Attention unit Y dimension
 	uint attn_height;
 	/// (computed) MLP backend X dimension
 	uint ffnn_width;
 	/// (computed) MLP backend Y dimension
 	uint ffnn_height;
 	/// Number of possible target n-grams.
 	uint logits;
 	/// Seed for the task's PRNG.
 	ulong seed;
 	/// Training steps
 	ulong training_steps;
 	/// Validation steps
 	ulong validation_steps;
 	/// Target label
 	ushort label;
 	/// Learning rate
 	float alpha;
 	/// Decay rate
 	float beta[2];
 	/// Denorm smoothing
 	float epsilon;
 	/// Number of gate descriptors attached to this page.
 	uint gates;
 	/// The gate descriptor table starts at offset 2048 and continues to the
 	/// end of the page. For more descriptors additional pages must be
 	/// attached.
 	struct ircd_gpt_gate gate[] __attribute__((aligned(2048)));
 }
 __attribute__((aligned(4096)));
 #ifdef __cplusplus
 namespace ircd::gpt
 {
 	using opts = struct ircd_gpt_opts;
 }
 static_assert(sizeof(struct ircd_gpt_opts) == 4096);
 static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
 #endif
--- a/include/ircd/gpt/task/task.h
+++ b/include/ircd/gpt/task/task.h
@ -0,0 +1,56 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_TASK_H
 #include "epic.h"
 #include "tokens.h"
 #include "gate.h"
 #include "opts.h"
 #include "ctrl.h"
 #ifdef __cplusplus
 /// Task Context
 ///
 /// State for a task.
 struct ircd::gpt::task
 {
 	enum status :char;
 	/// Reference to the attached options.
 	const gpt::opts *opts {nullptr};
 	/// Reference to control pages.
 	struct ircd_gpt_task *ctrl {nullptr};
 	/// Current task status.
 	enum status status {'\0'};
 	task(const gpt::opts *       = nullptr,
 	     struct ircd_gpt_task *  = nullptr);
 	~task() noexcept;
 };
 /// The current status of a task is indicated with intelligible characters
 enum ircd::gpt::task::status
 :char
 {
 	QUEUED    = 'Q',  ///< Queued for execution.
 	RUNNING   = 'R',  ///< Currently being executed.
 	ACCEPT    = 'A',  ///< Execution completed successfully.
 	ERROR     = 'E',  ///< Execution did not complete successfully.
 };
 static_assert(sizeof(struct ircd_gpt_task) == 4096);
 static_assert(offsetof(struct ircd_gpt_task, token) == 2048);
 static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
 #endif
--- a/include/ircd/gpt/task/tokens.h
+++ b/include/ircd/gpt/task/tokens.h
@ -0,0 +1,35 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_TASK_TOKENS_H
 /// Token Context Buffer (Control Block)
 ///
 struct ircd_gpt_task_tokens
 {
 	/// Token ring head. Tokens in the ring extend behind the head for
 	/// `tokens`. The `head` value is automatically modulated by device
 	/// software to wrap around the ring.
 	uint head;
 	/// Token counter. The counter indicates the number of valid tokens in
 	/// the context buffer. This value must not exceed the buffer size.
 	uint count;
 	/// Accumulates the number of tokens produced by the task. Several tokens
 	/// may be produced each epoch, but currently only one token is produced
 	/// each cycle.
 	ulong produced;
 	/// Accumulates the number tokens witnessed by the task. The number of
 	/// tokens in the context for each cycle is counted as witnessed.
 	ulong witnessed;
 };
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -88,67 +88,29 @@ ircd::gpt::generate(const vector_view<u16> &out,
 	const auto &opts(*task.opts);
 	auto &ctrl(*task.ctrl);
-	auto &errc(ctrl.error_seq);
+	ctrl.tokens.count = 0;
-	auto &accc(ctrl.accept_seq);
+	ctrl.tokens.head = 0;
 	ctrl.tokens = in.size();
 	ctrl.head = 0;
 	const size_t tmax
 	{
 		in.size() + opts.limit
 	};
 	const vector_view<f32> accum
 	{
 		gpt::scratch, tmax * 768
 	};
 	const vector_view<f32> embeds
 	{
 		gpt::embeds, tmax * 768
 	};
 	for(uint j(0); j < in.size(); ++j)
 		ctrl.token[ctrl.tokens.count++] = in[j];
 	for(uint i(0); i < opts.gates; ++i)
 		for(uint k(0); k < 8; ++k)
 		{
 			if(ctrl.tokens.count >= opts.buffer_tokens)
 				break;
 			if(opts.gate[i].code[k] == 0)
 				break;
 			ctrl.token[ctrl.tokens.count] = opts.gate[i].code[k];
 			ctrl.tokens.count++;
 		}
 	const size_t in_size
 	{
-		const vector_view<f32> dst
+		ctrl.tokens.count
-		{
+	};
 			data(embeds) + j * 768, 768
 		};
 		if(ircd::cl::enable)
 			ctrl.token[j] = in[j];
 		else
 			embed(data(dst), in[j], j, opts);
 		#if 0 // RB_DEBUG
 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[1][64] {{0}};
 		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
 			"%-4u %4u %4u:%-4u %1u%1u  [ %6.2fL %6.2f%% ] %6.2fL %5.1f%%  %s",
 			ctrl.epoch,
 			ctrl.cycle,
 			j,
 			ctrl.tokens,
 			0,
 			0,
 			0.0,
 			0.0,
 			0.0,
 			0.0,
 			vocab::debug(dbuf, in[j]).c_str()
 		);
 		log::logf
 		{
 			log, log::level::DEBUG,
 			"%s",
 			string_view{report, report_size}
 		};
 		#endif
 	}
 	uint64_t cycles(0);
 	if(ctrl.prop)
@ -170,7 +132,7 @@ ircd::gpt::generate(const vector_view<u16> &out,
 			cycles
 		};
-		backprop(task, ctrl.loss_mean, *model::default_model, momentum);
+		backprop(task, ctrl.loss.mean, *model::default_model, momentum);
 	}
 	if(ctrl.prop)
@ -178,17 +140,17 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		log::debug
 		{
 			log, "Backpropagation of %2.6f in %lu cycles.",
-			ctrl.loss_mean,
+			ctrl.loss.mean,
 			cycles,
 		};
-		ctrl.epoch = 0;
+		ctrl.epic.epoch = 0;
-		ctrl.loss_mean = 0;
+		ctrl.loss.mean = 0;
-		ctrl.loss = ctrl.loss_mean;
+		ctrl.loss.last = ctrl.loss.mean;
-		ctrl.perp_mean = 0;
+		ctrl.perp.mean = 0;
-		ctrl.perp = ctrl.perp_mean;
+		ctrl.perp.last = ctrl.perp.mean;
-		ctrl.cert_mean = 0;
+		ctrl.cert.mean = 0;
-		ctrl.cert = ctrl.cert_mean;
+		ctrl.cert.last = ctrl.cert.mean;
 		ctrl.prop = false;
 		pipe::default_model->invalid = true;
 		return {};
@ -206,73 +168,49 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		generate(task);
 	}
 	last_time = stopwatch.at<milliseconds>();
-	ctrl.elapsed += last_time.count();
+	ctrl.epic.elapsed += last_time.count();
-	/*
+	for(uint j(0); j < ctrl.tokens.count && ret < out.size() && !halt; ++j)
 		coil(data(scratch), tokens, *opts.model);
 		tail(logit, data(last_embed), *opts.model);
 		out[i] = argmax(logit, *opts);
 	*/
 	uint accc_thresh[3] {3, 3, 3};
 	for(uint i(0); i < 3; ++i)
 		for(uint j(3); j > 0; --j)
 			if(opts.accept_code[i][j - 1] == -1U)
 				--accc_thresh[i];
 			else
 				break;
 	uint errc_thresh[3] {3, 3, 3};
 	for(uint i(0); i < 3; ++i)
 		for(uint j(3); j > 0; --j)
 			if(opts.error_code[i][j - 1] == -1U)
 				--errc_thresh[i];
 			else
 				break;
 	for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
 	{
-		out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
+		const auto tok
 		{
 			ctrl.token[j]
 		};
-		for(uint j(0); j < 3; ++j)
+		if(j >= in_size)
-			errc[j] = opts.error_code[j][errc[j]] == out[j]?
+			out[ret++] = tok;
 				errc[j] + 1: 0;
-		for(uint j(0); j < 3; ++j)
+		if(j < in_size)
-			accc[j] = opts.accept_code[j][accc[j]] == out[j]?
+			continue;
 				accc[j] + 1: 0;
 		for(uint j(0); j < 3; ++j)
 			halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
 			halt |= errc_thresh[j] && errc[j] >= errc_thresh[j];
 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[4][64] {0};
-		const size_t bsz(ctrl.tokens - in.size());
+		const size_t bsz(ctrl.tokens.count - in_size);
 		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
-			"%4lu:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
+			"%-3u %4u:%-4u %4lu:%-4lu %6.1f%% %5.1fP %6.3fL [%c%c%c] %5u %6.3fL %6.2fP  %5.1f%% %s %04x  %8s %8s | %8s",
-			j + in.size(),
+			j,
-			ctrl.tokens,
+			ret - 1,
-			ctrl.epoch,
+			ctrl.tokens.count,
-			ctrl.cycle,
+			ctrl.epic.epoch,
-			std::clamp(ctrl.cert_mean * 100.0f, 0.0f, 100.0f),
+			ctrl.epic.cycle,
-			std::clamp(ctrl.perp_mean, 0.0f, 100.0f),
+			std::clamp(ctrl.cert.mean * 100.0f, 0.0f, 100.0f),
-			std::clamp(ctrl.loss_mean, 0.0f, 99.99f),
+			std::clamp(ctrl.perp.mean, 0.0f, 100.0f),
-			opts.label == out[j]? '+': ' ',
+			std::clamp(ctrl.loss.mean, 0.0f, 99.99f),
-			accc[0] + accc[1] + accc[2] >= 3? 'A': ' ',
+			opts.label == tok? '+': ' ',
-			errc[0] + errc[1] + errc[2] >= 3? 'E': ' ',
+			' ', // flag place
 			' ', // flag place
 			opts.label,
-			std::clamp(ctrl.loss, 0.0f, 99.99f),
+			std::clamp(ctrl.loss.last, 0.0f, 99.99f),
-			std::clamp(ctrl.perp, 0.0f, 100.0f),
+			std::clamp(ctrl.perp.last, 0.0f, 100.0f),
-			std::clamp(ctrl.cert * 100.0f, 0.0f, 100.0f),
+			std::clamp(ctrl.cert.last * 100.0f, 0.0f, 100.0f),
-			vocab::debug(dbuf, out[j]).c_str(),
+			vocab::debug(dbuf, tok).c_str(),
-			out[j],
+			tok,
 			pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
 			pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
-			pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
+			pretty(tmbuf[2], milliseconds(ctrl.epic.elapsed), 1).c_str()
 		);
 		log::logf
@ -283,19 +221,6 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		};
 	}
 	ret = ctrl.tokens - in.size();
 	if ((false)) for(uint i(0); i < 3; ++i)
 		if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
 		{
 			ret -= (3 - accc_thresh[i]);
 			break;
 		}
 		else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
 		{
 			ret -= (3 - errc_thresh[i]);
 			break;
 		}
 	ctx::interruption_point();
 	return vector_view<u16>
 	{
@ -689,6 +614,7 @@ ircd::gpt::gelu(f32x4 &out,
 // backside
 //
 [[gnu::noinline]]
 size_t
 ircd::gpt::backprop(task &task,
                    const f32 grad,
@ -792,6 +718,7 @@ ircd::gpt::backprop(task &task,
 	return off;
 }
 [[gnu::noinline]]
 size_t
 ircd::gpt::adamw(task &task,
                 const f32 grad,
@ -820,7 +747,7 @@ ircd::gpt::adamw(task &task,
 	};
 	for(uint i(0); i < num / 4; ++i)
-		off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.step, off);
+		off = adamw(p[0][i], p[1][i], p[2][i], grad, opts.alpha, opts.beta[0], opts.beta[1], ctrl.epic.step, off);
 	return off;
 }
@ -915,19 +842,111 @@ noexcept
 }
 //
-// hypercall
+// gpt::opts
 //
-ircd::string_view
+ircd_gpt_opts::ircd_gpt_opts(const ircd::gpt::model::decoder *const model)
 ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
 noexcept
 :model
 {
 	model
 }
 ,limit
 {
 	-1U
 }
 ,top_k
 {
 	2U
 }
 ,context_tokens
 {
 	1024U
 }
 ,buffer_tokens
 {
 	1024U
 }
 ,embed_elems
 {
 	768U
 }
 ,attn_mult
 {
 	3U
 }
 ,ffnn_mult
 {
 	4U
 }
 ,attn_elems
 {
 	embed_elems * attn_mult
 }
 ,ffnn_elems
 {
 	embed_elems * ffnn_mult
 }
 ,lanes
 {
 	4U
 }
 ,embed_width
 {
 	embed_elems / lanes
 }
 ,attn_width
 {
 	attn_elems / lanes
 }
 ,attn_height
 {
 	embed_elems / lanes
 }
 ,ffnn_width
 {
 	ffnn_elems / lanes
 }
 ,ffnn_height
 {
 	embed_elems / lanes
 }
 ,logits
 {
 	50257
 }
 ,seed
 {
 	1234567890UL
 }
 ,training_steps
 {
 	250000
 }
 ,validation_steps
 {
 	5000
 }
 ,label
 {
 	198
 }
 ,alpha
 {
 	0.001f
 }
 ,beta
 {
 	0.9f,
 	0.999f,
 }
 ,epsilon
 {
 	0.000001
 }
 ,gates
 {
 	0
 }
 {
 	switch(code)
 	{
 		case IRCD_GPT_ACCEPT:      return "ACCEPT";
 		case IRCD_GPT_ECOMPLETE:   return "ECOMPLETE";
 		case IRCD_GPT_ETOKENS:     return "ETOKENS";
 	}
 	return "??????";
 }
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -432,7 +432,7 @@ _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                   const uint word_idx)
 {
 	const ushort
-	ring_idx = (ctrl->head + tok_idx) % opts->buffer_tokens,
+	ring_idx = (ctrl->tokens.head + tok_idx) % opts->buffer_tokens,
 	token = ctrl->token[ring_idx];
 	const float4
@ -454,7 +454,7 @@ ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);
-	for(uint i = 0; i < ctrl->tokens; ++i)
+	for(uint i = 0; i < ctrl->tokens.count; ++i)
 		if(i % wn == wi)
 			_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
 }
@ -492,7 +492,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
 {
 	const uint
 	gi = get_global_id(0),
-	ti = ctrl->tokens - 1,
+	ti = ctrl->tokens.count - 1,
 	words = opts->embed_width;
 	float4 acc = 0.0f;
@ -596,31 +596,16 @@ ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
               __constant const struct ircd_gpt_opts *const opts,
               const uint li)
 {
 	// If the call value has been set to something other than default we
 	// do nothing else here.
 	if(ctrl->call != IRCD_GPT_ECOMPLETE)
 		return;
 	// No action for other threads right now
 	if(li != 0)
 		return;
 	// Run debug checks and assertions.
 	#ifdef RB_DEBUG
 	if(ctrl->call == IRCD_GPT_ECOMPLETE)
 		if(ctrl->tokens < 2)
 			ctrl->call = IRCD_GPT_ETOKENS;
 	#endif
 	// On the last cycle, with no prior call or error code set, indicate
 	// a nominal exit condition.
-	if(ctrl->cycle + 1 >= opts->limit)
+	if(ctrl->epic.cycle + 1 >= opts->limit)
-	{
+		ctrl->epic.epoch += 1;
 		ctrl->call = IRCD_GPT_ACCEPT;
 		ctrl->epoch += 1;
 	}
-	ctrl->cycle += 1;
+	ctrl->epic.cycle += 1;
 	ctrl->magic = 0xC7012C70U;
 }
@ -634,10 +619,6 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
                   __global const float *const restrict logexp,
                   __global const float *const restrict logit)
 {
 	// When the hypercall code is already set, bail here.
 	if(ctrl->call != IRCD_GPT_ECOMPLETE)
 		return;
 	// To read from cells other than idx[0] we need this barrier.
 	if(opts->top_k > 1)
 		barrier(CLK_LOCAL_MEM_FENCE);
@ -647,7 +628,7 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
 		return;
 	const bool
-	buffer_full = ctrl->tokens >= opts->buffer_tokens;
+	buffer_full = ctrl->tokens.count >= opts->buffer_tokens;
 	const ulong
 	rnd = opts->top_k > 1?
@ -657,20 +638,20 @@ ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
 	entro = max(opts->top_k, 1U),
 	select = rnd % entro,
 	token = idx[select],
-	dest = (ctrl->head + ctrl->tokens) % opts->buffer_tokens,
+	dest = (ctrl->tokens.head + ctrl->tokens.count) % opts->buffer_tokens,
-	tokens = min(ctrl->tokens + 1, opts->buffer_tokens),
+	tokens = min(ctrl->tokens.count + 1, opts->buffer_tokens),
 	head = buffer_full?
-		(ctrl->head + 1) % opts->buffer_tokens: ctrl->head;
+		(ctrl->tokens.head + 1) % opts->buffer_tokens: ctrl->tokens.head;
-	ctrl->head = head;
+	ctrl->tokens.head = head;
-	ctrl->tokens = tokens;
+	ctrl->tokens.count = tokens;
 	ctrl->token[dest] = token;
 	const ushort
 	ln = get_local_size(0),
 	next_select = (select + 1) % ln,
 	next_token = idx[next_select],
-	sum_sel = ctrl->epoch % 3;
+	sum_sel = ctrl->epic.epoch % 3;
 	const float
 	test_lsm = logexp[opts->label],
@ -737,7 +718,7 @@ ircd_gpt_prop_elem(__global const struct ircd_gpt_task *const ctrl,
 {
 	const uint
 	li = get_local_id(0),
-	step = ctrl->step;
+	step = ctrl->epic.step;
 	const float4
 	param = param_[li],
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -129,12 +129,11 @@ ircd::gpt::generate(task &task)
 		*task.ctrl
 	};
-	ctrl.cycle = 0;
+	ctrl.epic.cycle = 0;
-	ctrl.call = IRCD_GPT_ECOMPLETE;
+	ctrl.epic.host_tsc = prof::cycles();
-	ctrl.host_tsc = prof::cycles();
+	volatile const size_t tokens(ctrl.tokens.count);
-	volatile const size_t tokens(ctrl.tokens);
+	volatile const auto epoch(ctrl.epic.epoch);
-	volatile const auto epoch(ctrl.epoch);
+	volatile size_t cycle(ctrl.epic.cycle);
 	volatile size_t cycle(ctrl.cycle);
 	std::deque<pipe::exec> list;
 	for(; cycle < opts.limit; ++cycle)
@ -151,8 +150,7 @@ ircd::gpt::generate(task &task)
 			task, tokens + cycle, rel, acq
 		);
-		// Conditions for a cl::flush here; this is not default but
+		// Conditions for a cl::flush here
 		// may be configured to improve some workloads.
 		const bool flush
 		{
 			// Flushing here is enabled by the configuration
@ -194,18 +192,8 @@ ircd::gpt::generate(task &task)
 	list.clear();
 	assert(ctrl.magic == 0xC7012C70);
 	assert(ctrl.epic.cycle == cycle || ctx::interruption_requested());
 	this_ctx::interruption_point();
 	// Interp error codes
 	if(unlikely(ctrl.call <= 0))
 		throw error
 		{
 			"hyper (#%d) :%s",
 			abs(int(ctrl.call)),
 			reflect(ctrl.call),
 		};
 	assert(ctrl.cycle == cycle || ctx::interruption_requested());
 }
 void