ircd::gpt: Reorganize interface.

2024-12-25 23:14:13 +01:00 · 2021-04-02 13:01:38 -07:00 · 2021-04-02 13:01:38 -07:00 · 1870c364f4
commit 1870c364f4
parent f096e7fcf5
17 changed files with 1067 additions and 778 deletions
--- a/include/ircd/gpt/generate.h
+++ b/include/ircd/gpt/generate.h
@ -13,15 +13,16 @@
 namespace ircd::gpt
 {
 	void
 	generate(task &);
 	vector_view<u16>
 	generate(const vector_view<u16> &out,
 	         const vector_view<const u16> &in,
-	         const opts * = &default_opts,
+	         task &);
 	         task * = nullptr);
 	string_view
 	generate(const mutable_buffer &out,
 	         const string_view &in,
-	         const opts * = &default_opts,
+	         task &);
 	         task * = nullptr);
 }
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -20,63 +20,14 @@ namespace ircd::gpt
 	struct opts;
 	struct task;
 	extern const opts default_opts;
 	extern log::log log;
 }
 #include "hypercall.h"
 #include "vocab.h"
 #include "model.h"
 #include "token.h"
 #include "opts.h"
 #include "task.h"
 #include "pipe/pipe.h"
 #include "generate.h"
 /// Primary Options
 ///
 /// Use this structure to configure and control specifics of the machine.
 /// These settings are immutable for the operations. To maintain state between
 /// calls see task.h
 ///
 struct ircd::gpt::opts
 {
 	/// Specifies the nominal halting condition based on the sequence of
 	/// tokens. Generation will complete when this sequence is witnessed. Set
 	/// tokens to -1 starting from the back to not match that token. Setting
 	/// all tokens to -1 will ignore this condition.
 	uint accept_code[3][3]
 	{
 		{    13,  198,  -1U,  },
 		{   198,  198,  -1U,  },
 		{   -1U,  -1U,  -1U,  },
 	};
 	/// Specifies the exceptional halting condition based on the sequence of
 	/// tokens. By default, the three zeros represent three outputs of '!'
 	/// which is probably an error code; note that a true "!!!" is represented
 	/// by token number 10185. Set tokens to -1 starting from the back to
 	/// not match that token; generated output after errors is usually garbage.
 	uint error_code[3][3]
 	{
 		{     0,    0,    0,  },
 		{   -1U,    0,    0,  },
 		{   -1U,    0,    0,  },
 	};
 	/// Limit number of output tokens. Default of -1 is unlimited; the number
 	/// of tokens generated will be limited by other factors.
 	uint limit
 	{
 		-1U
 	};
 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
 	uint top_k
 	{
 		2
 	};
 	/// Pointer to the model
 	const model::decoder *model
 	{
 		model::default_model
 	};
 };
--- a/include/ircd/gpt/hypercall.h
+++ b/include/ircd/gpt/hypercall.h
@ -0,0 +1,34 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_HYPERCALL_H
 /// Hypercalling code enumeration.
 ///
 /// Error codes are all negative values. Zero is also an error.
 enum ircd_gpt_hypercall
 {
 	/// General nominal completion code; similar to EXIT_SUCCESS, etc.
 	IRCD_GPT_ACCEPT = 1,
 	/// Failed or incomplete execution occurred. After an execution attempt
 	/// it indicates no execution likely took place. Device software never
 	/// sets this value; it is the initial value set by the host before
 	/// execution.
 	IRCD_GPT_ECOMPLETE = 0,
 };
 #ifdef __cplusplus
 namespace ircd::gpt
 {
 	string_view reflect(const enum ircd_gpt_hypercall) noexcept;
 }
 #endif
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -67,7 +67,8 @@ struct ircd::gpt::model::block
 	norm ln2;
 	model::ffnn ffnn;
-};
+}
 __attribute__((packed));
 /// Vocabulary embeddings
 struct ircd::gpt::model::embed
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@ -0,0 +1,123 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_OPTS_H
 /// Task Options Page
 ///
 /// The option block is directly shared with task software as constant data.
 /// This stucture and its mutable companion in `task.h` determine the outcome
 /// of the next execution cycle; options are immutable to device software but
 /// may be changed by the host between executions cycles if desired.
 /// 
 struct ircd_gpt_opts
 {
 	/// Specifies the nominal halting condition based on a sequence of tokens.
 	/// Generation will complete with success after one of these sequences is
 	/// witnessed. Set tokens to -1 starting from the back for shorter
 	/// sequences; zero-length sequences (all -1's) are never matched.
 	uint accept_code[4][4]
 	#ifdef __cplusplus
 	{
 		{    13,  198,  -1U,  -1U,  },
 		{   198,  198,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 	}
 	#endif
 	;
 	/// Specifies the exceptional halting condition based on the sequence of
 	/// tokens. By default, the three zeros represent three outputs of '!'
 	/// which is probably an error; note that a true "!!!" is represented by
 	/// token number 10185. Set tokens to -1 starting from the back to not
 	/// match that token; generated output after errors is usually garbage.
 	uint error_code[4][4]
 	#ifdef __cplusplus
 	{
 		{     0,    0,    0,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 		{   -1U,  -1U,  -1U,  -1U,  },
 	}
 	#endif
 	;
 	/// Limit number of output tokens. Default of -1 is unlimited; the number
 	/// of tokens generated will be limited by other factors.
 	uint limit
 	#ifdef __cplusplus
 	{
 		1
 	}
 	#endif
 	;
 	/// Flip random coins over the top k logits each round. Setting to 1
 	/// deterministically selects the top logit.
 	uint top_k
 	#ifdef __cplusplus
 	{
 		2
 	}
 	#endif
 	;
 	/// Specifies the token context size in tokens.
 	uint context_tokens
 	#ifdef __cplusplus
 	{
 		1024
 	}
 	#endif
 	;
 	/// Specifies the token buffer size in tokens.
 	uint buffer_tokens
 	#ifdef __cplusplus
 	{
 		1024
 	}
 	#endif
 	;
 	/// Seed for the task's PRNG.
 	ulong seed
 	#ifdef __cplusplus
 	{
 		1234567890UL
 	}
 	#endif
 	;
 }
 __attribute__((aligned(4096)));
 #ifdef __cplusplus
 /// Generator Task Options.
 ///
 /// Parameters for a task. Options are constant and one instance can be shared
 /// between multiple task instances. This structure extends the task options
 /// page, starting a new page which is not visible to device software; C++ and
 /// host pointers are available.
 ///
 struct ircd::gpt::opts
 :ircd_gpt_opts
 {
 	/// Pointer to the model
 	const model::decoder *model
 	{
 		model::default_model
 	};
 };
 static_assert(sizeof(struct ircd_gpt_opts) == 4096);
 static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
 #endif
--- a/include/ircd/gpt/pipe/code.h
+++ b/include/ircd/gpt/pipe/code.h
@ -0,0 +1,22 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_PIPE_CODE_H
 /// Pipe code segment.
 struct ircd::gpt::pipe::code
 :cl::code
 {
 	static const string_view compile_opts;
 	code();
 	~code() noexcept;
 };
--- a/include/ircd/gpt/pipe/desc.h
+++ b/include/ircd/gpt/pipe/desc.h
@ -0,0 +1,47 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_PIPE_DESC_H
 /// Pipe descriptor
 struct ircd::gpt::pipe::desc
 {
 	struct layer;
 	pipe::model *model;
 	pipe::code *code;
 	cl::data
 	state,         // qry/key/val projection (tokens * embed * 3 * float)
 	accum,         // accumulator (tokens * embed * float)
 	logit,         // result output vector (50257 * float)
 	ctrl,          // control page
 	opts;          // options page
 	cl::kern
 	lm_embed,
 	lm_norm,
 	lm_logit,
 	lm_select;
 	std::unique_ptr<struct desc::layer>
 	layer[12];
 	desc(pipe::code &, pipe::model &);
 };
 struct ircd::gpt::pipe::desc::layer
 {
 	cl::kern negative;
 	cl::kern positive;
 	layer(pipe::desc &, const int);
 };
--- a/include/ircd/gpt/pipe/exec.h
+++ b/include/ircd/gpt/pipe/exec.h
@ -0,0 +1,57 @@
 // Matrix Construct
 //
 // Copyright (C) Matrix Construct Developers, Authors & Contributors
 // Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #pragma once
 #define HAVE_IRCD_GPT_PIPE_EXEC_H
 /// Perform one task cycle on the device.
 ///
 /// Constructions of this object enqueue device commands to complete an
 /// additional epoch of the task as provided by `ctrl` and `opts`.
 ///
 /// Destructions of this object yield the ircd::ctx until those commands
 /// are complete.
 ///
 /// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
 /// achieved by constructing several objects before following with destructions
 /// i.e in a std::deque.
 ///
 struct ircd::gpt::pipe::exec
 {
 	pipe::desc *desc;
 	const_buffer
 	send_opts,         // Set when sending the options page.
 	send_ctrl;         // Set when sending the control page.
 	mutable_buffer
 	recv_ctrl;         // Set when receiving the control page.
 	cl::kern::range
 	range_lm_embed,    // Dimension range of the lm_embed kernel.
 	range_negative,    // Dimension range of a layer kernel.
 	range_positive,    // Dimension range of a layer kernel.
 	range_lm_norm,     // Dimension range of the final norm kernel.
 	range_lm_logit,    // Dimension range of the language logit kernel.
 	range_lm_select;   // Dimension range of the language token kernel.
 	cl::exec
 	release_opts,      // Release the options page.
 	release_ctrl,      // Release the control page.
 	lm_embed,          // Compute token and positional embeddings.
 	coil[12 * 2],      // Pass over all layers.
 	lm_norm,           // Final normalization.
 	lm_logit,          // Compute logit result vector.
 	lm_select,         // Select next token.
 	acquire_ctrl;      // Acquire the control page.
 	exec(task &, const size_t tokens, const bool rel, const bool acq);
 	~exec() noexcept;
 };
--- a/include/ircd/gpt/pipe/model.h
+++ b/include/ircd/gpt/pipe/model.h
@ -59,11 +59,13 @@ struct ircd::gpt::pipe::model::block
 	model::attn attn;
 	model::ffnn ffnn;
 	block(cl::data &, const off_t, const gpt::model::block &, const size_t);
 	block(const gpt::model::block &, const size_t);
 };
 struct ircd::gpt::pipe::model::decoder
 {
 	cl::data master;
 	model::block block[12];
 	tensor norm;
--- a/include/ircd/gpt/pipe/pipe.h
+++ b/include/ircd/gpt/pipe/pipe.h
@ -17,7 +17,6 @@ namespace ircd::gpt::pipe
 	struct code;
 	struct desc;
 	struct exec;
 	struct bank;
 	extern model *default_model;
 	extern code *default_code;
@ -27,70 +26,6 @@ namespace ircd::gpt::pipe
 };
 #include "model.h"
-#include "ctrl.h"
+#include "code.h"
-
+#include "desc.h"
-struct ircd::gpt::pipe::code
+#include "exec.h"
 :cl::code
 {
 	static const string_view compile_opts;
 	code();
 	~code() noexcept;
 };
 struct ircd::gpt::pipe::desc
 {
 	struct layer;
 	pipe::model *model;
 	pipe::code *code;
 	cl::data opts;
 	cl::data ctrl;
 	cl::data state;
 	cl::data xattn;
 	cl::data accum;
 	cl::data logit;
 	cl::kern anode;
 	std::unique_ptr<struct desc::layer> layer[12];
 	cl::kern cathode;
 	cl::kern lmhead;
 	cl::kern lmamax;
 	desc(pipe::code &, pipe::model &);
 };
 struct ircd::gpt::pipe::desc::layer
 {
 	cl::kern negative;
 	cl::kern selfattn;
 	cl::kern positive;
 	layer(pipe::desc &, const int);
 };
 struct ircd::gpt::pipe::exec
 {
 	pipe::desc *desc;
 	mutable_buffer out_ctrl;
 	const_buffer in_ctrl, in_opts;
 	cl::kern::range range_anode;
 	cl::kern::range range_coil;
 	cl::kern::range range_negative;
 	cl::kern::range range_selfattn;
 	cl::kern::range range_positive;
 	cl::kern::range range_cathode;
 	cl::kern::range range_lmhead;
 	cl::kern::range range_lmamax;
 	cl::exec send[2];
 	cl::exec tail[1];
 	cl::exec coil[12 * 3];
 	cl::exec head[3];
 	cl::exec recv[1];
 	exec(ctor_ctrl &, const ctor_opts &);
 	~exec() noexcept;
 };
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -11,8 +11,77 @@
 #pragma once
 #define HAVE_IRCD_GPT_TASK_H
-/// Context to maintain state across calls.
+/// Task Control Page
 ///
 /// The control block is shared with our device software. Execution state is
 /// maintained in the task control block across cycles. The control block is
 /// the mutable state component for an execution; for the immutable component
 /// also shared with device software see opts.h.
 ///
 struct ircd_gpt_task
 {
 	/// Header magic 0xC7012C70
 	uint magic;
 	/// Hypercall code set by our device software upon completion and control
 	/// transfer back to the host. Negative codes indicate errors, positive
 	/// codes are used for status and/or procedure calls; zero is also an error.
 	enum ircd_gpt_hypercall call;
 	/// Token ring head. Tokens in the ring extend behind the head for
 	/// `tokens`. The `head` value is automatically modulated by device
 	/// software to wrap around the ring.
 	uint head;
 	/// Token counter. The counter indicates the number of valid tokens in
 	/// the context buffer. This value must not exceed the buffer size.
 	uint tokens;
 	/// Accumulates the number of task cycles. The cycle counter is incremented
 	/// by device software after each repetition of the kernel pipeline to
 	/// produce one additional token.
 	ulong cycle;
 	/// Accumulates the epoch count for the task. The counter is incremented
 	/// by one in device software before control returns back to the host.
 	/// Several cycles may occur during each epoch.
 	ulong epoch;
 	/// Accumulates the number of tokens produced by the task. Several tokens
 	/// may be produced each epoch, but currently only one token is produced
 	/// each cycle.
 	ulong produced;
 	/// Accumulates the number tokens witnessed by the task. The number of
 	/// tokens in the context for each cycle is counted as witnessed.
 	ulong witnessed;
 	/// Accumulates time in microseconds elapsed for the task.
 	ulong elapsed;
 	/// PRNG xoshiro256 state. This is the de facto random seed which can be
 	/// set before cycle entry by the host. It is updated by device software
 	/// when used.
 	ulong rand[4];
 	/// Updated by the host with the value of the timestamp register as sampled
 	/// immediately before each transfer of control to the device.
 	ulong host_tsc;
 	/// State counters for the accept/error sequence codes.
 	uint accept_seq[4], error_seq[4];
 	/// The token buffer starts at offset 2048 and continues to the end of
 	/// the page; options specify the size of the tokens buffer in tokens.
 	/// Additional pages must be attached for larger buffer sizes.
 	ushort token[] __attribute__((aligned(2048)));
 }
 __attribute__((aligned(4096)));
 #ifdef __cplusplus
 /// Task Context
 ///
 /// State for a task.
 struct ircd::gpt::task
 {
 	enum status :char;
@ -20,36 +89,16 @@ struct ircd::gpt::task
 	/// Reference to the attached options.
 	const gpt::opts *opts {nullptr};
 	/// Reference to control pages.
 	struct ircd_gpt_task *ctrl {nullptr};
 	/// Current task status.
 	enum status status {'\0'};
-	/// State counters for the accept codes specified in the options.
+	task(const gpt::opts *       = nullptr,
-	uint8_t accept_seq[3] {0};
+	     struct ircd_gpt_task *  = nullptr);
-	/// State counters for the error codes specified in the options.
+	~task() noexcept;
 	uint8_t error_seq[3] {0};
 	/// Accumulates the number of executions by the user. Each call to the
 	/// interface is an execution.
 	uint64_t epoch {0};
 	/// Accumulates the number of tokens produced by the task. Several tokens
 	/// may be produced each epoch.
 	uint64_t produced {0};
 	/// Accumulates the number tokens witnessed by the task. The number of
 	/// tokens in the context for each produced token is counted as witnessed.
 	uint64_t witnessed {0};
 	/// Accumulates the number of CPU reference cycles consumed by the task.
 	/// This counter does not reflect time when the task is queued or waiting
 	/// or offloaded to a co-processor/accelerator.
 	uint64_t cycles {0};
 	/// Accumulates the total time in milliseconds over all executions of the
 	/// task. This counter reflects total wall-clock time of all phases of
 	/// the execution.
 	milliseconds time {0ms};
 };
 /// The current status of a task is indicated with intelligible characters
@ -61,3 +110,7 @@ enum ircd::gpt::task::status
 	ACCEPT    = 'A',  ///< Execution completed successfully.
 	ERROR     = 'E',  ///< Execution did not complete successfully.
 };
 static_assert(sizeof(struct ircd_gpt_task) == 4096);
 static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
 #endif
--- a/include/ircd/gpt/pipe/ctrl.h
+++ b/include/ircd/gpt/pipe/ctrl.h
@ -9,78 +9,46 @@
 // full license for this software is available in the LICENSE file.
 #pragma once
-#define HAVE_IRCD_GPT_PIPE_CTRL_H
+#ifdef __OPENCL_C_VERSION__
 #define HAVE_IRCD_GPT_TOKEN_H
-struct ctor_ctrl
+union ircd_gpt_token
 {
 	long call;
 	ulong pc;
 	ulong tokens;
 	ulong magic;
 	uchar pad[1024 - 32];
 	union
 	{
 		char str[3072];
 		ushort token[1536];
 	}
 	body;
 }
 __attribute__((aligned(4096)));
 struct ctor_opts
 {
    uchar pad[4096];
 }
 __attribute__((aligned(4096)));
 #ifndef __OPENCL_C_VERSION__
 static_assert(sizeof(struct ctor_ctrl) == 4096);
 #endif
 #ifndef __OPENCL_C_VERSION__
 static_assert(sizeof(struct ctor_opts) == 4096);
 #endif
 #ifndef __cplusplus
 union token
 {
 	float
 	word[768],
 	attn[12][64];
 };
-union tokenv
+union ircd_gpt_tokenv
 {
 	float4
 	word[768/4],
 	attn[12][64/4];
 };
-struct qkv
+struct ircd_gpt_qkv
 {
-	union token
+	union ircd_gpt_tokenv
 	qry,
 	key,
 	val;
 };
-struct qkvv
+struct ircd_gpt_qkvv
 {
-	union tokenv
+	union ircd_gpt_tokenv
 	qry,
 	key,
 	val;
 };
-struct attn_mask
+struct ircd_gpt_attn_mask
 {
 	bool
 	token[1024];
 };
-union aperature
+union ircd_gpt_aperature
 {
 	float
 	word[768],
@ -90,7 +58,7 @@ union aperature
 	attn[12][64];
 };
-union aperaturev
+union ircd_gpt_aperaturev
 {
 	float4
 	word[768/4],
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -34,30 +34,22 @@ namespace ircd::gpt
 	static f32
 	logit alignas(64) [65536],
 	embeds alignas(64) [1024 * 768],
 	scratch alignas(64) [1024 * 768];
 }
 namespace ircd::gpt
 {
 	extern void transform(ctor_ctrl &, const ctor_opts &);
 }
 decltype(ircd::gpt::log)
 ircd::gpt::log
 {
 	"gpt"
 };
 decltype(ircd::gpt::default_opts)
 ircd::gpt::default_opts;
 ircd::string_view
 ircd::gpt::generate(const mutable_buffer &out,
                    const string_view &in,
-                    const opts *opts,
+                    task &task)
                    task *task)
 {
-	u16 buf[2][256];
+	u16 buf[2][1024];
 	const auto input_tokens
 	{
 		vocab::tokenize(buf[0], in)
@ -65,7 +57,7 @@ ircd::gpt::generate(const mutable_buffer &out,
 	const auto output_tokens
 	{
-		generate(buf[1], input_tokens, opts, task)
+		generate(buf[1], input_tokens, task)
 	};
 	const auto output
@ -79,13 +71,92 @@ ircd::gpt::generate(const mutable_buffer &out,
 ircd::vector_view<ircd::u16>
 ircd::gpt::generate(const vector_view<u16> &out,
                    const vector_view<const u16> &in,
-                    const opts *opts,
+                    task &task)
                    task *task)
 {
 	assert(task.ctrl);
 	assert(task.opts);
 	uint ret(0);
 	bool halt(false);
 	const auto &opts(*task.opts);
 	auto &ctrl(*task.ctrl);
 	auto &errc(ctrl.error_seq);
 	auto &accc(ctrl.accept_seq);
 	ctrl.tokens = in.size();
 	const size_t tmax
 	{
 		in.size() + opts.limit
 	};
 	const vector_view<f32> accum
 	{
 		gpt::scratch, tmax * 768
 	};
 	const vector_view<f32> embeds
 	{
 		gpt::embeds, tmax * 768
 	};
 	for(uint j(0); j < in.size(); ++j)
 	{
 		const vector_view<f32> dst
 		{
 			data(embeds) + j * 768, 768
 		};
 		if(ircd::cl::enable)
 			ctrl.token[j] = in[j];
 		else
 			embed(data(dst), in[j], j, opts);
 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[1][64] {{0}};
 		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
 			"%-2u -- %-3u [%5u] --- --- %s        0        0  |  %8s",
 			j,
 			ctrl.tokens,
 			ctrl.token[j],
 			vocab::debug(dbuf, ctrl.token[j]).c_str(),
 			pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
 		);
 		log::info
 		{
 			log, "%s",
 			string_view{report, report_size}
 		};
 	}
 	uint64_t cycles(0);
 	milliseconds last_time {0};
 	util::timer stopwatch;
 	{
 		const prof::scope_cycles task_cycles
 		{
 			cycles
 		};
 		generate(task);
 	}
 	last_time = stopwatch.at<milliseconds>();
 	ctrl.elapsed += last_time.count();
 	/*
 		coil(data(scratch), tokens, *opts.model);
 		tail(logit, data(last_embed), *opts.model);
 		out[i] = argmax(logit, *opts);
 	*/
 	uint accc_thresh[3] {3, 3, 3};
 	for(uint i(0); i < 3; ++i)
 		for(uint j(3); j > 0; --j)
-			if(opts->accept_code[i][j - 1] == -1U)
+			if(opts.accept_code[i][j - 1] == -1U)
 				--accc_thresh[i];
 			else
 				break;
@ -93,99 +164,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
 	uint errc_thresh[3] {3, 3, 3};
 	for(uint i(0); i < 3; ++i)
 		for(uint j(3); j > 0; --j)
-			if(opts->error_code[i][j - 1] == -1U)
+			if(opts.error_code[i][j - 1] == -1U)
 				--errc_thresh[i];
 			else
 				break;
-	uint ret(0);
+	for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
 	bool halt(false);
 	auto &errc(task->error_seq);
 	auto &accc(task->accept_seq);
 	for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
 	{
-		ctor_ctrl ctrl alignas(4096) {0};
+		out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];
 		ctrl.pc = 1;
 		const size_t tokens
 		{
 			in.size() + i
 		};
 		const vector_view<f32> scratch
 		{
 			gpt::scratch, tokens * 768
 		};
 		for(uint j(0); j < in.size(); ++j)
 		{
 			const vector_view<f32> dst
 			{
 				data(scratch) + j * 768, 768
 			};
 			if(ircd::cl::enable)
 				ctrl.body.token[ctrl.tokens++] = in[j];
 			else
 				embed(data(dst), in[j], j, *opts);
 		}
 		for(uint j(0); j < i; ++j)
 		{
 			const vector_view<f32> dst
 			{
 				data(scratch) + (in.size() + j) * 768, 768
 			};
 			if(ircd::cl::enable)
 				ctrl.body.token[ctrl.tokens++] = out[j];
 			else
 				embed(data(dst), out[j], in.size() + j, *opts);
 		}
 		assert(!ircd::cl::enable || ctrl.tokens == tokens);
 		const vector_view<f32> last_embed
 		{
 			data(scratch) + (tokens - 1) * 768, 768
 		};
 		const auto last_cycl(task->cycles);
 		milliseconds last_time {0};
 		{
 			util::timer stopwatch;
 			const prof::scope_cycles task_cycles
 			{
 				task->cycles
 			};
 			if(ircd::cl::enable)
 			{
 				static const ctor_opts opts alignas(4096) {0};
 				transform(ctrl, opts);
 				out[i] = ctrl.body.token[ctrl.tokens - 1];
 				assert(ctrl.tokens == tokens + 1);
 			} else {
 				coil(data(scratch), tokens, *opts->model);
 				tail(logit, data(last_embed), *opts->model);
 				out[i] = argmax(logit, *opts);
 			}
 			last_time = stopwatch.at<milliseconds>();
 			task->time += last_time;
 		}
 		for(uint j(0); j < 3; ++j)
-			errc[j] =
+			errc[j] = opts.error_code[j][errc[j]] == out[j]?
-				opts->error_code[j][errc[j]] == out[i]?
+				errc[j] + 1: 0;
 					errc[j] + 1:
 					0;
 		for(uint j(0); j < 3; ++j)
-			accc[j] =
+			accc[j] = opts.accept_code[j][accc[j]] == out[j]?
-				opts->accept_code[j][accc[j]] == out[i]?
+				accc[j] + 1: 0;
 					accc[j] + 1:
 					0;
 		for(uint j(0); j < 3; ++j)
 			halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
@ -194,21 +188,23 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[4][64] {0};
-		size_t report_size;
+		const size_t bsz(ctrl.tokens - in.size());
-		report_size = snprintf
+		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
-			"%-2u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s  |  %8s",
+			"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s  |  %8s",
-			i,
+			j,
 			j + in.size(),
 			ctrl.tokens,
-			ret,
+			ctrl.cycle,
-			out[i],
+			ctrl.epoch,
 			out[j],
 			accc[0] + accc[1] + accc[2],
 			errc[0] + errc[1] + errc[2],
-			vocab::debug(dbuf, out[i]).c_str(),
+			vocab::debug(dbuf, out[j]).c_str(),
-			pretty(tmbuf[0], last_time, 1).c_str(),
+			pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
-			pretty(tmbuf[1], si(last_cycl), 1).c_str(),
+			pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
-			pretty(tmbuf[2], task->time, 1).c_str()
+			pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
 		);
 		log::info
@ -216,24 +212,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
 			log, "%s",
 			string_view{report, report_size}
 		};
 		++ret;
 		ctx::yield();
 		ctx::interruption_point();
 	}
 	ret = ctrl.tokens - in.size();
 	for(uint i(0); i < 3; ++i)
-		if(accc_thresh[i] && task->accept_seq[i] >= accc_thresh[i])
+		if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
 		{
 			ret -= (3 - accc_thresh[i]);
 			break;
 		}
-		else if(errc_thresh[i] && task->error_seq[i] >= errc_thresh[i])
+		else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
 		{
 			ret -= (3 - errc_thresh[i]);
 			break;
 		}
 	ctx::interruption_point();
 	return vector_view<u16>
 	{
 		out, ret
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -8,129 +8,27 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 inline void
 ctor_local_bcast_ldr(__local float4 *const out,
                     const uint ln,
                     const uint li)
 {
 	for(uint stride = 1; stride < ln; stride <<= 1)
 	{
 		if(li < stride)
 			out[li + stride] = out[li];
 		barrier(CLK_LOCAL_MEM_FENCE);
 	}
 }
 inline void
-ctor_local_reduce_add_ldr(__local float4 *const out,
+ircd_gpt_norm_fmad(__local float4 *const out,
-                          const uint ln,
+                   __local const float4 *const in,
-                          const uint li)
+                   __global const float4 *const restrict bias,
-{
+                   __global const float4 *const restrict weight,
-	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
+                   const uint i)
 	{
 		barrier(CLK_LOCAL_MEM_FENCE);
 		if(li < stride)
 			out[li] += out[li + stride];
 	}
 }
 inline void
 ctor_local_reduce_max_ldr(__local float *const out,
                          const uint ln,
                          const uint li)
 {
 	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
 	{
 		barrier(CLK_LOCAL_MEM_FENCE);
 		if(li < stride)
 			out[li] = max(out[li], out[li + stride]);
 	}
 }
 inline void
 ctor_local_reduce_tournament_ldr(__local float *const best,
                                 __local ushort *const idx,
                                 const uint ln,
                                 const uint li)
 {
 	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
 	{
 		barrier(CLK_LOCAL_MEM_FENCE);
 		if(li < stride && best[li] < best[li + stride])
 		{
 			best[li] = best[li + stride];
 			idx[li] = idx[li + stride];
 		}
 	}
 }
 inline void
 ctor_mean(__local float4 *const restrict out,
          __local const float4 *const restrict in,
          const uint num,
          const uint i)
 {
 	out[i] = in[i];
 	ctor_local_reduce_add_ldr(out, num, i);
 	float numerator = 0.0f;
 	float4 numeratorv = out[i];
 	for(uint k = 0; k < 4; ++k)
 		numerator += numeratorv[k];
 	out[i] = numerator;
 	ctor_local_bcast_ldr(out, num, i);
 	numeratorv = out[i];
 	out[i] = numeratorv / (num * 4);
 }
 inline void
 ctor_norm(__local float4 *const out,
          __local const float4 *const in,
          __local float4 *const restrict tmp,
          const uint num,
          const uint i)
 {
 	ctor_mean(tmp, in, num, i);
 	const float4
 	sub_mean = in[i] - tmp[i];
 	tmp[i] = pow(sub_mean, 2);
 	ctor_mean(out, tmp, num, i);
 	const float4
 	epsilon = 0.00001f,
 	s = sqrt(out[i] + epsilon);
 	out[i] = sub_mean / s;
 }
 inline void
 ctor_norm_fmad(__local float4 *const out,
               __local const float4 *const in,
               __global const float4 *const restrict bias,
               __global const float4 *const restrict weight,
               const uint i)
 {
 	out[i] = in[i] * weight[i] + bias[i];
 }
 // Matrix * Vector Multiply/Accumulate
 inline void
-ctor_sgemv(__local float4 *const restrict out,
+ircd_gpt_sgemv(__local float4 *const restrict out,
-           __local const float4 *const restrict in,
+               __local const float4 *const restrict in,
-           __global const float4 *const restrict bias,
+               __global const float4 *const restrict bias,
-           __global const float4 *const restrict weight,
+               __global const float4 *const restrict weight,
-           const uint width,
+               const uint width,
-           const uint height,
+               const uint height,
-           const uint tiles,
+               const uint tiles,
-           const uint i)
+               const uint i)
 {
 	const uint seg = height / tiles;
@ -151,9 +49,9 @@ ctor_sgemv(__local float4 *const restrict out,
 }
 inline void
-ctor_gelu(__local float4 *const out,
+ircd_gpt_gelu(__local float4 *const out,
-          __local const float4 *const in_,
+              __local const float4 *const in_,
-          const uint i)
+              const uint i)
 {
 	float4 a,
 	in = in_[i];
@ -178,14 +76,15 @@ ctor_gelu(__local float4 *const out,
 //
 __kernel void
-ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
-               __constant const struct ctor_opts *const opts,
+              __constant const struct ircd_gpt_opts *const opts,
-               __global union aperaturev *const restrict out,
+              __global union ircd_gpt_tokenv *const restrict accum,
-               __global const union tokenv *const restrict in,
+              __global const float4 *const restrict norm_bias,
-               __global const float4 *const restrict norm_bias,
+              __global const float4 *const restrict norm_weight,
-               __global const float4 *const restrict norm_weight,
+              __global const float4 *const restrict fcon_bias,
-               __global const float4 *const restrict fcon_bias,
+              __global const float4 *const restrict fcon_weight,
-               __global const float4 *const restrict fcon_weight)
+              __global const float4 *const restrict proj_bias,
              __global const float4 *const restrict proj_weight)
 {
 	const uint
 	gi = get_global_id(0),
@ -195,31 +94,39 @@ ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);
-	__local union aperaturev token;
+	__local union ircd_gpt_aperaturev token;
 	__local float4 tmp[768/4];
-	token.word[li] = in[wi].word[li];
+	// Fetch local copy of the global accumulator. We operate on a cached
 	// copy as input, and add our output to the global upon completion.
 	token.word[li] = accum[wi].word[li];
 	// Layer re-normalization
-	ctor_norm(token.word, token.word, tmp, ln, li);
+	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
-	ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
+	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
 	// Fully connected
-	for(uint i = 0; i < 3; ++i)
+	for(uint i = 0; i < 4; ++i)
-		ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
+		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
-	// Export queries, keys, and values.
+	// Gaussian Error Linear Unit
-	for(uint i = 0; i < 3; ++i)
+	for(uint i = 0; i < 4; ++i)
-		out[wi].proj[i][li] = token.proj[i][li];
+		ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
 	// Projection
 	ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
 	// Accumulation; end of layer
 	accum[wi].word[li] += tmp[li];
 }
 __kernel void
-ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
-               __constant const struct ctor_opts *const opts,
+                   __constant const struct ircd_gpt_opts *const opts,
-               __global union tokenv *const restrict accum,
+                   __global union ircd_gpt_tokenv *const restrict accum,
-               __global const union tokenv *const restrict xattn,
+                   __local const union ircd_gpt_tokenv *const restrict xattn,
-               __global const float4 *const restrict proj_bias,
+                   __global const float4 *const restrict proj_bias,
-               __global const float4 *const restrict proj_weight)
+                   __global const float4 *const restrict proj_weight)
 {
 	const uint
 	gi = get_global_id(0),
@ -234,25 +141,24 @@ ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
 	out[768/4];
 	// Fetch
-	in[li] = xattn[wi].word[li];
+	in[li] = xattn->word[li];
 	// Need this here if xattn is __local
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// Projection
-	ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
+	ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
 	// Accumulation; end of layer
 	accum[wi].word[li] += out[li];
 }
 __kernel void
-ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
-          __constant const struct ctor_opts *const opts,
+                   __constant const struct ircd_gpt_opts *const opts,
-          __global union tokenv *const restrict accum,
+                   __local union ircd_gpt_tokenv *const restrict out,
-          __global const float4 *const restrict norm_bias,
+                   __global const struct ircd_gpt_qkvv *const restrict token,
-          __global const float4 *const restrict norm_weight,
+                   __global const struct ircd_gpt_attn_mask *const restrict mask)   // [1024][1024],
          __global const float4 *const restrict fcon_bias,
          __global const float4 *const restrict fcon_weight,
          __global const float4 *const restrict proj_bias,
          __global const float4 *const restrict proj_weight)
 {
 	const uint
 	gi = get_global_id(0),
@ -262,96 +168,13 @@ ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);
-	__local union aperaturev token;
+	__local union
 	__local float4 tmp[768/4];
 	// Fetch local copy of the global accumulator. We operate on a cached
 	// copy as input, and add our output to the global upon completion.
 	token.word[li] = accum[wi].word[li];
 	// Layer re-normalization
 	ctor_norm(token.word, token.word, tmp, ln, li);
 	ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
 	// Fully connected
 	for(uint i = 0; i < 4; ++i)
 		ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
 	// Gaussian Error Linear Unit
 	for(uint i = 0; i < 4; ++i)
 		ctor_gelu(token.fcon, token.fcon, i * ln + li);
 	// Projection
 	ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
 	// Accumulation; end of layer
 	accum[wi].word[li] += tmp[li];
 }
 __kernel void
 ctor_backend(__global const struct ctor_ctrl *const ctrl,
             __constant const struct ctor_opts *const opts,
             __global union tokenv *const restrict accum,
             __global const union tokenv *const restrict xattn,
             __global const float4 *const restrict attn_proj_bias,
             __global const float4 *const restrict attn_proj_weight,
             __global const float4 *const restrict ffnn_norm_bias,
             __global const float4 *const restrict ffnn_norm_weight,
             __global const float4 *const restrict ffnn_fcon_bias,
             __global const float4 *const restrict ffnn_fcon_weight,
             __global const float4 *const restrict ffnn_proj_bias,
             __global const float4 *const restrict ffnn_proj_weight)
 {
 	ctor_attn_proj
 	(
 		ctrl,
 		opts,
 		accum,
 		xattn,
 		attn_proj_bias,
 		attn_proj_weight
 	);
 	ctor_ffnn
 	(
 		ctrl,
 		opts,
 		accum,
 		ffnn_norm_bias,
 		ffnn_norm_weight,
 		ffnn_fcon_bias,
 		ffnn_fcon_weight,
 		ffnn_proj_bias,
 		ffnn_proj_weight
 	);
 }
 //
 // ctrl
 //
 __kernel void
 ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
               __constant const struct ctor_opts *const opts,
               __global union tokenv *const restrict out,
               __global const struct qkvv *const restrict token,
               __global const struct attn_mask *const restrict mask)   // [1024][1024],
 {
 	__local struct
 	{
 		float
-		attn[12][32];
+		attn[12][96];
 	}
 	self;
 	const uint
 	gi = get_global_id(0),
 	gn = get_global_size(0),
 	li = get_local_id(0),
 	ln = get_local_size(0),
 	wi = get_group_id(0),
 	wn = get_num_groups(0);
 	for(uint i = 0; i < wn; ++i)
 		if(mask[wi].token[i])
 			self.attn[li][i] = 0.0f;
@ -389,114 +212,167 @@ ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
 		self.attn[li][i] /= acc;
 	for(uint j = 0; j < 64/4; ++j)
-		out[wi].attn[li][j] = 0.0f;
+		out->attn[li][j] = 0.0f;
 	for(uint i = 0; i < wn; ++i)
 		for(uint j = 0; j < 64/4; ++j)
-			out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
+			out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
 }
 //
 // leads
 //
 __kernel void
-ctor_anode0(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
-            __constant const struct ctor_opts *const opts,
+                   __constant const struct ircd_gpt_opts *const opts,
-            __global union tokenv *const restrict accum,
+                   __global union ircd_gpt_aperaturev *const restrict out,
-            __global const union tokenv *const restrict pos,
+                   __global const union ircd_gpt_tokenv *const restrict in,
-            __global const union tokenv *const restrict vocab)
+                   __global const float4 *const restrict norm_bias,
                   __global const float4 *const restrict norm_weight,
                   __global const float4 *const restrict fcon_bias,
                   __global const float4 *const restrict fcon_weight)
 {
 	const uint
 	gi = get_global_id(0),
 	gn = get_global_size(0),
 	li = get_local_id(0),
-	wi = get_group_id(0);
+	ln = get_local_size(0),
 	wi = get_group_id(0),
 	wn = get_num_groups(0);
-	const ushort
+	__local union ircd_gpt_aperaturev token;
-	token = ctrl->body.token[wi];
+	__local float4 tmp[768/4];
-	const float4
+	token.word[li] = in[wi].word[li];
 	wte = vocab[token].word[li],
 	wpe = pos[wi].word[li];
-	accum[wi].word[li] = wte + wpe;
+	// Layer re-normalization
 	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
 	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
 	// Fully connected
 	for(uint i = 0; i < 3; ++i)
 		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
 	// Export queries, keys, and values.
 	for(uint i = 0; i < 3; ++i)
 		out[wi].proj[i][li] = token.proj[i][li];
 }
 __kernel void
-ctor_anode1(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
-            __constant const struct ctor_opts *const opts,
+              __constant const struct ircd_gpt_opts *const opts,
-            __global union tokenv *const restrict accum,
+              __global union ircd_gpt_tokenv *const restrict accum,
-            __global const union tokenv *const restrict pos,
+              __global const struct ircd_gpt_qkvv *const restrict state,
-            __global const union tokenv *const restrict vocab)
+              __global const struct ircd_gpt_attn_mask *const restrict mask,   // [1024][1024],
              __global const float4 *const restrict attn_proj_bias,
              __global const float4 *const restrict attn_proj_weight,
              __global const float4 *const restrict ffnn_norm_bias,
              __global const float4 *const restrict ffnn_norm_weight,
              __global const float4 *const restrict ffnn_fcon_bias,
              __global const float4 *const restrict ffnn_fcon_weight,
              __global const float4 *const restrict ffnn_proj_bias,
              __global const float4 *const restrict ffnn_proj_weight)
 {
 	__local union ircd_gpt_tokenv value;
 	ircd_gpt_attn_self
 	(
 		ctrl,
 		opts,
 		&value,
 		state,
 		mask
 	);
 	ircd_gpt_attn_proj
 	(
 		ctrl,
 		opts,
 		accum,
 		&value,
 		attn_proj_bias,
 		attn_proj_weight
 	);
 	ircd_gpt_ffnn
 	(
 		ctrl,
 		opts,
 		accum,
 		ffnn_norm_bias,
 		ffnn_norm_weight,
 		ffnn_fcon_bias,
 		ffnn_fcon_weight,
 		ffnn_proj_bias,
 		ffnn_proj_weight
 	);
 }
 //
 // frontend
 //
 inline void
 _ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   __global union ircd_gpt_tokenv *const restrict out,
                   __global const union ircd_gpt_tokenv *const restrict pos,
                   __global const union ircd_gpt_tokenv *const restrict vocab,
                   const uint out_idx,
                   const uint tok_idx,
                   const uint word_idx)
 {
 	const ushort
 	token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
 	const float4
 	wte = vocab[token].word[word_idx],
 	wpe = pos[tok_idx].word[word_idx];
 	out[out_idx].word[word_idx] = wte + wpe;
 }
 __kernel void
 ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
                  __constant const struct ircd_gpt_opts *const opts,
                  __global union ircd_gpt_tokenv *const restrict accum,
                  __global const union ircd_gpt_tokenv *const restrict pos,
                  __global const union ircd_gpt_tokenv *const restrict vocab)
 {
 	const uint
 	li = get_local_id(0);
 	for(uint i = 0; i < ctrl->tokens; ++i)
-	{
+		_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
 		const ushort
 		token = ctrl->body.token[i];
 		const float4
 		wte = vocab[token].word[li],
 		wpe = pos[i].word[li];
 		accum[i].word[li] = wte + wpe;
 	}
 }
 __kernel void
-ctor_anode2(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
-            __constant const struct ctor_opts *const opts,
+                 __constant const struct ircd_gpt_opts *const opts,
-            __global union tokenv *const restrict accum,
+                 __global union ircd_gpt_tokenv *const restrict accum,
-            __global const union tokenv *const restrict pos,
+                 __global const float4 *const restrict norm_bias,
-            __global const union tokenv *const restrict vocab)
+                 __global const float4 *const restrict norm_weight)
 {
 	const uint
 	gi = get_global_id(0);
 	const ushort
 	token = ctrl->body.token[gi];
 	for(uint i = 0; i < 768/4; ++i)
 	{
 		const float4
 		wte = vocab[token].word[i],
 		wpe = pos[gi].word[i];
 		accum[gi].word[i] = wte + wpe;
 	}
 }
 __kernel void
 ctor_cathode(__global const struct ctor_ctrl *const ctrl,
             __constant const struct ctor_opts *const opts,
             __global union tokenv *const restrict accum,
             __global const float4 *const restrict norm_bias,
             __global const float4 *const restrict norm_weight)
 {
 	const uint
 	li = get_local_id(0),
 	ln = get_local_size(0),
 	wi = get_global_offset(0) / ln + get_group_id(0);
-	__local union tokenv
+	__local union ircd_gpt_tokenv
 	token, tmp;
 	token.word[li] = accum[wi].word[li];
 	// Final re-normalization
-	ctor_norm(token.word, token.word, tmp.word, ln, li);
+	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
-	ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
+	ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
 	accum[0].word[li] = token.word[li];
 }
 __kernel void
-ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
+ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
-            __constant const struct ctor_opts *const opts,
+                  __constant const struct ircd_gpt_opts *const opts,
-            __global float *const restrict logit,
+                  __global float *const restrict logit,
-            __global const union tokenv *const restrict accum,
+                  __global const union ircd_gpt_tokenv *const restrict accum,
-            __global const union tokenv *const restrict token)
+                  __global const union ircd_gpt_tokenv *const restrict token)
 {
 	const uint
 	gi = get_global_id(0);
@ -519,10 +395,79 @@ ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
 	logit[gi] = res;
 }
 inline void
 ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
               __constant const struct ircd_gpt_opts *const opts,
               const uint li)
 {
 	// No action for other threads right now
 	if(li != 0)
 		return;
 	// Run debug checks and assertions.
 	#ifdef RB_DEBUG
 	if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)
 		if(ctrl->tokens < 2)
 			ctrl->call = IRCD_GPT_ETOKENS;
 	#endif
 	// If the call value has been set to something other than default we
 	// do nothing else here.
 	if(ctrl->call != IRCD_GPT_ECOMPLETE)
 		return;
 	// On the last cycle, with no prior call or error code set, indicate
 	// a nominal exit condition.
 	if(ctrl->cycle + 1 >= opts->limit)
 	{
 		ctrl->call = IRCD_GPT_ACCEPT;
 		ctrl->epoch += 1;
 	}
 	ctrl->cycle += 1;
 }
 inline void
 ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
                   __constant const struct ircd_gpt_opts *const opts,
                   const uint li,
                   __local const ushort *const restrict idx)
 {
 	// To read from cells other than idx[0] we need this barrier.
 	if(opts->top_k > 1)
 		barrier(CLK_LOCAL_MEM_FENCE);
 	// No action for other threads right now
 	if(li != 0)
 		return;
 	// When the hypercall code is already set, bail here.
 	if(ctrl->call != IRCD_GPT_ECOMPLETE)
 		return;
 	const bool
 	buffer_full = ctrl->tokens >= opts->buffer_tokens;
 	const ulong
 	rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
 	sel = rnd % max(opts->top_k, 1U);
 	const ushort
 	token = idx[sel],
 	token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
 	ctrl->token[token_idx] = token;
 	if(buffer_full)
 		ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
 	else
 		ctrl->tokens++;
 }
 __kernel void
-ctor_lmamax(__global struct ctor_ctrl *const ctrl,
+ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
-            __constant const struct ctor_opts *const opts,
+                   __constant const struct ircd_gpt_opts *const opts,
-            __global const float *const restrict logit)
+                   __global const float *const restrict logit)
 {
 	const uint
 	gi = get_global_id(0),
@ -535,25 +480,13 @@ ctor_lmamax(__global struct ctor_ctrl *const ctrl,
 	ti = tn * li;
 	__local ushort idx[192];
 	__local float best[192];
 	idx[li] = ti;
 	for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
 		if(logit[j] > logit[idx[li]])
 			idx[li] = j;
-	best[li] = logit[idx[li]];
+	ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
-	ctor_local_reduce_tournament_ldr(best, idx, ln, li);
+	ircd_gpt_lm_result(ctrl, opts, li, idx);
-
+	ircd_gpt_leave(ctrl, opts, li);
 	if(li == 0 && ctrl->call == -1)
 		ctrl->body.token[ctrl->tokens++] = idx[li];
 	if(li == 0 && ctrl->call == -1)
 		ctrl->call = 1;
 	#ifdef RB_DEBUG
 	if(li == 0 && ctrl->call == 1)
 		if(ctrl->tokens < 2)
 			ctrl->call = -2;
 	#endif
 }
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -142,7 +142,7 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)
 	fs::map::opts map_opts;
 	map_opts.huge2mb = true;
-	map_opts.locked = false;
+	map_opts.locked = true;
 	default_model_shm = fs::map
 	{
 		fd, map_opts, sizeof(decoder)
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -8,21 +8,33 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.
 #include <ircd/gpt/pipe/pipe.h>
 namespace ircd::gpt
 {
 	void transform(ctor_ctrl &, const ctor_opts &);
 }
 namespace ircd::gpt::pipe
 {
-	static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts,
+	static void profile_dumplog(pipe::exec &);
 	lmhead_opts, lmamax_opts;
 	static ircd::cl::exec::opts
 	negative_opts, positive_opts, selfattn_opts,
 	cathode_opts, anode_opts, lmhead_opts, lmamax_opts;
 	extern conf::item<size_t> flush_cycles;
 	extern conf::item<size_t> queue_cycles;
 	extern const ircd::run::changed handle_quit;
 }
 decltype(ircd::gpt::pipe::queue_cycles)
 ircd::gpt::pipe::queue_cycles
 {
 	{ "name",     "ircd.gpt.pipe.queue" },
 	{ "default",  1L,                   },
 };
 decltype(ircd::gpt::pipe::flush_cycles)
 ircd::gpt::pipe::flush_cycles
 {
 	{ "name",     "ircd.gpt.pipe.flush" },
 	{ "default",  0L,                   },
 };
 decltype(ircd::gpt::pipe::default_model)
 ircd::gpt::pipe::default_model;
@ -82,144 +94,233 @@ noexcept
 //
 void
-ircd::gpt::transform(ctor_ctrl &ctrl,
+ircd::gpt::generate(task &task)
                     const ctor_opts &opts)
 {
 	if(unlikely(!pipe::default_model))
 		pipe::init();
-	ctrl.call = -1;
+	const auto &opts
 	pipe::exec
 	{
-		ctrl, opts
+		*task.opts
 	};
 	auto &ctrl
 	{
 		*task.ctrl
 	};
 	ctrl.call = IRCD_GPT_ECOMPLETE;
 	ctrl.host_tsc = prof::cycles();
 	size_t cycle(ctrl.cycle);
 	const size_t tokens(ctrl.tokens);
 	std::deque<pipe::exec> list;
 	for(; cycle < opts.limit; ++cycle)
 	{
 		// When the release/acquire bits are set the control pages are sent
 		// and received; only set on first and last iterations of this loop.
 		const bool
 		rel(cycle == 0),
 		acq(cycle + 1 >= opts.limit);
 		// Enqueue the cycle's commands
 		list.emplace_back
 		(
 			task, tokens + cycle, rel, acq
 		);
 		// Conditions for a cl::flush here; this is not default but
 		// may be configured to improve some workloads.
 		const bool flush
 		{
 			// Flushing here is enabled by the configuration
 			pipe::flush_cycles
 			// Skip flushing on cycles already performing IO or waiting.
 			&& !rel && !acq && list.size() <= pipe::queue_cycles
 			// The configuration item can specify an interval greater than
 			// one between flushes.
 			&& cycle % pipe::flush_cycles == 0
 		};
 		if(flush)
 			cl::flush();
 		// Enqueue consecutive repetitions of our kernel batch before waiting
 		// on the first; based on the configuration. XXX get from ircd::cl
 		if(list.size() <= pipe::queue_cycles)
 			continue;
 		// Profiling branch
 		if((false))
 		{
 			auto &ex(list.front());
 			profile_dumplog(ex);
 		}
 		// Destructing the front of the queue waits for completion by yielding
 		// this ircd::ctx.
 		list.pop_front();
 	}
 	// Wait for all unfinished
 	list.clear();
 	// Interp error codes
 	if(unlikely(ctrl.call <= 0))
 		throw error
 		{
 			"hyper (#%d) :%s",
-			abs(ctrl.call),
+			abs(int(ctrl.call)),
-			ctrl.body.str,
+			reflect(ctrl.call),
 		};
 	always_assert(ctrl.cycle == cycle);
 }
 void
 ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
 {
 	constexpr size_t coils
 	{
 		sizeof(exec.coil) / sizeof(cl::exec)
 	};
 	for(size_t i(0); i < coils; ++i)
 	{
 		exec.coil[i].wait();
 		const auto &pro
 		{
 			exec.coil[i].profile()
 		};
 		char tmbuf[4][32] {{0}};
 		log::logf
 		{
 			log, log::level::DEBUG,
 			"coil:%-2lu %8s %8s %8s %8s\n",
 			i,
 			util::pretty(tmbuf[0], si(pro[0]), 1),
 			util::pretty(tmbuf[1], si(pro[1]), 1),
 			util::pretty(tmbuf[2], si(pro[2]), 1),
 			util::pretty(tmbuf[3], si(pro[3]), 1),
 		};
 	}
 }
 //
 // pipe::exec
 //
-ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl,
+ircd::gpt::pipe::exec::exec(task &task,
-                            const ctor_opts &opts)
+                            const size_t tokens,
                            const bool release,
                            const bool acquire)
 :desc
 {
 	default_desc
 }
-,out_ctrl
+,send_opts
 {
-	reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl)
+	reinterpret_cast<const char *>(task.opts),
 	release? sizeof(struct ircd_gpt_opts): 0
 }
-,in_ctrl
+,send_ctrl
 {
-	reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl)
+	reinterpret_cast<const char *>(task.ctrl),
 	release? sizeof(struct ircd_gpt_task): 0
 }
-,in_opts
+,recv_ctrl
 {
-	reinterpret_cast<const char *>(&opts), sizeof(ctor_opts)
+	reinterpret_cast<char *>(task.ctrl),
 	acquire? sizeof(struct ircd_gpt_task): 0
 }
-,range_anode
+,range_lm_embed
 {
-	{ ctrl.tokens, 0, },
+	{      1 * 192UL, 0, },
-	{           1, 0, },
+	{          192UL, 0, },
 }
 ,range_coil
 {
 	{ ctrl.tokens * 192UL, 0, },
 	{               192UL, 0, },
 }
 ,range_negative
 {
-	range_coil
+	{ tokens * 192UL, 0, },
-}
+	{          192UL, 0, },
 ,range_selfattn
 {
 	range_coil
 }
 ,range_positive
 {
-	range_coil
+	{ tokens * 192UL, 0, },
 	{          192UL, 0, },
 }
-,range_cathode
+,range_lm_norm
 {
-	{                 1 * 192UL, 0 },
+	{            1 * 192UL, 0 },
-	{                     192UL, 0 },
+	{                192UL, 0 },
-	{ (ctrl.tokens - 1) * 192UL, 0 },
+	{ (tokens - 1) * 192UL, 0 },
 }
-,range_lmhead
+,range_lm_logit
 {
 	{ 262 * 192UL, 0 },  // align_up(50257) / 192
 	{       192UL, 0 },
 }
-,range_lmamax
+,range_lm_select
 {
 	{   1 * 192UL, 0 },
 	{       192UL, 0 },
 }
-,send
+,release_opts
 {
-	{ desc->opts, in_opts },
+	desc->opts, send_opts
 	{ desc->ctrl, in_ctrl },
 }
-,tail
+,release_ctrl
 {
-	{ desc->anode,     range_anode,   anode_opts    },
+	desc->ctrl, send_ctrl
 }
 ,lm_embed
 {
 	desc->lm_embed, range_lm_embed, anode_opts
 }
 ,coil
 {
 	{ desc->layer[0x00]->negative, range_negative, negative_opts },
 	{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x00]->positive, range_positive, positive_opts },
 	{ desc->layer[0x01]->negative, range_negative, negative_opts },
 	{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x01]->positive, range_positive, positive_opts },
 	{ desc->layer[0x02]->negative, range_negative, negative_opts },
 	{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x02]->positive, range_positive, positive_opts },
 	{ desc->layer[0x03]->negative, range_negative, negative_opts },
 	{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x03]->positive, range_positive, positive_opts },
 	{ desc->layer[0x04]->negative, range_negative, negative_opts },
 	{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x04]->positive, range_positive, positive_opts },
 	{ desc->layer[0x05]->negative, range_negative, negative_opts },
 	{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x05]->positive, range_positive, positive_opts },
 	{ desc->layer[0x06]->negative, range_negative, negative_opts },
 	{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x06]->positive, range_positive, positive_opts },
 	{ desc->layer[0x07]->negative, range_negative, negative_opts },
 	{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x07]->positive, range_positive, positive_opts },
 	{ desc->layer[0x08]->negative, range_negative, negative_opts },
 	{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x08]->positive, range_positive, positive_opts },
 	{ desc->layer[0x09]->negative, range_negative, negative_opts },
 	{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x09]->positive, range_positive, positive_opts },
 	{ desc->layer[0x0a]->negative, range_negative, negative_opts },
 	{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x0a]->positive, range_positive, positive_opts },
 	{ desc->layer[0x0b]->negative, range_negative, negative_opts },
 	{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x0b]->positive, range_positive, positive_opts },
 }
-,head
+,lm_norm
 {
-	{ desc->cathode,  range_cathode,  cathode_opts  },
+	desc->lm_norm, range_lm_norm, cathode_opts
 	{ desc->lmhead,   range_lmhead,   lmhead_opts   },
 	{ desc->lmamax,   range_lmamax,   lmamax_opts   },
 }
-,recv
+,lm_logit
 {
-	{ desc->ctrl, out_ctrl },
+	desc->lm_logit, range_lm_logit, lmhead_opts
 }
 ,lm_select
 {
 	desc->lm_select, range_lm_select, lmamax_opts
 }
 ,acquire_ctrl
 {
 	desc->ctrl, recv_ctrl
 }
 {
 }
@ -296,29 +397,14 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 {
 	&code
 }
 ,opts
 {
 	4_KiB,
 	const_buffer{}
 }
 ,ctrl
 {
 	4_KiB,
 	mutable_buffer{}
 }
 ,state
 {
-	32 * 3 * 768 * sizeof(float),
+	96 * 3 * 768 * sizeof(float),
 	mutable_buffer{}
 }
 ,xattn
 {
 	32 * 1 * 768 * sizeof(float),
 	mutable_buffer{}
 }
 ,accum
 {
-	32 * 768 * sizeof(float),
+	96 * 768 * sizeof(float),
 	mutable_buffer{}
 }
 ,logit
@ -326,16 +412,54 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	65536 * sizeof(float),
 	mutable_buffer{}
 }
-,anode
+,ctrl
 {
 	sizeof(struct ircd_gpt_task),
 	mutable_buffer{}
 }
 ,opts
 {
 	sizeof(struct ircd_gpt_opts),
 	const_buffer{}
 }
 ,lm_embed
 {
 	code,
-	"ctor_anode2",
+	"ircd_gpt_lm_embed",
 	ctrl,
 	opts,
 	accum,
 	model.embed->pos,
 	model.embed->token,
 }
 ,lm_norm
 {
 	code,
 	"ircd_gpt_lm_norm",
 	ctrl,
 	opts,
 	accum,
 	model.decode->norm.bias,
 	model.decode->norm.weight,
 }
 ,lm_logit
 {
 	code,
 	"ircd_gpt_lm_logit",
 	ctrl,
 	opts,
 	logit,
 	accum,
 	model.embed->token,
 }
 ,lm_select
 {
 	code,
 	"ircd_gpt_lm_select",
 	ctrl,
 	opts,
 	logit,
 }
 ,layer
 {
 	std::make_unique<struct desc::layer>(*this, 0x00),
@ -351,34 +475,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	std::make_unique<struct desc::layer>(*this, 0x0a),
 	std::make_unique<struct desc::layer>(*this, 0x0b),
 }
 ,cathode
 {
 	code,
 	"ctor_cathode",
 	ctrl,
 	opts,
 	accum,
 	model.decode->norm.bias,
 	model.decode->norm.weight,
 }
 ,lmhead
 {
 	code,
 	"ctor_lmhead",
 	ctrl,
 	opts,
 	logit,
 	accum,
 	model.embed->token,
 }
 ,lmamax
 {
 	code,
 	"ctor_lmamax",
 	ctrl,
 	opts,
 	logit,
 }
 {
 }
@ -391,7 +487,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
 :negative
 {
 	*desc.code,
-	"ctor_attn_fcon",
+	"ircd_gpt_attn_fcon",
 	desc.ctrl,
 	desc.opts,
 	desc.state,
@ -401,24 +497,15 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
 	desc.model->decode->block[laynum].attn.fcon.bias,
 	desc.model->decode->block[laynum].attn.fcon.weight,
 }
 ,selfattn
 {
 	*desc.code,
 	"ctor_attn_self",
 	desc.ctrl,
 	desc.opts,
 	desc.xattn,
 	desc.state,
 	desc.model->decode->block[laynum].attn.mask,
 }
 ,positive
 {
 	*desc.code,
-	"ctor_backend",
+	"ircd_gpt_coil",
 	desc.ctrl,
 	desc.opts,
 	desc.accum,
-	desc.xattn,
+	desc.state,
 	desc.model->decode->block[laynum].attn.mask,
 	desc.model->decode->block[laynum].attn.proj.bias,
 	desc.model->decode->block[laynum].attn.proj.weight,
 	desc.model->decode->block[laynum].ffnn.norm.bias,
@ -486,23 +573,33 @@ noexcept
 //
 ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
-:block
+:master
 {
-	{ decoder.layer[0x00], 0x00, },
+	sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer
-	{ decoder.layer[0x01], 0x01, },
+	{
-	{ decoder.layer[0x02], 0x02, },
+		reinterpret_cast<const char *>(decoder.layer),
-	{ decoder.layer[0x03], 0x03, },
+		sizeof(decoder.layer) + sizeof(decoder.f)
-	{ decoder.layer[0x04], 0x04, },
+	}
-	{ decoder.layer[0x05], 0x05, },
+}
-	{ decoder.layer[0x06], 0x06, },
+,block
-	{ decoder.layer[0x07], 0x07, },
+{
-	{ decoder.layer[0x08], 0x08, },
+	{ master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
-	{ decoder.layer[0x09], 0x09, },
+	{ master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
-	{ decoder.layer[0x0a], 0x0a, },
+	{ master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, },
-	{ decoder.layer[0x0b], 0x0b, },
+	{ master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, },
 	{ master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, },
 	{ master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, },
 	{ master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, },
 	{ master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, },
 	{ master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, },
 	{ master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, },
 	{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
 	{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
 }
 ,norm
 {
 	master,
 	off_t(sizeof(gpt::model::block) * 12),
 	const_buffer{decoder.f.bias},
 	const_buffer{decoder.f.weight},
 }
@ -544,6 +641,27 @@ ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
 {
 }
 ircd::gpt::pipe::model::block::block(cl::data &master,
                                     const off_t offset,
                                     const gpt::model::block &block,
                                     const size_t layer)
 :attn
 {
 	master,
 	offset,
 	block.ln1,
 	block.attn,
 }
 ,ffnn
 {
 	master,
 	offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
 	block.ln2,
 	block.ffnn,
 }
 {
 }
 //
 // pipe::model::ffnn
 //
@ -678,3 +796,48 @@ ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
 }
 {
 }
 //
 // gpt::task
 //
 ircd::gpt::task::task(const gpt::opts *const opts,
                      struct ircd_gpt_task *const ctrl)
 :opts
 {
 	opts
 }
 ,ctrl
 {
 	ctrl
 }
 {
 	memset(this->ctrl, 0x0, sizeof(ircd_gpt_task));
    this->ctrl->rand[0] = this->opts->seed;
    this->ctrl->rand[1] = this->opts->seed;
    this->ctrl->rand[2] = -1UL;
    this->ctrl->rand[3] = -1UL;
 }
 ircd::gpt::task::~task()
 noexcept
 {
 }
 //
 // hypercall
 //
 ircd::string_view
 ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
 noexcept
 {
 	switch(code)
 	{
 		case IRCD_GPT_ACCEPT:      return "ACCEPT";
 		case IRCD_GPT_ECOMPLETE:   return "ECOMPLETE";
 	}
 	return "??????";
 }
--- a/modules/console.cc
+++ b/modules/console.cc
@ -17324,10 +17324,15 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
 	opts.limit = param.at<uint>("limit");
 	opts.top_k = 3;
-	gpt::task task;
+	struct ircd_gpt_task ctrl;
 	gpt::task task
 	{
 		&opts, &ctrl
 	};
 	const auto output
 	{
-		gpt::generate(buf, text, &opts, &task)
+		gpt::generate(buf, text, task)
 	};
 	out