ircd::gpt: Reorganize interface.

2024-12-25 15:04:10 +01:00 · 2021-04-02 13:01:38 -07:00 · 2021-04-02 13:01:38 -07:00 · 1870c364f4
commit 1870c364f4
parent f096e7fcf5
17 changed files with 1067 additions and 778 deletions
--- a/include/ircd/gpt/generate.h
+++ b/include/ircd/gpt/generate.h
@ -13,15 +13,16 @@

 namespace ircd::gpt
 {
+	void
+	generate(task &);
+
 	vector_view<u16>
 	generate(const vector_view<u16> &out,
 	         const vector_view<const u16> &in,
-	         const opts * = &default_opts,
-	         task * = nullptr);
+	         task &);

 	string_view
 	generate(const mutable_buffer &out,
 	         const string_view &in,
-	         const opts * = &default_opts,
-	         task * = nullptr);
+	         task &);
 }
--- a/include/ircd/gpt/gpt.h
+++ b/include/ircd/gpt/gpt.h
@ -20,63 +20,14 @@ namespace ircd::gpt
 	struct opts;
 	struct task;

-	extern const opts default_opts;
 	extern log::log log;
 }

+#include "hypercall.h"
 #include "vocab.h"
 #include "model.h"
+#include "token.h"
+#include "opts.h"
 #include "task.h"
+#include "pipe/pipe.h"
 #include "generate.h"
-
-/// Primary Options
-///
-/// Use this structure to configure and control specifics of the machine.
-/// These settings are immutable for the operations. To maintain state between
-/// calls see task.h
-///
-struct ircd::gpt::opts
-{
-	/// Specifies the nominal halting condition based on the sequence of
-	/// tokens. Generation will complete when this sequence is witnessed. Set
-	/// tokens to -1 starting from the back to not match that token. Setting
-	/// all tokens to -1 will ignore this condition.
-	uint accept_code[3][3]
-	{
-		{    13,  198,  -1U,  },
-		{   198,  198,  -1U,  },
-		{   -1U,  -1U,  -1U,  },
-	};
-
-	/// Specifies the exceptional halting condition based on the sequence of
-	/// tokens. By default, the three zeros represent three outputs of '!'
-	/// which is probably an error code; note that a true "!!!" is represented
-	/// by token number 10185. Set tokens to -1 starting from the back to
-	/// not match that token; generated output after errors is usually garbage.
-	uint error_code[3][3]
-	{
-		{     0,    0,    0,  },
-		{   -1U,    0,    0,  },
-		{   -1U,    0,    0,  },
-	};
-
-	/// Limit number of output tokens. Default of -1 is unlimited; the number
-	/// of tokens generated will be limited by other factors.
-	uint limit
-	{
-		-1U
-	};
-
-	/// Flip random coins over the top k logits each round. Setting to 1
-	/// deterministically selects the top logit.
-	uint top_k
-	{
-		2
-	};
-
-	/// Pointer to the model
-	const model::decoder *model
-	{
-		model::default_model
-	};
-};
--- a/include/ircd/gpt/hypercall.h
+++ b/include/ircd/gpt/hypercall.h
@ -0,0 +1,34 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_HYPERCALL_H
+
+/// Hypercalling code enumeration.
+///
+/// Error codes are all negative values. Zero is also an error.
+enum ircd_gpt_hypercall
+{
+	/// General nominal completion code; similar to EXIT_SUCCESS, etc.
+	IRCD_GPT_ACCEPT = 1,
+
+	/// Failed or incomplete execution occurred. After an execution attempt
+	/// it indicates no execution likely took place. Device software never
+	/// sets this value; it is the initial value set by the host before
+	/// execution.
+	IRCD_GPT_ECOMPLETE = 0,
+};
+
+#ifdef __cplusplus
+namespace ircd::gpt
+{
+	string_view reflect(const enum ircd_gpt_hypercall) noexcept;
+}
+#endif
--- a/include/ircd/gpt/model.h
+++ b/include/ircd/gpt/model.h
@ -67,7 +67,8 @@ struct ircd::gpt::model::block

 	norm ln2;
 	model::ffnn ffnn;
-};
+}
+__attribute__((packed));

 /// Vocabulary embeddings
 struct ircd::gpt::model::embed
--- a/include/ircd/gpt/opts.h
+++ b/include/ircd/gpt/opts.h
@ -0,0 +1,123 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_OPTS_H
+
+/// Task Options Page
+///
+/// The option block is directly shared with task software as constant data.
+/// This stucture and its mutable companion in `task.h` determine the outcome
+/// of the next execution cycle; options are immutable to device software but
+/// may be changed by the host between executions cycles if desired.
+/// 
+struct ircd_gpt_opts
+{
+	/// Specifies the nominal halting condition based on a sequence of tokens.
+	/// Generation will complete with success after one of these sequences is
+	/// witnessed. Set tokens to -1 starting from the back for shorter
+	/// sequences; zero-length sequences (all -1's) are never matched.
+	uint accept_code[4][4]
+	#ifdef __cplusplus
+	{
+		{    13,  198,  -1U,  -1U,  },
+		{   198,  198,  -1U,  -1U,  },
+		{   -1U,  -1U,  -1U,  -1U,  },
+		{   -1U,  -1U,  -1U,  -1U,  },
+	}
+	#endif
+	;
+
+	/// Specifies the exceptional halting condition based on the sequence of
+	/// tokens. By default, the three zeros represent three outputs of '!'
+	/// which is probably an error; note that a true "!!!" is represented by
+	/// token number 10185. Set tokens to -1 starting from the back to not
+	/// match that token; generated output after errors is usually garbage.
+	uint error_code[4][4]
+	#ifdef __cplusplus
+	{
+		{     0,    0,    0,  -1U,  },
+		{   -1U,  -1U,  -1U,  -1U,  },
+		{   -1U,  -1U,  -1U,  -1U,  },
+		{   -1U,  -1U,  -1U,  -1U,  },
+	}
+	#endif
+	;
+
+	/// Limit number of output tokens. Default of -1 is unlimited; the number
+	/// of tokens generated will be limited by other factors.
+	uint limit
+	#ifdef __cplusplus
+	{
+		1
+	}
+	#endif
+	;
+
+	/// Flip random coins over the top k logits each round. Setting to 1
+	/// deterministically selects the top logit.
+	uint top_k
+	#ifdef __cplusplus
+	{
+		2
+	}
+	#endif
+	;
+
+	/// Specifies the token context size in tokens.
+	uint context_tokens
+	#ifdef __cplusplus
+	{
+		1024
+	}
+	#endif
+	;
+
+	/// Specifies the token buffer size in tokens.
+	uint buffer_tokens
+	#ifdef __cplusplus
+	{
+		1024
+	}
+	#endif
+	;
+
+	/// Seed for the task's PRNG.
+	ulong seed
+	#ifdef __cplusplus
+	{
+		1234567890UL
+	}
+	#endif
+	;
+}
+__attribute__((aligned(4096)));
+
+#ifdef __cplusplus
+/// Generator Task Options.
+///
+/// Parameters for a task. Options are constant and one instance can be shared
+/// between multiple task instances. This structure extends the task options
+/// page, starting a new page which is not visible to device software; C++ and
+/// host pointers are available.
+///
+struct ircd::gpt::opts
+:ircd_gpt_opts
+{
+	/// Pointer to the model
+	const model::decoder *model
+	{
+		model::default_model
+	};
+};
+
+static_assert(sizeof(struct ircd_gpt_opts) == 4096);
+static_assert(std::is_standard_layout<struct ircd_gpt_opts>::value);
+#endif
--- a/include/ircd/gpt/pipe/code.h
+++ b/include/ircd/gpt/pipe/code.h
@ -0,0 +1,22 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_CODE_H
+
+/// Pipe code segment.
+struct ircd::gpt::pipe::code
+:cl::code
+{
+	static const string_view compile_opts;
+
+	code();
+	~code() noexcept;
+};
--- a/include/ircd/gpt/pipe/desc.h
+++ b/include/ircd/gpt/pipe/desc.h
@ -0,0 +1,47 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_DESC_H
+
+/// Pipe descriptor
+struct ircd::gpt::pipe::desc
+{
+	struct layer;
+
+	pipe::model *model;
+	pipe::code *code;
+
+	cl::data
+	state,         // qry/key/val projection (tokens * embed * 3 * float)
+	accum,         // accumulator (tokens * embed * float)
+	logit,         // result output vector (50257 * float)
+	ctrl,          // control page
+	opts;          // options page
+
+	cl::kern
+	lm_embed,
+	lm_norm,
+	lm_logit,
+	lm_select;
+
+	std::unique_ptr<struct desc::layer>
+	layer[12];
+
+	desc(pipe::code &, pipe::model &);
+};
+
+struct ircd::gpt::pipe::desc::layer
+{
+	cl::kern negative;
+	cl::kern positive;
+
+	layer(pipe::desc &, const int);
+};
--- a/include/ircd/gpt/pipe/exec.h
+++ b/include/ircd/gpt/pipe/exec.h
@ -0,0 +1,57 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_EXEC_H
+
+/// Perform one task cycle on the device.
+///
+/// Constructions of this object enqueue device commands to complete an
+/// additional epoch of the task as provided by `ctrl` and `opts`.
+///
+/// Destructions of this object yield the ircd::ctx until those commands
+/// are complete.
+///
+/// Consecutive cycles on the device without stopping (a.k.a. pipelining) is
+/// achieved by constructing several objects before following with destructions
+/// i.e in a std::deque.
+///
+struct ircd::gpt::pipe::exec
+{
+	pipe::desc *desc;
+
+	const_buffer
+	send_opts,         // Set when sending the options page.
+	send_ctrl;         // Set when sending the control page.
+
+	mutable_buffer
+	recv_ctrl;         // Set when receiving the control page.
+
+	cl::kern::range
+	range_lm_embed,    // Dimension range of the lm_embed kernel.
+	range_negative,    // Dimension range of a layer kernel.
+	range_positive,    // Dimension range of a layer kernel.
+	range_lm_norm,     // Dimension range of the final norm kernel.
+	range_lm_logit,    // Dimension range of the language logit kernel.
+	range_lm_select;   // Dimension range of the language token kernel.
+
+	cl::exec
+	release_opts,      // Release the options page.
+	release_ctrl,      // Release the control page.
+	lm_embed,          // Compute token and positional embeddings.
+	coil[12 * 2],      // Pass over all layers.
+	lm_norm,           // Final normalization.
+	lm_logit,          // Compute logit result vector.
+	lm_select,         // Select next token.
+	acquire_ctrl;      // Acquire the control page.
+
+	exec(task &, const size_t tokens, const bool rel, const bool acq);
+	~exec() noexcept;
+};
--- a/include/ircd/gpt/pipe/model.h
+++ b/include/ircd/gpt/pipe/model.h
@ -59,11 +59,13 @@ struct ircd::gpt::pipe::model::block
 	model::attn attn;
 	model::ffnn ffnn;

+	block(cl::data &, const off_t, const gpt::model::block &, const size_t);
 	block(const gpt::model::block &, const size_t);
 };

 struct ircd::gpt::pipe::model::decoder
 {
+	cl::data master;
 	model::block block[12];
 	tensor norm;

--- a/include/ircd/gpt/pipe/pipe.h
+++ b/include/ircd/gpt/pipe/pipe.h
@ -17,7 +17,6 @@ namespace ircd::gpt::pipe
 	struct code;
 	struct desc;
 	struct exec;
-	struct bank;

 	extern model *default_model;
 	extern code *default_code;
@ -27,70 +26,6 @@ namespace ircd::gpt::pipe
 };

 #include "model.h"
-#include "ctrl.h"
-
-struct ircd::gpt::pipe::code
-:cl::code
-{
-	static const string_view compile_opts;
-
-	code();
-	~code() noexcept;
-};
-
-struct ircd::gpt::pipe::desc
-{
-	struct layer;
-
-	pipe::model *model;
-	pipe::code *code;
-
-	cl::data opts;
-	cl::data ctrl;
-	cl::data state;
-	cl::data xattn;
-	cl::data accum;
-	cl::data logit;
-	cl::kern anode;
-	std::unique_ptr<struct desc::layer> layer[12];
-	cl::kern cathode;
-	cl::kern lmhead;
-	cl::kern lmamax;
-
-	desc(pipe::code &, pipe::model &);
-};
-
-struct ircd::gpt::pipe::desc::layer
-{
-	cl::kern negative;
-	cl::kern selfattn;
-	cl::kern positive;
-
-	layer(pipe::desc &, const int);
-};
-
-struct ircd::gpt::pipe::exec
-{
-	pipe::desc *desc;
-
-	mutable_buffer out_ctrl;
-	const_buffer in_ctrl, in_opts;
-
-	cl::kern::range range_anode;
-	cl::kern::range range_coil;
-	cl::kern::range range_negative;
-	cl::kern::range range_selfattn;
-	cl::kern::range range_positive;
-	cl::kern::range range_cathode;
-	cl::kern::range range_lmhead;
-	cl::kern::range range_lmamax;
-
-	cl::exec send[2];
-	cl::exec tail[1];
-	cl::exec coil[12 * 3];
-	cl::exec head[3];
-	cl::exec recv[1];
-
-	exec(ctor_ctrl &, const ctor_opts &);
-	~exec() noexcept;
-};
+#include "code.h"
+#include "desc.h"
+#include "exec.h"
--- a/include/ircd/gpt/task.h
+++ b/include/ircd/gpt/task.h
@ -11,8 +11,77 @@
 #pragma once
 #define HAVE_IRCD_GPT_TASK_H

-/// Context to maintain state across calls.
+/// Task Control Page
 ///
+/// The control block is shared with our device software. Execution state is
+/// maintained in the task control block across cycles. The control block is
+/// the mutable state component for an execution; for the immutable component
+/// also shared with device software see opts.h.
+///
+struct ircd_gpt_task
+{
+	/// Header magic 0xC7012C70
+	uint magic;
+
+	/// Hypercall code set by our device software upon completion and control
+	/// transfer back to the host. Negative codes indicate errors, positive
+	/// codes are used for status and/or procedure calls; zero is also an error.
+	enum ircd_gpt_hypercall call;
+
+	/// Token ring head. Tokens in the ring extend behind the head for
+	/// `tokens`. The `head` value is automatically modulated by device
+	/// software to wrap around the ring.
+	uint head;
+
+	/// Token counter. The counter indicates the number of valid tokens in
+	/// the context buffer. This value must not exceed the buffer size.
+	uint tokens;
+
+	/// Accumulates the number of task cycles. The cycle counter is incremented
+	/// by device software after each repetition of the kernel pipeline to
+	/// produce one additional token.
+	ulong cycle;
+
+	/// Accumulates the epoch count for the task. The counter is incremented
+	/// by one in device software before control returns back to the host.
+	/// Several cycles may occur during each epoch.
+	ulong epoch;
+
+	/// Accumulates the number of tokens produced by the task. Several tokens
+	/// may be produced each epoch, but currently only one token is produced
+	/// each cycle.
+	ulong produced;
+
+	/// Accumulates the number tokens witnessed by the task. The number of
+	/// tokens in the context for each cycle is counted as witnessed.
+	ulong witnessed;
+
+	/// Accumulates time in microseconds elapsed for the task.
+	ulong elapsed;
+
+	/// PRNG xoshiro256 state. This is the de facto random seed which can be
+	/// set before cycle entry by the host. It is updated by device software
+	/// when used.
+	ulong rand[4];
+
+	/// Updated by the host with the value of the timestamp register as sampled
+	/// immediately before each transfer of control to the device.
+	ulong host_tsc;
+
+	/// State counters for the accept/error sequence codes.
+	uint accept_seq[4], error_seq[4];
+
+	/// The token buffer starts at offset 2048 and continues to the end of
+	/// the page; options specify the size of the tokens buffer in tokens.
+	/// Additional pages must be attached for larger buffer sizes.
+	ushort token[] __attribute__((aligned(2048)));
+}
+__attribute__((aligned(4096)));
+
+#ifdef __cplusplus
+/// Task Context
+///
+/// State for a task.
 struct ircd::gpt::task
 {
 	enum status :char;
@ -20,36 +89,16 @@ struct ircd::gpt::task
 	/// Reference to the attached options.
 	const gpt::opts *opts {nullptr};

+	/// Reference to control pages.
+	struct ircd_gpt_task *ctrl {nullptr};
+
 	/// Current task status.
 	enum status status {'\0'};

-	/// State counters for the accept codes specified in the options.
-	uint8_t accept_seq[3] {0};
+	task(const gpt::opts *       = nullptr,
+	     struct ircd_gpt_task *  = nullptr);

-	/// State counters for the error codes specified in the options.
-	uint8_t error_seq[3] {0};
-
-	/// Accumulates the number of executions by the user. Each call to the
-	/// interface is an execution.
-	uint64_t epoch {0};
-
-	/// Accumulates the number of tokens produced by the task. Several tokens
-	/// may be produced each epoch.
-	uint64_t produced {0};
-
-	/// Accumulates the number tokens witnessed by the task. The number of
-	/// tokens in the context for each produced token is counted as witnessed.
-	uint64_t witnessed {0};
-
-	/// Accumulates the number of CPU reference cycles consumed by the task.
-	/// This counter does not reflect time when the task is queued or waiting
-	/// or offloaded to a co-processor/accelerator.
-	uint64_t cycles {0};
-
-	/// Accumulates the total time in milliseconds over all executions of the
-	/// task. This counter reflects total wall-clock time of all phases of
-	/// the execution.
-	milliseconds time {0ms};
+	~task() noexcept;
 };

 /// The current status of a task is indicated with intelligible characters
@ -61,3 +110,7 @@ enum ircd::gpt::task::status
 	ACCEPT    = 'A',  ///< Execution completed successfully.
 	ERROR     = 'E',  ///< Execution did not complete successfully.
 };
+
+static_assert(sizeof(struct ircd_gpt_task) == 4096);
+static_assert(std::is_standard_layout<struct ircd_gpt_task>::value);
+#endif
--- a/include/ircd/gpt/pipe/ctrl.h
+++ b/include/ircd/gpt/pipe/ctrl.h
@ -9,78 +9,46 @@
 // full license for this software is available in the LICENSE file.

 #pragma once
-#define HAVE_IRCD_GPT_PIPE_CTRL_H
+#ifdef __OPENCL_C_VERSION__
+#define HAVE_IRCD_GPT_TOKEN_H

-struct ctor_ctrl
-{
-	long call;
-	ulong pc;
-	ulong tokens;
-	ulong magic;
-	uchar pad[1024 - 32];
-
-	union
-	{
-		char str[3072];
-		ushort token[1536];
-	}
-	body;
-}
-__attribute__((aligned(4096)));
-
-struct ctor_opts
-{
-    uchar pad[4096];
-}
-__attribute__((aligned(4096)));
-
-#ifndef __OPENCL_C_VERSION__
-static_assert(sizeof(struct ctor_ctrl) == 4096);
-#endif
-
-#ifndef __OPENCL_C_VERSION__
-static_assert(sizeof(struct ctor_opts) == 4096);
-#endif
-
-#ifndef __cplusplus
-
-union token
+union ircd_gpt_token
 {
 	float
 	word[768],
 	attn[12][64];
 };

-union tokenv
+union ircd_gpt_tokenv
 {
 	float4
 	word[768/4],
 	attn[12][64/4];
 };

-struct qkv
+struct ircd_gpt_qkv
 {
-	union token
+	union ircd_gpt_tokenv
 	qry,
 	key,
 	val;
 };

-struct qkvv
+struct ircd_gpt_qkvv
 {
-	union tokenv
+	union ircd_gpt_tokenv
 	qry,
 	key,
 	val;
 };

-struct attn_mask
+struct ircd_gpt_attn_mask
 {
 	bool
 	token[1024];
 };

-union aperature
+union ircd_gpt_aperature
 {
 	float
 	word[768],
@ -90,7 +58,7 @@ union aperature
 	attn[12][64];
 };

-union aperaturev
+union ircd_gpt_aperaturev
 {
 	float4
 	word[768/4],
--- a/ircd/gpt.cc
+++ b/ircd/gpt.cc
@ -34,30 +34,22 @@ namespace ircd::gpt

 	static f32
 	logit alignas(64) [65536],
+	embeds alignas(64) [1024 * 768],
 	scratch alignas(64) [1024 * 768];
 }

-namespace ircd::gpt
-{
-	extern void transform(ctor_ctrl &, const ctor_opts &);
-}
-
 decltype(ircd::gpt::log)
 ircd::gpt::log
 {
 	"gpt"
 };

-decltype(ircd::gpt::default_opts)
-ircd::gpt::default_opts;
-
 ircd::string_view
 ircd::gpt::generate(const mutable_buffer &out,
                    const string_view &in,
-                    const opts *opts,
-                    task *task)
+                    task &task)
 {
-	u16 buf[2][256];
+	u16 buf[2][1024];
 	const auto input_tokens
 	{
 		vocab::tokenize(buf[0], in)
@ -65,7 +57,7 @@ ircd::gpt::generate(const mutable_buffer &out,

 	const auto output_tokens
 	{
-		generate(buf[1], input_tokens, opts, task)
+		generate(buf[1], input_tokens, task)
 	};

 	const auto output
@ -79,13 +71,92 @@ ircd::gpt::generate(const mutable_buffer &out,
 ircd::vector_view<ircd::u16>
 ircd::gpt::generate(const vector_view<u16> &out,
                    const vector_view<const u16> &in,
-                    const opts *opts,
-                    task *task)
+                    task &task)
 {
+	assert(task.ctrl);
+	assert(task.opts);
+
+	uint ret(0);
+	bool halt(false);
+
+	const auto &opts(*task.opts);
+	auto &ctrl(*task.ctrl);
+	auto &errc(ctrl.error_seq);
+	auto &accc(ctrl.accept_seq);
+	ctrl.tokens = in.size();
+
+	const size_t tmax
+	{
+		in.size() + opts.limit
+	};
+
+	const vector_view<f32> accum
+	{
+		gpt::scratch, tmax * 768
+	};
+
+	const vector_view<f32> embeds
+	{
+		gpt::embeds, tmax * 768
+	};
+
+	for(uint j(0); j < in.size(); ++j)
+	{
+		const vector_view<f32> dst
+		{
+			data(embeds) + j * 768, 768
+		};
+
+		if(ircd::cl::enable)
+			ctrl.token[j] = in[j];
+		else
+			embed(data(dst), in[j], j, opts);
+
+		static char dbuf[512] {0};
+		char report[1536] {0};
+		char tmbuf[1][64] {{0}};
+		const size_t report_size = snprintf
+		(
+			report, sizeof(report),
+			"%-2u -- %-3u [%5u] --- --- %s        0        0  |  %8s",
+			j,
+			ctrl.tokens,
+			ctrl.token[j],
+			vocab::debug(dbuf, ctrl.token[j]).c_str(),
+			pretty(tmbuf[0], milliseconds(ctrl.elapsed), 1).c_str()
+		);
+
+		log::info
+		{
+			log, "%s",
+			string_view{report, report_size}
+		};
+	}
+
+	uint64_t cycles(0);
+	milliseconds last_time {0};
+	util::timer stopwatch;
+	{
+		const prof::scope_cycles task_cycles
+		{
+			cycles
+		};
+
+		generate(task);
+	}
+	last_time = stopwatch.at<milliseconds>();
+	ctrl.elapsed += last_time.count();
+
+	/*
+		coil(data(scratch), tokens, *opts.model);
+		tail(logit, data(last_embed), *opts.model);
+		out[i] = argmax(logit, *opts);
+	*/
+
 	uint accc_thresh[3] {3, 3, 3};
 	for(uint i(0); i < 3; ++i)
 		for(uint j(3); j > 0; --j)
-			if(opts->accept_code[i][j - 1] == -1U)
+			if(opts.accept_code[i][j - 1] == -1U)
 				--accc_thresh[i];
 			else
 				break;
@ -93,99 +164,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
 	uint errc_thresh[3] {3, 3, 3};
 	for(uint i(0); i < 3; ++i)
 		for(uint j(3); j > 0; --j)
-			if(opts->error_code[i][j - 1] == -1U)
+			if(opts.error_code[i][j - 1] == -1U)
 				--errc_thresh[i];
 			else
 				break;

-	uint ret(0);
-	bool halt(false);
-	auto &errc(task->error_seq);
-	auto &accc(task->accept_seq);
-	for(uint i(0); !halt && i < out.size() && ret < opts->limit; ++i)
+	for(auto &j(ret); j + in.size() < ctrl.tokens && j < out.size() && !halt; ++j)
 	{
-		ctor_ctrl ctrl alignas(4096) {0};
-		ctrl.pc = 1;
-
-		const size_t tokens
-		{
-			in.size() + i
-		};
-
-		const vector_view<f32> scratch
-		{
-			gpt::scratch, tokens * 768
-		};
-
-		for(uint j(0); j < in.size(); ++j)
-		{
-			const vector_view<f32> dst
-			{
-				data(scratch) + j * 768, 768
-			};
-
-			if(ircd::cl::enable)
-				ctrl.body.token[ctrl.tokens++] = in[j];
-			else
-				embed(data(dst), in[j], j, *opts);
-		}
-
-		for(uint j(0); j < i; ++j)
-		{
-			const vector_view<f32> dst
-			{
-				data(scratch) + (in.size() + j) * 768, 768
-			};
-
-			if(ircd::cl::enable)
-				ctrl.body.token[ctrl.tokens++] = out[j];
-			else
-				embed(data(dst), out[j], in.size() + j, *opts);
-		}
-
-		assert(!ircd::cl::enable || ctrl.tokens == tokens);
-		const vector_view<f32> last_embed
-		{
-			data(scratch) + (tokens - 1) * 768, 768
-		};
-
-		const auto last_cycl(task->cycles);
-		milliseconds last_time {0};
-		{
-			util::timer stopwatch;
-			const prof::scope_cycles task_cycles
-			{
-				task->cycles
-			};
-
-			if(ircd::cl::enable)
-			{
-				static const ctor_opts opts alignas(4096) {0};
-
-				transform(ctrl, opts);
-				out[i] = ctrl.body.token[ctrl.tokens - 1];
-				assert(ctrl.tokens == tokens + 1);
-			} else {
-				coil(data(scratch), tokens, *opts->model);
-				tail(logit, data(last_embed), *opts->model);
-				out[i] = argmax(logit, *opts);
-			}
-
-			last_time = stopwatch.at<milliseconds>();
-			task->time += last_time;
-		}
+		out[j] = ctrl.token[(in.size() + j + ctrl.head) % opts.buffer_tokens];

 		for(uint j(0); j < 3; ++j)
-			errc[j] =
-				opts->error_code[j][errc[j]] == out[i]?
-					errc[j] + 1:
-					0;
+			errc[j] = opts.error_code[j][errc[j]] == out[j]?
+				errc[j] + 1: 0;

 		for(uint j(0); j < 3; ++j)
-			accc[j] =
-				opts->accept_code[j][accc[j]] == out[i]?
-					accc[j] + 1:
-					0;
+			accc[j] = opts.accept_code[j][accc[j]] == out[j]?
+				accc[j] + 1: 0;

 		for(uint j(0); j < 3; ++j)
 			halt |= accc_thresh[j] && accc[j] >= accc_thresh[j],
@ -194,21 +188,23 @@ ircd::gpt::generate(const vector_view<u16> &out,
 		static char dbuf[512] {0};
 		char report[1536] {0};
 		char tmbuf[4][64] {0};
-		size_t report_size;
-		report_size = snprintf
+		const size_t bsz(ctrl.tokens - in.size());
+		const size_t report_size = snprintf
 		(
 			report, sizeof(report),
-			"%-2u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s  |  %8s",
-			i,
+			"%-2u %-2u %-3u %-3u %-3u [%5u] a:%u e:%u %s %8s %8s  |  %8s",
+			j,
+			j + in.size(),
 			ctrl.tokens,
-			ret,
-			out[i],
+			ctrl.cycle,
+			ctrl.epoch,
+			out[j],
 			accc[0] + accc[1] + accc[2],
 			errc[0] + errc[1] + errc[2],
-			vocab::debug(dbuf, out[i]).c_str(),
-			pretty(tmbuf[0], last_time, 1).c_str(),
-			pretty(tmbuf[1], si(last_cycl), 1).c_str(),
-			pretty(tmbuf[2], task->time, 1).c_str()
+			vocab::debug(dbuf, out[j]).c_str(),
+			pretty(tmbuf[0], milliseconds(last_time / bsz), 1).c_str(),
+			pretty(tmbuf[1], si(cycles / bsz), 1).c_str(),
+			pretty(tmbuf[2], milliseconds(ctrl.elapsed), 1).c_str()
 		);

 		log::info
@ -216,24 +212,22 @@ ircd::gpt::generate(const vector_view<u16> &out,
 			log, "%s",
 			string_view{report, report_size}
 		};
-
-		++ret;
-		ctx::yield();
-		ctx::interruption_point();
 	}

+	ret = ctrl.tokens - in.size();
 	for(uint i(0); i < 3; ++i)
-		if(accc_thresh[i] && task->accept_seq[i] >= accc_thresh[i])
+		if(accc_thresh[i] && ctrl.accept_seq[i] >= accc_thresh[i])
 		{
 			ret -= (3 - accc_thresh[i]);
 			break;
 		}
-		else if(errc_thresh[i] && task->error_seq[i] >= errc_thresh[i])
+		else if(errc_thresh[i] && ctrl.error_seq[i] >= errc_thresh[i])
 		{
 			ret -= (3 - errc_thresh[i]);
 			break;
 		}

+	ctx::interruption_point();
 	return vector_view<u16>
 	{
 		out, ret
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -8,129 +8,27 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.

-inline void
-ctor_local_bcast_ldr(__local float4 *const out,
-                     const uint ln,
-                     const uint li)
-{
-	for(uint stride = 1; stride < ln; stride <<= 1)
-	{
-		if(li < stride)
-			out[li + stride] = out[li];
-
-		barrier(CLK_LOCAL_MEM_FENCE);
-	}
-}

 inline void
-ctor_local_reduce_add_ldr(__local float4 *const out,
-                          const uint ln,
-                          const uint li)
-{
-	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
-	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		if(li < stride)
-			out[li] += out[li + stride];
-	}
-}
-
-inline void
-ctor_local_reduce_max_ldr(__local float *const out,
-                          const uint ln,
-                          const uint li)
-{
-	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
-	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		if(li < stride)
-			out[li] = max(out[li], out[li + stride]);
-	}
-}
-
-inline void
-ctor_local_reduce_tournament_ldr(__local float *const best,
-                                 __local ushort *const idx,
-                                 const uint ln,
-                                 const uint li)
-{
-	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
-	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-
-		if(li < stride && best[li] < best[li + stride])
-		{
-			best[li] = best[li + stride];
-			idx[li] = idx[li + stride];
-		}
-	}
-}
-
-inline void
-ctor_mean(__local float4 *const restrict out,
-          __local const float4 *const restrict in,
-          const uint num,
-          const uint i)
-{
-	out[i] = in[i];
-	ctor_local_reduce_add_ldr(out, num, i);
-
-	float numerator = 0.0f;
-	float4 numeratorv = out[i];
-	for(uint k = 0; k < 4; ++k)
-		numerator += numeratorv[k];
-
-	out[i] = numerator;
-	ctor_local_bcast_ldr(out, num, i);
-
-	numeratorv = out[i];
-	out[i] = numeratorv / (num * 4);
-}
-
-inline void
-ctor_norm(__local float4 *const out,
-          __local const float4 *const in,
-          __local float4 *const restrict tmp,
-          const uint num,
-          const uint i)
-{
-	ctor_mean(tmp, in, num, i);
-
-	const float4
-	sub_mean = in[i] - tmp[i];
-
-	tmp[i] = pow(sub_mean, 2);
-	ctor_mean(out, tmp, num, i);
-
-	const float4
-	epsilon = 0.00001f,
-	s = sqrt(out[i] + epsilon);
-
-	out[i] = sub_mean / s;
-}
-
-inline void
-ctor_norm_fmad(__local float4 *const out,
-               __local const float4 *const in,
-               __global const float4 *const restrict bias,
-               __global const float4 *const restrict weight,
-               const uint i)
+ircd_gpt_norm_fmad(__local float4 *const out,
+                   __local const float4 *const in,
+                   __global const float4 *const restrict bias,
+                   __global const float4 *const restrict weight,
+                   const uint i)
 {
 	out[i] = in[i] * weight[i] + bias[i];
 }

 // Matrix * Vector Multiply/Accumulate
 inline void
-ctor_sgemv(__local float4 *const restrict out,
-           __local const float4 *const restrict in,
-           __global const float4 *const restrict bias,
-           __global const float4 *const restrict weight,
-           const uint width,
-           const uint height,
-           const uint tiles,
-           const uint i)
+ircd_gpt_sgemv(__local float4 *const restrict out,
+               __local const float4 *const restrict in,
+               __global const float4 *const restrict bias,
+               __global const float4 *const restrict weight,
+               const uint width,
+               const uint height,
+               const uint tiles,
+               const uint i)
 {
 	const uint seg = height / tiles;

@ -151,9 +49,9 @@ ctor_sgemv(__local float4 *const restrict out,
 }

 inline void
-ctor_gelu(__local float4 *const out,
-          __local const float4 *const in_,
-          const uint i)
+ircd_gpt_gelu(__local float4 *const out,
+              __local const float4 *const in_,
+              const uint i)
 {
 	float4 a,
 	in = in_[i];
@ -178,14 +76,15 @@ ctor_gelu(__local float4 *const out,
 //

 __kernel void
-ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
-               __constant const struct ctor_opts *const opts,
-               __global union aperaturev *const restrict out,
-               __global const union tokenv *const restrict in,
-               __global const float4 *const restrict norm_bias,
-               __global const float4 *const restrict norm_weight,
-               __global const float4 *const restrict fcon_bias,
-               __global const float4 *const restrict fcon_weight)
+ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
+              __constant const struct ircd_gpt_opts *const opts,
+              __global union ircd_gpt_tokenv *const restrict accum,
+              __global const float4 *const restrict norm_bias,
+              __global const float4 *const restrict norm_weight,
+              __global const float4 *const restrict fcon_bias,
+              __global const float4 *const restrict fcon_weight,
+              __global const float4 *const restrict proj_bias,
+              __global const float4 *const restrict proj_weight)
 {
 	const uint
 	gi = get_global_id(0),
@ -195,31 +94,39 @@ ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);

-	__local union aperaturev token;
+	__local union ircd_gpt_aperaturev token;
 	__local float4 tmp[768/4];

-	token.word[li] = in[wi].word[li];
+	// Fetch local copy of the global accumulator. We operate on a cached
+	// copy as input, and add our output to the global upon completion.
+	token.word[li] = accum[wi].word[li];

 	// Layer re-normalization
-	ctor_norm(token.word, token.word, tmp, ln, li);
-	ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
+	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
+	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);

 	// Fully connected
-	for(uint i = 0; i < 3; ++i)
-		ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
+	for(uint i = 0; i < 4; ++i)
+		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);

-	// Export queries, keys, and values.
-	for(uint i = 0; i < 3; ++i)
-		out[wi].proj[i][li] = token.proj[i][li];
+	// Gaussian Error Linear Unit
+	for(uint i = 0; i < 4; ++i)
+		ircd_gpt_gelu(token.fcon, token.fcon, i * ln + li);
+
+	// Projection
+	ircd_gpt_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
+
+	// Accumulation; end of layer
+	accum[wi].word[li] += tmp[li];
 }

 __kernel void
-ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
-               __constant const struct ctor_opts *const opts,
-               __global union tokenv *const restrict accum,
-               __global const union tokenv *const restrict xattn,
-               __global const float4 *const restrict proj_bias,
-               __global const float4 *const restrict proj_weight)
+ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global union ircd_gpt_tokenv *const restrict accum,
+                   __local const union ircd_gpt_tokenv *const restrict xattn,
+                   __global const float4 *const restrict proj_bias,
+                   __global const float4 *const restrict proj_weight)
 {
 	const uint
 	gi = get_global_id(0),
@ -234,25 +141,24 @@ ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
 	out[768/4];

 	// Fetch
-	in[li] = xattn[wi].word[li];
+	in[li] = xattn->word[li];
+
+	// Need this here if xattn is __local
+	barrier(CLK_LOCAL_MEM_FENCE);

 	// Projection
-	ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
+	ircd_gpt_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);

 	// Accumulation; end of layer
 	accum[wi].word[li] += out[li];
 }

 __kernel void
-ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
-          __constant const struct ctor_opts *const opts,
-          __global union tokenv *const restrict accum,
-          __global const float4 *const restrict norm_bias,
-          __global const float4 *const restrict norm_weight,
-          __global const float4 *const restrict fcon_bias,
-          __global const float4 *const restrict fcon_weight,
-          __global const float4 *const restrict proj_bias,
-          __global const float4 *const restrict proj_weight)
+ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __local union ircd_gpt_tokenv *const restrict out,
+                   __global const struct ircd_gpt_qkvv *const restrict token,
+                   __global const struct ircd_gpt_attn_mask *const restrict mask)   // [1024][1024],
 {
 	const uint
 	gi = get_global_id(0),
@ -262,96 +168,13 @@ ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
 	wi = get_group_id(0),
 	wn = get_num_groups(0);

-	__local union aperaturev token;
-	__local float4 tmp[768/4];
-
-	// Fetch local copy of the global accumulator. We operate on a cached
-	// copy as input, and add our output to the global upon completion.
-	token.word[li] = accum[wi].word[li];
-
-	// Layer re-normalization
-	ctor_norm(token.word, token.word, tmp, ln, li);
-	ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
-
-	// Fully connected
-	for(uint i = 0; i < 4; ++i)
-		ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
-
-	// Gaussian Error Linear Unit
-	for(uint i = 0; i < 4; ++i)
-		ctor_gelu(token.fcon, token.fcon, i * ln + li);
-
-	// Projection
-	ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
-
-	// Accumulation; end of layer
-	accum[wi].word[li] += tmp[li];
-}
-
-__kernel void
-ctor_backend(__global const struct ctor_ctrl *const ctrl,
-             __constant const struct ctor_opts *const opts,
-             __global union tokenv *const restrict accum,
-             __global const union tokenv *const restrict xattn,
-             __global const float4 *const restrict attn_proj_bias,
-             __global const float4 *const restrict attn_proj_weight,
-             __global const float4 *const restrict ffnn_norm_bias,
-             __global const float4 *const restrict ffnn_norm_weight,
-             __global const float4 *const restrict ffnn_fcon_bias,
-             __global const float4 *const restrict ffnn_fcon_weight,
-             __global const float4 *const restrict ffnn_proj_bias,
-             __global const float4 *const restrict ffnn_proj_weight)
-{
-	ctor_attn_proj
-	(
-		ctrl,
-		opts,
-		accum,
-		xattn,
-		attn_proj_bias,
-		attn_proj_weight
-	);
-
-	ctor_ffnn
-	(
-		ctrl,
-		opts,
-		accum,
-		ffnn_norm_bias,
-		ffnn_norm_weight,
-		ffnn_fcon_bias,
-		ffnn_fcon_weight,
-		ffnn_proj_bias,
-		ffnn_proj_weight
-	);
-}
-
-//
-// ctrl
-//
-
-__kernel void
-ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
-               __constant const struct ctor_opts *const opts,
-               __global union tokenv *const restrict out,
-               __global const struct qkvv *const restrict token,
-               __global const struct attn_mask *const restrict mask)   // [1024][1024],
-{
-	__local struct
+	__local union
 	{
 		float
-		attn[12][32];
+		attn[12][96];
 	}
 	self;

-	const uint
-	gi = get_global_id(0),
-	gn = get_global_size(0),
-	li = get_local_id(0),
-	ln = get_local_size(0),
-	wi = get_group_id(0),
-	wn = get_num_groups(0);
-
 	for(uint i = 0; i < wn; ++i)
 		if(mask[wi].token[i])
 			self.attn[li][i] = 0.0f;
@ -389,114 +212,167 @@ ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
 		self.attn[li][i] /= acc;

 	for(uint j = 0; j < 64/4; ++j)
-		out[wi].attn[li][j] = 0.0f;
+		out->attn[li][j] = 0.0f;

 	for(uint i = 0; i < wn; ++i)
 		for(uint j = 0; j < 64/4; ++j)
-			out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
+			out->attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
 }

-//
-// leads
-//
-
 __kernel void
-ctor_anode0(__global const struct ctor_ctrl *const ctrl,
-            __constant const struct ctor_opts *const opts,
-            __global union tokenv *const restrict accum,
-            __global const union tokenv *const restrict pos,
-            __global const union tokenv *const restrict vocab)
+ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global union ircd_gpt_aperaturev *const restrict out,
+                   __global const union ircd_gpt_tokenv *const restrict in,
+                   __global const float4 *const restrict norm_bias,
+                   __global const float4 *const restrict norm_weight,
+                   __global const float4 *const restrict fcon_bias,
+                   __global const float4 *const restrict fcon_weight)
 {
 	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
 	li = get_local_id(0),
-	wi = get_group_id(0);
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0);

-	const ushort
-	token = ctrl->body.token[wi];
+	__local union ircd_gpt_aperaturev token;
+	__local float4 tmp[768/4];

-	const float4
-	wte = vocab[token].word[li],
-	wpe = pos[wi].word[li];
+	token.word[li] = in[wi].word[li];

-	accum[wi].word[li] = wte + wpe;
+	// Layer re-normalization
+	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp, ln, li);
+	ircd_gpt_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
+
+	// Fully connected
+	for(uint i = 0; i < 3; ++i)
+		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
+
+	// Export queries, keys, and values.
+	for(uint i = 0; i < 3; ++i)
+		out[wi].proj[i][li] = token.proj[i][li];
 }

 __kernel void
-ctor_anode1(__global const struct ctor_ctrl *const ctrl,
-            __constant const struct ctor_opts *const opts,
-            __global union tokenv *const restrict accum,
-            __global const union tokenv *const restrict pos,
-            __global const union tokenv *const restrict vocab)
+ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
+              __constant const struct ircd_gpt_opts *const opts,
+              __global union ircd_gpt_tokenv *const restrict accum,
+              __global const struct ircd_gpt_qkvv *const restrict state,
+              __global const struct ircd_gpt_attn_mask *const restrict mask,   // [1024][1024],
+              __global const float4 *const restrict attn_proj_bias,
+              __global const float4 *const restrict attn_proj_weight,
+              __global const float4 *const restrict ffnn_norm_bias,
+              __global const float4 *const restrict ffnn_norm_weight,
+              __global const float4 *const restrict ffnn_fcon_bias,
+              __global const float4 *const restrict ffnn_fcon_weight,
+              __global const float4 *const restrict ffnn_proj_bias,
+              __global const float4 *const restrict ffnn_proj_weight)
+{
+	__local union ircd_gpt_tokenv value;
+
+	ircd_gpt_attn_self
+	(
+		ctrl,
+		opts,
+		&value,
+		state,
+		mask
+	);
+
+	ircd_gpt_attn_proj
+	(
+		ctrl,
+		opts,
+		accum,
+		&value,
+		attn_proj_bias,
+		attn_proj_weight
+	);
+
+	ircd_gpt_ffnn
+	(
+		ctrl,
+		opts,
+		accum,
+		ffnn_norm_bias,
+		ffnn_norm_weight,
+		ffnn_fcon_bias,
+		ffnn_fcon_weight,
+		ffnn_proj_bias,
+		ffnn_proj_weight
+	);
+}
+
+//
+// frontend
+//
+
+inline void
+_ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global union ircd_gpt_tokenv *const restrict out,
+                   __global const union ircd_gpt_tokenv *const restrict pos,
+                   __global const union ircd_gpt_tokenv *const restrict vocab,
+                   const uint out_idx,
+                   const uint tok_idx,
+                   const uint word_idx)
+{
+	const ushort
+	token = ctrl->token[(ctrl->head + tok_idx) % opts->buffer_tokens];
+
+	const float4
+	wte = vocab[token].word[word_idx],
+	wpe = pos[tok_idx].word[word_idx];
+
+	out[out_idx].word[word_idx] = wte + wpe;
+}
+
+__kernel void
+ircd_gpt_lm_embed(__global const struct ircd_gpt_task *const ctrl,
+                  __constant const struct ircd_gpt_opts *const opts,
+                  __global union ircd_gpt_tokenv *const restrict accum,
+                  __global const union ircd_gpt_tokenv *const restrict pos,
+                  __global const union ircd_gpt_tokenv *const restrict vocab)
 {
 	const uint
 	li = get_local_id(0);

 	for(uint i = 0; i < ctrl->tokens; ++i)
-	{
-		const ushort
-		token = ctrl->body.token[i];
-
-		const float4
-		wte = vocab[token].word[li],
-		wpe = pos[i].word[li];
-
-		accum[i].word[li] = wte + wpe;
-	}
+		_ircd_gpt_lm_embed(ctrl, opts, accum, pos, vocab, i, i, li);
 }

 __kernel void
-ctor_anode2(__global const struct ctor_ctrl *const ctrl,
-            __constant const struct ctor_opts *const opts,
-            __global union tokenv *const restrict accum,
-            __global const union tokenv *const restrict pos,
-            __global const union tokenv *const restrict vocab)
-{
-	const uint
-	gi = get_global_id(0);
-
-	const ushort
-	token = ctrl->body.token[gi];
-
-	for(uint i = 0; i < 768/4; ++i)
-	{
-		const float4
-		wte = vocab[token].word[i],
-		wpe = pos[gi].word[i];
-
-		accum[gi].word[i] = wte + wpe;
-	}
-}
-
-__kernel void
-ctor_cathode(__global const struct ctor_ctrl *const ctrl,
-             __constant const struct ctor_opts *const opts,
-             __global union tokenv *const restrict accum,
-             __global const float4 *const restrict norm_bias,
-             __global const float4 *const restrict norm_weight)
+ircd_gpt_lm_norm(__global const struct ircd_gpt_task *const ctrl,
+                 __constant const struct ircd_gpt_opts *const opts,
+                 __global union ircd_gpt_tokenv *const restrict accum,
+                 __global const float4 *const restrict norm_bias,
+                 __global const float4 *const restrict norm_weight)
 {
 	const uint
 	li = get_local_id(0),
 	ln = get_local_size(0),
 	wi = get_global_offset(0) / ln + get_group_id(0);

-	__local union tokenv
+	__local union ircd_gpt_tokenv
 	token, tmp;

 	token.word[li] = accum[wi].word[li];

 	// Final re-normalization
-	ctor_norm(token.word, token.word, tmp.word, ln, li);
-	ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
+	ircd_simt_math_norm_f4lldr(token.word, token.word, tmp.word, ln, li);
+	ircd_gpt_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);

 	accum[0].word[li] = token.word[li];
 }

 __kernel void
-ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
-            __constant const struct ctor_opts *const opts,
-            __global float *const restrict logit,
-            __global const union tokenv *const restrict accum,
-            __global const union tokenv *const restrict token)
+ircd_gpt_lm_logit(__global const struct ircd_gpt_task *const ctrl,
+                  __constant const struct ircd_gpt_opts *const opts,
+                  __global float *const restrict logit,
+                  __global const union ircd_gpt_tokenv *const restrict accum,
+                  __global const union ircd_gpt_tokenv *const restrict token)
 {
 	const uint
 	gi = get_global_id(0);
@ -519,10 +395,79 @@ ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
 	logit[gi] = res;
 }

+inline void
+ircd_gpt_leave(__global struct ircd_gpt_task *const ctrl,
+               __constant const struct ircd_gpt_opts *const opts,
+               const uint li)
+{
+	// No action for other threads right now
+	if(li != 0)
+		return;
+
+	// Run debug checks and assertions.
+	#ifdef RB_DEBUG
+	if(li == 0 && ctrl->call == IRCD_GPT_ECOMPLETE)
+		if(ctrl->tokens < 2)
+			ctrl->call = IRCD_GPT_ETOKENS;
+	#endif
+
+	// If the call value has been set to something other than default we
+	// do nothing else here.
+	if(ctrl->call != IRCD_GPT_ECOMPLETE)
+		return;
+
+	// On the last cycle, with no prior call or error code set, indicate
+	// a nominal exit condition.
+	if(ctrl->cycle + 1 >= opts->limit)
+	{
+		ctrl->call = IRCD_GPT_ACCEPT;
+		ctrl->epoch += 1;
+	}
+
+	ctrl->cycle += 1;
+}
+
+inline void
+ircd_gpt_lm_result(__global struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   const uint li,
+                   __local const ushort *const restrict idx)
+{
+	// To read from cells other than idx[0] we need this barrier.
+	if(opts->top_k > 1)
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+	// No action for other threads right now
+	if(li != 0)
+		return;
+
+	// When the hypercall code is already set, bail here.
+	if(ctrl->call != IRCD_GPT_ECOMPLETE)
+		return;
+
+	const bool
+	buffer_full = ctrl->tokens >= opts->buffer_tokens;
+
+	const ulong
+	rnd = ircd_simt_rand_xoshiro256pg(ctrl->rand),
+	sel = rnd % max(opts->top_k, 1U);
+
+	const ushort
+	token = idx[sel],
+	token_idx = (ctrl->head + ctrl->tokens) % opts->buffer_tokens;
+
+	ctrl->token[token_idx] = token;
+
+	if(buffer_full)
+		ctrl->head = (ctrl->head + 1) % opts->buffer_tokens;
+	else
+		ctrl->tokens++;
+}
+
 __kernel void
-ctor_lmamax(__global struct ctor_ctrl *const ctrl,
-            __constant const struct ctor_opts *const opts,
-            __global const float *const restrict logit)
+ircd_gpt_lm_select(__global struct ircd_gpt_task *const ctrl,
+                   __constant const struct ircd_gpt_opts *const opts,
+                   __global const float *const restrict logit)
 {
 	const uint
 	gi = get_global_id(0),
@ -535,25 +480,13 @@ ctor_lmamax(__global struct ctor_ctrl *const ctrl,
 	ti = tn * li;

 	__local ushort idx[192];
-	__local float best[192];

 	idx[li] = ti;
 	for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
 		if(logit[j] > logit[idx[li]])
 			idx[li] = j;

-	best[li] = logit[idx[li]];
-	ctor_local_reduce_tournament_ldr(best, idx, ln, li);
-
-	if(li == 0 && ctrl->call == -1)
-		ctrl->body.token[ctrl->tokens++] = idx[li];
-
-	if(li == 0 && ctrl->call == -1)
-		ctrl->call = 1;
-
-	#ifdef RB_DEBUG
-	if(li == 0 && ctrl->call == 1)
-		if(ctrl->tokens < 2)
-			ctrl->call = -2;
-	#endif
+	ircd_simt_sort_idx16_flldr(idx, logit, ln, li);
+	ircd_gpt_lm_result(ctrl, opts, li, idx);
+	ircd_gpt_leave(ctrl, opts, li);
 }
--- a/ircd/gpt_model.cc
+++ b/ircd/gpt_model.cc
@ -142,7 +142,7 @@ ircd::gpt::model::init_from_cache(const string_view &cache_path)

 	fs::map::opts map_opts;
 	map_opts.huge2mb = true;
-	map_opts.locked = false;
+	map_opts.locked = true;
 	default_model_shm = fs::map
 	{
 		fd, map_opts, sizeof(decoder)
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -8,21 +8,33 @@
 // copyright notice and this permission notice is present in all copies. The
 // full license for this software is available in the LICENSE file.

-#include <ircd/gpt/pipe/pipe.h>
-
-namespace ircd::gpt
-{
-	void transform(ctor_ctrl &, const ctor_opts &);
-}
-
 namespace ircd::gpt::pipe
 {
-	static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts,
-	lmhead_opts, lmamax_opts;
+	static void profile_dumplog(pipe::exec &);

+	static ircd::cl::exec::opts
+	negative_opts, positive_opts, selfattn_opts,
+	cathode_opts, anode_opts, lmhead_opts, lmamax_opts;
+
+	extern conf::item<size_t> flush_cycles;
+	extern conf::item<size_t> queue_cycles;
 	extern const ircd::run::changed handle_quit;
 }

+decltype(ircd::gpt::pipe::queue_cycles)
+ircd::gpt::pipe::queue_cycles
+{
+	{ "name",     "ircd.gpt.pipe.queue" },
+	{ "default",  1L,                   },
+};
+
+decltype(ircd::gpt::pipe::flush_cycles)
+ircd::gpt::pipe::flush_cycles
+{
+	{ "name",     "ircd.gpt.pipe.flush" },
+	{ "default",  0L,                   },
+};
+
 decltype(ircd::gpt::pipe::default_model)
 ircd::gpt::pipe::default_model;

@ -82,144 +94,233 @@ noexcept
 //

 void
-ircd::gpt::transform(ctor_ctrl &ctrl,
-                     const ctor_opts &opts)
+ircd::gpt::generate(task &task)
 {
 	if(unlikely(!pipe::default_model))
 		pipe::init();

-	ctrl.call = -1;
-	pipe::exec
+	const auto &opts
 	{
-		ctrl, opts
+		*task.opts
 	};

+	auto &ctrl
+	{
+		*task.ctrl
+	};
+
+	ctrl.call = IRCD_GPT_ECOMPLETE;
+	ctrl.host_tsc = prof::cycles();
+	size_t cycle(ctrl.cycle);
+	const size_t tokens(ctrl.tokens);
+
+	std::deque<pipe::exec> list;
+	for(; cycle < opts.limit; ++cycle)
+	{
+		// When the release/acquire bits are set the control pages are sent
+		// and received; only set on first and last iterations of this loop.
+		const bool
+		rel(cycle == 0),
+		acq(cycle + 1 >= opts.limit);
+
+		// Enqueue the cycle's commands
+		list.emplace_back
+		(
+			task, tokens + cycle, rel, acq
+		);
+
+		// Conditions for a cl::flush here; this is not default but
+		// may be configured to improve some workloads.
+		const bool flush
+		{
+			// Flushing here is enabled by the configuration
+			pipe::flush_cycles
+
+			// Skip flushing on cycles already performing IO or waiting.
+			&& !rel && !acq && list.size() <= pipe::queue_cycles
+
+			// The configuration item can specify an interval greater than
+			// one between flushes.
+			&& cycle % pipe::flush_cycles == 0
+		};
+
+		if(flush)
+			cl::flush();
+
+		// Enqueue consecutive repetitions of our kernel batch before waiting
+		// on the first; based on the configuration. XXX get from ircd::cl
+		if(list.size() <= pipe::queue_cycles)
+			continue;
+
+		// Profiling branch
+		if((false))
+		{
+			auto &ex(list.front());
+			profile_dumplog(ex);
+		}
+
+		// Destructing the front of the queue waits for completion by yielding
+		// this ircd::ctx.
+		list.pop_front();
+	}
+
+	// Wait for all unfinished
+	list.clear();
+
+	// Interp error codes
 	if(unlikely(ctrl.call <= 0))
 		throw error
 		{
 			"hyper (#%d) :%s",
-			abs(ctrl.call),
-			ctrl.body.str,
+			abs(int(ctrl.call)),
+			reflect(ctrl.call),
 		};
+
+	always_assert(ctrl.cycle == cycle);
+}
+
+void
+ircd::gpt::pipe::profile_dumplog(pipe::exec &exec)
+{
+	constexpr size_t coils
+	{
+		sizeof(exec.coil) / sizeof(cl::exec)
+	};
+
+	for(size_t i(0); i < coils; ++i)
+	{
+		exec.coil[i].wait();
+		const auto &pro
+		{
+			exec.coil[i].profile()
+		};
+
+		char tmbuf[4][32] {{0}};
+		log::logf
+		{
+			log, log::level::DEBUG,
+			"coil:%-2lu %8s %8s %8s %8s\n",
+			i,
+			util::pretty(tmbuf[0], si(pro[0]), 1),
+			util::pretty(tmbuf[1], si(pro[1]), 1),
+			util::pretty(tmbuf[2], si(pro[2]), 1),
+			util::pretty(tmbuf[3], si(pro[3]), 1),
+		};
+	}
 }

 //
 // pipe::exec
 //

-ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl,
-                            const ctor_opts &opts)
+ircd::gpt::pipe::exec::exec(task &task,
+                            const size_t tokens,
+                            const bool release,
+                            const bool acquire)
 :desc
 {
 	default_desc
 }
-,out_ctrl
+,send_opts
 {
-	reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl)
+	reinterpret_cast<const char *>(task.opts),
+	release? sizeof(struct ircd_gpt_opts): 0
 }
-,in_ctrl
+,send_ctrl
 {
-	reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl)
+	reinterpret_cast<const char *>(task.ctrl),
+	release? sizeof(struct ircd_gpt_task): 0
 }
-,in_opts
+,recv_ctrl
 {
-	reinterpret_cast<const char *>(&opts), sizeof(ctor_opts)
+	reinterpret_cast<char *>(task.ctrl),
+	acquire? sizeof(struct ircd_gpt_task): 0
 }
-,range_anode
+,range_lm_embed
 {
-	{ ctrl.tokens, 0, },
-	{           1, 0, },
-}
-,range_coil
-{
-	{ ctrl.tokens * 192UL, 0, },
-	{               192UL, 0, },
+	{      1 * 192UL, 0, },
+	{          192UL, 0, },
 }
 ,range_negative
 {
-	range_coil
-}
-,range_selfattn
-{
-	range_coil
+	{ tokens * 192UL, 0, },
+	{          192UL, 0, },
 }
 ,range_positive
 {
-	range_coil
+	{ tokens * 192UL, 0, },
+	{          192UL, 0, },
 }
-,range_cathode
+,range_lm_norm
 {
-	{                 1 * 192UL, 0 },
-	{                     192UL, 0 },
-	{ (ctrl.tokens - 1) * 192UL, 0 },
+	{            1 * 192UL, 0 },
+	{                192UL, 0 },
+	{ (tokens - 1) * 192UL, 0 },
 }
-,range_lmhead
+,range_lm_logit
 {
 	{ 262 * 192UL, 0 },  // align_up(50257) / 192
 	{       192UL, 0 },
 }
-,range_lmamax
+,range_lm_select
 {
 	{   1 * 192UL, 0 },
 	{       192UL, 0 },
 }
-,send
+,release_opts
 {
-	{ desc->opts, in_opts },
-	{ desc->ctrl, in_ctrl },
+	desc->opts, send_opts
 }
-,tail
+,release_ctrl
 {
-	{ desc->anode,     range_anode,   anode_opts    },
+	desc->ctrl, send_ctrl
+}
+,lm_embed
+{
+	desc->lm_embed, range_lm_embed, anode_opts
 }
 ,coil
 {
 	{ desc->layer[0x00]->negative, range_negative, negative_opts },
-	{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x00]->positive, range_positive, positive_opts },
 	{ desc->layer[0x01]->negative, range_negative, negative_opts },
-	{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x01]->positive, range_positive, positive_opts },
 	{ desc->layer[0x02]->negative, range_negative, negative_opts },
-	{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x02]->positive, range_positive, positive_opts },
 	{ desc->layer[0x03]->negative, range_negative, negative_opts },
-	{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x03]->positive, range_positive, positive_opts },
 	{ desc->layer[0x04]->negative, range_negative, negative_opts },
-	{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x04]->positive, range_positive, positive_opts },
 	{ desc->layer[0x05]->negative, range_negative, negative_opts },
-	{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x05]->positive, range_positive, positive_opts },
 	{ desc->layer[0x06]->negative, range_negative, negative_opts },
-	{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x06]->positive, range_positive, positive_opts },
 	{ desc->layer[0x07]->negative, range_negative, negative_opts },
-	{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x07]->positive, range_positive, positive_opts },
 	{ desc->layer[0x08]->negative, range_negative, negative_opts },
-	{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x08]->positive, range_positive, positive_opts },
 	{ desc->layer[0x09]->negative, range_negative, negative_opts },
-	{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x09]->positive, range_positive, positive_opts },
 	{ desc->layer[0x0a]->negative, range_negative, negative_opts },
-	{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x0a]->positive, range_positive, positive_opts },
 	{ desc->layer[0x0b]->negative, range_negative, negative_opts },
-	{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
 	{ desc->layer[0x0b]->positive, range_positive, positive_opts },
 }
-,head
+,lm_norm
 {
-	{ desc->cathode,  range_cathode,  cathode_opts  },
-	{ desc->lmhead,   range_lmhead,   lmhead_opts   },
-	{ desc->lmamax,   range_lmamax,   lmamax_opts   },
+	desc->lm_norm, range_lm_norm, cathode_opts
 }
-,recv
+,lm_logit
 {
-	{ desc->ctrl, out_ctrl },
+	desc->lm_logit, range_lm_logit, lmhead_opts
+}
+,lm_select
+{
+	desc->lm_select, range_lm_select, lmamax_opts
+}
+,acquire_ctrl
+{
+	desc->ctrl, recv_ctrl
 }
 {
 }
@ -296,29 +397,14 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 {
 	&code
 }
-,opts
-{
-	4_KiB,
-	const_buffer{}
-}
-,ctrl
-{
-	4_KiB,
-	mutable_buffer{}
-}
 ,state
 {
-	32 * 3 * 768 * sizeof(float),
-	mutable_buffer{}
-}
-,xattn
-{
-	32 * 1 * 768 * sizeof(float),
+	96 * 3 * 768 * sizeof(float),
 	mutable_buffer{}
 }
 ,accum
 {
-	32 * 768 * sizeof(float),
+	96 * 768 * sizeof(float),
 	mutable_buffer{}
 }
 ,logit
@ -326,16 +412,54 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	65536 * sizeof(float),
 	mutable_buffer{}
 }
-,anode
+,ctrl
+{
+	sizeof(struct ircd_gpt_task),
+	mutable_buffer{}
+}
+,opts
+{
+	sizeof(struct ircd_gpt_opts),
+	const_buffer{}
+}
+,lm_embed
 {
 	code,
-	"ctor_anode2",
+	"ircd_gpt_lm_embed",
 	ctrl,
 	opts,
 	accum,
 	model.embed->pos,
 	model.embed->token,
 }
+,lm_norm
+{
+	code,
+	"ircd_gpt_lm_norm",
+	ctrl,
+	opts,
+	accum,
+	model.decode->norm.bias,
+	model.decode->norm.weight,
+}
+,lm_logit
+{
+	code,
+	"ircd_gpt_lm_logit",
+	ctrl,
+	opts,
+	logit,
+	accum,
+	model.embed->token,
+}
+,lm_select
+{
+	code,
+	"ircd_gpt_lm_select",
+	ctrl,
+	opts,
+	logit,
+}
 ,layer
 {
 	std::make_unique<struct desc::layer>(*this, 0x00),
@ -351,34 +475,6 @@ ircd::gpt::pipe::desc::desc(pipe::code &code,
 	std::make_unique<struct desc::layer>(*this, 0x0a),
 	std::make_unique<struct desc::layer>(*this, 0x0b),
 }
-,cathode
-{
-	code,
-	"ctor_cathode",
-	ctrl,
-	opts,
-	accum,
-	model.decode->norm.bias,
-	model.decode->norm.weight,
-}
-,lmhead
-{
-	code,
-	"ctor_lmhead",
-	ctrl,
-	opts,
-	logit,
-	accum,
-	model.embed->token,
-}
-,lmamax
-{
-	code,
-	"ctor_lmamax",
-	ctrl,
-	opts,
-	logit,
-}
 {
 }

@ -391,7 +487,7 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
 :negative
 {
 	*desc.code,
-	"ctor_attn_fcon",
+	"ircd_gpt_attn_fcon",
 	desc.ctrl,
 	desc.opts,
 	desc.state,
@ -401,24 +497,15 @@ ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
 	desc.model->decode->block[laynum].attn.fcon.bias,
 	desc.model->decode->block[laynum].attn.fcon.weight,
 }
-,selfattn
-{
-	*desc.code,
-	"ctor_attn_self",
-	desc.ctrl,
-	desc.opts,
-	desc.xattn,
-	desc.state,
-	desc.model->decode->block[laynum].attn.mask,
-}
 ,positive
 {
 	*desc.code,
-	"ctor_backend",
+	"ircd_gpt_coil",
 	desc.ctrl,
 	desc.opts,
 	desc.accum,
-	desc.xattn,
+	desc.state,
+	desc.model->decode->block[laynum].attn.mask,
 	desc.model->decode->block[laynum].attn.proj.bias,
 	desc.model->decode->block[laynum].attn.proj.weight,
 	desc.model->decode->block[laynum].ffnn.norm.bias,
@ -486,23 +573,33 @@ noexcept
 //

 ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
-:block
+:master
 {
-	{ decoder.layer[0x00], 0x00, },
-	{ decoder.layer[0x01], 0x01, },
-	{ decoder.layer[0x02], 0x02, },
-	{ decoder.layer[0x03], 0x03, },
-	{ decoder.layer[0x04], 0x04, },
-	{ decoder.layer[0x05], 0x05, },
-	{ decoder.layer[0x06], 0x06, },
-	{ decoder.layer[0x07], 0x07, },
-	{ decoder.layer[0x08], 0x08, },
-	{ decoder.layer[0x09], 0x09, },
-	{ decoder.layer[0x0a], 0x0a, },
-	{ decoder.layer[0x0b], 0x0b, },
+	sizeof(gpt::model::block) * 12 + sizeof(gpt::model::norm), const_buffer
+	{
+		reinterpret_cast<const char *>(decoder.layer),
+		sizeof(decoder.layer) + sizeof(decoder.f)
+	}
+}
+,block
+{
+	{ master, sizeof(gpt::model::block) * 0x00, decoder.layer[0x00], 0x00, },
+	{ master, sizeof(gpt::model::block) * 0x01, decoder.layer[0x01], 0x01, },
+	{ master, sizeof(gpt::model::block) * 0x02, decoder.layer[0x02], 0x02, },
+	{ master, sizeof(gpt::model::block) * 0x03, decoder.layer[0x03], 0x03, },
+	{ master, sizeof(gpt::model::block) * 0x04, decoder.layer[0x04], 0x04, },
+	{ master, sizeof(gpt::model::block) * 0x05, decoder.layer[0x05], 0x05, },
+	{ master, sizeof(gpt::model::block) * 0x06, decoder.layer[0x06], 0x06, },
+	{ master, sizeof(gpt::model::block) * 0x07, decoder.layer[0x07], 0x07, },
+	{ master, sizeof(gpt::model::block) * 0x08, decoder.layer[0x08], 0x08, },
+	{ master, sizeof(gpt::model::block) * 0x09, decoder.layer[0x09], 0x09, },
+	{ master, sizeof(gpt::model::block) * 0x0a, decoder.layer[0x0a], 0x0a, },
+	{ master, sizeof(gpt::model::block) * 0x0b, decoder.layer[0x0b], 0x0b, },
 }
 ,norm
 {
+	master,
+	off_t(sizeof(gpt::model::block) * 12),
 	const_buffer{decoder.f.bias},
 	const_buffer{decoder.f.weight},
 }
@ -544,6 +641,27 @@ ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
 {
 }

+ircd::gpt::pipe::model::block::block(cl::data &master,
+                                     const off_t offset,
+                                     const gpt::model::block &block,
+                                     const size_t layer)
+:attn
+{
+	master,
+	offset,
+	block.ln1,
+	block.attn,
+}
+,ffnn
+{
+	master,
+	offset + off_t(sizeof(block.ln1) + sizeof(block.attn)),
+	block.ln2,
+	block.ffnn,
+}
+{
+}
+
 //
 // pipe::model::ffnn
 //
@ -678,3 +796,48 @@ ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
 }
 {
 }
+
+//
+// gpt::task
+//
+
+ircd::gpt::task::task(const gpt::opts *const opts,
+                      struct ircd_gpt_task *const ctrl)
+:opts
+{
+	opts
+}
+,ctrl
+{
+	ctrl
+}
+{
+	memset(this->ctrl, 0x0, sizeof(ircd_gpt_task));
+
+    this->ctrl->rand[0] = this->opts->seed;
+    this->ctrl->rand[1] = this->opts->seed;
+    this->ctrl->rand[2] = -1UL;
+    this->ctrl->rand[3] = -1UL;
+}
+
+ircd::gpt::task::~task()
+noexcept
+{
+}
+
+//
+// hypercall
+//
+
+ircd::string_view
+ircd::gpt::reflect(const enum ircd_gpt_hypercall code)
+noexcept
+{
+	switch(code)
+	{
+		case IRCD_GPT_ACCEPT:      return "ACCEPT";
+		case IRCD_GPT_ECOMPLETE:   return "ECOMPLETE";
+	}
+
+	return "??????";
+}
--- a/modules/console.cc
+++ b/modules/console.cc
@ -17324,10 +17324,15 @@ console_cmd__gpt__raw(opt &out, const string_view &line)
 	opts.limit = param.at<uint>("limit");
 	opts.top_k = 3;

-	gpt::task task;
+	struct ircd_gpt_task ctrl;
+	gpt::task task
+	{
+		&opts, &ctrl
+	};
+
 	const auto output
 	{
-		gpt::generate(buf, text, &opts, &task)
+		gpt::generate(buf, text, task)
 	};

 	out