ircd::gpt: Transformer Pipe.

2024-06-20 10:58:20 +02:00 · 2021-03-29 18:22:42 -07:00 · 2021-03-29 18:22:42 -07:00 · 29e74ec9e1
parent 5e52f6b97b
commit 29e74ec9e1
7 changed files with 1526 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@ Makefile
 *.o
 *.so
 *.lo
+*.clo
 *.la
 *.orig
 *.log
--- a/include/ircd/gpt/pipe/ctrl.h
+++ b/include/ircd/gpt/pipe/ctrl.h
@ -0,0 +1,103 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_CTRL_H
+
+struct ctor_ctrl
+{
+	long call;
+	ulong pc;
+	ulong tokens;
+	ulong magic;
+	uchar pad[1024 - 32];
+
+	union
+	{
+		char str[3072];
+		ushort token[1536];
+	}
+	body;
+}
+__attribute__((aligned(4096)));
+
+struct ctor_opts
+{
+    uchar pad[4096];
+}
+__attribute__((aligned(4096)));
+
+#ifndef __OPENCL_C_VERSION__
+static_assert(sizeof(struct ctor_ctrl) == 4096);
+#endif
+
+#ifndef __OPENCL_C_VERSION__
+static_assert(sizeof(struct ctor_opts) == 4096);
+#endif
+
+#ifndef __cplusplus
+
+union token
+{
+	float
+	word[768],
+	attn[12][64];
+};
+
+union tokenv
+{
+	float4
+	word[768/4],
+	attn[12][64/4];
+};
+
+struct qkv
+{
+	union token
+	qry,
+	key,
+	val;
+};
+
+struct qkvv
+{
+	union tokenv
+	qry,
+	key,
+	val;
+};
+
+struct attn_mask
+{
+	bool
+	token[1024];
+};
+
+union aperature
+{
+	float
+	word[768],
+	fcon[2304],
+	proj[3][768],
+	qkv[3][12][64],
+	attn[12][64];
+};
+
+union aperaturev
+{
+	float4
+	word[768/4],
+	fcon[2304/4],
+	proj[3][768/4],
+	qkv[3][12][64/4],
+	attn[12][64/4];
+};
+
+#endif
--- a/include/ircd/gpt/pipe/model.h
+++ b/include/ircd/gpt/pipe/model.h
@ -0,0 +1,80 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_MODEL_H
+
+struct ircd::gpt::pipe::model
+{
+	struct tensor;
+	struct norm;
+	struct proj;
+	struct fcon;
+	struct attn;
+	struct ffnn;
+	struct block;
+	struct decoder;
+	struct language;
+
+	std::unique_ptr<model::decoder> decode;
+	std::unique_ptr<model::language> embed;
+
+	model(const gpt::model::decoder &, const gpt::model::embed &);
+	~model() noexcept;
+};
+
+struct ircd::gpt::pipe::model::tensor
+{
+	cl::data bias, weight;
+
+	tensor(const const_buffer &bias, const const_buffer &weight);
+	tensor(cl::data &, const off_t, const const_buffer &bias, const const_buffer &weight);
+};
+
+struct ircd::gpt::pipe::model::attn
+{
+	tensor norm, fcon, proj;
+	cl::data mask;
+
+	attn(cl::data &, const off_t, const gpt::model::norm &, const gpt::model::attn &);
+};
+
+struct ircd::gpt::pipe::model::ffnn
+{
+	tensor norm, fcon, proj;
+
+	ffnn(cl::data &, const off_t, const gpt::model::norm &, const gpt::model::ffnn &);
+};
+
+struct ircd::gpt::pipe::model::block
+{
+	cl::data master;
+	model::attn attn;
+	model::ffnn ffnn;
+
+	block(const gpt::model::block &, const size_t);
+};
+
+struct ircd::gpt::pipe::model::decoder
+{
+	model::block block[12];
+	tensor norm;
+
+	decoder(const gpt::model::decoder &);
+	~decoder() noexcept;
+};
+
+struct ircd::gpt::pipe::model::language
+{
+	cl::data pos, token;
+
+	language(const gpt::model::embed &);
+	~language() noexcept;
+};
--- a/include/ircd/gpt/pipe/pipe.h
+++ b/include/ircd/gpt/pipe/pipe.h
@ -0,0 +1,96 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_GPT_PIPE_H
+
+namespace ircd::gpt::pipe
+{
+	struct model;
+	struct code;
+	struct desc;
+	struct exec;
+	struct bank;
+
+	extern model *default_model;
+	extern code *default_code;
+	extern desc *default_desc;
+
+	void init(), fini() noexcept;
+};
+
+#include "model.h"
+#include "ctrl.h"
+
+struct ircd::gpt::pipe::code
+:cl::code
+{
+	static const string_view compile_opts;
+
+	code();
+	~code() noexcept;
+};
+
+struct ircd::gpt::pipe::desc
+{
+	struct layer;
+
+	pipe::model *model;
+	pipe::code *code;
+
+	cl::data opts;
+	cl::data ctrl;
+	cl::data state;
+	cl::data xattn;
+	cl::data accum;
+	cl::data logit;
+	cl::kern anode;
+	std::unique_ptr<struct desc::layer> layer[12];
+	cl::kern cathode;
+	cl::kern lmhead;
+	cl::kern lmamax;
+
+	desc(pipe::code &, pipe::model &);
+};
+
+struct ircd::gpt::pipe::desc::layer
+{
+	cl::kern negative;
+	cl::kern selfattn;
+	cl::kern positive;
+
+	layer(pipe::desc &, const int);
+};
+
+struct ircd::gpt::pipe::exec
+{
+	pipe::desc *desc;
+
+	mutable_buffer out_ctrl;
+	const_buffer in_ctrl, in_opts;
+
+	cl::kern::range range_anode;
+	cl::kern::range range_coil;
+	cl::kern::range range_negative;
+	cl::kern::range range_selfattn;
+	cl::kern::range range_positive;
+	cl::kern::range range_cathode;
+	cl::kern::range range_lmhead;
+	cl::kern::range range_lmamax;
+
+	cl::exec send[2];
+	cl::exec tail[1];
+	cl::exec coil[12 * 3];
+	cl::exec head[3];
+	cl::exec recv[1];
+
+	exec(ctor_ctrl &, const ctor_opts &);
+	~exec() noexcept;
+};
--- a/ircd/Makefile.am
+++ b/ircd/Makefile.am
@ -218,6 +218,10 @@ if OPENCL
 libircd_la_SOURCES += cl.cc
 endif
 libircd_la_SOURCES += gpt.cc
+libircd_la_SOURCES += gpt_pipe.cc
+if OPENCL
+BUILT_SOURCES += gpt_cl.clo
+endif
 libircd_la_SOURCES += gpt_model.cc
 libircd_la_SOURCES += gpt_vocab.cc
 libircd_la_SOURCES += openssl.cc
@ -348,3 +352,6 @@ endif
 # LLVM PGO text to binary for -fprofile-use
 default.profdata:
 	-$(LLVM_PROFDATA) merge -output=default.profdata default.proftext
+
+gpt_cl.clo: gpt_cl.cl
+	clang++-11 -std=CL1.1 -c -pipe -Xclang -finclude-default-header -o gpt_cl.clo $^
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@ -0,0 +1,559 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+inline void
+ctor_local_bcast_ldr(__local float4 *const out,
+                     const uint ln,
+                     const uint li)
+{
+	for(uint stride = 1; stride < ln; stride <<= 1)
+	{
+		if(li < stride)
+			out[li + stride] = out[li];
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+}
+
+inline void
+ctor_local_reduce_add_ldr(__local float4 *const out,
+                          const uint ln,
+                          const uint li)
+{
+	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(li < stride)
+			out[li] += out[li + stride];
+	}
+}
+
+inline void
+ctor_local_reduce_max_ldr(__local float *const out,
+                          const uint ln,
+                          const uint li)
+{
+	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(li < stride)
+			out[li] = max(out[li], out[li + stride]);
+	}
+}
+
+inline void
+ctor_local_reduce_tournament_ldr(__local float *const best,
+                                 __local ushort *const idx,
+                                 const uint ln,
+                                 const uint li)
+{
+	for(uint stride = ln >> 1; stride > 0; stride >>= 1)
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		if(li < stride && best[li] < best[li + stride])
+		{
+			best[li] = best[li + stride];
+			idx[li] = idx[li + stride];
+		}
+	}
+}
+
+inline void
+ctor_mean(__local float4 *const restrict out,
+          __local const float4 *const restrict in,
+          const uint num,
+          const uint i)
+{
+	out[i] = in[i];
+	ctor_local_reduce_add_ldr(out, num, i);
+
+	float numerator = 0.0f;
+	float4 numeratorv = out[i];
+	for(uint k = 0; k < 4; ++k)
+		numerator += numeratorv[k];
+
+	out[i] = numerator;
+	ctor_local_bcast_ldr(out, num, i);
+
+	numeratorv = out[i];
+	out[i] = numeratorv / (num * 4);
+}
+
+inline void
+ctor_norm(__local float4 *const out,
+          __local const float4 *const in,
+          __local float4 *const restrict tmp,
+          const uint num,
+          const uint i)
+{
+	ctor_mean(tmp, in, num, i);
+
+	const float4
+	sub_mean = in[i] - tmp[i];
+
+	tmp[i] = pow(sub_mean, 2);
+	ctor_mean(out, tmp, num, i);
+
+	const float4
+	epsilon = 0.00001f,
+	s = sqrt(out[i] + epsilon);
+
+	out[i] = sub_mean / s;
+}
+
+inline void
+ctor_norm_fmad(__local float4 *const out,
+               __local const float4 *const in,
+               __global const float4 *const restrict bias,
+               __global const float4 *const restrict weight,
+               const uint i)
+{
+	out[i] = in[i] * weight[i] + bias[i];
+}
+
+// Matrix * Vector Multiply/Accumulate
+inline void
+ctor_sgemv(__local float4 *const restrict out,
+           __local const float4 *const restrict in,
+           __global const float4 *const restrict bias,
+           __global const float4 *const restrict weight,
+           const uint width,
+           const uint height,
+           const uint tiles,
+           const uint i)
+{
+	const uint seg = height / tiles;
+
+	float4 acc = bias[i];
+	for(uint j = 0; j < seg; ++j)
+		for(uint t = 0; t < tiles; ++t)
+			for(uint k = 0; k < 4; ++k)
+			{
+				const uint
+				jidx = t * seg + j,
+				kidx = jidx * 4 + k,
+				widx = kidx * width + i;
+
+				acc += weight[widx] * in[jidx][k];
+			}
+
+	out[i] = acc;
+}
+
+inline void
+ctor_gelu(__local float4 *const out,
+          __local const float4 *const in_,
+          const uint i)
+{
+	float4 a,
+	in = in_[i];
+
+	a = 0.044715f;
+	a *= in;
+	a *= in;
+	a += 1.0f;
+	a *= 0.7978845608f;
+	a *= in;
+
+	a = tanh(a);
+	a += 1.0f;
+	a *= in;
+	a *= 0.5f;
+
+	out[i] = a;
+}
+
+//
+// core
+//
+
+__kernel void
+ctor_attn_fcon(__global const struct ctor_ctrl *const ctrl,
+               __constant const struct ctor_opts *const opts,
+               __global union aperaturev *const restrict out,
+               __global const union tokenv *const restrict in,
+               __global const float4 *const restrict norm_bias,
+               __global const float4 *const restrict norm_weight,
+               __global const float4 *const restrict fcon_bias,
+               __global const float4 *const restrict fcon_weight)
+{
+	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0);
+
+	__local union aperaturev token;
+	__local float4 tmp[768/4];
+
+	token.word[li] = in[wi].word[li];
+
+	// Layer re-normalization
+	ctor_norm(token.word, token.word, tmp, ln, li);
+	ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
+
+	// Fully connected
+	for(uint i = 0; i < 3; ++i)
+		ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 2304/4, 768/4, 4, i * ln + li);
+
+	// Export queries, keys, and values.
+	for(uint i = 0; i < 3; ++i)
+		out[wi].proj[i][li] = token.proj[i][li];
+}
+
+__kernel void
+ctor_attn_proj(__global const struct ctor_ctrl *const ctrl,
+               __constant const struct ctor_opts *const opts,
+               __global union tokenv *const restrict accum,
+               __global const union tokenv *const restrict xattn,
+               __global const float4 *const restrict proj_bias,
+               __global const float4 *const restrict proj_weight)
+{
+	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0);
+
+	__local float4
+	in[768/4],
+	out[768/4];
+
+	// Fetch
+	in[li] = xattn[wi].word[li];
+
+	// Projection
+	ctor_sgemv(out, in, proj_bias, proj_weight, 768/4, 768/4, 1, li);
+
+	// Accumulation; end of layer
+	accum[wi].word[li] += out[li];
+}
+
+__kernel void
+ctor_ffnn(__global const struct ctor_ctrl *const ctrl,
+          __constant const struct ctor_opts *const opts,
+          __global union tokenv *const restrict accum,
+          __global const float4 *const restrict norm_bias,
+          __global const float4 *const restrict norm_weight,
+          __global const float4 *const restrict fcon_bias,
+          __global const float4 *const restrict fcon_weight,
+          __global const float4 *const restrict proj_bias,
+          __global const float4 *const restrict proj_weight)
+{
+	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0);
+
+	__local union aperaturev token;
+	__local float4 tmp[768/4];
+
+	// Fetch local copy of the global accumulator. We operate on a cached
+	// copy as input, and add our output to the global upon completion.
+	token.word[li] = accum[wi].word[li];
+
+	// Layer re-normalization
+	ctor_norm(token.word, token.word, tmp, ln, li);
+	ctor_norm_fmad(tmp, token.word, norm_bias, norm_weight, li);
+
+	// Fully connected
+	for(uint i = 0; i < 4; ++i)
+		ctor_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, 3072/4, 768/4, 4, i * ln + li);
+
+	// Gaussian Error Linear Unit
+	for(uint i = 0; i < 4; ++i)
+		ctor_gelu(token.fcon, token.fcon, i * ln + li);
+
+	// Projection
+	ctor_sgemv(tmp, token.fcon, proj_bias, proj_weight, 768/4, 3072/4, 4, li);
+
+	// Accumulation; end of layer
+	accum[wi].word[li] += tmp[li];
+}
+
+__kernel void
+ctor_backend(__global const struct ctor_ctrl *const ctrl,
+             __constant const struct ctor_opts *const opts,
+             __global union tokenv *const restrict accum,
+             __global const union tokenv *const restrict xattn,
+             __global const float4 *const restrict attn_proj_bias,
+             __global const float4 *const restrict attn_proj_weight,
+             __global const float4 *const restrict ffnn_norm_bias,
+             __global const float4 *const restrict ffnn_norm_weight,
+             __global const float4 *const restrict ffnn_fcon_bias,
+             __global const float4 *const restrict ffnn_fcon_weight,
+             __global const float4 *const restrict ffnn_proj_bias,
+             __global const float4 *const restrict ffnn_proj_weight)
+{
+	ctor_attn_proj
+	(
+		ctrl,
+		opts,
+		accum,
+		xattn,
+		attn_proj_bias,
+		attn_proj_weight
+	);
+
+	ctor_ffnn
+	(
+		ctrl,
+		opts,
+		accum,
+		ffnn_norm_bias,
+		ffnn_norm_weight,
+		ffnn_fcon_bias,
+		ffnn_fcon_weight,
+		ffnn_proj_bias,
+		ffnn_proj_weight
+	);
+}
+
+//
+// ctrl
+//
+
+__kernel void
+ctor_attn_self(__global const struct ctor_ctrl *const ctrl,
+               __constant const struct ctor_opts *const opts,
+               __global union tokenv *const restrict out,
+               __global const struct qkvv *const restrict token,
+               __global const struct attn_mask *const restrict mask)   // [1024][1024],
+{
+	__local struct
+	{
+		float
+		attn[12][32];
+	}
+	self;
+
+	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0);
+
+	for(uint i = 0; i < wn; ++i)
+		if(mask[wi].token[i])
+			self.attn[li][i] = 0.0f;
+		else
+			self.attn[li][i] = -10000.0f;
+
+	for(uint i = 0; i < wn; ++i)
+		if(mask[wi].token[i])
+			for(uint j = 0; j < 64/4; ++j)
+			{
+				float4
+				qry = token[wi].qry.attn[li][j],
+				key = token[i].key.attn[li][j],
+				res = qry * key;
+				for(uint k = 0; k < 4; ++k)
+					self.attn[li][i] += res[k];
+			}
+
+	for(uint i = 0; i < wn; ++i)
+		if(mask[wi].token[i])
+			self.attn[li][i] /= 8.0f;
+
+	for(uint i = 0; i < wn; ++i)
+		self.attn[li][i] = exp(self.attn[li][i]);
+
+	float4 vacc = 0.0f;
+	for(uint i = 0; i < wn; ++i)
+		vacc[i % 4] += self.attn[li][i];
+
+	float acc = 0.0f;
+	for(uint i = 0; i < 4; ++i)
+		acc += vacc[i];
+
+	for(uint i = 0; i < wn; ++i)
+		self.attn[li][i] /= acc;
+
+	for(uint j = 0; j < 64/4; ++j)
+		out[wi].attn[li][j] = 0.0f;
+
+	for(uint i = 0; i < wn; ++i)
+		for(uint j = 0; j < 64/4; ++j)
+			out[wi].attn[li][j] += token[i].val.attn[li][j] * self.attn[li][i];
+}
+
+//
+// leads
+//
+
+__kernel void
+ctor_anode0(__global const struct ctor_ctrl *const ctrl,
+            __constant const struct ctor_opts *const opts,
+            __global union tokenv *const restrict accum,
+            __global const union tokenv *const restrict pos,
+            __global const union tokenv *const restrict vocab)
+{
+	const uint
+	li = get_local_id(0),
+	wi = get_group_id(0);
+
+	const ushort
+	token = ctrl->body.token[wi];
+
+	const float4
+	wte = vocab[token].word[li],
+	wpe = pos[wi].word[li];
+
+	accum[wi].word[li] = wte + wpe;
+}
+
+__kernel void
+ctor_anode1(__global const struct ctor_ctrl *const ctrl,
+            __constant const struct ctor_opts *const opts,
+            __global union tokenv *const restrict accum,
+            __global const union tokenv *const restrict pos,
+            __global const union tokenv *const restrict vocab)
+{
+	const uint
+	li = get_local_id(0);
+
+	for(uint i = 0; i < ctrl->tokens; ++i)
+	{
+		const ushort
+		token = ctrl->body.token[i];
+
+		const float4
+		wte = vocab[token].word[li],
+		wpe = pos[i].word[li];
+
+		accum[i].word[li] = wte + wpe;
+	}
+}
+
+__kernel void
+ctor_anode2(__global const struct ctor_ctrl *const ctrl,
+            __constant const struct ctor_opts *const opts,
+            __global union tokenv *const restrict accum,
+            __global const union tokenv *const restrict pos,
+            __global const union tokenv *const restrict vocab)
+{
+	const uint
+	gi = get_global_id(0);
+
+	const ushort
+	token = ctrl->body.token[gi];
+
+	for(uint i = 0; i < 768/4; ++i)
+	{
+		const float4
+		wte = vocab[token].word[i],
+		wpe = pos[gi].word[i];
+
+		accum[gi].word[i] = wte + wpe;
+	}
+}
+
+__kernel void
+ctor_cathode(__global const struct ctor_ctrl *const ctrl,
+             __constant const struct ctor_opts *const opts,
+             __global union tokenv *const restrict accum,
+             __global const float4 *const restrict norm_bias,
+             __global const float4 *const restrict norm_weight)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_global_offset(0) / ln + get_group_id(0);
+
+	__local union tokenv
+	token, tmp;
+
+	token.word[li] = accum[wi].word[li];
+
+	// Final re-normalization
+	ctor_norm(token.word, token.word, tmp.word, ln, li);
+	ctor_norm_fmad(token.word, token.word, norm_bias, norm_weight, li);
+
+	accum[0].word[li] = token.word[li];
+}
+
+__kernel void
+ctor_lmhead(__global const struct ctor_ctrl *const ctrl,
+            __constant const struct ctor_opts *const opts,
+            __global float *const restrict logit,
+            __global const union tokenv *const restrict accum,
+            __global const union tokenv *const restrict token)
+{
+	const uint
+	gi = get_global_id(0);
+
+	float4 acc = 0.0f;
+	for(uint j = 0; j < 768/4; ++j)
+	{
+		const float4
+		in = accum[0].word[j],
+		vocab = token[gi].word[j],
+		res = vocab * in;
+
+		acc += res;
+	}
+
+	float res = 0.0f;
+	for(uint k = 0; k < 4; ++k)
+		res += acc[k];
+
+	logit[gi] = res;
+}
+
+__kernel void
+ctor_lmamax(__global struct ctor_ctrl *const ctrl,
+            __constant const struct ctor_opts *const opts,
+            __global const float *const restrict logit)
+{
+	const uint
+	gi = get_global_id(0),
+	gn = get_global_size(0),
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	wi = get_group_id(0),
+	wn = get_num_groups(0),
+	tn = 262,
+	ti = tn * li;
+
+	__local ushort idx[192];
+	__local float best[192];
+
+	idx[li] = ti;
+	for(uint j = ti + 1; j < ti + tn && j < 50257; ++j)
+		if(logit[j] > logit[idx[li]])
+			idx[li] = j;
+
+	best[li] = logit[idx[li]];
+	ctor_local_reduce_tournament_ldr(best, idx, ln, li);
+
+	if(li == 0 && ctrl->call == -1)
+		ctrl->body.token[ctrl->tokens++] = idx[li];
+
+	if(li == 0 && ctrl->call == -1)
+		ctrl->call = 1;
+
+	#ifdef RB_DEBUG
+	if(li == 0 && ctrl->call == 1)
+		if(ctrl->tokens < 2)
+			ctrl->call = -2;
+	#endif
+}
--- a/ircd/gpt_pipe.cc
+++ b/ircd/gpt_pipe.cc
@ -0,0 +1,680 @@
+// Tensor Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#include <ircd/gpt/pipe/pipe.h>
+
+namespace ircd::gpt
+{
+	void transform(ctor_ctrl &, const ctor_opts &);
+}
+
+namespace ircd::gpt::pipe
+{
+	static ircd::cl::exec::opts negative_opts, positive_opts, selfattn_opts, cathode_opts, anode_opts,
+	lmhead_opts, lmamax_opts;
+
+	extern const ircd::run::changed handle_quit;
+}
+
+decltype(ircd::gpt::pipe::default_model)
+ircd::gpt::pipe::default_model;
+
+decltype(ircd::gpt::pipe::default_code)
+ircd::gpt::pipe::default_code;
+
+decltype(ircd::gpt::pipe::default_desc)
+ircd::gpt::pipe::default_desc;
+
+decltype(ircd::gpt::pipe::handle_quit)
+ircd::gpt::pipe::handle_quit
+{
+	run::level::QUIT, pipe::fini
+};
+
+void
+ircd::gpt::pipe::init()
+{
+	const auto &default_model
+	{
+		*gpt::model::default_model
+	};
+
+	assert(!pipe::default_model);
+	pipe::default_model = new pipe::model
+	{
+		default_model, default_model.word
+	};
+
+	pipe::default_code = new pipe::code
+	{
+
+	};
+
+	pipe::default_desc = new pipe::desc
+	{
+		*pipe::default_code, *pipe::default_model
+	};
+}
+
+void
+ircd::gpt::pipe::fini()
+noexcept
+{
+	delete default_desc;
+	default_desc = nullptr;
+
+	delete default_code;
+	default_code = nullptr;
+
+	delete default_model;
+	default_model = nullptr;
+}
+
+//
+// pipe
+//
+
+void
+ircd::gpt::transform(ctor_ctrl &ctrl,
+                     const ctor_opts &opts)
+{
+	if(unlikely(!pipe::default_model))
+		pipe::init();
+
+	ctrl.call = -1;
+	pipe::exec
+	{
+		ctrl, opts
+	};
+
+	if(unlikely(ctrl.call <= 0))
+		throw error
+		{
+			"hyper (#%d) :%s",
+			abs(ctrl.call),
+			ctrl.body.str,
+		};
+}
+
+//
+// pipe::exec
+//
+
+ircd::gpt::pipe::exec::exec(ctor_ctrl &ctrl,
+                            const ctor_opts &opts)
+:desc
+{
+	default_desc
+}
+,out_ctrl
+{
+	reinterpret_cast<char *>(&ctrl), sizeof(ctor_ctrl)
+}
+,in_ctrl
+{
+	reinterpret_cast<const char *>(&ctrl), sizeof(ctor_ctrl)
+}
+,in_opts
+{
+	reinterpret_cast<const char *>(&opts), sizeof(ctor_opts)
+}
+,range_anode
+{
+	{ ctrl.tokens, 0, },
+	{           1, 0, },
+}
+,range_coil
+{
+	{ ctrl.tokens * 192UL, 0, },
+	{               192UL, 0, },
+}
+,range_negative
+{
+	range_coil
+}
+,range_selfattn
+{
+	range_coil
+}
+,range_positive
+{
+	range_coil
+}
+,range_cathode
+{
+	{                 1 * 192UL, 0 },
+	{                     192UL, 0 },
+	{ (ctrl.tokens - 1) * 192UL, 0 },
+}
+,range_lmhead
+{
+	{ 262 * 192UL, 0 },  // align_up(50257) / 192
+	{       192UL, 0 },
+}
+,range_lmamax
+{
+	{   1 * 192UL, 0 },
+	{       192UL, 0 },
+}
+,send
+{
+	{ desc->opts, in_opts },
+	{ desc->ctrl, in_ctrl },
+}
+,tail
+{
+	{ desc->anode,     range_anode,   anode_opts    },
+}
+,coil
+{
+	{ desc->layer[0x00]->negative, range_negative, negative_opts },
+	{ desc->layer[0x00]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x00]->positive, range_positive, positive_opts },
+	{ desc->layer[0x01]->negative, range_negative, negative_opts },
+	{ desc->layer[0x01]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x01]->positive, range_positive, positive_opts },
+	{ desc->layer[0x02]->negative, range_negative, negative_opts },
+	{ desc->layer[0x02]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x02]->positive, range_positive, positive_opts },
+	{ desc->layer[0x03]->negative, range_negative, negative_opts },
+	{ desc->layer[0x03]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x03]->positive, range_positive, positive_opts },
+	{ desc->layer[0x04]->negative, range_negative, negative_opts },
+	{ desc->layer[0x04]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x04]->positive, range_positive, positive_opts },
+	{ desc->layer[0x05]->negative, range_negative, negative_opts },
+	{ desc->layer[0x05]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x05]->positive, range_positive, positive_opts },
+	{ desc->layer[0x06]->negative, range_negative, negative_opts },
+	{ desc->layer[0x06]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x06]->positive, range_positive, positive_opts },
+	{ desc->layer[0x07]->negative, range_negative, negative_opts },
+	{ desc->layer[0x07]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x07]->positive, range_positive, positive_opts },
+	{ desc->layer[0x08]->negative, range_negative, negative_opts },
+	{ desc->layer[0x08]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x08]->positive, range_positive, positive_opts },
+	{ desc->layer[0x09]->negative, range_negative, negative_opts },
+	{ desc->layer[0x09]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x09]->positive, range_positive, positive_opts },
+	{ desc->layer[0x0a]->negative, range_negative, negative_opts },
+	{ desc->layer[0x0a]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x0a]->positive, range_positive, positive_opts },
+	{ desc->layer[0x0b]->negative, range_negative, negative_opts },
+	{ desc->layer[0x0b]->selfattn, range_selfattn, selfattn_opts },
+	{ desc->layer[0x0b]->positive, range_positive, positive_opts },
+}
+,head
+{
+	{ desc->cathode,  range_cathode,  cathode_opts  },
+	{ desc->lmhead,   range_lmhead,   lmhead_opts   },
+	{ desc->lmamax,   range_lmamax,   lmamax_opts   },
+}
+,recv
+{
+	{ desc->ctrl, out_ctrl },
+}
+{
+}
+
+ircd::gpt::pipe::exec::~exec()
+noexcept
+{
+}
+
+//
+// code
+//
+
+decltype(ircd::gpt::pipe::code::compile_opts)
+ircd::gpt::pipe::code::compile_opts
+{
+	" -cl-strict-aliasing"
+	" -cl-no-signed-zeros"
+	" -cl-finite-math-only"
+	" -cl-unsafe-math-optimizations"
+	" -cl-fast-relaxed-math"
+	//" -cl-mad-enable"
+	//" -cl-single-precision-constant"
+	//" -cl-fp32-correctly-rounded-divide-sqrt"
+};
+
+ircd::gpt::pipe::code::code()
+:cl::code{[]
+{
+	const fs::fd fd
+	{
+
+	};
+
+	const std::string read
+	{
+		fs::read(fd)
+	};
+
+	const string_view bin
+	{
+		read
+	};
+
+	const vector_view<const string_view> bins
+	(
+		&bin, 1
+	);
+
+	return cl::code
+	{
+		bins, compile_opts
+	};
+}()}
+{
+}
+
+ircd::gpt::pipe::code::~code()
+noexcept
+{
+}
+
+//
+// pipe::desc
+//
+
+ircd::gpt::pipe::desc::desc(pipe::code &code,
+                            pipe::model &model)
+:model
+{
+	&model
+}
+,code
+{
+	&code
+}
+,opts
+{
+	4_KiB,
+	const_buffer{}
+}
+,ctrl
+{
+	4_KiB,
+	mutable_buffer{}
+}
+,state
+{
+	32 * 3 * 768 * sizeof(float),
+	mutable_buffer{}
+}
+,xattn
+{
+	32 * 1 * 768 * sizeof(float),
+	mutable_buffer{}
+}
+,accum
+{
+	32 * 768 * sizeof(float),
+	mutable_buffer{}
+}
+,logit
+{
+	65536 * sizeof(float),
+	mutable_buffer{}
+}
+,anode
+{
+	code,
+	"ctor_anode2",
+	ctrl,
+	opts,
+	accum,
+	model.embed->pos,
+	model.embed->token,
+}
+,layer
+{
+	std::make_unique<struct desc::layer>(*this, 0x00),
+	std::make_unique<struct desc::layer>(*this, 0x01),
+	std::make_unique<struct desc::layer>(*this, 0x02),
+	std::make_unique<struct desc::layer>(*this, 0x03),
+	std::make_unique<struct desc::layer>(*this, 0x04),
+	std::make_unique<struct desc::layer>(*this, 0x05),
+	std::make_unique<struct desc::layer>(*this, 0x06),
+	std::make_unique<struct desc::layer>(*this, 0x07),
+	std::make_unique<struct desc::layer>(*this, 0x08),
+	std::make_unique<struct desc::layer>(*this, 0x09),
+	std::make_unique<struct desc::layer>(*this, 0x0a),
+	std::make_unique<struct desc::layer>(*this, 0x0b),
+}
+,cathode
+{
+	code,
+	"ctor_cathode",
+	ctrl,
+	opts,
+	accum,
+	model.decode->norm.bias,
+	model.decode->norm.weight,
+}
+,lmhead
+{
+	code,
+	"ctor_lmhead",
+	ctrl,
+	opts,
+	logit,
+	accum,
+	model.embed->token,
+}
+,lmamax
+{
+	code,
+	"ctor_lmamax",
+	ctrl,
+	opts,
+	logit,
+}
+{
+}
+
+//
+// pipe::desc::layer
+//
+
+ircd::gpt::pipe::desc::layer::layer(pipe::desc &desc,
+                                    const int laynum)
+:negative
+{
+	*desc.code,
+	"ctor_attn_fcon",
+	desc.ctrl,
+	desc.opts,
+	desc.state,
+	desc.accum,
+	desc.model->decode->block[laynum].attn.norm.bias,
+	desc.model->decode->block[laynum].attn.norm.weight,
+	desc.model->decode->block[laynum].attn.fcon.bias,
+	desc.model->decode->block[laynum].attn.fcon.weight,
+}
+,selfattn
+{
+	*desc.code,
+	"ctor_attn_self",
+	desc.ctrl,
+	desc.opts,
+	desc.xattn,
+	desc.state,
+	desc.model->decode->block[laynum].attn.mask,
+}
+,positive
+{
+	*desc.code,
+	"ctor_backend",
+	desc.ctrl,
+	desc.opts,
+	desc.accum,
+	desc.xattn,
+	desc.model->decode->block[laynum].attn.proj.bias,
+	desc.model->decode->block[laynum].attn.proj.weight,
+	desc.model->decode->block[laynum].ffnn.norm.bias,
+	desc.model->decode->block[laynum].ffnn.norm.weight,
+	desc.model->decode->block[laynum].ffnn.fcon.bias,
+	desc.model->decode->block[laynum].ffnn.fcon.weight,
+	desc.model->decode->block[laynum].ffnn.proj.bias,
+	desc.model->decode->block[laynum].ffnn.proj.weight,
+}
+{
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// model
+//
+
+//
+// pipe::model::model
+//
+
+ircd::gpt::pipe::model::model(const gpt::model::decoder &decoder,
+                              const gpt::model::embed &embed)
+:decode
+{
+	std::make_unique<model::decoder>(decoder)
+}
+,embed
+{
+	std::make_unique<model::language>(embed)
+}
+{
+}
+
+ircd::gpt::pipe::model::~model()
+noexcept
+{
+}
+
+//
+// pipe::model::language
+//
+
+ircd::gpt::pipe::model::language::language(const gpt::model::embed &embed)
+:pos
+{
+	sizeof(embed.pos),
+	const_buffer{embed.pos}
+}
+,token
+{
+	sizeof(embed.token),
+	const_buffer{embed.token}
+}
+{
+}
+
+ircd::gpt::pipe::model::language::~language()
+noexcept
+{
+}
+
+//
+// pipe::model::decoder
+//
+
+ircd::gpt::pipe::model::decoder::decoder(const gpt::model::decoder &decoder)
+:block
+{
+	{ decoder.layer[0x00], 0x00, },
+	{ decoder.layer[0x01], 0x01, },
+	{ decoder.layer[0x02], 0x02, },
+	{ decoder.layer[0x03], 0x03, },
+	{ decoder.layer[0x04], 0x04, },
+	{ decoder.layer[0x05], 0x05, },
+	{ decoder.layer[0x06], 0x06, },
+	{ decoder.layer[0x07], 0x07, },
+	{ decoder.layer[0x08], 0x08, },
+	{ decoder.layer[0x09], 0x09, },
+	{ decoder.layer[0x0a], 0x0a, },
+	{ decoder.layer[0x0b], 0x0b, },
+}
+,norm
+{
+	const_buffer{decoder.f.bias},
+	const_buffer{decoder.f.weight},
+}
+{
+}
+
+ircd::gpt::pipe::model::decoder::~decoder()
+noexcept
+{
+}
+
+//
+// pipe::model::block
+//
+
+ircd::gpt::pipe::model::block::block(const gpt::model::block &block,
+                                     const size_t layer)
+:master
+{
+	sizeof(block), const_buffer
+	{
+		reinterpret_cast<const char *>(&block), sizeof(block)
+	}
+}
+,attn
+{
+	master,
+	0,
+	block.ln1,
+	block.attn,
+}
+,ffnn
+{
+	master,
+	sizeof(block.ln1) + sizeof(block.attn),
+	block.ln2,
+	block.ffnn,
+}
+{
+}
+
+//
+// pipe::model::ffnn
+//
+
+ircd::gpt::pipe::model::ffnn::ffnn(cl::data &master,
+                                   const off_t offset,
+                                   const gpt::model::norm &norm,
+                                   const gpt::model::ffnn &ffnn)
+:norm
+{
+	master,
+	offset,
+	const_buffer{norm.bias},
+	const_buffer{norm.weight},
+}
+,fcon
+{
+	master,
+	offset + off_t(sizeof(norm)),
+	const_buffer{ffnn.fc_bias},
+	const_buffer{ffnn.fc_weight},
+}
+,proj
+{
+	master,
+	offset + off_t(sizeof(norm) + sizeof(ffnn.fc_bias) + sizeof(ffnn.fc_weight)),
+	const_buffer{ffnn.proj_bias},
+	const_buffer{ffnn.proj_weight},
+}
+{
+	always_assert
+	(
+		ircd::data(const_buffer{ffnn.proj_weight})
+		==
+		ircd::data(const_buffer{norm.bias}) +
+		sizeof(norm) +
+		sizeof(ffnn.fc_bias) +
+		sizeof(ffnn.fc_weight) +
+		ircd::size(const_buffer{ffnn.proj_bias})
+	);
+}
+
+//
+// pipe::model::attn
+//
+
+ircd::gpt::pipe::model::attn::attn(cl::data &master,
+                                   const off_t offset,
+                                   const gpt::model::norm &norm,
+                                   const gpt::model::attn &attn)
+:norm
+{
+	master,
+	offset,
+	const_buffer{norm.bias},
+	const_buffer{norm.weight},
+}
+,fcon
+{
+	master,
+	offset + off_t(sizeof(norm)),
+	const_buffer{attn.attn_bias},
+	const_buffer{attn.attn_weight},
+}
+,proj
+{
+	master,
+	offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight) + sizeof(attn.bias)),
+	const_buffer{attn.proj_bias},
+	const_buffer{attn.proj_weight},
+}
+,mask
+{
+	master,
+	{
+		sizeof(attn.bias),
+		offset + off_t(sizeof(norm) + sizeof(attn.attn_bias) + sizeof(attn.attn_weight)),
+	},
+}
+{
+	always_assert
+	(
+		ircd::data(const_buffer{attn.proj_weight})
+		==
+		ircd::data(const_buffer{norm.bias}) +
+		sizeof(norm) +
+		sizeof(attn.bias) +
+		sizeof(attn.attn_bias) +
+		sizeof(attn.attn_weight) +
+		ircd::size(const_buffer{attn.proj_bias})
+	);
+}
+
+//
+// pipe::model::tensor
+//
+
+ircd::gpt::pipe::model::tensor::tensor(const const_buffer &bias,
+                                       const const_buffer &weight)
+:bias
+{
+	ircd::size(bias),
+	bias,
+}
+,weight
+{
+	ircd::size(weight),
+	weight,
+}
+{
+}
+
+ircd::gpt::pipe::model::tensor::tensor(cl::data &master,
+                                       const off_t offset,
+                                       const const_buffer &bias,
+                                       const const_buffer &weight)
+:bias
+{
+	master,
+	{
+		ircd::size(bias),           // size
+		offset,                     // offset
+	},
+}
+,weight
+{
+	master,
+	{
+		ircd::size(weight),         // size
+		offset + ircd::size(bias),  // offset
+	}
+}
+{
+}