From 536594b48750d12568e31fd2a412ac0dec3786b0 Mon Sep 17 00:00:00 2001
From: Jason Volk <jason@zemos.net>
Date: Mon, 26 Apr 2021 20:43:21 -0700
Subject: [PATCH] ircd::gpt::pipe: Consolidate loop; improve access pattern;
 various reorg.

---
 ircd/gpt_cl.cl | 101 +++++++++++++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 42 deletions(-)

diff --git a/ircd/gpt_cl.cl b/ircd/gpt_cl.cl
index bf58ce2a6..9f5bf168c 100644
--- a/ircd/gpt_cl.cl
+++ b/ircd/gpt_cl.cl
@@ -10,6 +10,7 @@
 
 
 inline void
+__attribute__((always_inline))
 ircd_gpt_norm_fmad(__local float4 *const out,
                    __local const float4 *const in,
                    __global const float4 *const restrict bias,
@@ -19,40 +20,6 @@ ircd_gpt_norm_fmad(__local float4 *const out,
 	out[i] = in[i] * weight[i] + bias[i];
 }
 
-// Matrix * Vector Multiply/Accumulate
-inline void
-ircd_gpt_sgemv(__local float4 *const restrict out,
-               __local const float4 *const restrict in,
-               __global const float4 *const restrict bias,
-               __global const float4 *const restrict weight,
-               const uint width,
-               const uint height,
-               const uint i)
-{
-	const uint
-	lanes = 4;
-
-	float4
-	acc = bias[i];
-
-	for(uint j = 0; j < height; ++j)
-	{
-		const uint
-		tile = j * lanes;
-
-		for(uint k = 0; k < lanes; ++k)
-		{
-			const uint
-			row = tile + k,
-			cell = row * width + i;
-
-			acc += in[j][k] * weight[cell];
-		}
-	}
-
-	out[i] = acc;
-}
-
 /// Gaussian Error Linear Unit
 inline void
 ircd_gpt_ffnn_gelu(__local float4 *const out,
@@ -78,6 +45,50 @@ ircd_gpt_ffnn_gelu(__local float4 *const out,
 	out[i] = a;
 }
 
+// Matrix * Vector Multiply/Accumulate
+inline void
+ircd_gpt_sgemv(__local float4 *const restrict out,
+               __local const float4 *const restrict in,
+               __global const float4 *const restrict bias,
+               __global const float4 *const restrict weight,
+               const uint width,
+               const uint height,
+               const uint segs)
+{
+	const uint
+	li = get_local_id(0),
+	ln = get_local_size(0),
+	lanes = 4;
+
+	__attribute__((opencl_unroll_hint))
+	for(uint i = 0; i < segs; ++i)
+	{
+		const uint
+		col = i * ln + li;
+
+		out[col] = bias[col];
+	}
+
+	for(uint j = 0; j < height; ++j)
+		for(uint i = 0; i < segs; ++i)
+		{
+			const uint
+			col = i * ln + li;
+
+			float4 acc = 0.0f;
+			for(uint k = 0; k < lanes; ++k)
+			{
+				const uint
+				row = j * lanes + k,
+				cell = row * width + col;
+
+				acc += in[j][k] * weight[cell];
+			}
+
+			out[col] += acc;
+		}
+}
+
 inline void
 __attribute__((always_inline))
 ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_task *const ctrl,
@@ -94,8 +105,7 @@ ircd_gpt_ffnn_fcon(__global const struct ircd_gpt_task *const ctrl,
 	height = opts->ffnn_height,
 	tiles = opts->ffnn_mult;
 
-	for(uint i = 0; i < tiles; ++i)
-		ircd_gpt_sgemv(out->fcon, in->word, bias, weight, width, height, i * ln + li);
+	ircd_gpt_sgemv(out->fcon, in->word, bias, weight, width, height, tiles);
 
 	for(uint i = 0; i < tiles; ++i)
 		ircd_gpt_ffnn_gelu(out->fcon, out->fcon, i * ln + li);
@@ -107,6 +117,8 @@ ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
               __constant const struct ircd_gpt_opts *const opts,
               __local union ircd_gpt_tokenv *const restrict token,
               __local union ircd_gpt_ffnn_aperaturev *const restrict buf,
+              __local union ircd_gpt_ffnn_aperaturev *const restrict tmp0,
+              __local union ircd_gpt_tokenv *const restrict tmp1,
               __global const float4 *const restrict norm_bias,
               __global const float4 *const restrict norm_weight,
               __global const float4 *const restrict fcon_bias,
@@ -138,7 +150,7 @@ ircd_gpt_ffnn(__global const struct ircd_gpt_task *const ctrl,
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	// Projection
-	ircd_gpt_sgemv(token->word, buf->fcon, proj_bias, proj_weight, height, width, li);
+	ircd_gpt_sgemv(token->word, buf->fcon, proj_bias, proj_weight, height, width, 1);
 }
 
 inline void
@@ -197,12 +209,14 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
 			self[i][li] = exp(self[i][li] - mu);
 
 		float sum = 0.0f;
+		__attribute__((opencl_unroll_hint))
 		for(uint i = 0; i < wn; ++i)
 			sum += self[i][li];
 
 		const float
 		lambda = 1.0f / sum;
 
+		__attribute__((opencl_unroll_hint))
 		for(uint i = 0; i < wn; ++i)
 			self[i][li] *= lambda;
 	}
@@ -211,6 +225,7 @@ ircd_gpt_attn_self(__global const struct ircd_gpt_task *const ctrl,
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	float4 acc = 0.0f;
+	__attribute__((opencl_unroll_hint))
 	for(uint i = 0; i < wn; ++i)
 	{
 		const float4
@@ -243,7 +258,7 @@ ircd_gpt_attn_proj(__global const struct ircd_gpt_task *const ctrl,
 	height = opts->attn_height;
 
 	// Projection
-	ircd_gpt_sgemv(out->word, xattn->word, bias, weight, width, height, li);
+	ircd_gpt_sgemv(out->word, xattn->word, bias, weight, width, height, 1);
 }
 
 __kernel void
@@ -263,6 +278,7 @@ ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
 {
 	const uint
 	li = get_local_id(0),
+	ln = get_local_size(0),
 	wi = get_group_id(0);
 
 	__local union ircd_gpt_tokenv
@@ -271,7 +287,7 @@ ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
 	__local union
 	{
 		union ircd_gpt_ffnn_aperaturev
-		ffnn_fcon;
+		ffnn_fcon[2];
 
 		float
 		attn_self[512][12];
@@ -319,7 +335,9 @@ ircd_gpt_coil(__global const struct ircd_gpt_task *const ctrl,
 		ctrl,
 		opts,
 		&buf0,
-		&buf.ffnn_fcon,
+		buf.ffnn_fcon + 0,
+		buf.ffnn_fcon + 1,
+		&buf1,
 		ffnn_norm_bias,
 		ffnn_norm_weight,
 		ffnn_fcon_bias,
@@ -376,8 +394,7 @@ ircd_gpt_attn_fcon(__global const struct ircd_gpt_task *const ctrl,
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	// Fully connected
-	for(uint i = 0; i < tiles; ++i)
-		ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, width, height, i * ln + li);
+	ircd_gpt_sgemv(token.fcon, tmp, fcon_bias, fcon_weight, width, height, tiles);
 
 	// Export queries, keys, and values.
 	for(uint i = 0; i < tiles; ++i)