From d377674748075296c8b5d0eb6206131e0eaace67 Mon Sep 17 00:00:00 2001
From: Jason Volk <jason@zemos.net>
Date: Wed, 28 Dec 2022 02:13:36 +0000
Subject: [PATCH] ircd::simt: Split vector reduce_add to hadd.

---
 include/ircd/simt/hadd.h       | 51 ++++++++++++++++++++++++++++++++++
 include/ircd/simt/mean.h       |  2 +-
 include/ircd/simt/reduce_add.h | 13 ---------
 include/ircd/simt/simt.h       |  1 +
 ircd/gpt_gpu.cl                |  4 +--
 5 files changed, 55 insertions(+), 16 deletions(-)
 create mode 100644 include/ircd/simt/hadd.h

diff --git a/include/ircd/simt/hadd.h b/include/ircd/simt/hadd.h
new file mode 100644
index 000000000..d0313f839
--- /dev/null
+++ b/include/ircd/simt/hadd.h
@@ -0,0 +1,51 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_SIMT_HADD_H
+
+#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT4__)
+inline float
+__attribute__((always_inline))
+ircd_simt_hadd_f4(const float4 in)
+{
+	float ret = 0.0f;
+	for(uint i = 0; i < 4; ++i)
+		ret += in[i];
+
+	return ret;
+}
+#endif
+
+#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT8__)
+inline float
+__attribute__((always_inline))
+ircd_simt_hadd_f8(const float8 in)
+{
+	float ret = 0.0f;
+	for(uint i = 0; i < 8; ++i)
+		ret += in[i];
+
+	return ret;
+}
+#endif
+
+#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT16__)
+inline float
+__attribute__((always_inline))
+ircd_simt_hadd_f16(const float16 in)
+{
+	float ret = 0.0f;
+	for(uint i = 0; i < 16; ++i)
+		ret += in[i];
+
+	return ret;
+}
+#endif
diff --git a/include/ircd/simt/mean.h b/include/ircd/simt/mean.h
index 2a246574c..42a4d9d34 100644
--- a/include/ircd/simt/mean.h
+++ b/include/ircd/simt/mean.h
@@ -61,7 +61,7 @@ ircd_simt_math_mean_f4lldr(__local float4 *const buf,
 	if(li == 0)
 	{
 		const float
-		sum = ircd_simt_reduce_add_f4(buf[li]),
+		sum = ircd_simt_hadd_f4(buf[li]),
 		div = ln * 4,
 		res = sum / div;
 
diff --git a/include/ircd/simt/reduce_add.h b/include/ircd/simt/reduce_add.h
index 21abc24e2..3e3c5b5ad 100644
--- a/include/ircd/simt/reduce_add.h
+++ b/include/ircd/simt/reduce_add.h
@@ -88,16 +88,3 @@ ircd_simt_reduce_add_ulldr(__local uint *const buf,
 		atomic_add(buf + 0, buf[li]);
 }
 #endif
-
-#ifdef __OPENCL_VERSION__
-inline float
-__attribute__((always_inline))
-ircd_simt_reduce_add_f4(const float4 in)
-{
-	float ret = 0.0f;
-	for(uint i = 0; i < 4; ++i)
-		ret += in[i];
-
-	return ret;
-}
-#endif
diff --git a/include/ircd/simt/simt.h b/include/ircd/simt/simt.h
index e86433514..b3e769c22 100644
--- a/include/ircd/simt/simt.h
+++ b/include/ircd/simt/simt.h
@@ -15,6 +15,7 @@
 #include "assert.h"
 #include "cycles.h"
 #include "math.h"
+#include "hadd.h"
 #include "broadcast.h"
 #include "reduce_add.h"
 #include "reduce_max.h"
diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl
index dff08941a..5ce3ef39a 100644
--- a/ircd/gpt_gpu.cl
+++ b/ircd/gpt_gpu.cl
@@ -303,7 +303,7 @@ ircd_gpt_attn_self_keys(__global const struct ircd_gpt_ctrl *const ctrl,
 		key = token[i].key.attn[li][k],
 		res = qry * key;
 
-		self[i][li] += ircd_simt_reduce_add_f4(res);
+		self[i][li] += ircd_simt_hadd_f4(res);
 	}
 
 	self[i][li] /= 8.0f;
@@ -679,7 +679,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl,
 		wpe = pos[wi].elem[j],
 		res = in * token + wpe;
 
-		acc += ircd_simt_reduce_add_f4(res);
+		acc += ircd_simt_hadd_f4(res);
 	}
 
 	logit[gi] = acc;