ircd::simt: Split vector reduce_add to hadd.

2024-12-24 14:34:00 +01:00 · 2022-12-28 02:13:36 +00:00 · 2022-12-28 02:13:36 +00:00 · d377674748
commit d377674748
parent 1f87668a28
5 changed files with 55 additions and 16 deletions
--- a/include/ircd/simt/hadd.h
+++ b/include/ircd/simt/hadd.h
@ -0,0 +1,51 @@
+// Matrix Construct
+//
+// Copyright (C) Matrix Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2022 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_SIMT_HADD_H
+
+#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT4__)
+inline float
+__attribute__((always_inline))
+ircd_simt_hadd_f4(const float4 in)
+{
+	float ret = 0.0f;
+	for(uint i = 0; i < 4; ++i)
+		ret += in[i];
+
+	return ret;
+}
+#endif
+
+#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT8__)
+inline float
+__attribute__((always_inline))
+ircd_simt_hadd_f8(const float8 in)
+{
+	float ret = 0.0f;
+	for(uint i = 0; i < 8; ++i)
+		ret += in[i];
+
+	return ret;
+}
+#endif
+
+#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT16__)
+inline float
+__attribute__((always_inline))
+ircd_simt_hadd_f16(const float16 in)
+{
+	float ret = 0.0f;
+	for(uint i = 0; i < 16; ++i)
+		ret += in[i];
+
+	return ret;
+}
+#endif
--- a/include/ircd/simt/mean.h
+++ b/include/ircd/simt/mean.h
@ -61,7 +61,7 @@ ircd_simt_math_mean_f4lldr(__local float4 *const buf,
 	if(li == 0)
 	{
 		const float
-		sum = ircd_simt_reduce_add_f4(buf[li]),
+		sum = ircd_simt_hadd_f4(buf[li]),
 		div = ln * 4,
 		res = sum / div;

--- a/include/ircd/simt/reduce_add.h
+++ b/include/ircd/simt/reduce_add.h
@ -88,16 +88,3 @@ ircd_simt_reduce_add_ulldr(__local uint *const buf,
 		atomic_add(buf + 0, buf[li]);
 }
 #endif
-
-#ifdef __OPENCL_VERSION__
-inline float
-__attribute__((always_inline))
-ircd_simt_reduce_add_f4(const float4 in)
-{
-	float ret = 0.0f;
-	for(uint i = 0; i < 4; ++i)
-		ret += in[i];
-
-	return ret;
-}
-#endif
--- a/include/ircd/simt/simt.h
+++ b/include/ircd/simt/simt.h
@ -15,6 +15,7 @@
 #include "assert.h"
 #include "cycles.h"
 #include "math.h"
+#include "hadd.h"
 #include "broadcast.h"
 #include "reduce_add.h"
 #include "reduce_max.h"
--- a/ircd/gpt_gpu.cl
+++ b/ircd/gpt_gpu.cl
@ -303,7 +303,7 @@ ircd_gpt_attn_self_keys(__global const struct ircd_gpt_ctrl *const ctrl,
 		key = token[i].key.attn[li][k],
 		res = qry * key;

-		self[i][li] += ircd_simt_reduce_add_f4(res);
+		self[i][li] += ircd_simt_hadd_f4(res);
 	}

 	self[i][li] /= 8.0f;
@ -679,7 +679,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl,
 		wpe = pos[wi].elem[j],
 		res = in * token + wpe;

-		acc += ircd_simt_reduce_add_f4(res);
+		acc += ircd_simt_hadd_f4(res);
 	}

 	logit[gi] = acc;