From d377674748075296c8b5d0eb6206131e0eaace67 Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Wed, 28 Dec 2022 02:13:36 +0000 Subject: [PATCH] ircd::simt: Split vector reduce_add to hadd. --- include/ircd/simt/hadd.h | 51 ++++++++++++++++++++++++++++++++++ include/ircd/simt/mean.h | 2 +- include/ircd/simt/reduce_add.h | 13 --------- include/ircd/simt/simt.h | 1 + ircd/gpt_gpu.cl | 4 +-- 5 files changed, 55 insertions(+), 16 deletions(-) create mode 100644 include/ircd/simt/hadd.h diff --git a/include/ircd/simt/hadd.h b/include/ircd/simt/hadd.h new file mode 100644 index 000000000..d0313f839 --- /dev/null +++ b/include/ircd/simt/hadd.h @@ -0,0 +1,51 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2022 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_SIMT_HADD_H + +#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT4__) +inline float +__attribute__((always_inline)) +ircd_simt_hadd_f4(const float4 in) +{ + float ret = 0.0f; + for(uint i = 0; i < 4; ++i) + ret += in[i]; + + return ret; +} +#endif + +#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT8__) +inline float +__attribute__((always_inline)) +ircd_simt_hadd_f8(const float8 in) +{ + float ret = 0.0f; + for(uint i = 0; i < 8; ++i) + ret += in[i]; + + return ret; +} +#endif + +#if defined(__OPENCL_VERSION__) && defined(__SIZEOF_FLOAT16__) +inline float +__attribute__((always_inline)) +ircd_simt_hadd_f16(const float16 in) +{ + float ret = 0.0f; + for(uint i = 0; i < 16; ++i) + ret += in[i]; + + return ret; +} +#endif diff --git a/include/ircd/simt/mean.h b/include/ircd/simt/mean.h index 2a246574c..42a4d9d34 100644 --- a/include/ircd/simt/mean.h +++ b/include/ircd/simt/mean.h @@ -61,7 +61,7 @@ ircd_simt_math_mean_f4lldr(__local float4 *const buf, if(li == 0) { const float - sum = ircd_simt_reduce_add_f4(buf[li]), + sum = ircd_simt_hadd_f4(buf[li]), div = ln * 4, res = sum / div; diff --git a/include/ircd/simt/reduce_add.h b/include/ircd/simt/reduce_add.h index 21abc24e2..3e3c5b5ad 100644 --- a/include/ircd/simt/reduce_add.h +++ b/include/ircd/simt/reduce_add.h @@ -88,16 +88,3 @@ ircd_simt_reduce_add_ulldr(__local uint *const buf, atomic_add(buf + 0, buf[li]); } #endif - -#ifdef __OPENCL_VERSION__ -inline float -__attribute__((always_inline)) -ircd_simt_reduce_add_f4(const float4 in) -{ - float ret = 0.0f; - for(uint i = 0; i < 4; ++i) - ret += in[i]; - - return ret; -} -#endif diff --git a/include/ircd/simt/simt.h b/include/ircd/simt/simt.h index e86433514..b3e769c22 100644 --- a/include/ircd/simt/simt.h +++ b/include/ircd/simt/simt.h @@ -15,6 +15,7 @@ #include "assert.h" #include "cycles.h" #include "math.h" +#include "hadd.h" #include "broadcast.h" #include "reduce_add.h" #include "reduce_max.h" diff --git a/ircd/gpt_gpu.cl b/ircd/gpt_gpu.cl index dff08941a..5ce3ef39a 100644 --- a/ircd/gpt_gpu.cl +++ b/ircd/gpt_gpu.cl @@ -303,7 +303,7 @@ ircd_gpt_attn_self_keys(__global const struct ircd_gpt_ctrl *const ctrl, key = token[i].key.attn[li][k], res = qry * key; - self[i][li] += ircd_simt_reduce_add_f4(res); + self[i][li] += ircd_simt_hadd_f4(res); } self[i][li] /= 8.0f; @@ -679,7 +679,7 @@ ircd_gpt_lm_logit(__global const struct ircd_gpt_ctrl *const ctrl, wpe = pos[wi].elem[j], res = in * token + wpe; - acc += ircd_simt_reduce_add_f4(res); + acc += ircd_simt_hadd_f4(res); } logit[gi] = acc;