From 075b40400a648d239d29e57f29f472d4640b17f8 Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Thu, 15 Apr 2021 09:02:44 -0700 Subject: [PATCH] ircd::simt: Split reduce tools; add float4 horizontal add. --- include/ircd/simt/mean.h | 3 +- include/ircd/simt/reduce_add.h | 38 ++++++++++++++++++++ include/ircd/simt/{reduce.h => reduce_max.h} | 16 --------- include/ircd/simt/simt.h | 3 +- 4 files changed, 41 insertions(+), 19 deletions(-) create mode 100644 include/ircd/simt/reduce_add.h rename include/ircd/simt/{reduce.h => reduce_max.h} (67%) diff --git a/include/ircd/simt/mean.h b/include/ircd/simt/mean.h index ab7eae2c9..4a27cf822 100644 --- a/include/ircd/simt/mean.h +++ b/include/ircd/simt/mean.h @@ -20,8 +20,7 @@ ircd_simt_math_mean_f4lldr(__local float4 *const restrict out, ircd_simt_reduce_add_f4lldr(out, num, i); if(i == 0) - for(uint k = 1; k < 4; ++k) - out[i][0] += out[i][k]; + out[i][0] = ircd_simt_reduce_add_f4(out[i]); if(i == 0) out[i] = out[i][0] / (num * 4); diff --git a/include/ircd/simt/reduce_add.h b/include/ircd/simt/reduce_add.h new file mode 100644 index 000000000..2a13c5166 --- /dev/null +++ b/include/ircd/simt/reduce_add.h @@ -0,0 +1,38 @@ +// Matrix Construct +// +// Copyright (C) Matrix Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +/// Sum all elements in the buffer. All threads in the group participate; +/// result is placed in index [0], the rest of the buffer is trashed. +inline void +ircd_simt_reduce_add_f4lldr(__local float4 *const buf, + const uint ln, + const uint li) +{ + for(uint stride = ln >> 1; stride > 0; stride >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + + if(li < stride) + buf[li] += buf[li + stride]; + } +} + +/// Sum all elements in the buffer. All threads in the group participate; +/// result is placed in index [0], the rest of the buffer is trashed. +inline float +__attribute__((always_inline)) +ircd_simt_reduce_add_f4(const float4 in) +{ + float ret = 0.0f; + for(uint i = 0; i < 4; ++i) + ret += in[i]; + + return ret; +} diff --git a/include/ircd/simt/reduce.h b/include/ircd/simt/reduce_max.h similarity index 67% rename from include/ircd/simt/reduce.h rename to include/ircd/simt/reduce_max.h index 5e4fed8df..f8ff577de 100644 --- a/include/ircd/simt/reduce.h +++ b/include/ircd/simt/reduce_max.h @@ -8,22 +8,6 @@ // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. -/// Sum all elements in the buffer. All threads in the group participate; -/// result is placed in index [0], the rest of the buffer is trashed. -inline void -ircd_simt_reduce_add_f4lldr(__local float4 *const buf, - const uint ln, - const uint li) -{ - for(uint stride = ln >> 1; stride > 0; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if(li < stride) - buf[li] += buf[li + stride]; - } -} - /// Find the greatest value in the buffer. All threads in the group participate; /// the greatest value is placed in index [0], the rest of the buffer is /// trashed. diff --git a/include/ircd/simt/simt.h b/include/ircd/simt/simt.h index 76bba36d4..759520424 100644 --- a/include/ircd/simt/simt.h +++ b/include/ircd/simt/simt.h @@ -12,7 +12,8 @@ #define HAVE_IRCD_SIMT_H #include "broadcast.h" -#include "reduce.h" +#include "reduce_add.h" +#include "reduce_max.h" #include "sort.h" #include "mean.h" #include "norm.h"