mirror of
https://github.com/matrix-construct/construct
synced 2024-09-27 11:18:51 +02:00
ircd::simt: Split reduce tools; add float4 horizontal add.
This commit is contained in:
parent
fac509e306
commit
075b40400a
4 changed files with 41 additions and 19 deletions
|
@ -20,8 +20,7 @@ ircd_simt_math_mean_f4lldr(__local float4 *const restrict out,
|
|||
ircd_simt_reduce_add_f4lldr(out, num, i);
|
||||
|
||||
if(i == 0)
|
||||
for(uint k = 1; k < 4; ++k)
|
||||
out[i][0] += out[i][k];
|
||||
out[i][0] = ircd_simt_reduce_add_f4(out[i]);
|
||||
|
||||
if(i == 0)
|
||||
out[i] = out[i][0] / (num * 4);
|
||||
|
|
38
include/ircd/simt/reduce_add.h
Normal file
38
include/ircd/simt/reduce_add.h
Normal file
|
@ -0,0 +1,38 @@
|
|||
// Matrix Construct
|
||||
//
|
||||
// Copyright (C) Matrix Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
/// Sum all elements in the buffer. All threads in the group participate;
|
||||
/// result is placed in index [0], the rest of the buffer is trashed.
|
||||
inline void
|
||||
ircd_simt_reduce_add_f4lldr(__local float4 *const buf,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(li < stride)
|
||||
buf[li] += buf[li + stride];
|
||||
}
|
||||
}
|
||||
|
||||
/// Sum all elements in the buffer. All threads in the group participate;
|
||||
/// result is placed in index [0], the rest of the buffer is trashed.
|
||||
inline float
|
||||
__attribute__((always_inline))
|
||||
ircd_simt_reduce_add_f4(const float4 in)
|
||||
{
|
||||
float ret = 0.0f;
|
||||
for(uint i = 0; i < 4; ++i)
|
||||
ret += in[i];
|
||||
|
||||
return ret;
|
||||
}
|
|
@ -8,22 +8,6 @@
|
|||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
/// Sum all elements in the buffer. All threads in the group participate;
|
||||
/// result is placed in index [0], the rest of the buffer is trashed.
|
||||
inline void
|
||||
ircd_simt_reduce_add_f4lldr(__local float4 *const buf,
|
||||
const uint ln,
|
||||
const uint li)
|
||||
{
|
||||
for(uint stride = ln >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(li < stride)
|
||||
buf[li] += buf[li + stride];
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the greatest value in the buffer. All threads in the group participate;
|
||||
/// the greatest value is placed in index [0], the rest of the buffer is
|
||||
/// trashed.
|
|
@ -12,7 +12,8 @@
|
|||
#define HAVE_IRCD_SIMT_H
|
||||
|
||||
#include "broadcast.h"
|
||||
#include "reduce.h"
|
||||
#include "reduce_add.h"
|
||||
#include "reduce_max.h"
|
||||
#include "sort.h"
|
||||
#include "mean.h"
|
||||
#include "norm.h"
|
||||
|
|
Loading…
Reference in a new issue