mirror of
https://github.com/matrix-construct/construct
synced 2024-09-28 19:58:53 +02:00
ircd::simd: Add closure to stream interface; include in stack.
This commit is contained in:
parent
d4de92c61f
commit
5e21435e1a
2 changed files with 69 additions and 50 deletions
|
@ -32,6 +32,7 @@
|
||||||
#include "lateral.h"
|
#include "lateral.h"
|
||||||
#include "any.h"
|
#include "any.h"
|
||||||
#include "all.h"
|
#include "all.h"
|
||||||
|
#include "stream.h"
|
||||||
#include "for_each.h"
|
#include "for_each.h"
|
||||||
#include "transform.h"
|
#include "transform.h"
|
||||||
#include "generate.h"
|
#include "generate.h"
|
||||||
|
|
|
@ -13,26 +13,17 @@
|
||||||
|
|
||||||
namespace ircd::simd
|
namespace ircd::simd
|
||||||
{
|
{
|
||||||
size_t stream_aligned(const mutable_buffer &dst, const const_buffer &src);
|
// Using the AVX512 vector type by default as it conveniently matches the
|
||||||
}
|
// cache-line size on the relevant platforms and simplifies our syntax below
|
||||||
|
// by being a single object. On those w/o AVX512 it uses an isomorphic
|
||||||
|
// configuration of the best available regs.
|
||||||
|
using stream_line_t = u512x1;
|
||||||
|
|
||||||
|
template<class block_t>
|
||||||
|
using stream_proto = void (block_t &);
|
||||||
|
|
||||||
/// Non-temporal copy. This copies from an aligned source to an aligned
|
|
||||||
/// destination without the data cycling through the d-cache. The alignment
|
|
||||||
/// requirements are currently very strict. The source and destination buffers
|
|
||||||
/// must begin at a cache-line alignment and the size of the buffers must be
|
|
||||||
/// a multiple of something we'll call "register-file size" which is the size
|
|
||||||
/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for
|
|
||||||
/// AVX512) so it's safe to say buffers should just be aligned and padded out
|
|
||||||
/// to 4K page-size to be safe. The size of the src argument itself can be an
|
|
||||||
/// arbitrary size and this function will return that size, but its backing
|
|
||||||
/// buffer must be padded out to alignment.
|
|
||||||
///
|
|
||||||
inline size_t
|
|
||||||
ircd::simd::stream_aligned(const mutable_buffer &dst,
|
|
||||||
const const_buffer &src)
|
|
||||||
{
|
|
||||||
// Platforms that have non-temporal store support; this is all of x86_64
|
// Platforms that have non-temporal store support; this is all of x86_64
|
||||||
constexpr bool has_store
|
constexpr bool stream_has_store
|
||||||
{
|
{
|
||||||
#if defined(__SSE2__) && !defined(RB_GENERIC)
|
#if defined(__SSE2__) && !defined(RB_GENERIC)
|
||||||
true
|
true
|
||||||
|
@ -43,7 +34,7 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
|
||||||
|
|
||||||
// Platforms that have non-temporal load support; sidenote SSE4.1 can do
|
// Platforms that have non-temporal load support; sidenote SSE4.1 can do
|
||||||
// 16 byte loads and AVX2 can do 32 byte loads; SSE2 cannot do loads.
|
// 16 byte loads and AVX2 can do 32 byte loads; SSE2 cannot do loads.
|
||||||
constexpr bool has_load
|
constexpr bool stream_has_load
|
||||||
{
|
{
|
||||||
#if defined(__AVX__) && !defined(RB_GENERIC)
|
#if defined(__AVX__) && !defined(RB_GENERIC)
|
||||||
true
|
true
|
||||||
|
@ -52,17 +43,11 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
// Use the AVX512 vector type unconditionally as it conveniently matches
|
|
||||||
// the cache-line size on the relevant platforms and simplifies our syntax
|
|
||||||
// below by being a single object. On those w/o AVX512 it uses an
|
|
||||||
// isomorphic configuration of the best available regs.
|
|
||||||
using block_type = u512x1;
|
|
||||||
|
|
||||||
// The number of cache lines we'll have "in flight" which is basically
|
// The number of cache lines we'll have "in flight" which is basically
|
||||||
// just a gimmick to unroll the loop such that each iteration covers
|
// just a gimmick to unroll the loop such that each iteration covers
|
||||||
// the full register file. On SSE with 256 bytes of register file we can
|
// the full register file. On SSE with 256 bytes of register file we can
|
||||||
// name 4 cache lines at once; on AVX with 512 bytes we can name 8, etc.
|
// name 4 cache lines at once; on AVX with 512 bytes we can name 8, etc.
|
||||||
constexpr size_t file_lines
|
constexpr size_t stream_max_lines
|
||||||
{
|
{
|
||||||
#if defined(__AVX512F__)
|
#if defined(__AVX512F__)
|
||||||
32
|
32
|
||||||
|
@ -76,21 +61,48 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
|
||||||
// Configurable magic number only relevant to SSE2 systems which don't have
|
// Configurable magic number only relevant to SSE2 systems which don't have
|
||||||
// non-temporal load instructions. On these platforms we'll conduct a
|
// non-temporal load instructions. On these platforms we'll conduct a
|
||||||
// prefetch loop and mark the lines NTA.
|
// prefetch loop and mark the lines NTA.
|
||||||
constexpr size_t latency
|
constexpr size_t stream_latency
|
||||||
{
|
{
|
||||||
16
|
16
|
||||||
};
|
};
|
||||||
|
|
||||||
// When the constexpr conditions aren't favorable we can fallback to
|
template<size_t = stream_max_lines,
|
||||||
// normal copy here.
|
class lambda>
|
||||||
if constexpr(!has_store && !has_load)
|
mutable_buffer
|
||||||
return copy(dst, src);
|
stream(const mutable_buffer &, const const_buffer &, lambda&&) noexcept;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Non-temporal stream. This copies from an aligned source to an aligned
|
||||||
|
/// destination without the data cycling through the d-cache. The alignment
|
||||||
|
/// requirements are currently very strict. The source and destination buffers
|
||||||
|
/// must begin at a cache-line alignment and the size of the buffers must be
|
||||||
|
/// a multiple of something we'll call "register-file size" which is the size
|
||||||
|
/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for
|
||||||
|
/// AVX512) so it's safe to say buffers should just be aligned and padded out
|
||||||
|
/// to 4K page-size to be safe. The size of the src argument itself can be an
|
||||||
|
/// arbitrary size and this function will return that size, but its backing
|
||||||
|
/// buffer must be padded out to alignment.
|
||||||
|
///
|
||||||
|
template<size_t bandwidth,
|
||||||
|
class lambda>
|
||||||
|
inline ircd::mutable_buffer
|
||||||
|
ircd::simd::stream(const mutable_buffer &dst,
|
||||||
|
const const_buffer &src,
|
||||||
|
lambda&& closure)
|
||||||
|
noexcept
|
||||||
|
{
|
||||||
|
using line_t = stream_line_t;
|
||||||
|
|
||||||
|
constexpr auto file_lines
|
||||||
|
{
|
||||||
|
std::min(bandwidth, stream_max_lines)
|
||||||
|
};
|
||||||
|
|
||||||
// Assert valid arguments
|
// Assert valid arguments
|
||||||
assert(!overlap(src, dst));
|
assert(!overlap(src, dst));
|
||||||
assert(aligned(data(src), sizeof(block_type)));
|
assert(aligned(data(src), sizeof(line_t)));
|
||||||
assert(aligned(data(dst), sizeof(block_type)));
|
assert(aligned(data(dst), sizeof(line_t)));
|
||||||
assert(size(dst) % (sizeof(block_type) * file_lines));
|
assert(size(dst) % (sizeof(line_t) * file_lines));
|
||||||
|
|
||||||
// Size in bytes to be copied
|
// Size in bytes to be copied
|
||||||
const size_t copy_size
|
const size_t copy_size
|
||||||
|
@ -101,50 +113,56 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
|
||||||
// Number of lines to be copied.
|
// Number of lines to be copied.
|
||||||
const size_t copy_lines
|
const size_t copy_lines
|
||||||
{
|
{
|
||||||
(copy_size / sizeof(block_type)) + bool(copy_size % sizeof(block_type))
|
(copy_size / sizeof(line_t)) + bool(copy_size % sizeof(line_t))
|
||||||
};
|
};
|
||||||
|
|
||||||
// destination base ptr
|
// destination base ptr
|
||||||
block_type *const __restrict__ out
|
line_t *const __restrict__ out
|
||||||
{
|
{
|
||||||
reinterpret_cast<block_type *__restrict__>(data(dst))
|
reinterpret_cast<line_t *__restrict__>(data(dst))
|
||||||
};
|
};
|
||||||
|
|
||||||
// source base ptr
|
// source base ptr
|
||||||
const block_type *const __restrict__ in
|
const line_t *const __restrict__ in
|
||||||
{
|
{
|
||||||
reinterpret_cast<const block_type *__restrict__>(data(src))
|
reinterpret_cast<const line_t *__restrict__>(data(src))
|
||||||
};
|
};
|
||||||
|
|
||||||
if constexpr(!has_load)
|
if constexpr(!stream_has_load)
|
||||||
#pragma clang loop unroll(disable)
|
#pragma clang loop unroll(disable)
|
||||||
for(size_t i(0); i < latency; ++i)
|
for(size_t i(0); i < stream_latency; ++i)
|
||||||
__builtin_prefetch(in + i, 0, 0);
|
__builtin_prefetch(in + i, 0, 0);
|
||||||
|
|
||||||
for(size_t i(0); i < copy_lines; i += file_lines)
|
for(size_t i(0); i < copy_lines; i += file_lines)
|
||||||
{
|
{
|
||||||
if constexpr(!has_load)
|
if constexpr(!stream_has_load)
|
||||||
for(size_t j(0); j < file_lines; ++j)
|
for(size_t j(0); j < file_lines; ++j)
|
||||||
__builtin_prefetch(in + i + latency + j, 0, 0);
|
__builtin_prefetch(in + i + stream_latency + j, 0, 0);
|
||||||
|
|
||||||
block_type block[file_lines];
|
line_t line[file_lines];
|
||||||
for(size_t j(0); j < file_lines; ++j)
|
for(size_t j(0); j < file_lines; ++j)
|
||||||
#if defined(__clang__)
|
#if defined(__clang__)
|
||||||
block[j] = __builtin_nontemporal_load(in + i + j);
|
line[j] = __builtin_nontemporal_load(in + i + j);
|
||||||
#else
|
#else
|
||||||
block[j] = *(in + i + j); //TODO: XXX
|
line[j] = in[i + j];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
for(size_t j(0); j < file_lines; ++j)
|
||||||
|
closure(line[j]);
|
||||||
|
|
||||||
for(size_t j(0); j < file_lines; ++j)
|
for(size_t j(0); j < file_lines; ++j)
|
||||||
#if defined(__clang__)
|
#if defined(__clang__)
|
||||||
__builtin_nontemporal_store(block[j], out + i + j);
|
__builtin_nontemporal_store(line[j], out + i + j);
|
||||||
#else
|
#else
|
||||||
*(out + i + j) = block[j]; //TODO: XXX
|
*(out + i + j) = line[j]; //TODO: XXX
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if constexpr(has_store)
|
if constexpr(stream_has_store)
|
||||||
asm volatile ("sfence");
|
asm volatile ("sfence");
|
||||||
|
|
||||||
return copy_size;
|
return mutable_buffer
|
||||||
|
{
|
||||||
|
data(dst), copy_size
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue