From 5e21435e1a04659bd7b4ac6a8148cc5f22787ecf Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Sat, 10 Oct 2020 01:18:37 -0700 Subject: [PATCH] ircd::simd: Add closure to stream interface; include in stack. --- include/ircd/simd/simd.h | 1 + include/ircd/simd/stream.h | 118 +++++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/include/ircd/simd/simd.h b/include/ircd/simd/simd.h index 32c8a092b..9347accdc 100644 --- a/include/ircd/simd/simd.h +++ b/include/ircd/simd/simd.h @@ -32,6 +32,7 @@ #include "lateral.h" #include "any.h" #include "all.h" +#include "stream.h" #include "for_each.h" #include "transform.h" #include "generate.h" diff --git a/include/ircd/simd/stream.h b/include/ircd/simd/stream.h index 580fe3958..2ce5cfa2a 100644 --- a/include/ircd/simd/stream.h +++ b/include/ircd/simd/stream.h @@ -13,26 +13,17 @@ namespace ircd::simd { - size_t stream_aligned(const mutable_buffer &dst, const const_buffer &src); -} + // Using the AVX512 vector type by default as it conveniently matches the + // cache-line size on the relevant platforms and simplifies our syntax below + // by being a single object. On those w/o AVX512 it uses an isomorphic + // configuration of the best available regs. + using stream_line_t = u512x1; + + template + using stream_proto = void (block_t &); -/// Non-temporal copy. This copies from an aligned source to an aligned -/// destination without the data cycling through the d-cache. The alignment -/// requirements are currently very strict. The source and destination buffers -/// must begin at a cache-line alignment and the size of the buffers must be -/// a multiple of something we'll call "register-file size" which is the size -/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for -/// AVX512) so it's safe to say buffers should just be aligned and padded out -/// to 4K page-size to be safe. The size of the src argument itself can be an -/// arbitrary size and this function will return that size, but its backing -/// buffer must be padded out to alignment. -/// -inline size_t -ircd::simd::stream_aligned(const mutable_buffer &dst, - const const_buffer &src) -{ // Platforms that have non-temporal store support; this is all of x86_64 - constexpr bool has_store + constexpr bool stream_has_store { #if defined(__SSE2__) && !defined(RB_GENERIC) true @@ -43,7 +34,7 @@ ircd::simd::stream_aligned(const mutable_buffer &dst, // Platforms that have non-temporal load support; sidenote SSE4.1 can do // 16 byte loads and AVX2 can do 32 byte loads; SSE2 cannot do loads. - constexpr bool has_load + constexpr bool stream_has_load { #if defined(__AVX__) && !defined(RB_GENERIC) true @@ -52,17 +43,11 @@ ircd::simd::stream_aligned(const mutable_buffer &dst, #endif }; - // Use the AVX512 vector type unconditionally as it conveniently matches - // the cache-line size on the relevant platforms and simplifies our syntax - // below by being a single object. On those w/o AVX512 it uses an - // isomorphic configuration of the best available regs. - using block_type = u512x1; - // The number of cache lines we'll have "in flight" which is basically // just a gimmick to unroll the loop such that each iteration covers // the full register file. On SSE with 256 bytes of register file we can // name 4 cache lines at once; on AVX with 512 bytes we can name 8, etc. - constexpr size_t file_lines + constexpr size_t stream_max_lines { #if defined(__AVX512F__) 32 @@ -76,21 +61,48 @@ ircd::simd::stream_aligned(const mutable_buffer &dst, // Configurable magic number only relevant to SSE2 systems which don't have // non-temporal load instructions. On these platforms we'll conduct a // prefetch loop and mark the lines NTA. - constexpr size_t latency + constexpr size_t stream_latency { 16 }; - // When the constexpr conditions aren't favorable we can fallback to - // normal copy here. - if constexpr(!has_store && !has_load) - return copy(dst, src); + template + mutable_buffer + stream(const mutable_buffer &, const const_buffer &, lambda&&) noexcept; +} + +/// Non-temporal stream. This copies from an aligned source to an aligned +/// destination without the data cycling through the d-cache. The alignment +/// requirements are currently very strict. The source and destination buffers +/// must begin at a cache-line alignment and the size of the buffers must be +/// a multiple of something we'll call "register-file size" which is the size +/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for +/// AVX512) so it's safe to say buffers should just be aligned and padded out +/// to 4K page-size to be safe. The size of the src argument itself can be an +/// arbitrary size and this function will return that size, but its backing +/// buffer must be padded out to alignment. +/// +template +inline ircd::mutable_buffer +ircd::simd::stream(const mutable_buffer &dst, + const const_buffer &src, + lambda&& closure) +noexcept +{ + using line_t = stream_line_t; + + constexpr auto file_lines + { + std::min(bandwidth, stream_max_lines) + }; // Assert valid arguments assert(!overlap(src, dst)); - assert(aligned(data(src), sizeof(block_type))); - assert(aligned(data(dst), sizeof(block_type))); - assert(size(dst) % (sizeof(block_type) * file_lines)); + assert(aligned(data(src), sizeof(line_t))); + assert(aligned(data(dst), sizeof(line_t))); + assert(size(dst) % (sizeof(line_t) * file_lines)); // Size in bytes to be copied const size_t copy_size @@ -101,50 +113,56 @@ ircd::simd::stream_aligned(const mutable_buffer &dst, // Number of lines to be copied. const size_t copy_lines { - (copy_size / sizeof(block_type)) + bool(copy_size % sizeof(block_type)) + (copy_size / sizeof(line_t)) + bool(copy_size % sizeof(line_t)) }; // destination base ptr - block_type *const __restrict__ out + line_t *const __restrict__ out { - reinterpret_cast(data(dst)) + reinterpret_cast(data(dst)) }; // source base ptr - const block_type *const __restrict__ in + const line_t *const __restrict__ in { - reinterpret_cast(data(src)) + reinterpret_cast(data(src)) }; - if constexpr(!has_load) + if constexpr(!stream_has_load) #pragma clang loop unroll(disable) - for(size_t i(0); i < latency; ++i) + for(size_t i(0); i < stream_latency; ++i) __builtin_prefetch(in + i, 0, 0); for(size_t i(0); i < copy_lines; i += file_lines) { - if constexpr(!has_load) + if constexpr(!stream_has_load) for(size_t j(0); j < file_lines; ++j) - __builtin_prefetch(in + i + latency + j, 0, 0); + __builtin_prefetch(in + i + stream_latency + j, 0, 0); - block_type block[file_lines]; + line_t line[file_lines]; for(size_t j(0); j < file_lines; ++j) #if defined(__clang__) - block[j] = __builtin_nontemporal_load(in + i + j); + line[j] = __builtin_nontemporal_load(in + i + j); #else - block[j] = *(in + i + j); //TODO: XXX + line[j] = in[i + j]; #endif + for(size_t j(0); j < file_lines; ++j) + closure(line[j]); + for(size_t j(0); j < file_lines; ++j) #if defined(__clang__) - __builtin_nontemporal_store(block[j], out + i + j); + __builtin_nontemporal_store(line[j], out + i + j); #else - *(out + i + j) = block[j]; //TODO: XXX + *(out + i + j) = line[j]; //TODO: XXX #endif } - if constexpr(has_store) + if constexpr(stream_has_store) asm volatile ("sfence"); - return copy_size; + return mutable_buffer + { + data(dst), copy_size + }; }