From 5e21435e1a04659bd7b4ac6a8148cc5f22787ecf Mon Sep 17 00:00:00 2001
From: Jason Volk <jason@zemos.net>
Date: Sat, 10 Oct 2020 01:18:37 -0700
Subject: [PATCH] ircd::simd: Add closure to stream interface; include in
 stack.

---
 include/ircd/simd/simd.h   |   1 +
 include/ircd/simd/stream.h | 118 +++++++++++++++++++++----------------
 2 files changed, 69 insertions(+), 50 deletions(-)
diff --git a/include/ircd/simd/simd.h b/include/ircd/simd/simd.h
index 32c8a092b..9347accdc 100644
--- a/include/ircd/simd/simd.h
+++ b/include/ircd/simd/simd.h
@@ -32,6 +32,7 @@
 #include "lateral.h"
 #include "any.h"
 #include "all.h"
+#include "stream.h"
 #include "for_each.h"
 #include "transform.h"
 #include "generate.h"
diff --git a/include/ircd/simd/stream.h b/include/ircd/simd/stream.h
index 580fe3958..2ce5cfa2a 100644
--- a/include/ircd/simd/stream.h
+++ b/include/ircd/simd/stream.h
@@ -13,26 +13,17 @@
 
 namespace ircd::simd
 {
-	size_t stream_aligned(const mutable_buffer &dst, const const_buffer &src);
-}
+	// Using the AVX512 vector type by default as it conveniently matches the
+	// cache-line size on the relevant platforms and simplifies our syntax below
+	// by being a single object. On those w/o AVX512 it uses an isomorphic
+	// configuration of the best available regs.
+	using stream_line_t = u512x1;
+
+	template<class block_t>
+	using stream_proto = void (block_t &);
 
-/// Non-temporal copy. This copies from an aligned source to an aligned
-/// destination without the data cycling through the d-cache. The alignment
-/// requirements are currently very strict. The source and destination buffers
-/// must begin at a cache-line alignment and the size of the buffers must be
-/// a multiple of something we'll call "register-file size" which is the size
-/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for
-/// AVX512) so it's safe to say buffers should just be aligned and padded out
-/// to 4K page-size to be safe. The size of the src argument itself can be an
-/// arbitrary size and this function will return that size, but its backing
-/// buffer must be padded out to alignment.
-///
-inline size_t
-ircd::simd::stream_aligned(const mutable_buffer &dst,
-                           const const_buffer &src)
-{
 	// Platforms that have non-temporal store support; this is all of x86_64
-	constexpr bool has_store
+	constexpr bool stream_has_store
 	{
 		#if defined(__SSE2__) && !defined(RB_GENERIC)
 			true
@@ -43,7 +34,7 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
 
 	// Platforms that have non-temporal load support; sidenote SSE4.1 can do
 	// 16 byte loads and AVX2 can do 32 byte loads; SSE2 cannot do loads.
-	constexpr bool has_load
+	constexpr bool stream_has_load
 	{
 		#if defined(__AVX__) && !defined(RB_GENERIC)
 			true
@@ -52,17 +43,11 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
 		#endif
 	};
 
-	// Use the AVX512 vector type unconditionally as it conveniently matches
-	// the cache-line size on the relevant platforms and simplifies our syntax
-	// below by being a single object. On those w/o AVX512 it uses an
-	// isomorphic configuration of the best available regs.
-	using block_type = u512x1;
-
 	// The number of cache lines we'll have "in flight" which is basically
 	// just a gimmick to unroll the loop such that each iteration covers
 	// the full register file. On SSE with 256 bytes of register file we can
 	// name 4 cache lines at once; on AVX with 512 bytes we can name 8, etc.
-	constexpr size_t file_lines
+	constexpr size_t stream_max_lines
 	{
 		#if defined(__AVX512F__)
 			32
@@ -76,21 +61,48 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
 	// Configurable magic number only relevant to SSE2 systems which don't have
 	// non-temporal load instructions. On these platforms we'll conduct a
 	// prefetch loop and mark the lines NTA.
-	constexpr size_t latency
+	constexpr size_t stream_latency
 	{
 		16
 	};
 
-	// When the constexpr conditions aren't favorable we can fallback to
-	// normal copy here.
-	if constexpr(!has_store && !has_load)
-		return copy(dst, src);
+	template<size_t = stream_max_lines,
+	         class lambda>
+	mutable_buffer
+	stream(const mutable_buffer &, const const_buffer &, lambda&&) noexcept;
+}
+
+/// Non-temporal stream. This copies from an aligned source to an aligned
+/// destination without the data cycling through the d-cache. The alignment
+/// requirements are currently very strict. The source and destination buffers
+/// must begin at a cache-line alignment and the size of the buffers must be
+/// a multiple of something we'll call "register-file size" which is the size
+/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for
+/// AVX512) so it's safe to say buffers should just be aligned and padded out
+/// to 4K page-size to be safe. The size of the src argument itself can be an
+/// arbitrary size and this function will return that size, but its backing
+/// buffer must be padded out to alignment.
+///
+template<size_t bandwidth,
+         class lambda>
+inline ircd::mutable_buffer
+ircd::simd::stream(const mutable_buffer &dst,
+                   const const_buffer &src,
+                   lambda&& closure)
+noexcept
+{
+	using line_t = stream_line_t;
+
+	constexpr auto file_lines
+	{
+		std::min(bandwidth, stream_max_lines)
+	};
 
 	// Assert valid arguments
 	assert(!overlap(src, dst));
-	assert(aligned(data(src), sizeof(block_type)));
-	assert(aligned(data(dst), sizeof(block_type)));
-	assert(size(dst) % (sizeof(block_type) * file_lines));
+	assert(aligned(data(src), sizeof(line_t)));
+	assert(aligned(data(dst), sizeof(line_t)));
+	assert(size(dst) % (sizeof(line_t) * file_lines));
 
 	// Size in bytes to be copied
 	const size_t copy_size
@@ -101,50 +113,56 @@ ircd::simd::stream_aligned(const mutable_buffer &dst,
 	// Number of lines to be copied.
 	const size_t copy_lines
 	{
-		(copy_size / sizeof(block_type)) + bool(copy_size % sizeof(block_type))
+		(copy_size / sizeof(line_t)) + bool(copy_size % sizeof(line_t))
 	};
 
 	// destination base ptr
-	block_type *const __restrict__ out
+	line_t *const __restrict__ out
 	{
-		reinterpret_cast<block_type *__restrict__>(data(dst))
+		reinterpret_cast<line_t *__restrict__>(data(dst))
 	};
 
 	// source base ptr
-	const block_type *const __restrict__ in
+	const line_t *const __restrict__ in
 	{
-		reinterpret_cast<const block_type *__restrict__>(data(src))
+		reinterpret_cast<const line_t *__restrict__>(data(src))
 	};
 
-	if constexpr(!has_load)
+	if constexpr(!stream_has_load)
 		#pragma clang loop unroll(disable)
-		for(size_t i(0); i < latency; ++i)
+		for(size_t i(0); i < stream_latency; ++i)
 			__builtin_prefetch(in + i, 0, 0);
 
 	for(size_t i(0); i < copy_lines; i += file_lines)
 	{
-		if constexpr(!has_load)
+		if constexpr(!stream_has_load)
 			for(size_t j(0); j < file_lines; ++j)
-				__builtin_prefetch(in + i + latency + j, 0, 0);
+				__builtin_prefetch(in + i + stream_latency + j, 0, 0);
 
-		block_type block[file_lines];
+		line_t line[file_lines];
 		for(size_t j(0); j < file_lines; ++j)
 			#if defined(__clang__)
-				block[j] = __builtin_nontemporal_load(in + i + j);
+				line[j] = __builtin_nontemporal_load(in + i + j);
 			#else
-				block[j] = *(in + i + j); //TODO: XXX
+				line[j] = in[i + j];
 			#endif
 
+		for(size_t j(0); j < file_lines; ++j)
+			closure(line[j]);
+
 		for(size_t j(0); j < file_lines; ++j)
 			#if defined(__clang__)
-				__builtin_nontemporal_store(block[j], out + i + j);
+				__builtin_nontemporal_store(line[j], out + i + j);
 			#else
-				*(out + i + j) = block[j]; //TODO: XXX
+				*(out + i + j) = line[j]; //TODO: XXX
 			#endif
 	}
 
-	if constexpr(has_store)
+	if constexpr(stream_has_store)
 		asm volatile ("sfence");
 
-	return copy_size;
+	return mutable_buffer
+	{
+		data(dst), copy_size
+	};
 }