From f1c8eb3c1ce7b280e7660cdd86e8c87c14a9937a Mon Sep 17 00:00:00 2001
From: Jason Volk <jason@zemos.net>
Date: Tue, 7 Jul 2020 14:53:05 -0700
Subject: [PATCH] ircd::buffer: Add experimental non-temporal copy device.

---
 include/ircd/buffer/buffer.h |   1 +
 include/ircd/buffer/stream.h | 142 +++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 include/ircd/buffer/stream.h

diff --git a/include/ircd/buffer/buffer.h b/include/ircd/buffer/buffer.h
index b8076a933..8b99bf7df 100644
--- a/include/ircd/buffer/buffer.h
+++ b/include/ircd/buffer/buffer.h
@@ -105,6 +105,7 @@ namespace ircd::buffer::buffers
 #include "const_buffer.h"
 #include "copy.h"
 #include "move.h"
+#include "stream.h"
 #include "fixed_buffer.h"
 #include "window_buffer.h"
 #include "parse_buffer.h"
diff --git a/include/ircd/buffer/stream.h b/include/ircd/buffer/stream.h
new file mode 100644
index 000000000..94393d558
--- /dev/null
+++ b/include/ircd/buffer/stream.h
@@ -0,0 +1,142 @@
+// The Construct
+//
+// Copyright (C) The Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_BUFFER_STREAM_H
+
+namespace ircd::buffer
+{
+	size_t stream_aligned(const mutable_buffer &dst, const const_buffer &src);
+}
+
+/// Non-temporal copy. This copies from an aligned source to an aligned
+/// destination without the data cycling through the d-cache. The alignment
+/// requirements are currently very strict. The source and destination buffers
+/// must begin at a cache-line alignment and the size of the buffers must be
+/// a multiple of something we'll call "register-file size" which is the size
+/// of all named multimedia registers (256 for SSE, 512 for AVX, 2048 for
+/// AVX512) so it's safe to say buffers should just be aligned and padded out
+/// to 4K page-size to be safe. The size of the src argument itself can be an
+/// arbitrary size and this function will return that size, but its backing
+/// buffer must be padded out to alignment.
+///
+inline size_t
+ircd::buffer::stream_aligned(const mutable_buffer &dst,
+                             const const_buffer &src)
+{
+	// Platforms that have non-temporal store support; this is all of x86_64
+	constexpr bool has_nontemporal_store
+	{
+		#if defined(__SSE2__) && !defined(RB_GENERIC)
+			true
+		#else
+			false
+		#endif
+	};
+
+	// Platforms that have non-temporal load support; sidenote SSE4.1 can do
+	// 16 byte loads and AVX2 can do 32 byte loads; SSE2 cannot do loads.
+	constexpr bool has_nontemporal_load
+	{
+		#if defined(__AVX__) && !defined(RB_GENERIC)
+			true
+		#else
+			false
+		#endif
+	};
+
+	// Use the AVX512 vector type unconditionally as it conveniently matches
+	// the cache-line size on the relevant platforms and simplifies our syntax
+	// below by being a single object. On those w/o AVX512 it uses an
+	// isomorphic configuration of the best available regs.
+	using block_type = u512x1;
+
+	// The number of cache lines we'll have "in flight" which is basically
+	// just a gimmick to unroll the loop such that each iteration covers
+	// the full register file. On SSE with 256 bytes of register file we can
+	// name 4 cache lines at once; on AVX with 512 bytes we can name 8, etc.
+	constexpr size_t file_lines
+	{
+		#if defined(__AVX512F__)
+			32
+		#elif defined(__AVX__)
+			8
+		#else
+			4
+		#endif
+	};
+
+	// Configurable magic number only relevant to SSE2 systems which don't have
+	// non-temporal load instructions. On these platforms we'll conduct a
+	// prefetch loop and mark the lines NTA.
+	constexpr size_t latency
+	{
+		16
+	};
+
+	// When the constexpr conditions aren't favorable we can fallback to
+	// normal copy here.
+	if constexpr(!has_nontemporal_store && !has_nontemporal_load)
+		return copy(dst, src);
+
+	// Assert valid arguments
+	assert(!overlap(src, dst));
+	assert(aligned(data(src), sizeof(block_type)));
+	assert(aligned(data(dst), sizeof(block_type)));
+	assert(size(dst) % (sizeof(block_type) * file_lines));
+
+	// Size in bytes to be copied
+	const size_t copy_size
+	{
+		std::min(size(src), size(dst))
+	};
+
+	// Number of lines to be copied.
+	const size_t copy_lines
+	{
+		(copy_size / sizeof(block_type)) + bool(copy_size % sizeof(block_type))
+	};
+
+	// destination base ptr
+	block_type *const __restrict__ out
+	{
+		reinterpret_cast<block_type *__restrict__>(data(dst))
+	};
+
+	// source base ptr
+	const block_type *const __restrict__ in
+	{
+		reinterpret_cast<const block_type *__restrict__>(data(src))
+	};
+
+	if constexpr(!has_nontemporal_load)
+		#pragma clang loop unroll(disable)
+		for(size_t i(0); i < latency; ++i)
+			__builtin_prefetch(in + i, 0, 0);
+
+	for(size_t i(0); i < copy_lines; i += file_lines)
+	{
+		if constexpr(!has_nontemporal_load)
+			for(size_t j(0); j < file_lines; ++j)
+				__builtin_prefetch(in + i + latency + j, 0, 0);
+
+		block_type block[file_lines];
+		for(size_t j(0); j < file_lines; ++j)
+			block[j] = __builtin_nontemporal_load(in + i + j);
+
+		for(size_t j(0); j < file_lines; ++j)
+			__builtin_nontemporal_store(block[j], out + i + j);
+	}
+
+	if constexpr(has_nontemporal_store)
+		asm volatile ("sfence");
+
+	return copy_size;
+}