ircd::simd: Split and improve stream template util naming.

2025-01-13 16:33:53 +01:00 · 2020-10-09 00:39:10 -07:00 · 2020-10-09 00:39:10 -07:00 · 20f9301158
commit 20f9301158
parent 21d681d59e
5 changed files with 552 additions and 551 deletions
--- a/include/ircd/simd/accumulate.h
+++ b/include/ircd/simd/accumulate.h
@ -16,9 +16,7 @@ namespace ircd::simd
 	/// Transform block_t by pseudo-reference. The closure has an opportunity
 	/// to modify the block while it is being streamed from the source to the
 	/// destination. The mask indicates which elements of the block are valid
-	/// if the input is smaller than the block size. This function returns
-	/// a pair of integers which advance the output and input positions of the
-	/// streams for the next iteration.
+	/// if the input is smaller than the block size.
 	template<class block_t>
 	using accumulate_prototype = void (block_t &, block_t, block_t mask);

--- a/include/ircd/simd/iostream.h
+++ b/include/ircd/simd/iostream.h
@ -0,0 +1,255 @@
+// The Construct
+//
+// Copyright (C) The Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_SIMD_IOSTREAM_H
+
+namespace ircd::simd
+{
+	template<class block_t>
+	using iostream_fixed_proto = void (block_t &, block_t mask);
+
+	template<class block_t>
+	using iostream_variable_proto = u64x2 (block_t &, block_t mask);
+
+	template<class block_t,
+	         class lambda>
+	using iostream_is_fixed_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t &, block_t>, void
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using iostream_is_variable_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t &, block_t>, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using iostream_fixed_stride = std::enable_if
+	<
+		iostream_is_fixed_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using iostream_variable_stride = std::enable_if
+	<
+		iostream_is_variable_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	typename iostream_fixed_stride<block_t, lambda>::type
+	stream(char *, const char *, const u64x2, lambda&&) noexcept;
+
+	template<class block_t,
+	         class lambda>
+	typename iostream_variable_stride<block_t, lambda>::type
+	stream(char *, const char *, const u64x2, lambda&&) noexcept;
+}
+
+/// Streaming transform
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * byte-aligned (unaligned): the input and output buffers do not have to
+/// be aligned and can be any size.
+///
+/// * full-duplex: the operation involves both input and output and there are
+/// separate pointers for progress across the input and output buffers which
+/// are incremented independently.
+///
+/// * variable-stride: progress for each iteration of the loop across the input
+/// and output buffers is not fixed; the transform function may advance either
+/// pointer zero to sizeof(block_t) bytes each iteration. Due to these
+/// characteristics, unaligned bytes may be redundantly loaded or stored and
+/// non-temporal features are not used to optimize the operation.
+///
+/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
+/// gives the buffer size in that format. The return value is the consumed
+/// bytes (final counter value) in that format.
+///
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::iostream_variable_stride<block_t, lambda>::type
+ircd::simd::stream(char *const __restrict__ out,
+                   const char *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	using block_t_u = unaligned<block_t>;
+
+	u64x2 count
+	{
+		0, // output pos
+		0, // input pos
+	};
+
+	// primary broadband loop
+	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
+	{
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto di
+		{
+			reinterpret_cast<block_t_u *>(out + count[0])
+		};
+
+		const auto si
+		{
+			reinterpret_cast<const block_t_u *>(in + count[1])
+		};
+
+		block_t block
+		(
+			*si
+		);
+
+		const auto consume
+		{
+			closure(block, mask)
+		};
+
+		count += consume;
+		*di = block;
+	}
+
+	// trailing narrowband loop
+	while(count[1] < max[1])
+	{
+		block_t block {0}, mask {0};
+		for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
+		{
+			block[i] = in[count[1] + i];
+			mask[i] = 0xff;
+		}
+
+		const auto consume
+		{
+			closure(block, mask)
+		};
+
+		for(size_t i(0); i < consume[0] && count[0] + i < max[0]; ++i)
+			out[count[0] + i] = block[i];
+
+		count += consume;
+	}
+
+	return u64x2
+	{
+		std::min(count[0], max[0]),
+		std::min(count[1], max[1]),
+	};
+}
+
+/// Streaming transform
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * byte-aligned (unaligned): the input and output buffers do not have to
+/// be aligned and can be any size.
+///
+/// * full-duplex: the operation involves both input and output and there are
+/// separate pointers for progress across the input and output buffers which
+/// are incremented independently.
+///
+/// * fixed-stride: progress for each iteration of the loop across the input
+/// and output buffers is fixed.
+///
+/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
+/// gives the buffer size in that format. The return value is the consumed
+/// bytes (final counter value) in that format.
+///
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::iostream_fixed_stride<block_t, lambda>::type
+ircd::simd::stream(char *const __restrict__ out,
+                   const char *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	using block_t_u = unaligned<block_t>;
+
+	u64x2 count
+	{
+		0, // output pos
+		0, // input pos
+	};
+
+	// primary broadband loop
+	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
+	{
+		static const u64x2 consume
+		{
+			sizeof(block_t),
+			sizeof(block_t),
+		};
+
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto di
+		{
+			reinterpret_cast<block_t_u *>(out + count[0])
+		};
+
+		const auto si
+		{
+			reinterpret_cast<const block_t_u *>(in + count[1])
+		};
+
+		block_t block
+		(
+			*si
+		);
+
+		closure(block, mask);
+		count += consume;
+		*di = block;
+	}
+
+	// trailing narrowband loop
+	assert(count[1] + sizeof(block_t) > max[1]);
+	if(likely(count[1] < max[1]))
+	{
+		u64 i[2] {0};
+		block_t block {0}, mask {0};
+		for(; count[1] + i[1] < max[1]; ++i[1])
+		{
+			block[i[1]] = in[count[1] + i[1]];
+			mask[i[1]] = 0xff;
+		}
+
+		closure(block, mask);
+		for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0])
+			out[count[0] + i[0]] = block[i[0]];
+
+		count += u64x2
+		{
+			i[0], i[1]
+		};
+	}
+
+	assert(count[0] == max[0]);
+	assert(count[1] == max[1]);
+	return count;
+}
--- a/include/ircd/simd/istream.h
+++ b/include/ircd/simd/istream.h
@ -0,0 +1,294 @@
+// The Construct
+//
+// Copyright (C) The Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_SIMD_ISTREAM_H
+
+namespace ircd::simd
+{
+	template<class block_t>
+	using istream_fixed_proto = void (block_t, block_t mask);
+
+	template<class block_t>
+	using istream_variable_proto = u64x2 (block_t, block_t mask);
+
+	template<class block_t,
+	         class lambda>
+	using istream_is_fixed_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t, block_t>, void
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using istream_is_variable_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t, block_t>, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using istream_fixed_stride = std::enable_if
+	<
+		istream_is_fixed_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using istream_variable_stride = std::enable_if
+	<
+		istream_is_variable_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	typename istream_fixed_stride<block_t, lambda>::type
+	stream(const block_t *, const u64x2, lambda&&) noexcept;
+
+	template<class block_t,
+	         class lambda>
+	typename istream_fixed_stride<block_t, lambda>::type
+	stream(const char *, const u64x2, lambda&&) noexcept;
+
+	template<class block_t,
+	         class lambda>
+	typename istream_variable_stride<block_t, lambda>::type
+	stream(const char *, const u64x2, lambda&&) noexcept;
+}
+
+/// Streaming consumer
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * byte-aligned (unaligned): the input buffer does not have to be aligned
+/// and can be any size.
+///
+/// * variable-stride: progress for each iteration of the loop across the input
+/// and buffer is not fixed; the transform function may advance the pointer
+/// one to sizeof(block_t) bytes each iteration. Due to these characteristics,
+/// unaligned bytes may be redundantly loaded and non-temporal features are
+/// not used to optimize the operation.
+///
+/// u64x2 counter lanes = { available_to_user, input_length }; The argument
+/// `max` gives the buffer size in that format. The return value is the
+/// consumed bytes (final counter value) in that format. The first lane is
+/// available to the user; its initial value is max[0] (also unused); it is
+/// then accumulated with the first lane of the closure's return value; its
+/// final value is returned in [0] of the return value.
+///
+/// Note that the closure must advance the stream one or more bytes each
+/// iteration; a zero value is available for loop control: the loop will
+/// break without another iteration.
+///
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::istream_variable_stride<block_t, lambda>::type
+ircd::simd::stream(const char *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	using block_t_u = unaligned<block_t>;
+
+	u64x2 count
+	{
+		max[0], // preserved for caller
+		0,      // input pos
+	};
+
+	u64x2 consume
+	{
+		0,
+		-1UL    // non-zero to start loop
+	};
+
+	// primary broadband loop
+	while(consume[1] && count[1] + sizeof(block_t) <= max[1])
+	{
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto si
+		{
+			reinterpret_cast<const block_t_u *>(in + count[1])
+		};
+
+		const block_t block
+		(
+			*si
+		);
+
+		consume = closure(block, mask);
+		count += consume;
+	}
+
+	// trailing narrowband loop
+	while(consume[1] && count[1] < max[1])
+	{
+		block_t block {0}, mask {0};
+		for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
+		{
+			block[i] = in[count[1] + i];
+			mask[i] = 0xff;
+		}
+
+		consume = closure(block, mask);
+		count += consume;
+	}
+
+	return u64x2
+	{
+		count[0],
+		std::min(count[1], max[1])
+	};
+}
+
+/// Streaming consumer
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * byte-aligned (unaligned): the input buffer does not have to be aligned
+/// and can be any size.
+///
+/// * fixed-stride: progress for each iteration of the loop across the input
+/// and buffer is fixed at the block width; the transform function does not
+/// control the iteration.
+///
+/// u64x2 counter lanes = { available_to_user, input_length }; The argument
+/// `max` gives the buffer size in that format. The return value is the
+/// consumed bytes (final counter value) in that format. The first lane is
+/// available to the user; its initial value is max[0] (also unused).
+///
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::istream_fixed_stride<block_t, lambda>::type
+ircd::simd::stream(const char *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	using block_t_u = unaligned<block_t>;
+
+	u64x2 count
+	{
+		max[0], // preserved for caller
+		0,      // input pos
+	};
+
+	// primary broadband loop
+	while(count[1] + sizeof(block_t) <= max[1])
+	{
+		static const u64x2 consume
+		{
+			0, sizeof(block_t)
+		};
+
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto si
+		{
+			reinterpret_cast<const block_t_u *>(in + count[1])
+		};
+
+		const block_t block
+		(
+			*si
+		);
+
+		closure(block, mask);
+		count += consume;
+	}
+
+	// trailing narrowband loop
+	assert(count[1] + sizeof(block_t) > max[1]);
+	if(likely(count[1] < max[1]))
+	{
+		size_t i(0);
+		block_t block {0}, mask {0};
+		for(; count[1] + i < max[1]; ++i)
+		{
+			block[i] = in[count[1] + i];
+			mask[i] = 0xff;
+		}
+
+		closure(block, mask);
+		count += u64x2 // consume remainder
+		{
+			0, i
+		};
+	}
+
+	// return value is pure
+	assert(count[0] == max[0]);
+	assert(count[1] == max[1]);
+	return count;
+}
+
+/// Streaming consumer
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * block-aligned
+/// * fixed-stride
+////
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::istream_fixed_stride<block_t, lambda>::type
+ircd::simd::stream(const block_t *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	u64x2 count
+	{
+		max[0], // preserved for caller
+		0,      // input pos
+	};
+
+	// primary broadband loop
+	while(count[1] < max[1])
+	{
+		static const u64x2 consume
+		{
+			0, sizeof(block_t)
+		};
+
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto si
+		{
+			in + count[1] / sizeof(block_t)
+		};
+
+		const block_t block
+		(
+			*si
+		);
+
+		closure(block, mask);
+		count += consume;
+	}
+
+	assert(count[1] + sizeof(block_t) > max[1]);
+	assert(count[0] == max[0]);
+	assert(count[1] == max[1]);
+	return count;
+}
--- a/include/ircd/simd/simd.h
+++ b/include/ircd/simd/simd.h
@ -32,7 +32,8 @@
 #include "lateral.h"
 #include "any.h"
 #include "all.h"
-#include "stream.h"
+#include "istream.h"
+#include "iostream.h"
 #include "accumulate.h"
 #include "print.h"

--- a/include/ircd/simd/stream.h
+++ b/include/ircd/simd/stream.h
@ -1,547 +0,0 @@
-// The Construct
-//
-// Copyright (C) The Construct Developers, Authors & Contributors
-// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
-//
-// Permission to use, copy, modify, and/or distribute this software for any
-// purpose with or without fee is hereby granted, provided that the above
-// copyright notice and this permission notice is present in all copies. The
-// full license for this software is available in the LICENSE file.
-
-#pragma once
-#define HAVE_IRCD_SIMD_STREAM_H
-
-// half-duplex fixed stride
-namespace ircd::simd
-{
-	template<class block_t>
-	using stream_half_fixed_proto = void (block_t, block_t mask);
-
-	template<class block_t,
-	         class lambda>
-	using stream_is_half_fixed_stride = std::is_same
-	<
-		std::invoke_result_t<lambda, block_t, block_t>, void
-	>;
-
-	template<class block_t,
-	         class lambda>
-	using stream_half_fixed_stride = std::enable_if
-	<
-		stream_is_half_fixed_stride<block_t, lambda>::value, u64x2
-	>;
-
-	template<class block_t,
-	         class lambda>
-	typename stream_half_fixed_stride<block_t, lambda>::type
-	stream(const block_t *, const u64x2, lambda&&) noexcept;
-
-	template<class block_t,
-	         class lambda>
-	typename stream_half_fixed_stride<block_t, lambda>::type
-	stream(const char *, const u64x2, lambda&&) noexcept;
-}
-
-// half-duplex variable stride
-namespace ircd::simd
-{
-	template<class block_t>
-	using stream_half_variable_proto = u64x2 (block_t, block_t mask);
-
-	template<class block_t,
-	         class lambda>
-	using stream_is_half_variable_stride = std::is_same
-	<
-		std::invoke_result_t<lambda, block_t, block_t>, u64x2
-	>;
-
-	template<class block_t,
-	         class lambda>
-	using stream_half_variable_stride = std::enable_if
-	<
-		stream_is_half_variable_stride<block_t, lambda>::value, u64x2
-	>;
-
-	template<class block_t,
-	         class lambda>
-	typename stream_half_variable_stride<block_t, lambda>::type
-	stream(const char *, const u64x2, lambda&&) noexcept;
-}
-
-// full-duplex fixed stride
-namespace ircd::simd
-{
-	template<class block_t>
-	using stream_full_fixed_proto = void (block_t &, block_t mask);
-
-	template<class block_t,
-	         class lambda>
-	using stream_is_full_fixed_stride = std::is_same
-	<
-		std::invoke_result_t<lambda, block_t &, block_t>, void
-	>;
-
-	template<class block_t,
-	         class lambda>
-	using stream_full_fixed_stride = std::enable_if
-	<
-		stream_is_full_fixed_stride<block_t, lambda>::value, u64x2
-	>;
-
-	template<class block_t,
-	         class lambda>
-	typename stream_full_fixed_stride<block_t, lambda>::type
-	stream(char *, const char *, const u64x2, lambda&&) noexcept;
-}
-
-// full-duplex variable stride
-namespace ircd::simd
-{
-	template<class block_t>
-	using stream_full_variable_proto = u64x2 (block_t &, block_t mask);
-
-	template<class block_t,
-	         class lambda>
-	using stream_is_full_variable_stride = std::is_same
-	<
-		std::invoke_result_t<lambda, block_t &, block_t>, u64x2
-	>;
-
-	template<class block_t,
-	         class lambda>
-	using stream_full_variable_stride = std::enable_if
-	<
-		stream_is_full_variable_stride<block_t, lambda>::value, u64x2
-	>;
-
-	template<class block_t,
-	         class lambda>
-	typename stream_full_variable_stride<block_t, lambda>::type
-	stream(char *, const char *, const u64x2, lambda&&) noexcept;
-}
-
-/// Streaming transform
-///
-/// This template performs the loop boiler-plate for the developer who can
-/// simply supply a conforming closure. Characteristics:
-///
-/// * byte-aligned (unaligned): the input and output buffers do not have to
-/// be aligned and can be any size.
-///
-/// * full-duplex: the operation involves both input and output and there are
-/// separate pointers for progress across the input and output buffers which
-/// are incremented independently.
-///
-/// * variable-stride: progress for each iteration of the loop across the input
-/// and output buffers is not fixed; the transform function may advance either
-/// pointer zero to sizeof(block_t) bytes each iteration. Due to these
-/// characteristics, unaligned bytes may be redundantly loaded or stored and
-/// non-temporal features are not used to optimize the operation.
-///
-/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
-/// gives the buffer size in that format. The return value is the consumed
-/// bytes (final counter value) in that format.
-///
-template<class block_t,
-         class lambda>
-inline typename ircd::simd::stream_full_variable_stride<block_t, lambda>::type
-ircd::simd::stream(char *const __restrict__ out,
-                   const char *const __restrict__ in,
-                   const u64x2 max,
-                   lambda&& closure)
-noexcept
-{
-	using block_t_u = unaligned<block_t>;
-
-	u64x2 count
-	{
-		0, // output pos
-		0, // input pos
-	};
-
-	// primary broadband loop
-	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
-	{
-		static const auto mask
-		{
-			~block_t{0}
-		};
-
-		const auto di
-		{
-			reinterpret_cast<block_t_u *>(out + count[0])
-		};
-
-		const auto si
-		{
-			reinterpret_cast<const block_t_u *>(in + count[1])
-		};
-
-		block_t block
-		(
-			*si
-		);
-
-		const auto consume
-		{
-			closure(block, mask)
-		};
-
-		count += consume;
-		*di = block;
-	}
-
-	// trailing narrowband loop
-	while(count[1] < max[1])
-	{
-		block_t block {0}, mask {0};
-		for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
-		{
-			block[i] = in[count[1] + i];
-			mask[i] = 0xff;
-		}
-
-		const auto consume
-		{
-			closure(block, mask)
-		};
-
-		for(size_t i(0); i < consume[0] && count[0] + i < max[0]; ++i)
-			out[count[0] + i] = block[i];
-
-		count += consume;
-	}
-
-	return u64x2
-	{
-		std::min(count[0], max[0]),
-		std::min(count[1], max[1]),
-	};
-}
-
-/// Streaming transform
-///
-/// This template performs the loop boiler-plate for the developer who can
-/// simply supply a conforming closure. Characteristics:
-///
-/// * byte-aligned (unaligned): the input and output buffers do not have to
-/// be aligned and can be any size.
-///
-/// * full-duplex: the operation involves both input and output and there are
-/// separate pointers for progress across the input and output buffers which
-/// are incremented independently.
-///
-/// * fixed-stride: progress for each iteration of the loop across the input
-/// and output buffers is fixed.
-///
-/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
-/// gives the buffer size in that format. The return value is the consumed
-/// bytes (final counter value) in that format.
-///
-template<class block_t,
-         class lambda>
-inline typename ircd::simd::stream_full_fixed_stride<block_t, lambda>::type
-ircd::simd::stream(char *const __restrict__ out,
-                   const char *const __restrict__ in,
-                   const u64x2 max,
-                   lambda&& closure)
-noexcept
-{
-	using block_t_u = unaligned<block_t>;
-
-	u64x2 count
-	{
-		0, // output pos
-		0, // input pos
-	};
-
-	// primary broadband loop
-	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
-	{
-		static const u64x2 consume
-		{
-			sizeof(block_t),
-			sizeof(block_t),
-		};
-
-		static const auto mask
-		{
-			~block_t{0}
-		};
-
-		const auto di
-		{
-			reinterpret_cast<block_t_u *>(out + count[0])
-		};
-
-		const auto si
-		{
-			reinterpret_cast<const block_t_u *>(in + count[1])
-		};
-
-		block_t block
-		(
-			*si
-		);
-
-		closure(block, mask);
-		count += consume;
-		*di = block;
-	}
-
-	// trailing narrowband loop
-	assert(count[1] + sizeof(block_t) > max[1]);
-	if(likely(count[1] < max[1]))
-	{
-		u64 i[2] {0};
-		block_t block {0}, mask {0};
-		for(; count[1] + i[1] < max[1]; ++i[1])
-		{
-			block[i[1]] = in[count[1] + i[1]];
-			mask[i[1]] = 0xff;
-		}
-
-		closure(block, mask);
-		for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0])
-			out[count[0] + i[0]] = block[i[0]];
-
-		count += u64x2
-		{
-			i[0], i[1]
-		};
-	}
-
-	assert(count[0] == max[0]);
-	assert(count[1] == max[1]);
-	return count;
-}
-
-/// Streaming consumer
-///
-/// This template performs the loop boiler-plate for the developer who can
-/// simply supply a conforming closure. Characteristics:
-///
-/// * byte-aligned (unaligned): the input buffer does not have to be aligned
-/// and can be any size.
-///
-/// * variable-stride: progress for each iteration of the loop across the input
-/// and buffer is not fixed; the transform function may advance the pointer
-/// one to sizeof(block_t) bytes each iteration. Due to these characteristics,
-/// unaligned bytes may be redundantly loaded and non-temporal features are
-/// not used to optimize the operation.
-///
-/// u64x2 counter lanes = { available_to_user, input_length }; The argument
-/// `max` gives the buffer size in that format. The return value is the
-/// consumed bytes (final counter value) in that format. The first lane is
-/// available to the user; its initial value is max[0] (also unused); it is
-/// then accumulated with the first lane of the closure's return value; its
-/// final value is returned in [0] of the return value.
-///
-/// Note that the closure must advance the stream one or more bytes each
-/// iteration; a zero value is available for loop control: the loop will
-/// break without another iteration.
-///
-template<class block_t,
-         class lambda>
-inline typename ircd::simd::stream_half_variable_stride<block_t, lambda>::type
-ircd::simd::stream(const char *const __restrict__ in,
-                   const u64x2 max,
-                   lambda&& closure)
-noexcept
-{
-	using block_t_u = unaligned<block_t>;
-
-	u64x2 count
-	{
-		max[0], // preserved for caller
-		0,      // input pos
-	};
-
-	u64x2 consume
-	{
-		0,
-		-1UL    // non-zero to start loop
-	};
-
-	// primary broadband loop
-	while(consume[1] && count[1] + sizeof(block_t) <= max[1])
-	{
-		static const auto mask
-		{
-			~block_t{0}
-		};
-
-		const auto si
-		{
-			reinterpret_cast<const block_t_u *>(in + count[1])
-		};
-
-		const block_t block
-		(
-			*si
-		);
-
-		consume = closure(block, mask);
-		count += consume;
-	}
-
-	// trailing narrowband loop
-	while(consume[1] && count[1] < max[1])
-	{
-		block_t block {0}, mask {0};
-		for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
-		{
-			block[i] = in[count[1] + i];
-			mask[i] = 0xff;
-		}
-
-		consume = closure(block, mask);
-		count += consume;
-	}
-
-	return u64x2
-	{
-		count[0],
-		std::min(count[1], max[1])
-	};
-}
-
-/// Streaming consumer
-///
-/// This template performs the loop boiler-plate for the developer who can
-/// simply supply a conforming closure. Characteristics:
-///
-/// * byte-aligned (unaligned): the input buffer does not have to be aligned
-/// and can be any size.
-///
-/// * fixed-stride: progress for each iteration of the loop across the input
-/// and buffer is fixed at the block width; the transform function does not
-/// control the iteration.
-///
-/// u64x2 counter lanes = { available_to_user, input_length }; The argument
-/// `max` gives the buffer size in that format. The return value is the
-/// consumed bytes (final counter value) in that format. The first lane is
-/// available to the user; its initial value is max[0] (also unused).
-///
-template<class block_t,
-         class lambda>
-inline typename ircd::simd::stream_half_fixed_stride<block_t, lambda>::type
-ircd::simd::stream(const char *const __restrict__ in,
-                   const u64x2 max,
-                   lambda&& closure)
-noexcept
-{
-	using block_t_u = unaligned<block_t>;
-
-	u64x2 count
-	{
-		max[0], // preserved for caller
-		0,      // input pos
-	};
-
-	// primary broadband loop
-	while(count[1] + sizeof(block_t) <= max[1])
-	{
-		static const u64x2 consume
-		{
-			0, sizeof(block_t)
-		};
-
-		static const auto mask
-		{
-			~block_t{0}
-		};
-
-		const auto si
-		{
-			reinterpret_cast<const block_t_u *>(in + count[1])
-		};
-
-		const block_t block
-		(
-			*si
-		);
-
-		closure(block, mask);
-		count += consume;
-	}
-
-	// trailing narrowband loop
-	assert(count[1] + sizeof(block_t) > max[1]);
-	if(likely(count[1] < max[1]))
-	{
-		size_t i(0);
-		block_t block {0}, mask {0};
-		for(; count[1] + i < max[1]; ++i)
-		{
-			block[i] = in[count[1] + i];
-			mask[i] = 0xff;
-		}
-
-		closure(block, mask);
-		count += u64x2 // consume remainder
-		{
-			0, i
-		};
-	}
-
-	// return value is pure
-	assert(count[0] == max[0]);
-	assert(count[1] == max[1]);
-	return count;
-}
-
-/// Streaming consumer
-///
-/// This template performs the loop boiler-plate for the developer who can
-/// simply supply a conforming closure. Characteristics:
-///
-/// * block-aligned
-/// * fixed-stride
-////
-template<class block_t,
-         class lambda>
-inline typename ircd::simd::stream_half_fixed_stride<block_t, lambda>::type
-ircd::simd::stream(const block_t *const __restrict__ in,
-                   const u64x2 max,
-                   lambda&& closure)
-noexcept
-{
-	u64x2 count
-	{
-		max[0], // preserved for caller
-		0,      // input pos
-	};
-
-	// primary broadband loop
-	while(count[1] < max[1])
-	{
-		static const u64x2 consume
-		{
-			0, sizeof(block_t)
-		};
-
-		static const auto mask
-		{
-			~block_t{0}
-		};
-
-		const auto si
-		{
-			in + count[1] / sizeof(block_t)
-		};
-
-		const block_t block
-		(
-			*si
-		);
-
-		closure(block, mask);
-		count += consume;
-	}
-
-	assert(count[1] + sizeof(block_t) > max[1]);
-	assert(count[0] == max[0]);
-	assert(count[1] == max[1]);
-	return count;
-}