ircd::simd: Make stream template into enable_if complex w/ closure deductions.

2025-03-14 05:20:17 +01:00 · 2020-10-06 19:41:47 -07:00 · 2020-10-06 19:41:47 -07:00 · 638efbc0e9
commit 638efbc0e9
parent ef715a637d
2 changed files with 278 additions and 18 deletions
--- a/include/ircd/simd/accumulate.h
+++ b/include/ircd/simd/accumulate.h
@ -38,18 +38,12 @@ ircd::simd::accumulate(const char *const __restrict__ in,
                       lambda&& closure)
 noexcept
 {
-	static const u64x2 block_stride
-	{
-		max[0], sizeof(block_t)
-	};
-
 	const u64x2 res
 	{
 		stream<block_t>(in, max, [&val, &closure]
 		(const auto block, const auto mask)
 		{
 			closure(val, block, mask);
-			return block_stride;
 		})
 	};

--- a/include/ircd/simd/stream.h
+++ b/include/ircd/simd/stream.h
@ -11,24 +11,108 @@
 #pragma once
 #define HAVE_IRCD_SIMD_STREAM_H

+// half-duplex fixed stride
 namespace ircd::simd
 {
-	/// Transform block_t by pseudo-reference. The closure has an opportunity
-	/// to modify the block while it is being streamed from the source to the
-	/// destination. The mask indicates which elements of the block are valid
-	/// if the input is smaller than the block size. This function returns
-	/// a pair of integers which advance the output and input positions of the
-	/// streams for the next iteration.
 	template<class block_t>
-	using transform_prototype = u64x2 (block_t &, block_t mask);
+	using stream_half_fixed_proto = void (block_t, block_t mask);

 	template<class block_t,
 	         class lambda>
-	u64x2 stream(const char *, const u64x2, lambda&&) noexcept;
+	using stream_is_half_fixed_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t, block_t>, void
+	>;

 	template<class block_t,
 	         class lambda>
-	u64x2 stream(char *, const char *, const u64x2, lambda&&) noexcept;
+	using stream_half_fixed_stride = std::enable_if
+	<
+		stream_is_half_fixed_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	typename stream_half_fixed_stride<block_t, lambda>::type
+	stream(const char *, const u64x2, lambda&&) noexcept;
+}
+
+// half-duplex variable stride
+namespace ircd::simd
+{
+	template<class block_t>
+	using stream_half_variable_proto = u64x2 (block_t, block_t mask);
+
+	template<class block_t,
+	         class lambda>
+	using stream_is_half_variable_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t, block_t>, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using stream_half_variable_stride = std::enable_if
+	<
+		stream_is_half_variable_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	typename stream_half_variable_stride<block_t, lambda>::type
+	stream(const char *, const u64x2, lambda&&) noexcept;
+}
+
+// full-duplex fixed stride
+namespace ircd::simd
+{
+	template<class block_t>
+	using stream_full_fixed_proto = void (block_t &, block_t mask);
+
+	template<class block_t,
+	         class lambda>
+	using stream_is_full_fixed_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t &, block_t>, void
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using stream_full_fixed_stride = std::enable_if
+	<
+		stream_is_full_fixed_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	typename stream_full_fixed_stride<block_t, lambda>::type
+	stream(char *, const char *, const u64x2, lambda&&) noexcept;
+}
+
+// full-duplex variable stride
+namespace ircd::simd
+{
+	template<class block_t>
+	using stream_full_variable_proto = u64x2 (block_t &, block_t mask);
+
+	template<class block_t,
+	         class lambda>
+	using stream_is_full_variable_stride = std::is_same
+	<
+		std::invoke_result_t<lambda, block_t &, block_t>, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	using stream_full_variable_stride = std::enable_if
+	<
+		stream_is_full_variable_stride<block_t, lambda>::value, u64x2
+	>;
+
+	template<class block_t,
+	         class lambda>
+	typename stream_full_variable_stride<block_t, lambda>::type
+	stream(char *, const char *, const u64x2, lambda&&) noexcept;
 }

 /// Streaming transform
@ -55,7 +139,7 @@ namespace ircd::simd
 ///
 template<class block_t,
         class lambda>
-inline ircd::u64x2
+inline typename ircd::simd::stream_full_variable_stride<block_t, lambda>::type
 ircd::simd::stream(char *const __restrict__ out,
                   const char *const __restrict__ in,
                   const u64x2 max,
@ -98,8 +182,8 @@ noexcept
 			closure(block, mask)
 		};

-		*di = block;
 		count += consume;
+		*di = block;
 	}

 	// trailing narrowband loop
@ -130,6 +214,103 @@ noexcept
 	};
 }

+/// Streaming transform
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * byte-aligned (unaligned): the input and output buffers do not have to
+/// be aligned and can be any size.
+///
+/// * full-duplex: the operation involves both input and output and there are
+/// separate pointers for progress across the input and output buffers which
+/// are incremented independently.
+///
+/// * fixed-stride: progress for each iteration of the loop across the input
+/// and output buffers is fixed.
+///
+/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
+/// gives the buffer size in that format. The return value is the consumed
+/// bytes (final counter value) in that format.
+///
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::stream_full_fixed_stride<block_t, lambda>::type
+ircd::simd::stream(char *const __restrict__ out,
+                   const char *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	using block_t_u = unaligned<block_t>;
+
+	u64x2 count
+	{
+		0, // output pos
+		0, // input pos
+	};
+
+	// primary broadband loop
+	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
+	{
+		static const u64x2 consume
+		{
+			sizeof(block_t),
+			sizeof(block_t),
+		};
+
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto di
+		{
+			reinterpret_cast<block_t_u *>(out + count[0])
+		};
+
+		const auto si
+		{
+			reinterpret_cast<const block_t_u *>(in + count[1])
+		};
+
+		block_t block
+		(
+			*si
+		);
+
+		closure(block, mask);
+		count += consume;
+		*di = block;
+	}
+
+	// trailing narrowband loop
+	assert(count[1] + sizeof(block_t) > max[1]);
+	if(likely(count[1] < max[1]))
+	{
+		u64 i[2] {0};
+		block_t block {0}, mask {0};
+		for(; count[1] + i[1] < max[1]; ++i[1])
+		{
+			block[i[1]] = in[count[1] + i[1]];
+			mask[i[1]] = 0xff;
+		}
+
+		closure(block, mask);
+		for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0])
+			out[count[0] + i[0]] = block[i[0]];
+
+		count += u64x2
+		{
+			i[0], i[1]
+		};
+	}
+
+	assert(count[0] == max[0]);
+	assert(count[1] == max[1]);
+	return count;
+}
+
 /// Streaming consumer
 ///
 /// This template performs the loop boiler-plate for the developer who can
@ -157,7 +338,7 @@ noexcept
 ///
 template<class block_t,
         class lambda>
-inline ircd::u64x2
+inline typename ircd::simd::stream_half_variable_stride<block_t, lambda>::type
 ircd::simd::stream(const char *const __restrict__ in,
                   const u64x2 max,
                   lambda&& closure)
@ -219,3 +400,88 @@ noexcept
 		std::min(count[1], max[1])
 	};
 }
+
+/// Streaming consumer
+///
+/// This template performs the loop boiler-plate for the developer who can
+/// simply supply a conforming closure. Characteristics:
+///
+/// * byte-aligned (unaligned): the input buffer does not have to be aligned
+/// and can be any size.
+///
+/// * fixed-stride: progress for each iteration of the loop across the input
+/// and buffer is fixed at the block width; the transform function does not
+/// control the iteration.
+///
+/// u64x2 counter lanes = { available_to_user, input_length }; The argument
+/// `max` gives the buffer size in that format. The return value is the
+/// consumed bytes (final counter value) in that format. The first lane is
+/// available to the user; its initial value is max[0] (also unused).
+///
+template<class block_t,
+         class lambda>
+inline typename ircd::simd::stream_half_fixed_stride<block_t, lambda>::type
+ircd::simd::stream(const char *const __restrict__ in,
+                   const u64x2 max,
+                   lambda&& closure)
+noexcept
+{
+	using block_t_u = unaligned<block_t>;
+
+	u64x2 count
+	{
+		max[0], // preserved for caller
+		0,      // input pos
+	};
+
+	// primary broadband loop
+	while(count[1] + sizeof(block_t) <= max[1])
+	{
+		static const u64x2 consume
+		{
+			0, sizeof(block_t)
+		};
+
+		static const auto mask
+		{
+			~block_t{0}
+		};
+
+		const auto si
+		{
+			reinterpret_cast<const block_t_u *>(in + count[1])
+		};
+
+		const block_t block
+		(
+			*si
+		);
+
+		closure(block, mask);
+		count += consume;
+	}
+
+	// trailing narrowband loop
+	assert(count[1] + sizeof(block_t) > max[1]);
+	if(likely(count[1] < max[1]))
+	{
+		size_t i(0);
+		block_t block {0}, mask {0};
+		for(; count[1] + i < max[1]; ++i)
+		{
+			block[i] = in[count[1] + i];
+			mask[i] = 0xff;
+		}
+
+		closure(block, mask);
+		count += u64x2 // consume remainder
+		{
+			0, i
+		};
+	}
+
+	// return value is pure
+	assert(count[0] == max[0]);
+	assert(count[1] == max[1]);
+	return count;
+}