construct/include/ircd/simd/stream.h

// The Construct
//
// Copyright (C) The Construct Developers, Authors & Contributors
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.

#pragma once
#define HAVE_IRCD_SIMD_STREAM_H

// half-duplex fixed stride
namespace ircd::simd
{
	template<class block_t>
	using stream_half_fixed_proto = void (block_t, block_t mask);

	template<class block_t,
	         class lambda>
	using stream_is_half_fixed_stride = std::is_same
	<
		std::invoke_result_t<lambda, block_t, block_t>, void
	>;

	template<class block_t,
	         class lambda>
	using stream_half_fixed_stride = std::enable_if
	<
		stream_is_half_fixed_stride<block_t, lambda>::value, u64x2
	>;

	template<class block_t,
	         class lambda>
	typename stream_half_fixed_stride<block_t, lambda>::type
	stream(const char *, const u64x2, lambda&&) noexcept;
}

// half-duplex variable stride
namespace ircd::simd
{
	template<class block_t>
	using stream_half_variable_proto = u64x2 (block_t, block_t mask);

	template<class block_t,
	         class lambda>
	using stream_is_half_variable_stride = std::is_same
	<
		std::invoke_result_t<lambda, block_t, block_t>, u64x2
	>;

	template<class block_t,
	         class lambda>
	using stream_half_variable_stride = std::enable_if
	<
		stream_is_half_variable_stride<block_t, lambda>::value, u64x2
	>;

	template<class block_t,
	         class lambda>
	typename stream_half_variable_stride<block_t, lambda>::type
	stream(const char *, const u64x2, lambda&&) noexcept;
}

// full-duplex fixed stride
namespace ircd::simd
{
	template<class block_t>
	using stream_full_fixed_proto = void (block_t &, block_t mask);

	template<class block_t,
	         class lambda>
	using stream_is_full_fixed_stride = std::is_same
	<
		std::invoke_result_t<lambda, block_t &, block_t>, void
	>;

	template<class block_t,
	         class lambda>
	using stream_full_fixed_stride = std::enable_if
	<
		stream_is_full_fixed_stride<block_t, lambda>::value, u64x2
	>;

	template<class block_t,
	         class lambda>
	typename stream_full_fixed_stride<block_t, lambda>::type
	stream(char *, const char *, const u64x2, lambda&&) noexcept;
}

// full-duplex variable stride
namespace ircd::simd
{
	template<class block_t>
	using stream_full_variable_proto = u64x2 (block_t &, block_t mask);

	template<class block_t,
	         class lambda>
	using stream_is_full_variable_stride = std::is_same
	<
		std::invoke_result_t<lambda, block_t &, block_t>, u64x2
	>;

	template<class block_t,
	         class lambda>
	using stream_full_variable_stride = std::enable_if
	<
		stream_is_full_variable_stride<block_t, lambda>::value, u64x2
	>;

	template<class block_t,
	         class lambda>
	typename stream_full_variable_stride<block_t, lambda>::type
	stream(char *, const char *, const u64x2, lambda&&) noexcept;
}

/// Streaming transform
///
/// This template performs the loop boiler-plate for the developer who can
/// simply supply a conforming closure. Characteristics:
///
/// * byte-aligned (unaligned): the input and output buffers do not have to
/// be aligned and can be any size.
///
/// * full-duplex: the operation involves both input and output and there are
/// separate pointers for progress across the input and output buffers which
/// are incremented independently.
///
/// * variable-stride: progress for each iteration of the loop across the input
/// and output buffers is not fixed; the transform function may advance either
/// pointer zero to sizeof(block_t) bytes each iteration. Due to these
/// characteristics, unaligned bytes may be redundantly loaded or stored and
/// non-temporal features are not used to optimize the operation.
///
/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
/// gives the buffer size in that format. The return value is the consumed
/// bytes (final counter value) in that format.
///
template<class block_t,
         class lambda>
inline typename ircd::simd::stream_full_variable_stride<block_t, lambda>::type
ircd::simd::stream(char *const __restrict__ out,
                   const char *const __restrict__ in,
                   const u64x2 max,
                   lambda&& closure)
noexcept
{
	using block_t_u = unaligned<block_t>;

	u64x2 count
	{
		0, // output pos
		0, // input pos
	};

	// primary broadband loop
	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
	{
		static const auto mask
		{
			~block_t{0}
		};

		const auto di
		{
			reinterpret_cast<block_t_u *>(out + count[0])
		};

		const auto si
		{
			reinterpret_cast<const block_t_u *>(in + count[1])
		};

		block_t block
		(
			*si
		);

		const auto consume
		{
			closure(block, mask)
		};

		count += consume;
		*di = block;
	}

	// trailing narrowband loop
	while(count[1] < max[1])
	{
		block_t block {0}, mask {0};
		for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
		{
			block[i] = in[count[1] + i];
			mask[i] = 0xff;
		}

		const auto consume
		{
			closure(block, mask)
		};

		for(size_t i(0); i < consume[0] && count[0] + i < max[0]; ++i)
			out[count[0] + i] = block[i];

		count += consume;
	}

	return u64x2
	{
		std::min(count[0], max[0]),
		std::min(count[1], max[1]),
	};
}

/// Streaming transform
///
/// This template performs the loop boiler-plate for the developer who can
/// simply supply a conforming closure. Characteristics:
///
/// * byte-aligned (unaligned): the input and output buffers do not have to
/// be aligned and can be any size.
///
/// * full-duplex: the operation involves both input and output and there are
/// separate pointers for progress across the input and output buffers which
/// are incremented independently.
///
/// * fixed-stride: progress for each iteration of the loop across the input
/// and output buffers is fixed.
///
/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
/// gives the buffer size in that format. The return value is the consumed
/// bytes (final counter value) in that format.
///
template<class block_t,
         class lambda>
inline typename ircd::simd::stream_full_fixed_stride<block_t, lambda>::type
ircd::simd::stream(char *const __restrict__ out,
                   const char *const __restrict__ in,
                   const u64x2 max,
                   lambda&& closure)
noexcept
{
	using block_t_u = unaligned<block_t>;

	u64x2 count
	{
		0, // output pos
		0, // input pos
	};

	// primary broadband loop
	while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
	{
		static const u64x2 consume
		{
			sizeof(block_t),
			sizeof(block_t),
		};

		static const auto mask
		{
			~block_t{0}
		};

		const auto di
		{
			reinterpret_cast<block_t_u *>(out + count[0])
		};

		const auto si
		{
			reinterpret_cast<const block_t_u *>(in + count[1])
		};

		block_t block
		(
			*si
		);

		closure(block, mask);
		count += consume;
		*di = block;
	}

	// trailing narrowband loop
	assert(count[1] + sizeof(block_t) > max[1]);
	if(likely(count[1] < max[1]))
	{
		u64 i[2] {0};
		block_t block {0}, mask {0};
		for(; count[1] + i[1] < max[1]; ++i[1])
		{
			block[i[1]] = in[count[1] + i[1]];
			mask[i[1]] = 0xff;
		}

		closure(block, mask);
		for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0])
			out[count[0] + i[0]] = block[i[0]];

		count += u64x2
		{
			i[0], i[1]
		};
	}

	assert(count[0] == max[0]);
	assert(count[1] == max[1]);
	return count;
}

/// Streaming consumer
///
/// This template performs the loop boiler-plate for the developer who can
/// simply supply a conforming closure. Characteristics:
///
/// * byte-aligned (unaligned): the input buffer does not have to be aligned
/// and can be any size.
///
/// * variable-stride: progress for each iteration of the loop across the input
/// and buffer is not fixed; the transform function may advance the pointer
/// one to sizeof(block_t) bytes each iteration. Due to these characteristics,
/// unaligned bytes may be redundantly loaded and non-temporal features are
/// not used to optimize the operation.
///
/// u64x2 counter lanes = { available_to_user, input_length }; The argument
/// `max` gives the buffer size in that format. The return value is the
/// consumed bytes (final counter value) in that format. The first lane is
/// available to the user; its initial value is max[0] (also unused); it is
/// then accumulated with the first lane of the closure's return value; its
/// final value is returned in [0] of the return value.
///
/// Note that the closure must advance the stream one or more bytes each
/// iteration; a zero value is available for loop control: the loop will
/// break without another iteration.
///
template<class block_t,
         class lambda>
inline typename ircd::simd::stream_half_variable_stride<block_t, lambda>::type
ircd::simd::stream(const char *const __restrict__ in,
                   const u64x2 max,
                   lambda&& closure)
noexcept
{
	using block_t_u = unaligned<block_t>;

	u64x2 count
	{
		max[0], // preserved for caller
		0,      // input pos
	};

	u64x2 consume
	{
		0,
		-1UL    // non-zero to start loop
	};

	// primary broadband loop
	while(consume[1] && count[1] + sizeof(block_t) <= max[1])
	{
		static const auto mask
		{
			~block_t{0}
		};

		const auto si
		{
			reinterpret_cast<const block_t_u *>(in + count[1])
		};

		const block_t block
		(
			*si
		);

		consume = closure(block, mask);
		count += consume;
	}

	// trailing narrowband loop
	while(consume[1] && count[1] < max[1])
	{
		block_t block {0}, mask {0};
		for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
		{
			block[i] = in[count[1] + i];
			mask[i] = 0xff;
		}

		consume = closure(block, mask);
		count += consume;
	}

	return u64x2
	{
		count[0],
		std::min(count[1], max[1])
	};
}

/// Streaming consumer
///
/// This template performs the loop boiler-plate for the developer who can
/// simply supply a conforming closure. Characteristics:
///
/// * byte-aligned (unaligned): the input buffer does not have to be aligned
/// and can be any size.
///
/// * fixed-stride: progress for each iteration of the loop across the input
/// and buffer is fixed at the block width; the transform function does not
/// control the iteration.
///
/// u64x2 counter lanes = { available_to_user, input_length }; The argument
/// `max` gives the buffer size in that format. The return value is the
/// consumed bytes (final counter value) in that format. The first lane is
/// available to the user; its initial value is max[0] (also unused).
///
template<class block_t,
         class lambda>
inline typename ircd::simd::stream_half_fixed_stride<block_t, lambda>::type
ircd::simd::stream(const char *const __restrict__ in,
                   const u64x2 max,
                   lambda&& closure)
noexcept
{
	using block_t_u = unaligned<block_t>;

	u64x2 count
	{
		max[0], // preserved for caller
		0,      // input pos
	};

	// primary broadband loop
	while(count[1] + sizeof(block_t) <= max[1])
	{
		static const u64x2 consume
		{
			0, sizeof(block_t)
		};

		static const auto mask
		{
			~block_t{0}
		};

		const auto si
		{
			reinterpret_cast<const block_t_u *>(in + count[1])
		};

		const block_t block
		(
			*si
		);

		closure(block, mask);
		count += consume;
	}

	// trailing narrowband loop
	assert(count[1] + sizeof(block_t) > max[1]);
	if(likely(count[1] < max[1]))
	{
		size_t i(0);
		block_t block {0}, mask {0};
		for(; count[1] + i < max[1]; ++i)
		{
			block[i] = in[count[1] + i];
			mask[i] = 0xff;
		}

		closure(block, mask);
		count += u64x2 // consume remainder
		{
			0, i
		};
	}

	// return value is pure
	assert(count[0] == max[0]);
	assert(count[1] == max[1]);
	return count;
}
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`// The Construct`
			`//`
			`// Copyright (C) The Construct Developers, Authors & Contributors`
			`// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>`
			`//`
			`// Permission to use, copy, modify, and/or distribute this software for any`
			`// purpose with or without fee is hereby granted, provided that the above`
			`// copyright notice and this permission notice is present in all copies. The`
			`// full license for this software is available in the LICENSE file.`

			`#pragma once`
			`#define HAVE_IRCD_SIMD_STREAM_H`

ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`// half-duplex fixed stride`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`namespace ircd::simd`
			`{`
			`template<class block_t>`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`using stream_half_fixed_proto = void (block_t, block_t mask);`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00
ircd::simd: Simplify stream template; deduce unaligned type. 2020-10-07 02:49:30 +02:00			`template<class block_t,`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`class lambda>`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`using stream_is_half_fixed_stride = std::is_same`
			`<`
			`std::invoke_result_t<lambda, block_t, block_t>, void`
			`>;`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00
ircd::simd: Simplify stream template; deduce unaligned type. 2020-10-07 02:49:30 +02:00			`template<class block_t,`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`class lambda>`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`using stream_half_fixed_stride = std::enable_if`
			`<`
			`stream_is_half_fixed_stride<block_t, lambda>::value, u64x2`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`typename stream_half_fixed_stride<block_t, lambda>::type`
			`stream(const char *, const u64x2, lambda&&) noexcept;`
			`}`

			`// half-duplex variable stride`
			`namespace ircd::simd`
			`{`
			`template<class block_t>`
			`using stream_half_variable_proto = u64x2 (block_t, block_t mask);`

			`template<class block_t,`
			`class lambda>`
			`using stream_is_half_variable_stride = std::is_same`
			`<`
			`std::invoke_result_t<lambda, block_t, block_t>, u64x2`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`using stream_half_variable_stride = std::enable_if`
			`<`
			`stream_is_half_variable_stride<block_t, lambda>::value, u64x2`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`typename stream_half_variable_stride<block_t, lambda>::type`
			`stream(const char *, const u64x2, lambda&&) noexcept;`
			`}`

			`// full-duplex fixed stride`
			`namespace ircd::simd`
			`{`
			`template<class block_t>`
			`using stream_full_fixed_proto = void (block_t &, block_t mask);`

			`template<class block_t,`
			`class lambda>`
			`using stream_is_full_fixed_stride = std::is_same`
			`<`
			`std::invoke_result_t<lambda, block_t &, block_t>, void`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`using stream_full_fixed_stride = std::enable_if`
			`<`
			`stream_is_full_fixed_stride<block_t, lambda>::value, u64x2`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`typename stream_full_fixed_stride<block_t, lambda>::type`
			`stream(char , const char , const u64x2, lambda&&) noexcept;`
			`}`

			`// full-duplex variable stride`
			`namespace ircd::simd`
			`{`
			`template<class block_t>`
			`using stream_full_variable_proto = u64x2 (block_t &, block_t mask);`

			`template<class block_t,`
			`class lambda>`
			`using stream_is_full_variable_stride = std::is_same`
			`<`
			`std::invoke_result_t<lambda, block_t &, block_t>, u64x2`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`using stream_full_variable_stride = std::enable_if`
			`<`
			`stream_is_full_variable_stride<block_t, lambda>::value, u64x2`
			`>;`

			`template<class block_t,`
			`class lambda>`
			`typename stream_full_variable_stride<block_t, lambda>::type`
			`stream(char , const char , const u64x2, lambda&&) noexcept;`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`}`

			`/// Streaming transform`
			`///`
			`/// This template performs the loop boiler-plate for the developer who can`
			`/// simply supply a conforming closure. Characteristics:`
			`///`
			`/// * byte-aligned (unaligned): the input and output buffers do not have to`
			`/// be aligned and can be any size.`
			`///`
			`/// * full-duplex: the operation involves both input and output and there are`
			`/// separate pointers for progress across the input and output buffers which`
			`/// are incremented independently.`
			`///`
			`/// * variable-stride: progress for each iteration of the loop across the input`
			`/// and output buffers is not fixed; the transform function may advance either`
			`/// pointer zero to sizeof(block_t) bytes each iteration. Due to these`
			`/// characteristics, unaligned bytes may be redundantly loaded or stored and`
			`/// non-temporal features are not used to optimize the operation.`
			`///`
			/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
			`/// gives the buffer size in that format. The return value is the consumed`
			`/// bytes (final counter value) in that format.`
			`///`
ircd::simd: Simplify stream template; deduce unaligned type. 2020-10-07 02:49:30 +02:00			`template<class block_t,`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`class lambda>`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`inline typename ircd::simd::stream_full_variable_stride<block_t, lambda>::type`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`ircd::simd::stream(char *const __restrict__ out,`
			`const char *const __restrict__ in,`
			`const u64x2 max,`
			`lambda&& closure)`
			`noexcept`
			`{`
ircd::simd: Simplify stream template; deduce unaligned type. 2020-10-07 02:49:30 +02:00			`using block_t_u = unaligned<block_t>;`

ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`u64x2 count`
			`{`
			`0, // output pos`
			`0, // input pos`
			`};`

			`// primary broadband loop`
			`while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])`
			`{`
			`static const auto mask`
			`{`
			`~block_t{0}`
			`};`

			`const auto di`
			`{`
			`reinterpret_cast<block_t_u *>(out + count[0])`
			`};`

			`const auto si`
			`{`
			`reinterpret_cast<const block_t_u *>(in + count[1])`
			`};`

			`block_t block`
			`(`
			`*si`
			`);`

			`const auto consume`
			`{`
			`closure(block, mask)`
			`};`

			`count += consume;`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`*di = block;`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`}`

			`// trailing narrowband loop`
			`while(count[1] < max[1])`
			`{`
			`block_t block {0}, mask {0};`
			`for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)`
			`{`
			`block[i] = in[count[1] + i];`
			`mask[i] = 0xff;`
			`}`

			`const auto consume`
			`{`
			`closure(block, mask)`
			`};`

			`for(size_t i(0); i < consume[0] && count[0] + i < max[0]; ++i)`
			`out[count[0] + i] = block[i];`

			`count += consume;`
			`}`

ircd::simd: Ensure return counts are bound to max values. 2020-09-13 11:56:24 +02:00			`return u64x2`
			`{`
			`std::min(count[0], max[0]),`
			`std::min(count[1], max[1]),`
			`};`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`}`

ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`/// Streaming transform`
			`///`
			`/// This template performs the loop boiler-plate for the developer who can`
			`/// simply supply a conforming closure. Characteristics:`
			`///`
			`/// * byte-aligned (unaligned): the input and output buffers do not have to`
			`/// be aligned and can be any size.`
			`///`
			`/// * full-duplex: the operation involves both input and output and there are`
			`/// separate pointers for progress across the input and output buffers which`
			`/// are incremented independently.`
			`///`
			`/// * fixed-stride: progress for each iteration of the loop across the input`
			`/// and output buffers is fixed.`
			`///`
			/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
			`/// gives the buffer size in that format. The return value is the consumed`
			`/// bytes (final counter value) in that format.`
			`///`
			`template<class block_t,`
			`class lambda>`
			`inline typename ircd::simd::stream_full_fixed_stride<block_t, lambda>::type`
			`ircd::simd::stream(char *const __restrict__ out,`
			`const char *const __restrict__ in,`
			`const u64x2 max,`
			`lambda&& closure)`
			`noexcept`
			`{`
			`using block_t_u = unaligned<block_t>;`

			`u64x2 count`
			`{`
			`0, // output pos`
			`0, // input pos`
			`};`

			`// primary broadband loop`
			`while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])`
			`{`
			`static const u64x2 consume`
			`{`
			`sizeof(block_t),`
			`sizeof(block_t),`
			`};`

			`static const auto mask`
			`{`
			`~block_t{0}`
			`};`

			`const auto di`
			`{`
			`reinterpret_cast<block_t_u *>(out + count[0])`
			`};`

			`const auto si`
			`{`
			`reinterpret_cast<const block_t_u *>(in + count[1])`
			`};`

			`block_t block`
			`(`
			`*si`
			`);`

			`closure(block, mask);`
			`count += consume;`
			`*di = block;`
			`}`

			`// trailing narrowband loop`
			`assert(count[1] + sizeof(block_t) > max[1]);`
			`if(likely(count[1] < max[1]))`
			`{`
			`u64 i[2] {0};`
			`block_t block {0}, mask {0};`
			`for(; count[1] + i[1] < max[1]; ++i[1])`
			`{`
			`block[i[1]] = in[count[1] + i[1]];`
			`mask[i[1]] = 0xff;`
			`}`

			`closure(block, mask);`
			`for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0])`
			`out[count[0] + i[0]] = block[i[0]];`

			`count += u64x2`
			`{`
			`i[0], i[1]`
			`};`
			`}`

			`assert(count[0] == max[0]);`
			`assert(count[1] == max[1]);`
			`return count;`
			`}`

ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`/// Streaming consumer`
			`///`
			`/// This template performs the loop boiler-plate for the developer who can`
			`/// simply supply a conforming closure. Characteristics:`
			`///`
			`/// * byte-aligned (unaligned): the input buffer does not have to be aligned`
			`/// and can be any size.`
			`///`
			`/// * variable-stride: progress for each iteration of the loop across the input`
ircd::simd: Fix comment. [ci skip] 2020-09-10 22:23:37 +02:00			`/// and buffer is not fixed; the transform function may advance the pointer`
			`/// one to sizeof(block_t) bytes each iteration. Due to these characteristics,`
			`/// unaligned bytes may be redundantly loaded and non-temporal features are`
			`/// not used to optimize the operation.`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`///`
			`/// u64x2 counter lanes = { available_to_user, input_length }; The argument`
			/// `max` gives the buffer size in that format. The return value is the
			`/// consumed bytes (final counter value) in that format. The first lane is`
			`/// available to the user; its initial value is max[0] (also unused); it is`
			`/// then accumulated with the first lane of the closure's return value; its`
			`/// final value is returned in [0] of the return value.`
			`///`
			`/// Note that the closure must advance the stream one or more bytes each`
			`/// iteration; a zero value is available for loop control: the loop will`
			`/// break without another iteration.`
			`///`
ircd::simd: Simplify stream template; deduce unaligned type. 2020-10-07 02:49:30 +02:00			`template<class block_t,`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`class lambda>`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00			`inline typename ircd::simd::stream_half_variable_stride<block_t, lambda>::type`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`ircd::simd::stream(const char *const __restrict__ in,`
			`const u64x2 max,`
			`lambda&& closure)`
			`noexcept`
			`{`
ircd::simd: Simplify stream template; deduce unaligned type. 2020-10-07 02:49:30 +02:00			`using block_t_u = unaligned<block_t>;`

ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`u64x2 count`
			`{`
			`max[0], // preserved for caller`
			`0, // input pos`
			`};`

			`u64x2 consume`
			`{`
			`0,`
			`-1UL // non-zero to start loop`
			`};`

			`// primary broadband loop`
			`while(consume[1] && count[1] + sizeof(block_t) <= max[1])`
			`{`
			`static const auto mask`
			`{`
			`~block_t{0}`
			`};`

			`const auto si`
			`{`
			`reinterpret_cast<const block_t_u *>(in + count[1])`
			`};`

			`const block_t block`
			`(`
			`*si`
			`);`

			`consume = closure(block, mask);`
			`count += consume;`
			`}`

			`// trailing narrowband loop`
			`while(consume[1] && count[1] < max[1])`
			`{`
			`block_t block {0}, mask {0};`
			`for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)`
			`{`
			`block[i] = in[count[1] + i];`
			`mask[i] = 0xff;`
			`}`

			`consume = closure(block, mask);`
			`count += consume;`
			`}`

ircd::simd: Ensure return counts are bound to max values. 2020-09-13 11:56:24 +02:00			`return u64x2`
			`{`
			`count[0],`
			`std::min(count[1], max[1])`
			`};`
ircd::simd: Add a streaming transform boilerplate template for the common pattern. ircd::simd: Add a streaming consumer boilerplate template for the common pattern. ircd::json: Simplify w/ stream templates; update counter lane convention. 2020-09-09 09:27:29 +02:00			`}`
ircd::simd: Make stream template into enable_if complex w/ closure deductions. 2020-10-07 04:41:47 +02:00
			`/// Streaming consumer`
			`///`
			`/// This template performs the loop boiler-plate for the developer who can`
			`/// simply supply a conforming closure. Characteristics:`
			`///`
			`/// * byte-aligned (unaligned): the input buffer does not have to be aligned`
			`/// and can be any size.`
			`///`
			`/// * fixed-stride: progress for each iteration of the loop across the input`
			`/// and buffer is fixed at the block width; the transform function does not`
			`/// control the iteration.`
			`///`
			`/// u64x2 counter lanes = { available_to_user, input_length }; The argument`
			/// `max` gives the buffer size in that format. The return value is the
			`/// consumed bytes (final counter value) in that format. The first lane is`
			`/// available to the user; its initial value is max[0] (also unused).`
			`///`
			`template<class block_t,`
			`class lambda>`
			`inline typename ircd::simd::stream_half_fixed_stride<block_t, lambda>::type`
			`ircd::simd::stream(const char *const __restrict__ in,`
			`const u64x2 max,`
			`lambda&& closure)`
			`noexcept`
			`{`
			`using block_t_u = unaligned<block_t>;`

			`u64x2 count`
			`{`
			`max[0], // preserved for caller`
			`0, // input pos`
			`};`

			`// primary broadband loop`
			`while(count[1] + sizeof(block_t) <= max[1])`
			`{`
			`static const u64x2 consume`
			`{`
			`0, sizeof(block_t)`
			`};`

			`static const auto mask`
			`{`
			`~block_t{0}`
			`};`

			`const auto si`
			`{`
			`reinterpret_cast<const block_t_u *>(in + count[1])`
			`};`

			`const block_t block`
			`(`
			`*si`
			`);`

			`closure(block, mask);`
			`count += consume;`
			`}`

			`// trailing narrowband loop`
			`assert(count[1] + sizeof(block_t) > max[1]);`
			`if(likely(count[1] < max[1]))`
			`{`
			`size_t i(0);`
			`block_t block {0}, mask {0};`
			`for(; count[1] + i < max[1]; ++i)`
			`{`
			`block[i] = in[count[1] + i];`
			`mask[i] = 0xff;`
			`}`

			`closure(block, mask);`
			`count += u64x2 // consume remainder`
			`{`
			`0, i`
			`};`
			`}`

			`// return value is pure`
			`assert(count[0] == max[0]);`
			`assert(count[1] == max[1]);`
			`return count;`
			`}`