// The Construct // // Copyright (C) The Construct Developers, Authors & Contributors // Copyright (C) 2016-2020 Jason Volk // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice is present in all copies. The // full license for this software is available in the LICENSE file. #pragma once #define HAVE_IRCD_SIMD_STREAM_H // half-duplex fixed stride namespace ircd::simd { template using stream_half_fixed_proto = void (block_t, block_t mask); template using stream_is_half_fixed_stride = std::is_same < std::invoke_result_t, void >; template using stream_half_fixed_stride = std::enable_if < stream_is_half_fixed_stride::value, u64x2 >; template typename stream_half_fixed_stride::type stream(const char *, const u64x2, lambda&&) noexcept; } // half-duplex variable stride namespace ircd::simd { template using stream_half_variable_proto = u64x2 (block_t, block_t mask); template using stream_is_half_variable_stride = std::is_same < std::invoke_result_t, u64x2 >; template using stream_half_variable_stride = std::enable_if < stream_is_half_variable_stride::value, u64x2 >; template typename stream_half_variable_stride::type stream(const char *, const u64x2, lambda&&) noexcept; } // full-duplex fixed stride namespace ircd::simd { template using stream_full_fixed_proto = void (block_t &, block_t mask); template using stream_is_full_fixed_stride = std::is_same < std::invoke_result_t, void >; template using stream_full_fixed_stride = std::enable_if < stream_is_full_fixed_stride::value, u64x2 >; template typename stream_full_fixed_stride::type stream(char *, const char *, const u64x2, lambda&&) noexcept; } // full-duplex variable stride namespace ircd::simd { template using stream_full_variable_proto = u64x2 (block_t &, block_t mask); template using stream_is_full_variable_stride = std::is_same < std::invoke_result_t, u64x2 >; template using stream_full_variable_stride = std::enable_if < stream_is_full_variable_stride::value, u64x2 >; template typename stream_full_variable_stride::type stream(char *, const char *, const u64x2, lambda&&) noexcept; } /// Streaming transform /// /// This template performs the loop boiler-plate for the developer who can /// simply supply a conforming closure. Characteristics: /// /// * byte-aligned (unaligned): the input and output buffers do not have to /// be aligned and can be any size. /// /// * full-duplex: the operation involves both input and output and there are /// separate pointers for progress across the input and output buffers which /// are incremented independently. /// /// * variable-stride: progress for each iteration of the loop across the input /// and output buffers is not fixed; the transform function may advance either /// pointer zero to sizeof(block_t) bytes each iteration. Due to these /// characteristics, unaligned bytes may be redundantly loaded or stored and /// non-temporal features are not used to optimize the operation. /// /// u64x2 counter lanes = { output_length, input_length }; The argument `max` /// gives the buffer size in that format. The return value is the consumed /// bytes (final counter value) in that format. /// template inline typename ircd::simd::stream_full_variable_stride::type ircd::simd::stream(char *const __restrict__ out, const char *const __restrict__ in, const u64x2 max, lambda&& closure) noexcept { using block_t_u = unaligned; u64x2 count { 0, // output pos 0, // input pos }; // primary broadband loop while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0]) { static const auto mask { ~block_t{0} }; const auto di { reinterpret_cast(out + count[0]) }; const auto si { reinterpret_cast(in + count[1]) }; block_t block ( *si ); const auto consume { closure(block, mask) }; count += consume; *di = block; } // trailing narrowband loop while(count[1] < max[1]) { block_t block {0}, mask {0}; for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i) { block[i] = in[count[1] + i]; mask[i] = 0xff; } const auto consume { closure(block, mask) }; for(size_t i(0); i < consume[0] && count[0] + i < max[0]; ++i) out[count[0] + i] = block[i]; count += consume; } return u64x2 { std::min(count[0], max[0]), std::min(count[1], max[1]), }; } /// Streaming transform /// /// This template performs the loop boiler-plate for the developer who can /// simply supply a conforming closure. Characteristics: /// /// * byte-aligned (unaligned): the input and output buffers do not have to /// be aligned and can be any size. /// /// * full-duplex: the operation involves both input and output and there are /// separate pointers for progress across the input and output buffers which /// are incremented independently. /// /// * fixed-stride: progress for each iteration of the loop across the input /// and output buffers is fixed. /// /// u64x2 counter lanes = { output_length, input_length }; The argument `max` /// gives the buffer size in that format. The return value is the consumed /// bytes (final counter value) in that format. /// template inline typename ircd::simd::stream_full_fixed_stride::type ircd::simd::stream(char *const __restrict__ out, const char *const __restrict__ in, const u64x2 max, lambda&& closure) noexcept { using block_t_u = unaligned; u64x2 count { 0, // output pos 0, // input pos }; // primary broadband loop while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0]) { static const u64x2 consume { sizeof(block_t), sizeof(block_t), }; static const auto mask { ~block_t{0} }; const auto di { reinterpret_cast(out + count[0]) }; const auto si { reinterpret_cast(in + count[1]) }; block_t block ( *si ); closure(block, mask); count += consume; *di = block; } // trailing narrowband loop assert(count[1] + sizeof(block_t) > max[1]); if(likely(count[1] < max[1])) { u64 i[2] {0}; block_t block {0}, mask {0}; for(; count[1] + i[1] < max[1]; ++i[1]) { block[i[1]] = in[count[1] + i[1]]; mask[i[1]] = 0xff; } closure(block, mask); for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0]) out[count[0] + i[0]] = block[i[0]]; count += u64x2 { i[0], i[1] }; } assert(count[0] == max[0]); assert(count[1] == max[1]); return count; } /// Streaming consumer /// /// This template performs the loop boiler-plate for the developer who can /// simply supply a conforming closure. Characteristics: /// /// * byte-aligned (unaligned): the input buffer does not have to be aligned /// and can be any size. /// /// * variable-stride: progress for each iteration of the loop across the input /// and buffer is not fixed; the transform function may advance the pointer /// one to sizeof(block_t) bytes each iteration. Due to these characteristics, /// unaligned bytes may be redundantly loaded and non-temporal features are /// not used to optimize the operation. /// /// u64x2 counter lanes = { available_to_user, input_length }; The argument /// `max` gives the buffer size in that format. The return value is the /// consumed bytes (final counter value) in that format. The first lane is /// available to the user; its initial value is max[0] (also unused); it is /// then accumulated with the first lane of the closure's return value; its /// final value is returned in [0] of the return value. /// /// Note that the closure must advance the stream one or more bytes each /// iteration; a zero value is available for loop control: the loop will /// break without another iteration. /// template inline typename ircd::simd::stream_half_variable_stride::type ircd::simd::stream(const char *const __restrict__ in, const u64x2 max, lambda&& closure) noexcept { using block_t_u = unaligned; u64x2 count { max[0], // preserved for caller 0, // input pos }; u64x2 consume { 0, -1UL // non-zero to start loop }; // primary broadband loop while(consume[1] && count[1] + sizeof(block_t) <= max[1]) { static const auto mask { ~block_t{0} }; const auto si { reinterpret_cast(in + count[1]) }; const block_t block ( *si ); consume = closure(block, mask); count += consume; } // trailing narrowband loop while(consume[1] && count[1] < max[1]) { block_t block {0}, mask {0}; for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i) { block[i] = in[count[1] + i]; mask[i] = 0xff; } consume = closure(block, mask); count += consume; } return u64x2 { count[0], std::min(count[1], max[1]) }; } /// Streaming consumer /// /// This template performs the loop boiler-plate for the developer who can /// simply supply a conforming closure. Characteristics: /// /// * byte-aligned (unaligned): the input buffer does not have to be aligned /// and can be any size. /// /// * fixed-stride: progress for each iteration of the loop across the input /// and buffer is fixed at the block width; the transform function does not /// control the iteration. /// /// u64x2 counter lanes = { available_to_user, input_length }; The argument /// `max` gives the buffer size in that format. The return value is the /// consumed bytes (final counter value) in that format. The first lane is /// available to the user; its initial value is max[0] (also unused). /// template inline typename ircd::simd::stream_half_fixed_stride::type ircd::simd::stream(const char *const __restrict__ in, const u64x2 max, lambda&& closure) noexcept { using block_t_u = unaligned; u64x2 count { max[0], // preserved for caller 0, // input pos }; // primary broadband loop while(count[1] + sizeof(block_t) <= max[1]) { static const u64x2 consume { 0, sizeof(block_t) }; static const auto mask { ~block_t{0} }; const auto si { reinterpret_cast(in + count[1]) }; const block_t block ( *si ); closure(block, mask); count += consume; } // trailing narrowband loop assert(count[1] + sizeof(block_t) > max[1]); if(likely(count[1] < max[1])) { size_t i(0); block_t block {0}, mask {0}; for(; count[1] + i < max[1]; ++i) { block[i] = in[count[1] + i]; mask[i] = 0xff; } closure(block, mask); count += u64x2 // consume remainder { 0, i }; } // return value is pure assert(count[0] == max[0]); assert(count[1] == max[1]); return count; }