0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2025-03-13 21:10:32 +01:00

ircd::simd: Make stream template into enable_if complex w/ closure deductions.

This commit is contained in:
Jason Volk 2020-10-06 19:41:47 -07:00
parent ef715a637d
commit 638efbc0e9
2 changed files with 278 additions and 18 deletions

View file

@ -38,18 +38,12 @@ ircd::simd::accumulate(const char *const __restrict__ in,
lambda&& closure)
noexcept
{
static const u64x2 block_stride
{
max[0], sizeof(block_t)
};
const u64x2 res
{
stream<block_t>(in, max, [&val, &closure]
(const auto block, const auto mask)
{
closure(val, block, mask);
return block_stride;
})
};

View file

@ -11,24 +11,108 @@
#pragma once
#define HAVE_IRCD_SIMD_STREAM_H
// half-duplex fixed stride
namespace ircd::simd
{
/// Transform block_t by pseudo-reference. The closure has an opportunity
/// to modify the block while it is being streamed from the source to the
/// destination. The mask indicates which elements of the block are valid
/// if the input is smaller than the block size. This function returns
/// a pair of integers which advance the output and input positions of the
/// streams for the next iteration.
template<class block_t>
using transform_prototype = u64x2 (block_t &, block_t mask);
using stream_half_fixed_proto = void (block_t, block_t mask);
template<class block_t,
class lambda>
u64x2 stream(const char *, const u64x2, lambda&&) noexcept;
using stream_is_half_fixed_stride = std::is_same
<
std::invoke_result_t<lambda, block_t, block_t>, void
>;
template<class block_t,
class lambda>
u64x2 stream(char *, const char *, const u64x2, lambda&&) noexcept;
using stream_half_fixed_stride = std::enable_if
<
stream_is_half_fixed_stride<block_t, lambda>::value, u64x2
>;
template<class block_t,
class lambda>
typename stream_half_fixed_stride<block_t, lambda>::type
stream(const char *, const u64x2, lambda&&) noexcept;
}
// half-duplex variable stride
namespace ircd::simd
{
template<class block_t>
using stream_half_variable_proto = u64x2 (block_t, block_t mask);
template<class block_t,
class lambda>
using stream_is_half_variable_stride = std::is_same
<
std::invoke_result_t<lambda, block_t, block_t>, u64x2
>;
template<class block_t,
class lambda>
using stream_half_variable_stride = std::enable_if
<
stream_is_half_variable_stride<block_t, lambda>::value, u64x2
>;
template<class block_t,
class lambda>
typename stream_half_variable_stride<block_t, lambda>::type
stream(const char *, const u64x2, lambda&&) noexcept;
}
// full-duplex fixed stride
namespace ircd::simd
{
template<class block_t>
using stream_full_fixed_proto = void (block_t &, block_t mask);
template<class block_t,
class lambda>
using stream_is_full_fixed_stride = std::is_same
<
std::invoke_result_t<lambda, block_t &, block_t>, void
>;
template<class block_t,
class lambda>
using stream_full_fixed_stride = std::enable_if
<
stream_is_full_fixed_stride<block_t, lambda>::value, u64x2
>;
template<class block_t,
class lambda>
typename stream_full_fixed_stride<block_t, lambda>::type
stream(char *, const char *, const u64x2, lambda&&) noexcept;
}
// full-duplex variable stride
namespace ircd::simd
{
template<class block_t>
using stream_full_variable_proto = u64x2 (block_t &, block_t mask);
template<class block_t,
class lambda>
using stream_is_full_variable_stride = std::is_same
<
std::invoke_result_t<lambda, block_t &, block_t>, u64x2
>;
template<class block_t,
class lambda>
using stream_full_variable_stride = std::enable_if
<
stream_is_full_variable_stride<block_t, lambda>::value, u64x2
>;
template<class block_t,
class lambda>
typename stream_full_variable_stride<block_t, lambda>::type
stream(char *, const char *, const u64x2, lambda&&) noexcept;
}
/// Streaming transform
@ -55,7 +139,7 @@ namespace ircd::simd
///
template<class block_t,
class lambda>
inline ircd::u64x2
inline typename ircd::simd::stream_full_variable_stride<block_t, lambda>::type
ircd::simd::stream(char *const __restrict__ out,
const char *const __restrict__ in,
const u64x2 max,
@ -98,8 +182,8 @@ noexcept
closure(block, mask)
};
*di = block;
count += consume;
*di = block;
}
// trailing narrowband loop
@ -130,6 +214,103 @@ noexcept
};
}
/// Streaming transform
///
/// This template performs the loop boiler-plate for the developer who can
/// simply supply a conforming closure. Characteristics:
///
/// * byte-aligned (unaligned): the input and output buffers do not have to
/// be aligned and can be any size.
///
/// * full-duplex: the operation involves both input and output and there are
/// separate pointers for progress across the input and output buffers which
/// are incremented independently.
///
/// * fixed-stride: progress for each iteration of the loop across the input
/// and output buffers is fixed.
///
/// u64x2 counter lanes = { output_length, input_length }; The argument `max`
/// gives the buffer size in that format. The return value is the consumed
/// bytes (final counter value) in that format.
///
template<class block_t,
class lambda>
inline typename ircd::simd::stream_full_fixed_stride<block_t, lambda>::type
ircd::simd::stream(char *const __restrict__ out,
const char *const __restrict__ in,
const u64x2 max,
lambda&& closure)
noexcept
{
using block_t_u = unaligned<block_t>;
u64x2 count
{
0, // output pos
0, // input pos
};
// primary broadband loop
while(count[1] + sizeof(block_t) <= max[1] && count[0] + sizeof(block_t) <= max[0])
{
static const u64x2 consume
{
sizeof(block_t),
sizeof(block_t),
};
static const auto mask
{
~block_t{0}
};
const auto di
{
reinterpret_cast<block_t_u *>(out + count[0])
};
const auto si
{
reinterpret_cast<const block_t_u *>(in + count[1])
};
block_t block
(
*si
);
closure(block, mask);
count += consume;
*di = block;
}
// trailing narrowband loop
assert(count[1] + sizeof(block_t) > max[1]);
if(likely(count[1] < max[1]))
{
u64 i[2] {0};
block_t block {0}, mask {0};
for(; count[1] + i[1] < max[1]; ++i[1])
{
block[i[1]] = in[count[1] + i[1]];
mask[i[1]] = 0xff;
}
closure(block, mask);
for(; i[0] < i[1] && count[0] + i[0] < max[0]; ++i[0])
out[count[0] + i[0]] = block[i[0]];
count += u64x2
{
i[0], i[1]
};
}
assert(count[0] == max[0]);
assert(count[1] == max[1]);
return count;
}
/// Streaming consumer
///
/// This template performs the loop boiler-plate for the developer who can
@ -157,7 +338,7 @@ noexcept
///
template<class block_t,
class lambda>
inline ircd::u64x2
inline typename ircd::simd::stream_half_variable_stride<block_t, lambda>::type
ircd::simd::stream(const char *const __restrict__ in,
const u64x2 max,
lambda&& closure)
@ -219,3 +400,88 @@ noexcept
std::min(count[1], max[1])
};
}
/// Streaming consumer
///
/// This template performs the loop boiler-plate for the developer who can
/// simply supply a conforming closure. Characteristics:
///
/// * byte-aligned (unaligned): the input buffer does not have to be aligned
/// and can be any size.
///
/// * fixed-stride: progress for each iteration of the loop across the input
/// and buffer is fixed at the block width; the transform function does not
/// control the iteration.
///
/// u64x2 counter lanes = { available_to_user, input_length }; The argument
/// `max` gives the buffer size in that format. The return value is the
/// consumed bytes (final counter value) in that format. The first lane is
/// available to the user; its initial value is max[0] (also unused).
///
template<class block_t,
class lambda>
inline typename ircd::simd::stream_half_fixed_stride<block_t, lambda>::type
ircd::simd::stream(const char *const __restrict__ in,
const u64x2 max,
lambda&& closure)
noexcept
{
using block_t_u = unaligned<block_t>;
u64x2 count
{
max[0], // preserved for caller
0, // input pos
};
// primary broadband loop
while(count[1] + sizeof(block_t) <= max[1])
{
static const u64x2 consume
{
0, sizeof(block_t)
};
static const auto mask
{
~block_t{0}
};
const auto si
{
reinterpret_cast<const block_t_u *>(in + count[1])
};
const block_t block
(
*si
);
closure(block, mask);
count += consume;
}
// trailing narrowband loop
assert(count[1] + sizeof(block_t) > max[1]);
if(likely(count[1] < max[1]))
{
size_t i(0);
block_t block {0}, mask {0};
for(; count[1] + i < max[1]; ++i)
{
block[i] = in[count[1] + i];
mask[i] = 0xff;
}
closure(block, mask);
count += u64x2 // consume remainder
{
0, i
};
}
// return value is pure
assert(count[0] == max[0]);
assert(count[1] == max[1]);
return count;
}