mirror of
https://github.com/matrix-construct/construct
synced 2024-10-31 10:58:54 +01:00
292 lines
6.8 KiB
C
292 lines
6.8 KiB
C
|
// The Construct
|
||
|
//
|
||
|
// Copyright (C) The Construct Developers, Authors & Contributors
|
||
|
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
|
||
|
//
|
||
|
// Permission to use, copy, modify, and/or distribute this software for any
|
||
|
// purpose with or without fee is hereby granted, provided that the above
|
||
|
// copyright notice and this permission notice is present in all copies. The
|
||
|
// full license for this software is available in the LICENSE file.
|
||
|
|
||
|
#pragma once
|
||
|
#define HAVE_IRCD_SIMD_ISTREAM_H
|
||
|
|
||
|
namespace ircd::simd
|
||
|
{
|
||
|
template<class block_t>
|
||
|
using istream_fixed_proto = void (block_t, block_t mask);
|
||
|
|
||
|
template<class block_t>
|
||
|
using istream_variable_proto = u64x2 (block_t, block_t mask);
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
using istream_is_fixed_stride = std::is_same
|
||
|
<
|
||
|
std::invoke_result_t<lambda, block_t, block_t>, void
|
||
|
>;
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
using istream_is_variable_stride = std::is_same
|
||
|
<
|
||
|
std::invoke_result_t<lambda, block_t, block_t>, u64x2
|
||
|
>;
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
using istream_fixed_stride = std::enable_if
|
||
|
<
|
||
|
istream_is_fixed_stride<block_t, lambda>::value, u64x2
|
||
|
>;
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
using istream_variable_stride = std::enable_if
|
||
|
<
|
||
|
istream_is_variable_stride<block_t, lambda>::value, u64x2
|
||
|
>;
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
typename istream_fixed_stride<block_t, lambda>::type
|
||
|
stream(const block_t *, const u64x2, lambda&&) noexcept;
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
typename istream_fixed_stride<block_t, lambda>::type
|
||
|
stream(const char *, const u64x2, lambda&&) noexcept;
|
||
|
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
typename istream_variable_stride<block_t, lambda>::type
|
||
|
stream(const char *, const u64x2, lambda&&) noexcept;
|
||
|
}
|
||
|
|
||
|
/// Streaming consumer
|
||
|
///
|
||
|
/// This template performs the loop boiler-plate for the developer who can
|
||
|
/// simply supply a conforming closure. Characteristics:
|
||
|
///
|
||
|
/// * byte-aligned (unaligned): the input buffer does not have to be aligned
|
||
|
/// and can be any size.
|
||
|
///
|
||
|
/// * variable-stride: progress for each iteration of the loop across the input
|
||
|
/// and buffer is not fixed; the transform function may advance the pointer
|
||
|
/// one to sizeof(block_t) bytes each iteration. Due to these characteristics,
|
||
|
/// unaligned bytes may be redundantly loaded and non-temporal features are
|
||
|
/// not used to optimize the operation.
|
||
|
///
|
||
|
/// u64x2 counter lanes = { available_to_user, input_length }; The argument
|
||
|
/// `max` gives the buffer size in that format. The return value is the
|
||
|
/// consumed bytes (final counter value) in that format. The first lane is
|
||
|
/// available to the user; its initial value is max[0] (also unused); it is
|
||
|
/// then accumulated with the first lane of the closure's return value; its
|
||
|
/// final value is returned in [0] of the return value.
|
||
|
///
|
||
|
/// Note that the closure must advance the stream one or more bytes each
|
||
|
/// iteration; a zero value is available for loop control: the loop will
|
||
|
/// break without another iteration.
|
||
|
///
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
inline typename ircd::simd::istream_variable_stride<block_t, lambda>::type
|
||
|
ircd::simd::stream(const char *const __restrict__ in,
|
||
|
const u64x2 max,
|
||
|
lambda&& closure)
|
||
|
noexcept
|
||
|
{
|
||
|
using block_t_u = unaligned<block_t>;
|
||
|
|
||
|
u64x2 count
|
||
|
{
|
||
|
max[0], // preserved for caller
|
||
|
0, // input pos
|
||
|
};
|
||
|
|
||
|
u64x2 consume
|
||
|
{
|
||
|
0,
|
||
|
-1UL // non-zero to start loop
|
||
|
};
|
||
|
|
||
|
// primary broadband loop
|
||
|
while(consume[1] && count[1] + sizeof(block_t) <= max[1])
|
||
|
{
|
||
|
static const auto mask
|
||
|
{
|
||
|
~block_t{0}
|
||
|
};
|
||
|
|
||
|
const auto si
|
||
|
{
|
||
|
reinterpret_cast<const block_t_u *>(in + count[1])
|
||
|
};
|
||
|
|
||
|
const block_t block
|
||
|
(
|
||
|
*si
|
||
|
);
|
||
|
|
||
|
consume = closure(block, mask);
|
||
|
count += consume;
|
||
|
}
|
||
|
|
||
|
// trailing narrowband loop
|
||
|
while(consume[1] && count[1] < max[1])
|
||
|
{
|
||
|
block_t block {0}, mask {0};
|
||
|
for(size_t i(0); count[1] + i < max[1] && i < sizeof(block_t); ++i)
|
||
|
{
|
||
|
block[i] = in[count[1] + i];
|
||
|
mask[i] = 0xff;
|
||
|
}
|
||
|
|
||
|
consume = closure(block, mask);
|
||
|
count += consume;
|
||
|
}
|
||
|
|
||
|
return u64x2
|
||
|
{
|
||
|
count[0],
|
||
|
std::min(count[1], max[1])
|
||
|
};
|
||
|
}
|
||
|
|
||
|
/// Streaming consumer
|
||
|
///
|
||
|
/// This template performs the loop boiler-plate for the developer who can
|
||
|
/// simply supply a conforming closure. Characteristics:
|
||
|
///
|
||
|
/// * byte-aligned (unaligned): the input buffer does not have to be aligned
|
||
|
/// and can be any size.
|
||
|
///
|
||
|
/// * fixed-stride: progress for each iteration of the loop across the input
|
||
|
/// and buffer is fixed at the block width; the transform function does not
|
||
|
/// control the iteration.
|
||
|
///
|
||
|
/// u64x2 counter lanes = { available_to_user, input_length }; The argument
|
||
|
/// `max` gives the buffer size in that format. The return value is the
|
||
|
/// consumed bytes (final counter value) in that format. The first lane is
|
||
|
/// available to the user; its initial value is max[0] (also unused).
|
||
|
///
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
inline typename ircd::simd::istream_fixed_stride<block_t, lambda>::type
|
||
|
ircd::simd::stream(const char *const __restrict__ in,
|
||
|
const u64x2 max,
|
||
|
lambda&& closure)
|
||
|
noexcept
|
||
|
{
|
||
|
using block_t_u = unaligned<block_t>;
|
||
|
|
||
|
u64x2 count
|
||
|
{
|
||
|
max[0], // preserved for caller
|
||
|
0, // input pos
|
||
|
};
|
||
|
|
||
|
// primary broadband loop
|
||
|
while(count[1] + sizeof(block_t) <= max[1])
|
||
|
{
|
||
|
static const u64x2 consume
|
||
|
{
|
||
|
0, sizeof(block_t)
|
||
|
};
|
||
|
|
||
|
static const auto mask
|
||
|
{
|
||
|
~block_t{0}
|
||
|
};
|
||
|
|
||
|
const auto si
|
||
|
{
|
||
|
reinterpret_cast<const block_t_u *>(in + count[1])
|
||
|
};
|
||
|
|
||
|
const block_t block
|
||
|
(
|
||
|
*si
|
||
|
);
|
||
|
|
||
|
closure(block, mask);
|
||
|
count += consume;
|
||
|
}
|
||
|
|
||
|
// trailing narrowband loop
|
||
|
assert(count[1] + sizeof(block_t) > max[1]);
|
||
|
if(likely(count[1] < max[1]))
|
||
|
{
|
||
|
size_t i(0);
|
||
|
block_t block {0}, mask {0};
|
||
|
for(; count[1] + i < max[1]; ++i)
|
||
|
{
|
||
|
block[i] = in[count[1] + i];
|
||
|
mask[i] = 0xff;
|
||
|
}
|
||
|
|
||
|
closure(block, mask);
|
||
|
count += u64x2 // consume remainder
|
||
|
{
|
||
|
0, i
|
||
|
};
|
||
|
}
|
||
|
|
||
|
assert(count[0] == max[0]);
|
||
|
return count;
|
||
|
}
|
||
|
|
||
|
/// Streaming consumer
|
||
|
///
|
||
|
/// This template performs the loop boiler-plate for the developer who can
|
||
|
/// simply supply a conforming closure. Characteristics:
|
||
|
///
|
||
|
/// * block-aligned
|
||
|
/// * fixed-stride
|
||
|
////
|
||
|
template<class block_t,
|
||
|
class lambda>
|
||
|
inline typename ircd::simd::istream_fixed_stride<block_t, lambda>::type
|
||
|
ircd::simd::stream(const block_t *const __restrict__ in,
|
||
|
const u64x2 max,
|
||
|
lambda&& closure)
|
||
|
noexcept
|
||
|
{
|
||
|
u64x2 count
|
||
|
{
|
||
|
max[0], // preserved for caller
|
||
|
0, // input pos
|
||
|
};
|
||
|
|
||
|
// primary broadband loop
|
||
|
while(count[1] < max[1])
|
||
|
{
|
||
|
static const u64x2 consume
|
||
|
{
|
||
|
0, sizeof(block_t)
|
||
|
};
|
||
|
|
||
|
static const auto mask
|
||
|
{
|
||
|
~block_t{0}
|
||
|
};
|
||
|
|
||
|
const auto si
|
||
|
{
|
||
|
in + count[1] / sizeof(block_t)
|
||
|
};
|
||
|
|
||
|
const block_t block
|
||
|
(
|
||
|
*si
|
||
|
);
|
||
|
|
||
|
closure(block, mask);
|
||
|
count += consume;
|
||
|
}
|
||
|
|
||
|
assert(count[1] + sizeof(block_t) > max[1]);
|
||
|
assert(count[0] == max[0]);
|
||
|
return count;
|
||
|
}
|