0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-05-20 03:43:47 +02:00

ircd::b64: Optimize outer loop vectorized codegen. (clang/AVX512)

This commit is contained in:
Jason Volk 2023-03-26 14:16:38 -07:00
parent 38e77c64c6
commit d4b3a0db66
2 changed files with 60 additions and 57 deletions

View file

@ -44,8 +44,8 @@ namespace ircd::b64
size_t encode_unpadded_size(const const_buffer &in) noexcept; size_t encode_unpadded_size(const const_buffer &in) noexcept;
const_buffer decode(const mutable_buffer out, const string_view in); const_buffer decode(const mutable_buffer out, const string_view in);
string_view encode_unpadded(const mutable_buffer out, const const_buffer in, const dictionary & = dict_rfc1421) noexcept; string_view encode_unpadded(const mutable_buffer out, const const_buffer in, const dictionary = dict_rfc1421) noexcept;
string_view encode(const mutable_buffer out, const const_buffer in, const dictionary & = dict_rfc1421) noexcept; string_view encode(const mutable_buffer out, const const_buffer in, const dictionary = dict_rfc1421) noexcept;
} }
inline size_t inline size_t

View file

@ -29,7 +29,7 @@ namespace ircd::b64
static u8x64 decode_block(const u8x64 block, i64x8 &__restrict__ err) noexcept; static u8x64 decode_block(const u8x64 block, i64x8 &__restrict__ err) noexcept;
[[IRCD_CLONES(IRCD_B64_TARGETS)]] [[IRCD_CLONES(IRCD_B64_TARGETS)]]
static u8x64 encode_block(const u8x64 block, const dictionary &) noexcept; static u8x64 encode_block(const u8x64 block, const dictionary) noexcept;
} }
#pragma GCC visibility pop #pragma GCC visibility pop
@ -157,7 +157,7 @@ alignas(64)
ircd::string_view ircd::string_view
ircd::b64::encode(const mutable_buffer out, ircd::b64::encode(const mutable_buffer out,
const const_buffer in, const const_buffer in,
const dictionary &dict) const dictionary dict)
noexcept noexcept
{ {
const auto pads const auto pads
@ -193,19 +193,9 @@ noexcept
ircd::string_view ircd::string_view
ircd::b64::encode_unpadded(const mutable_buffer out, ircd::b64::encode_unpadded(const mutable_buffer out,
const const_buffer in, const const_buffer in,
const dictionary &dict) const dictionary dict)
noexcept noexcept
{ {
char *const __restrict__ dst
{
data(out)
};
const char *const __restrict__ src
{
data(in)
};
const size_t res_len const size_t res_len
{ {
encode_unpadded_size(in) encode_unpadded_size(in)
@ -216,38 +206,49 @@ noexcept
std::min(res_len, size(out)) std::min(res_len, size(out))
}; };
u8x64 block {0}; uint i;
size_t i(0), j(0); for(i = 0; i < size(in) / 48 && i < out_len / 64; ++i)
for(; i < size(in) / 48 && i < out_len / 64; ++i)
{ {
// Destination is indexed at 64 byte stride // Destination is indexed at 64 byte stride
const auto di u512x1_u *const __restrict__ dx
{ {
reinterpret_cast<u512x1_u *__restrict__>(dst + (i * 64)) reinterpret_cast<u512x1_u *>(data(out))
}; };
// Source is indexed at 48 byte stride // Source is indexed at 48 byte stride
const auto si const auto *const __restrict__ si
{ {
reinterpret_cast<const u512x1_u *__restrict__>(src + (i * 48)) data(in) + i * 48
}; };
block = *si; u8x64 block {0};
#pragma clang loop vectorize(enable) unroll(full)
for(uint j(0); j < 48; ++j)
block[j] = si[j];
block = encode_block(block, dict); block = encode_block(block, dict);
*di = block; dx[i] = block;
} }
for(; i * 48 < size(in) && i * 64 < out_len; ++i) for(; i * 48 < size(in) && i * 64 < out_len; ++i)
{ {
#if !defined(__AVX__) auto *const __restrict__ di
#pragma clang loop unroll_count(2) {
#endif data(out) + i * 64
for(j = 0; j < 48 && i * 48 + j < size(in); ++j) };
block[j] = src[i * 48 + j];
const auto *const __restrict__ si
{
data(in) + i * 48
};
u8x64 block {0};
for(uint j(0); j < 48 && i * 48 + j < size(in); ++j)
block[j] = si[j];
block = encode_block(block, dict); block = encode_block(block, dict);
for(j = 0; j < 64 && i * 64 + j < out_len; ++j) for(uint j(0); j < 64 && i * 64 + j < out_len; ++j)
dst[i * 64 + j] = block[j]; di[j] = block[j];
} }
return string_view return string_view
@ -270,7 +271,7 @@ noexcept
[[IRCD_CLONES(IRCD_B64_TARGETS)]] [[IRCD_CLONES(IRCD_B64_TARGETS)]]
ircd::u8x64 ircd::u8x64
ircd::b64::encode_block(const u8x64 in, ircd::b64::encode_block(const u8x64 in,
const dictionary &dict) const dictionary dict)
noexcept noexcept
{ {
size_t i, j, k; size_t i, j, k;
@ -319,16 +320,6 @@ ircd::const_buffer
ircd::b64::decode(const mutable_buffer out, ircd::b64::decode(const mutable_buffer out,
const string_view in) const string_view in)
{ {
char *const __restrict__ dst
{
data(out)
};
const char *const __restrict__ src
{
data(in)
};
const size_t pads const size_t pads
{ {
endswith_count(in, '=') endswith_count(in, '=')
@ -344,39 +335,51 @@ ircd::b64::decode(const mutable_buffer out,
std::min(decode_size(in_len), size(out)) std::min(decode_size(in_len), size(out))
}; };
uint i;
i64x8 err {0}; i64x8 err {0};
u8x64 block {0}; for(i = 0; i < in_len / 64 && i < out_len / 48; ++i)
size_t i(0), j(0);
for(; i < in_len / 64 && i < out_len / 48; ++i)
{ {
// Destination is indexed at 48 byte stride // Destination is indexed at 48 byte stride
const auto di auto *const __restrict__ di
{ {
reinterpret_cast<u512x1_u *__restrict__>(dst + (i * 48)) data(out) + i * 48
}; };
// Source is indexed at 64 byte stride // Source is indexed at 64 byte stride
const auto si const u512x1_u *const __restrict__ sx
{ {
reinterpret_cast<const u512x1_u *__restrict__>(src + (i * 64)) reinterpret_cast<const u512x1_u *>(data(in))
}; };
block = *si; u8x64 block;
block = sx[i];
block = decode_block(block, err); block = decode_block(block, err);
*di = block; #pragma clang loop vectorize(enable) unroll(full)
for(uint j(0); j < 48; ++j)
di[j] = block[j];
} }
for(; i * 64 < in_len && i * 48 < out_len; ++i) for(; i * 64 < in_len && i * 48 < out_len; ++i)
{ {
u8x64 mask {0}; auto *const __restrict__ di
for(j = 0; j < 64 && i * 64 + j < in_len; ++j) {
block[j] = src[i * 64 + j], data(out) + i * 48
};
const auto *const __restrict__ si
{
data(in) + i * 64
};
u8x64 block {0}, mask {0};
for(uint j(0); j < 64 && i * 64 + j < in_len; ++j)
block[j] = si[j],
mask[j] = 0xff; mask[j] = 0xff;
i64x8 _err {0}; i64x8 _err {0};
block = decode_block(block, _err); block = decode_block(block, _err);
for(j = 0; j < 48 && i * 48 + j < out_len; ++j) for(uint j(0); j < 48 && i * 48 + j < out_len; ++j)
dst[i * 48 + j] = block[j]; di[j] = block[j];
err |= _err & i64x8(mask); err |= _err & i64x8(mask);
} }
@ -438,7 +441,7 @@ noexcept
u8x64 c(b), ret; u8x64 c(b), ret;
#pragma clang loop vectorize(enable) unroll(full) #pragma clang loop vectorize(enable) unroll(full)
for(i = 0; i < 64; ++i) for(i = 0; i < 48; ++i)
ret[i] = c[decode_permute_tab_le[i]]; ret[i] = c[decode_permute_tab_le[i]];
err |= i64x8(_err); err |= i64x8(_err);