ircd::b64: Optimize outer loop vectorized codegen. (clang/AVX512)

This commit is contained in:
Jason Volk 2023-03-26 14:16:38 -07:00
parent 38e77c64c6
commit d4b3a0db66
2 changed files with 60 additions and 57 deletions

View File

@ -44,8 +44,8 @@ namespace ircd::b64
size_t encode_unpadded_size(const const_buffer &in) noexcept;
const_buffer decode(const mutable_buffer out, const string_view in);
string_view encode_unpadded(const mutable_buffer out, const const_buffer in, const dictionary & = dict_rfc1421) noexcept;
string_view encode(const mutable_buffer out, const const_buffer in, const dictionary & = dict_rfc1421) noexcept;
string_view encode_unpadded(const mutable_buffer out, const const_buffer in, const dictionary = dict_rfc1421) noexcept;
string_view encode(const mutable_buffer out, const const_buffer in, const dictionary = dict_rfc1421) noexcept;
}
inline size_t

View File

@ -29,7 +29,7 @@ namespace ircd::b64
static u8x64 decode_block(const u8x64 block, i64x8 &__restrict__ err) noexcept;
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
static u8x64 encode_block(const u8x64 block, const dictionary &) noexcept;
static u8x64 encode_block(const u8x64 block, const dictionary) noexcept;
}
#pragma GCC visibility pop
@ -157,7 +157,7 @@ alignas(64)
ircd::string_view
ircd::b64::encode(const mutable_buffer out,
const const_buffer in,
const dictionary &dict)
const dictionary dict)
noexcept
{
const auto pads
@ -193,19 +193,9 @@ noexcept
ircd::string_view
ircd::b64::encode_unpadded(const mutable_buffer out,
const const_buffer in,
const dictionary &dict)
const dictionary dict)
noexcept
{
char *const __restrict__ dst
{
data(out)
};
const char *const __restrict__ src
{
data(in)
};
const size_t res_len
{
encode_unpadded_size(in)
@ -216,38 +206,49 @@ noexcept
std::min(res_len, size(out))
};
u8x64 block {0};
size_t i(0), j(0);
for(; i < size(in) / 48 && i < out_len / 64; ++i)
uint i;
for(i = 0; i < size(in) / 48 && i < out_len / 64; ++i)
{
// Destination is indexed at 64 byte stride
const auto di
u512x1_u *const __restrict__ dx
{
reinterpret_cast<u512x1_u *__restrict__>(dst + (i * 64))
reinterpret_cast<u512x1_u *>(data(out))
};
// Source is indexed at 48 byte stride
const auto si
const auto *const __restrict__ si
{
reinterpret_cast<const u512x1_u *__restrict__>(src + (i * 48))
data(in) + i * 48
};
block = *si;
u8x64 block {0};
#pragma clang loop vectorize(enable) unroll(full)
for(uint j(0); j < 48; ++j)
block[j] = si[j];
block = encode_block(block, dict);
*di = block;
dx[i] = block;
}
for(; i * 48 < size(in) && i * 64 < out_len; ++i)
{
#if !defined(__AVX__)
#pragma clang loop unroll_count(2)
#endif
for(j = 0; j < 48 && i * 48 + j < size(in); ++j)
block[j] = src[i * 48 + j];
auto *const __restrict__ di
{
data(out) + i * 64
};
const auto *const __restrict__ si
{
data(in) + i * 48
};
u8x64 block {0};
for(uint j(0); j < 48 && i * 48 + j < size(in); ++j)
block[j] = si[j];
block = encode_block(block, dict);
for(j = 0; j < 64 && i * 64 + j < out_len; ++j)
dst[i * 64 + j] = block[j];
for(uint j(0); j < 64 && i * 64 + j < out_len; ++j)
di[j] = block[j];
}
return string_view
@ -270,7 +271,7 @@ noexcept
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
ircd::u8x64
ircd::b64::encode_block(const u8x64 in,
const dictionary &dict)
const dictionary dict)
noexcept
{
size_t i, j, k;
@ -319,16 +320,6 @@ ircd::const_buffer
ircd::b64::decode(const mutable_buffer out,
const string_view in)
{
char *const __restrict__ dst
{
data(out)
};
const char *const __restrict__ src
{
data(in)
};
const size_t pads
{
endswith_count(in, '=')
@ -344,39 +335,51 @@ ircd::b64::decode(const mutable_buffer out,
std::min(decode_size(in_len), size(out))
};
uint i;
i64x8 err {0};
u8x64 block {0};
size_t i(0), j(0);
for(; i < in_len / 64 && i < out_len / 48; ++i)
for(i = 0; i < in_len / 64 && i < out_len / 48; ++i)
{
// Destination is indexed at 48 byte stride
const auto di
auto *const __restrict__ di
{
reinterpret_cast<u512x1_u *__restrict__>(dst + (i * 48))
data(out) + i * 48
};
// Source is indexed at 64 byte stride
const auto si
const u512x1_u *const __restrict__ sx
{
reinterpret_cast<const u512x1_u *__restrict__>(src + (i * 64))
reinterpret_cast<const u512x1_u *>(data(in))
};
block = *si;
u8x64 block;
block = sx[i];
block = decode_block(block, err);
*di = block;
#pragma clang loop vectorize(enable) unroll(full)
for(uint j(0); j < 48; ++j)
di[j] = block[j];
}
for(; i * 64 < in_len && i * 48 < out_len; ++i)
{
u8x64 mask {0};
for(j = 0; j < 64 && i * 64 + j < in_len; ++j)
block[j] = src[i * 64 + j],
auto *const __restrict__ di
{
data(out) + i * 48
};
const auto *const __restrict__ si
{
data(in) + i * 64
};
u8x64 block {0}, mask {0};
for(uint j(0); j < 64 && i * 64 + j < in_len; ++j)
block[j] = si[j],
mask[j] = 0xff;
i64x8 _err {0};
block = decode_block(block, _err);
for(j = 0; j < 48 && i * 48 + j < out_len; ++j)
dst[i * 48 + j] = block[j];
for(uint j(0); j < 48 && i * 48 + j < out_len; ++j)
di[j] = block[j];
err |= _err & i64x8(mask);
}
@ -438,7 +441,7 @@ noexcept
u8x64 c(b), ret;
#pragma clang loop vectorize(enable) unroll(full)
for(i = 0; i < 64; ++i)
for(i = 0; i < 48; ++i)
ret[i] = c[decode_permute_tab_le[i]];
err |= i64x8(_err);