mirror of
https://github.com/matrix-construct/construct
synced 2024-05-20 03:43:47 +02:00
ircd::b64: Optimize outer loop vectorized codegen. (clang/AVX512)
This commit is contained in:
parent
38e77c64c6
commit
d4b3a0db66
|
@ -44,8 +44,8 @@ namespace ircd::b64
|
||||||
size_t encode_unpadded_size(const const_buffer &in) noexcept;
|
size_t encode_unpadded_size(const const_buffer &in) noexcept;
|
||||||
|
|
||||||
const_buffer decode(const mutable_buffer out, const string_view in);
|
const_buffer decode(const mutable_buffer out, const string_view in);
|
||||||
string_view encode_unpadded(const mutable_buffer out, const const_buffer in, const dictionary & = dict_rfc1421) noexcept;
|
string_view encode_unpadded(const mutable_buffer out, const const_buffer in, const dictionary = dict_rfc1421) noexcept;
|
||||||
string_view encode(const mutable_buffer out, const const_buffer in, const dictionary & = dict_rfc1421) noexcept;
|
string_view encode(const mutable_buffer out, const const_buffer in, const dictionary = dict_rfc1421) noexcept;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline size_t
|
inline size_t
|
||||||
|
|
113
ircd/b64.cc
113
ircd/b64.cc
|
@ -29,7 +29,7 @@ namespace ircd::b64
|
||||||
static u8x64 decode_block(const u8x64 block, i64x8 &__restrict__ err) noexcept;
|
static u8x64 decode_block(const u8x64 block, i64x8 &__restrict__ err) noexcept;
|
||||||
|
|
||||||
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
|
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
|
||||||
static u8x64 encode_block(const u8x64 block, const dictionary &) noexcept;
|
static u8x64 encode_block(const u8x64 block, const dictionary) noexcept;
|
||||||
}
|
}
|
||||||
#pragma GCC visibility pop
|
#pragma GCC visibility pop
|
||||||
|
|
||||||
|
@ -157,7 +157,7 @@ alignas(64)
|
||||||
ircd::string_view
|
ircd::string_view
|
||||||
ircd::b64::encode(const mutable_buffer out,
|
ircd::b64::encode(const mutable_buffer out,
|
||||||
const const_buffer in,
|
const const_buffer in,
|
||||||
const dictionary &dict)
|
const dictionary dict)
|
||||||
noexcept
|
noexcept
|
||||||
{
|
{
|
||||||
const auto pads
|
const auto pads
|
||||||
|
@ -193,19 +193,9 @@ noexcept
|
||||||
ircd::string_view
|
ircd::string_view
|
||||||
ircd::b64::encode_unpadded(const mutable_buffer out,
|
ircd::b64::encode_unpadded(const mutable_buffer out,
|
||||||
const const_buffer in,
|
const const_buffer in,
|
||||||
const dictionary &dict)
|
const dictionary dict)
|
||||||
noexcept
|
noexcept
|
||||||
{
|
{
|
||||||
char *const __restrict__ dst
|
|
||||||
{
|
|
||||||
data(out)
|
|
||||||
};
|
|
||||||
|
|
||||||
const char *const __restrict__ src
|
|
||||||
{
|
|
||||||
data(in)
|
|
||||||
};
|
|
||||||
|
|
||||||
const size_t res_len
|
const size_t res_len
|
||||||
{
|
{
|
||||||
encode_unpadded_size(in)
|
encode_unpadded_size(in)
|
||||||
|
@ -216,38 +206,49 @@ noexcept
|
||||||
std::min(res_len, size(out))
|
std::min(res_len, size(out))
|
||||||
};
|
};
|
||||||
|
|
||||||
u8x64 block {0};
|
uint i;
|
||||||
size_t i(0), j(0);
|
for(i = 0; i < size(in) / 48 && i < out_len / 64; ++i)
|
||||||
for(; i < size(in) / 48 && i < out_len / 64; ++i)
|
|
||||||
{
|
{
|
||||||
// Destination is indexed at 64 byte stride
|
// Destination is indexed at 64 byte stride
|
||||||
const auto di
|
u512x1_u *const __restrict__ dx
|
||||||
{
|
{
|
||||||
reinterpret_cast<u512x1_u *__restrict__>(dst + (i * 64))
|
reinterpret_cast<u512x1_u *>(data(out))
|
||||||
};
|
};
|
||||||
|
|
||||||
// Source is indexed at 48 byte stride
|
// Source is indexed at 48 byte stride
|
||||||
const auto si
|
const auto *const __restrict__ si
|
||||||
{
|
{
|
||||||
reinterpret_cast<const u512x1_u *__restrict__>(src + (i * 48))
|
data(in) + i * 48
|
||||||
};
|
};
|
||||||
|
|
||||||
block = *si;
|
u8x64 block {0};
|
||||||
|
#pragma clang loop vectorize(enable) unroll(full)
|
||||||
|
for(uint j(0); j < 48; ++j)
|
||||||
|
block[j] = si[j];
|
||||||
|
|
||||||
block = encode_block(block, dict);
|
block = encode_block(block, dict);
|
||||||
*di = block;
|
dx[i] = block;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(; i * 48 < size(in) && i * 64 < out_len; ++i)
|
for(; i * 48 < size(in) && i * 64 < out_len; ++i)
|
||||||
{
|
{
|
||||||
#if !defined(__AVX__)
|
auto *const __restrict__ di
|
||||||
#pragma clang loop unroll_count(2)
|
{
|
||||||
#endif
|
data(out) + i * 64
|
||||||
for(j = 0; j < 48 && i * 48 + j < size(in); ++j)
|
};
|
||||||
block[j] = src[i * 48 + j];
|
|
||||||
|
const auto *const __restrict__ si
|
||||||
|
{
|
||||||
|
data(in) + i * 48
|
||||||
|
};
|
||||||
|
|
||||||
|
u8x64 block {0};
|
||||||
|
for(uint j(0); j < 48 && i * 48 + j < size(in); ++j)
|
||||||
|
block[j] = si[j];
|
||||||
|
|
||||||
block = encode_block(block, dict);
|
block = encode_block(block, dict);
|
||||||
for(j = 0; j < 64 && i * 64 + j < out_len; ++j)
|
for(uint j(0); j < 64 && i * 64 + j < out_len; ++j)
|
||||||
dst[i * 64 + j] = block[j];
|
di[j] = block[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
return string_view
|
return string_view
|
||||||
|
@ -270,7 +271,7 @@ noexcept
|
||||||
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
|
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
|
||||||
ircd::u8x64
|
ircd::u8x64
|
||||||
ircd::b64::encode_block(const u8x64 in,
|
ircd::b64::encode_block(const u8x64 in,
|
||||||
const dictionary &dict)
|
const dictionary dict)
|
||||||
noexcept
|
noexcept
|
||||||
{
|
{
|
||||||
size_t i, j, k;
|
size_t i, j, k;
|
||||||
|
@ -319,16 +320,6 @@ ircd::const_buffer
|
||||||
ircd::b64::decode(const mutable_buffer out,
|
ircd::b64::decode(const mutable_buffer out,
|
||||||
const string_view in)
|
const string_view in)
|
||||||
{
|
{
|
||||||
char *const __restrict__ dst
|
|
||||||
{
|
|
||||||
data(out)
|
|
||||||
};
|
|
||||||
|
|
||||||
const char *const __restrict__ src
|
|
||||||
{
|
|
||||||
data(in)
|
|
||||||
};
|
|
||||||
|
|
||||||
const size_t pads
|
const size_t pads
|
||||||
{
|
{
|
||||||
endswith_count(in, '=')
|
endswith_count(in, '=')
|
||||||
|
@ -344,39 +335,51 @@ ircd::b64::decode(const mutable_buffer out,
|
||||||
std::min(decode_size(in_len), size(out))
|
std::min(decode_size(in_len), size(out))
|
||||||
};
|
};
|
||||||
|
|
||||||
|
uint i;
|
||||||
i64x8 err {0};
|
i64x8 err {0};
|
||||||
u8x64 block {0};
|
for(i = 0; i < in_len / 64 && i < out_len / 48; ++i)
|
||||||
size_t i(0), j(0);
|
|
||||||
for(; i < in_len / 64 && i < out_len / 48; ++i)
|
|
||||||
{
|
{
|
||||||
// Destination is indexed at 48 byte stride
|
// Destination is indexed at 48 byte stride
|
||||||
const auto di
|
auto *const __restrict__ di
|
||||||
{
|
{
|
||||||
reinterpret_cast<u512x1_u *__restrict__>(dst + (i * 48))
|
data(out) + i * 48
|
||||||
};
|
};
|
||||||
|
|
||||||
// Source is indexed at 64 byte stride
|
// Source is indexed at 64 byte stride
|
||||||
const auto si
|
const u512x1_u *const __restrict__ sx
|
||||||
{
|
{
|
||||||
reinterpret_cast<const u512x1_u *__restrict__>(src + (i * 64))
|
reinterpret_cast<const u512x1_u *>(data(in))
|
||||||
};
|
};
|
||||||
|
|
||||||
block = *si;
|
u8x64 block;
|
||||||
|
block = sx[i];
|
||||||
block = decode_block(block, err);
|
block = decode_block(block, err);
|
||||||
*di = block;
|
#pragma clang loop vectorize(enable) unroll(full)
|
||||||
|
for(uint j(0); j < 48; ++j)
|
||||||
|
di[j] = block[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(; i * 64 < in_len && i * 48 < out_len; ++i)
|
for(; i * 64 < in_len && i * 48 < out_len; ++i)
|
||||||
{
|
{
|
||||||
u8x64 mask {0};
|
auto *const __restrict__ di
|
||||||
for(j = 0; j < 64 && i * 64 + j < in_len; ++j)
|
{
|
||||||
block[j] = src[i * 64 + j],
|
data(out) + i * 48
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto *const __restrict__ si
|
||||||
|
{
|
||||||
|
data(in) + i * 64
|
||||||
|
};
|
||||||
|
|
||||||
|
u8x64 block {0}, mask {0};
|
||||||
|
for(uint j(0); j < 64 && i * 64 + j < in_len; ++j)
|
||||||
|
block[j] = si[j],
|
||||||
mask[j] = 0xff;
|
mask[j] = 0xff;
|
||||||
|
|
||||||
i64x8 _err {0};
|
i64x8 _err {0};
|
||||||
block = decode_block(block, _err);
|
block = decode_block(block, _err);
|
||||||
for(j = 0; j < 48 && i * 48 + j < out_len; ++j)
|
for(uint j(0); j < 48 && i * 48 + j < out_len; ++j)
|
||||||
dst[i * 48 + j] = block[j];
|
di[j] = block[j];
|
||||||
|
|
||||||
err |= _err & i64x8(mask);
|
err |= _err & i64x8(mask);
|
||||||
}
|
}
|
||||||
|
@ -438,7 +441,7 @@ noexcept
|
||||||
|
|
||||||
u8x64 c(b), ret;
|
u8x64 c(b), ret;
|
||||||
#pragma clang loop vectorize(enable) unroll(full)
|
#pragma clang loop vectorize(enable) unroll(full)
|
||||||
for(i = 0; i < 64; ++i)
|
for(i = 0; i < 48; ++i)
|
||||||
ret[i] = c[decode_permute_tab_le[i]];
|
ret[i] = c[decode_permute_tab_le[i]];
|
||||||
|
|
||||||
err |= i64x8(_err);
|
err |= i64x8(_err);
|
||||||
|
|
Loading…
Reference in a new issue