From 3e6fcf3a479384a73e5091c3dedaf73c4af5e675 Mon Sep 17 00:00:00 2001 From: Jason Volk Date: Wed, 11 Aug 2021 04:31:32 -0700 Subject: [PATCH] ircd::utf: Split header; improve decode codegen; inline length. --- include/ircd/ircd.h | 3 +- include/ircd/{utf.h => utf16.h} | 23 +----- include/ircd/utf8.h | 57 +++++++++++++ ircd/utf.cc | 141 ++++++++++++++++++-------------- 4 files changed, 139 insertions(+), 85 deletions(-) rename include/ircd/{utf.h => utf16.h} (68%) create mode 100644 include/ircd/utf8.h diff --git a/include/ircd/ircd.h b/include/ircd/ircd.h index 80f9fb9f1..56ad52158 100644 --- a/include/ircd/ircd.h +++ b/include/ircd/ircd.h @@ -72,7 +72,8 @@ #include "crh.h" #include "fpe.h" #include "icu.h" -#include "utf.h" +#include "utf8.h" +#include "utf16.h" #include "b64.h" #include "b58.h" #include "iov.h" diff --git a/include/ircd/utf.h b/include/ircd/utf16.h similarity index 68% rename from include/ircd/utf.h rename to include/ircd/utf16.h index dddf01501..a5fce4af4 100644 --- a/include/ircd/utf.h +++ b/include/ircd/utf16.h @@ -1,7 +1,7 @@ // The Construct // // Copyright (C) The Construct Developers, Authors & Contributors -// Copyright (C) 2016-2020 Jason Volk +// Copyright (C) 2016-2021 Jason Volk // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above @@ -9,26 +9,7 @@ // full license for this software is available in the LICENSE file. #pragma once -#define HAVE_IRCD_UTF_H - -/// Unicode Transformation Format -namespace ircd::utf -{ - IRCD_EXCEPTION(ircd::error, error) -} - -/// Unicode Transformation Format (8-bit) -namespace ircd::utf8 -{ - // Get the utf8-encoded length from char32_t (decoded) codepoints - template u32xN length(const u32xN codepoints) noexcept; - - // Encode char32_t codepoints into respective utf-8 encodings - template u32xN encode_sparse(const u32xN codepoints) noexcept; - - // Decode utf-8 string into char32_t unicode codepoints - u32x16 decode(const u8x16 string) noexcept; -} +#define HAVE_IRCD_UTF16_H /// Unicode Transformation Format (16-bit) namespace ircd::utf16 diff --git a/include/ircd/utf8.h b/include/ircd/utf8.h new file mode 100644 index 000000000..d87ae42bc --- /dev/null +++ b/include/ircd/utf8.h @@ -0,0 +1,57 @@ +// The Construct +// +// Copyright (C) The Construct Developers, Authors & Contributors +// Copyright (C) 2016-2021 Jason Volk +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice is present in all copies. The +// full license for this software is available in the LICENSE file. + +#pragma once +#define HAVE_IRCD_UTF8_H + +/// Unicode Transformation Format (8-bit) +namespace ircd::utf8 +{ + // Get the utf8-encoded length from char32_t (decoded) codepoints + template u32xN length(const u32xN codepoints) noexcept; + + // Get the utf8 length at the first byte of each utf8-codepoint; 0x00 sep + template<> u8x16 length(const u8x16 string) noexcept; + + // Encode char32_t codepoints into respective utf-8 encodings + template u32xN encode_sparse(const u32xN codepoints) noexcept; + + // Decode utf-8 string into char32_t unicode codepoints + u32x16 decode_sparse(const u8x16 string) noexcept; + + // Decode utf-8 string into char32_t unicode codepoints packed left. + u32x16 decode(const u8x16 string) noexcept; +} + +template<> +inline ircd::u8x16 +ircd::utf8::length(const u8x16 string) +noexcept +{ + const u8x16 is_single + ( + string < 0x80 + ); + + const u8x16 is_multi + ( + (string - 0xc2) <= 0x32 + ); + + const u8x16 num_trail + ( + 1 + + ((string >= 0xc0) & 1) + + ((string >= 0xe0) & 1) + + ((string >= 0xf0) & 1) + ); + + return (is_single & 1) | (is_multi & num_trail); +} diff --git a/ircd/utf.cc b/ircd/utf.cc index ed3344b52..3a5705379 100644 --- a/ircd/utf.cc +++ b/ircd/utf.cc @@ -302,100 +302,115 @@ noexcept // ircd::u32x16 -ircd::utf8::decode(const u8x16 in) +ircd::utf8::decode(const u8x16 string) noexcept { - const u8x16 is_single + const u32x16 codepoints ( - (in & 0x80) == 0 + decode_sparse(string) ); - const u8x16 is_lead + const i32x16 zero_lane ( - (in - 0xc2) <= 0x32 + codepoints == 0 ); - const u8x16 is_trail + // Lanes separating sparsely decoded codepoints are zero. + const i8x16 skip_lane ( - in >= 0x80 && in < 0xbf + lane_cast(zero_lane) ); - const u8x16 is_head + // Actual NUL codepoints weren't altered by decode. + const i8x16 null_code ( - is_lead | is_single + string == 0 ); - const u8x16 len_mask[3] - { - in >= 0xc0, in >= 0xe0, in >= 0xf0, - }; - - const u8x16 expect_trail + // The pack will eliminate zero-value lanes except for legitimate NULs. + const i8x16 pack_mask ( - 1 + (len_mask[0] & 1) + (len_mask[1] & 1) + (len_mask[2] & 1) + ~null_code ^ skip_lane ); + const u32x16 ret + ( + simd::pack(codepoints, pack_mask) + ); + + return ret; +} + +ircd::u32x16 +ircd::utf8::decode_sparse(const u8x16 string) +noexcept +{ const u8x16 len ( - (is_single & 1) | (is_lead & expect_trail) + length(string) ); - const u8x16 head + const u8x16 rem ( - in & is_head + len + | ((shl<0x18>(len) == 4) & 1) + | ((shl<0x10>(len) == 4) & 2) + | ((shl<0x10>(len) == 3) & 1) + | ((shl<0x08>(len) == 4) & 3) + | ((shl<0x08>(len) == 3) & 2) + | ((shl<0x08>(len) == 2) & 1) ); - const u8x16 lead[] + const u8x16 bank[] { - 0x3f & in & is_trail, - 0xff & head & is_single, - 0x1f & head & len_mask[0] & ~len_mask[1], - 0x0f & head & len_mask[1] & ~len_mask[2], - 0x07 & head & len_mask[2], + string & 0x3f, + string & 0xff, + string & 0x1f, + string & 0x0f, + string & 0x07, }; - u8x16 full; - for(uint i(0); i < 16; ++i) - full[i] = lead[len[i]][i]; - - u8x16 shift {len & is_head}; - shift |= (shl<0x20>(len) == 4) & 0; - shift |= (shl<0x20>(len) == 3) & 0; - shift |= (shl<0x20>(len) == 2) & 0; - shift |= (shl<0x20>(len) == 1) & 0; - shift |= (shl<0x18>(len) == 4) & 1; - shift |= (shl<0x18>(len) == 3) & 0; - shift |= (shl<0x18>(len) == 2) & 0; - shift |= (shl<0x18>(len) == 1) & 0; - shift |= (shl<0x10>(len) == 4) & 2; - shift |= (shl<0x10>(len) == 3) & 1; - shift |= (shl<0x10>(len) == 2) & 0; - shift |= (shl<0x10>(len) == 1) & 0; - shift |= (shl<0x08>(len) == 4) & 3; - shift |= (shl<0x08>(len) == 3) & 2; - shift |= (shl<0x08>(len) == 2) & 1; - shift |= (shl<0x08>(len) == 1) & 0; - shift -= 1U; - shift &= 0x03U; - shift *= 6U; - - const u32x16 val - { - lane_cast(full) << lane_cast(shift) - }; - - const u8x16 incr + const u8x16 select ( - (shift == 0) & 1U + 0 + | (bank[0] & (len == 0)) + | (bank[1] & (len == 1)) + | (bank[2] & (len == 2)) + | (bank[3] & (len == 3)) + | (bank[4] & (len == 4)) ); - u8x16 idx {0}; - for(uint i(1); i < 16; ++i) - idx[i] = idx[i - 1] + incr[i - 1]; + const u8x16 byte[] + { + select & (rem == 1), + select & (rem == 2), + select & (rem == 3), + select & (rem == 4), + }; - u32x16 ret {0}; - for(uint i(0); i < 16; ++i) - ret[idx[i]] |= val[i]; + const u8x16 move[] + { + shl<8 * 0>(byte[0]), + shl<8 * 1>(byte[1]), + shl<8 * 2>(byte[2]), + shl<8 * 3>(byte[3]), + }; + + const u32x16 pack[] + { + lane_cast(move[0]) << 0x00, + lane_cast(move[1]) << 0x06, + lane_cast(move[2]) << 0x0c, + lane_cast(move[3]) << 0x12, + }; + + const u32x16 ret + ( + lane_cast(byte[0]) // pack[0] constrains clang opt + | pack[1] + | pack[2] + | pack[3] + ); return ret; }