diff --git a/include/ircd/utf.h b/include/ircd/utf.h index 681261ace..bff2bf2ce 100644 --- a/include/ircd/utf.h +++ b/include/ircd/utf.h @@ -20,7 +20,11 @@ namespace ircd::utf /// Unicode Transformation Format (8-bit) namespace ircd::utf8 { + // Get the utf8-encoded length from decoded codepoints. u32x16 length(const u32x16 codepoints) noexcept; + u32x8 length(const u32x8 codepoints) noexcept; + u32x4 length(const u32x4 codepoints) noexcept; + u32x16 encode(const u32x16 codepoints) noexcept; u32x16 decode(const u8x16 string) noexcept; } @@ -38,7 +42,6 @@ namespace ircd::utf16 u8x16 find_surrogate(const u8x16 input) noexcept; // scan for utf-16 surrogates including incomplete sequences truncated - // by the end of the input; also matches a single trailing slash. u8x16 find_surrogate_partial(const u8x16 input) noexcept; } diff --git a/ircd/utf.cc b/ircd/utf.cc index 29544d5f5..86d4fef88 100644 --- a/ircd/utf.cc +++ b/ircd/utf.cc @@ -29,6 +29,8 @@ ircd::utf16::truncation_table ~shl<0x08>(~full_mask), }; +/// scan for utf-16 surrogates including incomplete sequences truncated +/// by the end of the input; also matches a single trailing slash. ircd::u8x16 ircd::utf16::find_surrogate_partial(const u8x16 input) noexcept @@ -264,21 +266,48 @@ noexcept ; } -/// Determine the utf-8 encoding length of multiple codepoints in parallel. -/// The input vector char32_t codepoints and the output yields an integer -/// of 0-4 for each lane. +namespace ircd::utf8 +{ + template static u32xN _length(const u32xN codepoint) noexcept; +} + +ircd::u32x4 +ircd::utf8::length(const u32x4 codepoint) +noexcept +{ + return _length(codepoint); +} + +ircd::u32x8 +ircd::utf8::length(const u32x8 codepoint) +noexcept +{ + return _length(codepoint); +} + ircd::u32x16 ircd::utf8::length(const u32x16 codepoint) noexcept { - const u32x16 + return _length(codepoint); +} + +/// Determine the utf-8 encoding length of multiple codepoints in parallel. +/// The input vector char32_t codepoints and the output yields an integer +/// of 0-4 for each lane. +template +u32xN +ircd::utf8::_length(const u32xN codepoint) +noexcept +{ + const u32xN length_1 { codepoint <= 0x7f }, length_2 { codepoint <= 0x7ff && codepoint > 0x7f }, length_3_lo { codepoint <= 0xd7ff && codepoint > 0x7ff }, length_3_hi { codepoint <= 0xffff && codepoint > 0xdfff }, length_4 { codepoint <= 0x10ffff && codepoint > 0xffff }; - [[gnu::unused]] const u32x16 // Preserved here for future reference + [[gnu::unused]] const u32xN // Preserved here for future reference length_3_err { codepoint <= 0xdfff && codepoint > 0xd7ff }, length_err { (codepoint > 0x10ffff) | length_3_err };