diff --git a/include/ircd/utf.h b/include/ircd/utf.h index 0eb74ef00..b7ddbbf22 100644 --- a/include/ircd/utf.h +++ b/include/ircd/utf.h @@ -20,12 +20,17 @@ namespace ircd::utf /// Unicode Transformation Format (8-bit) namespace ircd::utf8 { - // Get the utf8-encoded length from decoded codepoints. + // Get the utf8-encoded length from char32_t (decoded) codepoints u32x16 length(const u32x16 codepoints) noexcept; u32x8 length(const u32x8 codepoints) noexcept; u32x4 length(const u32x4 codepoints) noexcept; + // Encode char32_t codepoints into respective utf-8 encodings u32x16 encode(const u32x16 codepoints) noexcept; + u32x8 encode(const u32x8 codepoints) noexcept; + u32x4 encode(const u32x4 codepoints) noexcept; + + // Decode utf-8 string into char32_t unicode codepoints u32x16 decode(const u8x16 string) noexcept; } diff --git a/ircd/utf.cc b/ircd/utf.cc index be412dea5..ef8794761 100644 --- a/ircd/utf.cc +++ b/ircd/utf.cc @@ -296,32 +296,59 @@ noexcept return integers; } -/// Transform multiple char32_t codepoints to their utf-8 encodings in -/// parallel, returning a sparse result in each char32_t (this does not -/// compress the result down). +namespace ircd::utf8 +{ + template static u32xN _encode(const u32xN codepoint) noexcept; +} + +ircd::u32x4 +ircd::utf8::encode(const u32x4 codepoint) +noexcept +{ + return _encode(codepoint); +} + +ircd::u32x8 +ircd::utf8::encode(const u32x8 codepoint) +noexcept +{ + return _encode(codepoint); +} + ircd::u32x16 ircd::utf8::encode(const u32x16 codepoint) noexcept { - const u32x16 len + return _encode(codepoint); +} + +/// Transform multiple char32_t codepoints to their utf-8 encodings in +/// parallel, returning a sparse result in each char32_t (this does not +/// compress the result down). +template +u32xN +ircd::utf8::_encode(const u32xN codepoint) +noexcept +{ + const u32xN len { length(codepoint) }; - const u32x16 enc_2 + const u32xN enc_2 { (((codepoint >> 6) | 0xc0) & 0xff) // byte[0] | ((((codepoint & 0x3f) | 0x80) &0xff) << 8) // byte[1] }; - const u32x16 enc_3 + const u32xN enc_3 { (((codepoint >> 12) | 0xe0) & 0xff) | // byte[0] (((((codepoint >> 6) & 0x3f) | 0x80) & 0xff) << 8) | // byte[1] ((((codepoint & 0x3f) | 0x80) & 0xff) << 16) // byte[3] }; - const u32x16 enc_4 + const u32xN enc_4 { (((codepoint >> 18) | 0xf0) & 0xff) | // byte[0] (((((codepoint >> 12) & 0x3f) | 0x80) & 0xff) << 8) | // byte[1]