diff --git a/ircd/json.cc b/ircd/json.cc index 6813dfc07..1b80f57a0 100644 --- a/ircd/json.cc +++ b/ircd/json.cc @@ -3558,21 +3558,26 @@ ircd::u64x2 ircd::json::string_unescape_utf16(u8x16 &block, const u8x16 block_mask) { - const u8x16 surr_mark - { - utf16::find_surrogate(block) & block_mask - }; - - const u8x16 surr_mask - { - utf16::mask_surrogate(surr_mark) - }; - const u32x4 unicode { - utf16::decode_surrogate_aligned_next(block) + utf16::decode_surrogate_aligned_next(block & block_mask) }; + const u32x4 length + { + utf8::length(unicode) + }; + + const u8x16 pair_mask + ( + length != 0 || shl<32>(length) == 4 + ); + + const u8x16 is_surrogate + ( + utf16::find_surrogate(block & block_mask) & pair_mask + ); + const u32x4 encoded_sparse { utf8::encode_sparse(unicode) @@ -3583,32 +3588,22 @@ ircd::json::string_unescape_utf16(u8x16 &block, encoded_sparse ); - u32x4 is_surrogate - { - -1U, -1U, 0, 0 - }; - - for(size_t i(0); i < 6; ++i) - { - is_surrogate[0] &= surr_mask[i]; - is_surrogate[1] &= surr_mask[i + 6]; - } - - const u32x4 length - { - utf8::length(unicode) & is_surrogate - }; - size_t di(0), i(0); for(; i < 2 && length[i] > 0; ++i) for(size_t j(0); j < length[i]; ++j) block[di++] = encoded[i * 4 + j]; + const auto surrogates + { + simd::popcnt(u64x2(popmask(is_surrogate))) + }; + + assert(surrogates > 0 && surrogates <= 2); assert(di == length[0] + length[1]); assert(i >= 1 && i <= 2); return u64x2 { - di, 6U * i + di, 6U * surrogates }; } @@ -3807,14 +3802,19 @@ ircd::json::string_stringify_utf16(u8x16 &block, utf16::decode_surrogate_aligned_next(block & block_mask) }; - const u32x4 is_surrogate + const u32x4 length_encoded + { + utf8::length(unicode) + }; + + const u8x16 pair_mask ( - utf16::find_surrogate(block & block_mask) + length_encoded != 0 || shl<32>(length_encoded) == 4 ); - const u32x4 surrogate_mask + const u8x16 is_surrogate ( - is_surrogate != 0U + utf16::find_surrogate(block & block_mask) & pair_mask ); const u32x4 is_ctrl @@ -3822,11 +3822,6 @@ ircd::json::string_stringify_utf16(u8x16 &block, unicode < 0x20 ); - const u32x4 length_encoded - { - utf8::length(unicode) - }; - const u32x4 ctrl_idx { unicode & is_ctrl @@ -3838,23 +3833,10 @@ ircd::json::string_stringify_utf16(u8x16 &block, u32(ctrl_tab_len[ctrl_idx[1]]), }; - const u32x4 is_non_bmp - ( - unicode >= 0x10000U - ); - - const u32x4 is_surrogate_pair - { - (is_non_bmp | shl<32>(is_non_bmp)) & - (surrogate_mask | shr<32>(surrogate_mask)) - }; - - // Determine the utf-8 encoding length for each codepoint... // Supplement the escaped surrogate length for excluded codepoints. const u32x4 length { - (length_encoded & ~is_ctrl) | - (length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask) + (length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl) }; const u32x4 encoded_sparse @@ -3876,7 +3858,7 @@ ircd::json::string_stringify_utf16(u8x16 &block, const auto surrogates { - simd::popcnt(u64x2(popmask(u8x16(is_surrogate)))) + simd::popcnt(u64x2(popmask(is_surrogate))) }; assert(di == length[0] + length[1]); @@ -4014,31 +3996,31 @@ ircd::u64x2 ircd::json::string_serialized_utf16(const u8x16 block, const u8x16 block_mask) { - const u32x4 is_surrogate - ( - utf16::find_surrogate(block & block_mask) - ); - - const u32x4 surrogate_mask - ( - is_surrogate != 0U - ); - const u32x4 unicode { utf16::decode_surrogate_aligned_next(block & block_mask) }; - const u32x4 is_ctrl - ( - unicode < 0x20 - ); - const u32x4 length_encoded { utf8::length(unicode) }; + const u8x16 pair_mask + ( + length_encoded != 0 || shl<32>(length_encoded) == 4 + ); + + const u8x16 is_surrogate + ( + utf16::find_surrogate(block & block_mask) & pair_mask + ); + + const u32x4 is_ctrl + ( + unicode < 0x20 + ); + const u32x4 ctrl_idx { unicode & is_ctrl @@ -4050,23 +4032,10 @@ ircd::json::string_serialized_utf16(const u8x16 block, ctrl_tab_len[ctrl_idx[1]], }; - const u32x4 is_non_bmp - ( - unicode >= 0x10000U - ); - - const u32x4 is_surrogate_pair - { - (is_non_bmp | shl<32>(is_non_bmp)) & - (surrogate_mask | shr<32>(surrogate_mask)) - }; - - // Determine the utf-8 encoding length for each codepoint... // Supplement the escaped surrogate length for excluded codepoints. const u32x4 length { - (length_encoded & ~is_ctrl) | - (length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask) + (length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl) }; const auto total_length @@ -4076,7 +4045,7 @@ ircd::json::string_serialized_utf16(const u8x16 block, const auto surrogates { - popcnt(u64x2(popmask(u8x16(is_surrogate)))) + popcnt(u64x2(popmask(is_surrogate))) }; return u64x2 diff --git a/ircd/utf.cc b/ircd/utf.cc index b6c30417a..0e6b08d88 100644 --- a/ircd/utf.cc +++ b/ircd/utf.cc @@ -12,6 +12,13 @@ // utf16 // +namespace ircd::utf16 +{ + static const u32x4 + mask_one { -1U, 0U, 0U, 0U, }, + mask_two { -1U, -1U, 0U, 0U, }; +} + /// Decodes one or two escaped surrogates (surrogate pair) aligned to the /// front of the input block. If the surrogates are a pair which decode into /// a single codepoint, only the first element of the return vector is used; @@ -137,33 +144,55 @@ noexcept (codepoint_paired <= 0xffffU) & ~(shl<32>(codepoint_high)) }; - // When one surrogate is input, only lane[0] - const u32x4 ret_codepoint_single + const u32x4 single_mask { - codepoint_unpaired & ~surrogate_pair_range & ~surrogate_deuce + ~surrogate_pair_range & ~surrogate_deuce & mask_one + }; + + const u32x4 paired_mask + { + surrogate_paired & surrogate_deuce & mask_one + }; + + const u32x4 unpaired_mask + { + ~surrogate_pair_range & surrogate_deuce & mask_two + }; + + // When one surrogate is input, only lane[0] + const u32x4 single_codepoint + { + codepoint_unpaired & single_mask }; // When two surrogates in a pair are input, lane[0] only - const u32x4 ret_codepoint_paired + const u32x4 paired_codepoint { - codepoint_paired & (surrogate_paired & surrogate_deuce) + codepoint_paired & paired_mask }; // When two unrelated surrogates are input, lane[0] and lane[1] - const u32x4 ret_codepoint_unpaired + const u32x4 unpaired_codepoint { - codepoint_unpaired & ~surrogate_pair_range & surrogate_deuce + codepoint_unpaired & unpaired_mask }; - static const u32x4 - mask_one { -1U, 0U, 0U, 0U, }, - mask_two { -1U, -1U, 0U, 0U, }; + const u32x4 codepoint + { + single_codepoint | paired_codepoint | unpaired_codepoint + }; - return 0 - | (ret_codepoint_single & mask_one) - | (ret_codepoint_paired & mask_one) - | (ret_codepoint_unpaired & mask_two) - ; + const u32x4 ret_mask + { + single_mask | paired_mask | unpaired_mask + }; + + const u32x4 ret + { + codepoint | ~ret_mask + }; + + return ret; } namespace ircd::utf16