0
0
Fork 0
mirror of https://github.com/matrix-construct/construct synced 2024-05-19 19:33:45 +02:00

ircd::json: Fix utf16 surrogate pairing discernment.

ircd::utf16: Fix return value for unused lanes; use sentinel.
This commit is contained in:
Jason Volk 2023-01-31 00:51:18 -08:00
parent fc0b6bbecb
commit a308010a1b
2 changed files with 95 additions and 97 deletions

View file

@ -3558,21 +3558,26 @@ ircd::u64x2
ircd::json::string_unescape_utf16(u8x16 &block, ircd::json::string_unescape_utf16(u8x16 &block,
const u8x16 block_mask) const u8x16 block_mask)
{ {
const u8x16 surr_mark
{
utf16::find_surrogate(block) & block_mask
};
const u8x16 surr_mask
{
utf16::mask_surrogate(surr_mark)
};
const u32x4 unicode const u32x4 unicode
{ {
utf16::decode_surrogate_aligned_next(block) utf16::decode_surrogate_aligned_next(block & block_mask)
}; };
const u32x4 length
{
utf8::length(unicode)
};
const u8x16 pair_mask
(
length != 0 || shl<32>(length) == 4
);
const u8x16 is_surrogate
(
utf16::find_surrogate(block & block_mask) & pair_mask
);
const u32x4 encoded_sparse const u32x4 encoded_sparse
{ {
utf8::encode_sparse(unicode) utf8::encode_sparse(unicode)
@ -3583,32 +3588,22 @@ ircd::json::string_unescape_utf16(u8x16 &block,
encoded_sparse encoded_sparse
); );
u32x4 is_surrogate
{
-1U, -1U, 0, 0
};
for(size_t i(0); i < 6; ++i)
{
is_surrogate[0] &= surr_mask[i];
is_surrogate[1] &= surr_mask[i + 6];
}
const u32x4 length
{
utf8::length(unicode) & is_surrogate
};
size_t di(0), i(0); size_t di(0), i(0);
for(; i < 2 && length[i] > 0; ++i) for(; i < 2 && length[i] > 0; ++i)
for(size_t j(0); j < length[i]; ++j) for(size_t j(0); j < length[i]; ++j)
block[di++] = encoded[i * 4 + j]; block[di++] = encoded[i * 4 + j];
const auto surrogates
{
simd::popcnt(u64x2(popmask(is_surrogate)))
};
assert(surrogates > 0 && surrogates <= 2);
assert(di == length[0] + length[1]); assert(di == length[0] + length[1]);
assert(i >= 1 && i <= 2); assert(i >= 1 && i <= 2);
return u64x2 return u64x2
{ {
di, 6U * i di, 6U * surrogates
}; };
} }
@ -3807,14 +3802,19 @@ ircd::json::string_stringify_utf16(u8x16 &block,
utf16::decode_surrogate_aligned_next(block & block_mask) utf16::decode_surrogate_aligned_next(block & block_mask)
}; };
const u32x4 is_surrogate const u32x4 length_encoded
{
utf8::length(unicode)
};
const u8x16 pair_mask
( (
utf16::find_surrogate(block & block_mask) length_encoded != 0 || shl<32>(length_encoded) == 4
); );
const u32x4 surrogate_mask const u8x16 is_surrogate
( (
is_surrogate != 0U utf16::find_surrogate(block & block_mask) & pair_mask
); );
const u32x4 is_ctrl const u32x4 is_ctrl
@ -3822,11 +3822,6 @@ ircd::json::string_stringify_utf16(u8x16 &block,
unicode < 0x20 unicode < 0x20
); );
const u32x4 length_encoded
{
utf8::length(unicode)
};
const u32x4 ctrl_idx const u32x4 ctrl_idx
{ {
unicode & is_ctrl unicode & is_ctrl
@ -3838,23 +3833,10 @@ ircd::json::string_stringify_utf16(u8x16 &block,
u32(ctrl_tab_len[ctrl_idx[1]]), u32(ctrl_tab_len[ctrl_idx[1]]),
}; };
const u32x4 is_non_bmp
(
unicode >= 0x10000U
);
const u32x4 is_surrogate_pair
{
(is_non_bmp | shl<32>(is_non_bmp)) &
(surrogate_mask | shr<32>(surrogate_mask))
};
// Determine the utf-8 encoding length for each codepoint...
// Supplement the escaped surrogate length for excluded codepoints. // Supplement the escaped surrogate length for excluded codepoints.
const u32x4 length const u32x4 length
{ {
(length_encoded & ~is_ctrl) | (length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl)
(length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask)
}; };
const u32x4 encoded_sparse const u32x4 encoded_sparse
@ -3876,7 +3858,7 @@ ircd::json::string_stringify_utf16(u8x16 &block,
const auto surrogates const auto surrogates
{ {
simd::popcnt(u64x2(popmask(u8x16(is_surrogate)))) simd::popcnt(u64x2(popmask(is_surrogate)))
}; };
assert(di == length[0] + length[1]); assert(di == length[0] + length[1]);
@ -4014,31 +3996,31 @@ ircd::u64x2
ircd::json::string_serialized_utf16(const u8x16 block, ircd::json::string_serialized_utf16(const u8x16 block,
const u8x16 block_mask) const u8x16 block_mask)
{ {
const u32x4 is_surrogate
(
utf16::find_surrogate(block & block_mask)
);
const u32x4 surrogate_mask
(
is_surrogate != 0U
);
const u32x4 unicode const u32x4 unicode
{ {
utf16::decode_surrogate_aligned_next(block & block_mask) utf16::decode_surrogate_aligned_next(block & block_mask)
}; };
const u32x4 is_ctrl
(
unicode < 0x20
);
const u32x4 length_encoded const u32x4 length_encoded
{ {
utf8::length(unicode) utf8::length(unicode)
}; };
const u8x16 pair_mask
(
length_encoded != 0 || shl<32>(length_encoded) == 4
);
const u8x16 is_surrogate
(
utf16::find_surrogate(block & block_mask) & pair_mask
);
const u32x4 is_ctrl
(
unicode < 0x20
);
const u32x4 ctrl_idx const u32x4 ctrl_idx
{ {
unicode & is_ctrl unicode & is_ctrl
@ -4050,23 +4032,10 @@ ircd::json::string_serialized_utf16(const u8x16 block,
ctrl_tab_len[ctrl_idx[1]], ctrl_tab_len[ctrl_idx[1]],
}; };
const u32x4 is_non_bmp
(
unicode >= 0x10000U
);
const u32x4 is_surrogate_pair
{
(is_non_bmp | shl<32>(is_non_bmp)) &
(surrogate_mask | shr<32>(surrogate_mask))
};
// Determine the utf-8 encoding length for each codepoint...
// Supplement the escaped surrogate length for excluded codepoints. // Supplement the escaped surrogate length for excluded codepoints.
const u32x4 length const u32x4 length
{ {
(length_encoded & ~is_ctrl) | (length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl)
(length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask)
}; };
const auto total_length const auto total_length
@ -4076,7 +4045,7 @@ ircd::json::string_serialized_utf16(const u8x16 block,
const auto surrogates const auto surrogates
{ {
popcnt(u64x2(popmask(u8x16(is_surrogate)))) popcnt(u64x2(popmask(is_surrogate)))
}; };
return u64x2 return u64x2

View file

@ -12,6 +12,13 @@
// utf16 // utf16
// //
namespace ircd::utf16
{
static const u32x4
mask_one { -1U, 0U, 0U, 0U, },
mask_two { -1U, -1U, 0U, 0U, };
}
/// Decodes one or two escaped surrogates (surrogate pair) aligned to the /// Decodes one or two escaped surrogates (surrogate pair) aligned to the
/// front of the input block. If the surrogates are a pair which decode into /// front of the input block. If the surrogates are a pair which decode into
/// a single codepoint, only the first element of the return vector is used; /// a single codepoint, only the first element of the return vector is used;
@ -137,33 +144,55 @@ noexcept
(codepoint_paired <= 0xffffU) & ~(shl<32>(codepoint_high)) (codepoint_paired <= 0xffffU) & ~(shl<32>(codepoint_high))
}; };
// When one surrogate is input, only lane[0] const u32x4 single_mask
const u32x4 ret_codepoint_single
{ {
codepoint_unpaired & ~surrogate_pair_range & ~surrogate_deuce ~surrogate_pair_range & ~surrogate_deuce & mask_one
};
const u32x4 paired_mask
{
surrogate_paired & surrogate_deuce & mask_one
};
const u32x4 unpaired_mask
{
~surrogate_pair_range & surrogate_deuce & mask_two
};
// When one surrogate is input, only lane[0]
const u32x4 single_codepoint
{
codepoint_unpaired & single_mask
}; };
// When two surrogates in a pair are input, lane[0] only // When two surrogates in a pair are input, lane[0] only
const u32x4 ret_codepoint_paired const u32x4 paired_codepoint
{ {
codepoint_paired & (surrogate_paired & surrogate_deuce) codepoint_paired & paired_mask
}; };
// When two unrelated surrogates are input, lane[0] and lane[1] // When two unrelated surrogates are input, lane[0] and lane[1]
const u32x4 ret_codepoint_unpaired const u32x4 unpaired_codepoint
{ {
codepoint_unpaired & ~surrogate_pair_range & surrogate_deuce codepoint_unpaired & unpaired_mask
}; };
static const u32x4 const u32x4 codepoint
mask_one { -1U, 0U, 0U, 0U, }, {
mask_two { -1U, -1U, 0U, 0U, }; single_codepoint | paired_codepoint | unpaired_codepoint
};
return 0 const u32x4 ret_mask
| (ret_codepoint_single & mask_one) {
| (ret_codepoint_paired & mask_one) single_mask | paired_mask | unpaired_mask
| (ret_codepoint_unpaired & mask_two) };
;
const u32x4 ret
{
codepoint | ~ret_mask
};
return ret;
} }
namespace ircd::utf16 namespace ircd::utf16