ircd::json: Fix utf16 surrogate pairing discernment.

ircd::utf16: Fix return value for unused lanes; use sentinel.
This commit is contained in:
Jason Volk 2023-01-31 00:51:18 -08:00
parent fc0b6bbecb
commit a308010a1b
2 changed files with 95 additions and 97 deletions

View File

@ -3558,21 +3558,26 @@ ircd::u64x2
ircd::json::string_unescape_utf16(u8x16 &block,
const u8x16 block_mask)
{
const u8x16 surr_mark
{
utf16::find_surrogate(block) & block_mask
};
const u8x16 surr_mask
{
utf16::mask_surrogate(surr_mark)
};
const u32x4 unicode
{
utf16::decode_surrogate_aligned_next(block)
utf16::decode_surrogate_aligned_next(block & block_mask)
};
const u32x4 length
{
utf8::length(unicode)
};
const u8x16 pair_mask
(
length != 0 || shl<32>(length) == 4
);
const u8x16 is_surrogate
(
utf16::find_surrogate(block & block_mask) & pair_mask
);
const u32x4 encoded_sparse
{
utf8::encode_sparse(unicode)
@ -3583,32 +3588,22 @@ ircd::json::string_unescape_utf16(u8x16 &block,
encoded_sparse
);
u32x4 is_surrogate
{
-1U, -1U, 0, 0
};
for(size_t i(0); i < 6; ++i)
{
is_surrogate[0] &= surr_mask[i];
is_surrogate[1] &= surr_mask[i + 6];
}
const u32x4 length
{
utf8::length(unicode) & is_surrogate
};
size_t di(0), i(0);
for(; i < 2 && length[i] > 0; ++i)
for(size_t j(0); j < length[i]; ++j)
block[di++] = encoded[i * 4 + j];
const auto surrogates
{
simd::popcnt(u64x2(popmask(is_surrogate)))
};
assert(surrogates > 0 && surrogates <= 2);
assert(di == length[0] + length[1]);
assert(i >= 1 && i <= 2);
return u64x2
{
di, 6U * i
di, 6U * surrogates
};
}
@ -3807,14 +3802,19 @@ ircd::json::string_stringify_utf16(u8x16 &block,
utf16::decode_surrogate_aligned_next(block & block_mask)
};
const u32x4 is_surrogate
const u32x4 length_encoded
{
utf8::length(unicode)
};
const u8x16 pair_mask
(
utf16::find_surrogate(block & block_mask)
length_encoded != 0 || shl<32>(length_encoded) == 4
);
const u32x4 surrogate_mask
const u8x16 is_surrogate
(
is_surrogate != 0U
utf16::find_surrogate(block & block_mask) & pair_mask
);
const u32x4 is_ctrl
@ -3822,11 +3822,6 @@ ircd::json::string_stringify_utf16(u8x16 &block,
unicode < 0x20
);
const u32x4 length_encoded
{
utf8::length(unicode)
};
const u32x4 ctrl_idx
{
unicode & is_ctrl
@ -3838,23 +3833,10 @@ ircd::json::string_stringify_utf16(u8x16 &block,
u32(ctrl_tab_len[ctrl_idx[1]]),
};
const u32x4 is_non_bmp
(
unicode >= 0x10000U
);
const u32x4 is_surrogate_pair
{
(is_non_bmp | shl<32>(is_non_bmp)) &
(surrogate_mask | shr<32>(surrogate_mask))
};
// Determine the utf-8 encoding length for each codepoint...
// Supplement the escaped surrogate length for excluded codepoints.
const u32x4 length
{
(length_encoded & ~is_ctrl) |
(length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask)
(length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl)
};
const u32x4 encoded_sparse
@ -3876,7 +3858,7 @@ ircd::json::string_stringify_utf16(u8x16 &block,
const auto surrogates
{
simd::popcnt(u64x2(popmask(u8x16(is_surrogate))))
simd::popcnt(u64x2(popmask(is_surrogate)))
};
assert(di == length[0] + length[1]);
@ -4014,31 +3996,31 @@ ircd::u64x2
ircd::json::string_serialized_utf16(const u8x16 block,
const u8x16 block_mask)
{
const u32x4 is_surrogate
(
utf16::find_surrogate(block & block_mask)
);
const u32x4 surrogate_mask
(
is_surrogate != 0U
);
const u32x4 unicode
{
utf16::decode_surrogate_aligned_next(block & block_mask)
};
const u32x4 is_ctrl
(
unicode < 0x20
);
const u32x4 length_encoded
{
utf8::length(unicode)
};
const u8x16 pair_mask
(
length_encoded != 0 || shl<32>(length_encoded) == 4
);
const u8x16 is_surrogate
(
utf16::find_surrogate(block & block_mask) & pair_mask
);
const u32x4 is_ctrl
(
unicode < 0x20
);
const u32x4 ctrl_idx
{
unicode & is_ctrl
@ -4050,23 +4032,10 @@ ircd::json::string_serialized_utf16(const u8x16 block,
ctrl_tab_len[ctrl_idx[1]],
};
const u32x4 is_non_bmp
(
unicode >= 0x10000U
);
const u32x4 is_surrogate_pair
{
(is_non_bmp | shl<32>(is_non_bmp)) &
(surrogate_mask | shr<32>(surrogate_mask))
};
// Determine the utf-8 encoding length for each codepoint...
// Supplement the escaped surrogate length for excluded codepoints.
const u32x4 length
{
(length_encoded & ~is_ctrl) |
(length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask)
(length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl)
};
const auto total_length
@ -4076,7 +4045,7 @@ ircd::json::string_serialized_utf16(const u8x16 block,
const auto surrogates
{
popcnt(u64x2(popmask(u8x16(is_surrogate))))
popcnt(u64x2(popmask(is_surrogate)))
};
return u64x2

View File

@ -12,6 +12,13 @@
// utf16
//
namespace ircd::utf16
{
static const u32x4
mask_one { -1U, 0U, 0U, 0U, },
mask_two { -1U, -1U, 0U, 0U, };
}
/// Decodes one or two escaped surrogates (surrogate pair) aligned to the
/// front of the input block. If the surrogates are a pair which decode into
/// a single codepoint, only the first element of the return vector is used;
@ -137,33 +144,55 @@ noexcept
(codepoint_paired <= 0xffffU) & ~(shl<32>(codepoint_high))
};
// When one surrogate is input, only lane[0]
const u32x4 ret_codepoint_single
const u32x4 single_mask
{
codepoint_unpaired & ~surrogate_pair_range & ~surrogate_deuce
~surrogate_pair_range & ~surrogate_deuce & mask_one
};
const u32x4 paired_mask
{
surrogate_paired & surrogate_deuce & mask_one
};
const u32x4 unpaired_mask
{
~surrogate_pair_range & surrogate_deuce & mask_two
};
// When one surrogate is input, only lane[0]
const u32x4 single_codepoint
{
codepoint_unpaired & single_mask
};
// When two surrogates in a pair are input, lane[0] only
const u32x4 ret_codepoint_paired
const u32x4 paired_codepoint
{
codepoint_paired & (surrogate_paired & surrogate_deuce)
codepoint_paired & paired_mask
};
// When two unrelated surrogates are input, lane[0] and lane[1]
const u32x4 ret_codepoint_unpaired
const u32x4 unpaired_codepoint
{
codepoint_unpaired & ~surrogate_pair_range & surrogate_deuce
codepoint_unpaired & unpaired_mask
};
static const u32x4
mask_one { -1U, 0U, 0U, 0U, },
mask_two { -1U, -1U, 0U, 0U, };
const u32x4 codepoint
{
single_codepoint | paired_codepoint | unpaired_codepoint
};
return 0
| (ret_codepoint_single & mask_one)
| (ret_codepoint_paired & mask_one)
| (ret_codepoint_unpaired & mask_two)
;
const u32x4 ret_mask
{
single_mask | paired_mask | unpaired_mask
};
const u32x4 ret
{
codepoint | ~ret_mask
};
return ret;
}
namespace ircd::utf16