ircd::json: Fix utf16 surrogate pairing discernment.

ircd::utf16: Fix return value for unused lanes; use sentinel.
2024-05-19 19:33:45 +02:00 · 2023-01-31 00:51:18 -08:00 · 2023-01-31 00:51:18 -08:00 · a308010a1b
parent fc0b6bbecb
commit a308010a1b
2 changed files with 95 additions and 97 deletions
--- a/ircd/json.cc
+++ b/ircd/json.cc
@ -3558,21 +3558,26 @@ ircd::u64x2
 ircd::json::string_unescape_utf16(u8x16 &block,
                                  const u8x16 block_mask)
 {
 	const u8x16 surr_mark
 	{
 		utf16::find_surrogate(block) & block_mask
 	};
 	const u8x16 surr_mask
 	{
 		utf16::mask_surrogate(surr_mark)
 	};
 	const u32x4 unicode
 	{
-		utf16::decode_surrogate_aligned_next(block)
+		utf16::decode_surrogate_aligned_next(block & block_mask)
 	};
 	const u32x4 length
 	{
 		utf8::length(unicode)
 	};
 	const u8x16 pair_mask
 	(
 		length != 0 || shl<32>(length) == 4
 	);
 	const u8x16 is_surrogate
 	(
 		utf16::find_surrogate(block & block_mask) & pair_mask
 	);
 	const u32x4 encoded_sparse
 	{
 		utf8::encode_sparse(unicode)
@ -3583,32 +3588,22 @@ ircd::json::string_unescape_utf16(u8x16 &block,
 		encoded_sparse
 	);
 	u32x4 is_surrogate
 	{
 		-1U, -1U, 0, 0
 	};
 	for(size_t i(0); i < 6; ++i)
 	{
 		is_surrogate[0] &= surr_mask[i];
 		is_surrogate[1] &= surr_mask[i + 6];
 	}
 	const u32x4 length
 	{
 		utf8::length(unicode) & is_surrogate
 	};
 	size_t di(0), i(0);
 	for(; i < 2 && length[i] > 0; ++i)
 		for(size_t j(0); j < length[i]; ++j)
 			block[di++] = encoded[i * 4 + j];
 	const auto surrogates
 	{
 		simd::popcnt(u64x2(popmask(is_surrogate)))
 	};
 	assert(surrogates > 0 && surrogates <= 2);
 	assert(di == length[0] + length[1]);
 	assert(i >= 1 && i <= 2);
 	return u64x2
 	{
-		di, 6U * i
+		di, 6U * surrogates
 	};
 }
@ -3807,14 +3802,19 @@ ircd::json::string_stringify_utf16(u8x16 &block,
 		utf16::decode_surrogate_aligned_next(block & block_mask)
 	};
-	const u32x4 is_surrogate
+	const u32x4 length_encoded
 	{
 		utf8::length(unicode)
 	};
 	const u8x16 pair_mask
 	(
-		utf16::find_surrogate(block & block_mask)
+		length_encoded != 0 || shl<32>(length_encoded) == 4
 	);
-	const u32x4 surrogate_mask
+	const u8x16 is_surrogate
 	(
-		is_surrogate != 0U
+		utf16::find_surrogate(block & block_mask) & pair_mask
 	);
 	const u32x4 is_ctrl
@ -3822,11 +3822,6 @@ ircd::json::string_stringify_utf16(u8x16 &block,
 		unicode < 0x20
 	);
 	const u32x4 length_encoded
 	{
 		utf8::length(unicode)
 	};
 	const u32x4 ctrl_idx
 	{
 		unicode & is_ctrl
@ -3838,23 +3833,10 @@ ircd::json::string_stringify_utf16(u8x16 &block,
 		u32(ctrl_tab_len[ctrl_idx[1]]),
 	};
 	const u32x4 is_non_bmp
 	(
 		unicode >= 0x10000U
 	);
 	const u32x4 is_surrogate_pair
 	{
 		(is_non_bmp | shl<32>(is_non_bmp)) &
 		(surrogate_mask | shr<32>(surrogate_mask))
 	};
 	// Determine the utf-8 encoding length for each codepoint...
 	// Supplement the escaped surrogate length for excluded codepoints.
 	const u32x4 length
 	{
-		(length_encoded & ~is_ctrl) |
+		(length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl)
 		(length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask)
 	};
 	const u32x4 encoded_sparse
@ -3876,7 +3858,7 @@ ircd::json::string_stringify_utf16(u8x16 &block,
 	const auto surrogates
 	{
-		simd::popcnt(u64x2(popmask(u8x16(is_surrogate))))
+		simd::popcnt(u64x2(popmask(is_surrogate)))
 	};
 	assert(di == length[0] + length[1]);
@ -4014,31 +3996,31 @@ ircd::u64x2
 ircd::json::string_serialized_utf16(const u8x16 block,
                                    const u8x16 block_mask)
 {
 	const u32x4 is_surrogate
 	(
 		utf16::find_surrogate(block & block_mask)
 	);
 	const u32x4 surrogate_mask
 	(
 		is_surrogate != 0U
 	);
 	const u32x4 unicode
 	{
 		utf16::decode_surrogate_aligned_next(block & block_mask)
 	};
 	const u32x4 is_ctrl
 	(
 		unicode < 0x20
 	);
 	const u32x4 length_encoded
 	{
 		utf8::length(unicode)
 	};
 	const u8x16 pair_mask
 	(
 		length_encoded != 0 || shl<32>(length_encoded) == 4
 	);
 	const u8x16 is_surrogate
 	(
 		utf16::find_surrogate(block & block_mask) & pair_mask
 	);
 	const u32x4 is_ctrl
 	(
 		unicode < 0x20
 	);
 	const u32x4 ctrl_idx
 	{
 		unicode & is_ctrl
@ -4050,23 +4032,10 @@ ircd::json::string_serialized_utf16(const u8x16 block,
 		ctrl_tab_len[ctrl_idx[1]],
 	};
 	const u32x4 is_non_bmp
 	(
 		unicode >= 0x10000U
 	);
 	const u32x4 is_surrogate_pair
 	{
 		(is_non_bmp | shl<32>(is_non_bmp)) &
 		(surrogate_mask | shr<32>(surrogate_mask))
 	};
 	// Determine the utf-8 encoding length for each codepoint...
 	// Supplement the escaped surrogate length for excluded codepoints.
 	const u32x4 length
 	{
-		(length_encoded & ~is_ctrl) |
+		(length_encoded & ~is_ctrl) | (length_surrogate & is_ctrl)
 		(length_surrogate & is_ctrl & ~is_surrogate_pair & surrogate_mask)
 	};
 	const auto total_length
@ -4076,7 +4045,7 @@ ircd::json::string_serialized_utf16(const u8x16 block,
 	const auto surrogates
 	{
-		popcnt(u64x2(popmask(u8x16(is_surrogate))))
+		popcnt(u64x2(popmask(is_surrogate)))
 	};
 	return u64x2
--- a/ircd/utf.cc
+++ b/ircd/utf.cc
@ -12,6 +12,13 @@
 // utf16
 //
 namespace ircd::utf16
 {
 	static const u32x4
 	mask_one { -1U,  0U,  0U,  0U, },
 	mask_two { -1U, -1U,  0U,  0U, };
 }
 /// Decodes one or two escaped surrogates (surrogate pair) aligned to the
 /// front of the input block. If the surrogates are a pair which decode into
 /// a single codepoint, only the first element of the return vector is used;
@ -137,33 +144,55 @@ noexcept
 		(codepoint_paired <= 0xffffU) & ~(shl<32>(codepoint_high))
 	};
-	// When one surrogate is input, only lane[0]
+	const u32x4 single_mask
 	const u32x4 ret_codepoint_single
 	{
-		codepoint_unpaired & ~surrogate_pair_range & ~surrogate_deuce
+		~surrogate_pair_range & ~surrogate_deuce & mask_one
 	};
 	const u32x4 paired_mask
 	{
 		surrogate_paired & surrogate_deuce & mask_one
 	};
 	const u32x4 unpaired_mask
 	{
 		~surrogate_pair_range & surrogate_deuce & mask_two
 	};
 	// When one surrogate is input, only lane[0]
 	const u32x4 single_codepoint
 	{
 		codepoint_unpaired & single_mask
 	};
 	// When two surrogates in a pair are input, lane[0] only
-	const u32x4 ret_codepoint_paired
+	const u32x4 paired_codepoint
 	{
-		codepoint_paired & (surrogate_paired & surrogate_deuce)
+		codepoint_paired & paired_mask
 	};
 	// When two unrelated surrogates are input, lane[0] and lane[1]
-	const u32x4 ret_codepoint_unpaired
+	const u32x4 unpaired_codepoint
 	{
-		codepoint_unpaired & ~surrogate_pair_range & surrogate_deuce
+		codepoint_unpaired & unpaired_mask
 	};
-	static const u32x4
+	const u32x4 codepoint
-	mask_one { -1U,  0U,  0U,  0U, },
+	{
-	mask_two { -1U, -1U,  0U,  0U, };
+		single_codepoint | paired_codepoint | unpaired_codepoint
 	};
-	return 0
+	const u32x4 ret_mask
-	| (ret_codepoint_single & mask_one)
+	{
-	| (ret_codepoint_paired & mask_one)
+		single_mask | paired_mask | unpaired_mask
-	| (ret_codepoint_unpaired & mask_two)
+	};
-	;
+
 	const u32x4 ret
 	{
 		codepoint | ~ret_mask
 	};
 	return ret;
 }
 namespace ircd::utf16