construct/ircd/utf.cc

// The Construct
//
// Copyright (C) The Construct Developers, Authors & Contributors
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.

//
// utf16
//

/// Decodes one or two escaped surrogates (surrogate pair) aligned to the
/// front of the input block. If the surrogates are a pair which decode into
/// a single codepoint, only the first element of the return vector is used;
/// otherwise each surrogate decodes into each element. Three surrogates
/// cannot be decoded at once, so the last two elements are never used.
ircd::u32x4
ircd::utf16::decode_surrogate_aligned_next(const u8x16 input)
noexcept
{
	const u8x16 is_hex[3]
	{
		input >= '0' && input <= '9',
		input >= 'A' && input <= 'F',
		input >= 'a' && input <= 'f',
	};

	const u8x16 hex_nibble
	{
		((input - 0x30) & is_hex[0])
		| ((input - 0x41 + 0x0a) & is_hex[1])
		| ((input - 0x61 + 0x0a) & is_hex[2])
	};

	const u128x1 is_hex_nibble
	{
		is_hex[0] | is_hex[1] | is_hex[2]
	};

	// Masks the starting byte (the '\' char) of each valid surrogate.
	const u32x4 is_surrogate
	{
		u128x1(input == '\\') &
		shr<8>(u128x1(input == 'u')) &
		shr<16>(is_hex_nibble) &
		shr<24>(is_hex_nibble) &
		shr<32>(is_hex_nibble) &
		shr<40>(is_hex_nibble)
	};

	// is_surrogate may leave byte[0] and byte[6] (and possibly byte[12] which
	// we don't care about here) as 0xff. Our result will be 4 byte codepoints
	// matching those 6 byte inputs, so we shift the byte[6] over to byte[4]
	// and stiffen the mask about to be generated.
	const u32x4 surrogate_mask
	{
		((is_surrogate & 0xff) | (is_surrogate >> 16)) == 0xffU
	};

	// Decide if one or two surrogates were actually input and assert that
	// between both lanes if so.
	const u32x4 surrogate_deuce
	{
		(surrogate_mask & shr<32>(u128x1(surrogate_mask))) |
		(surrogate_mask & shl<32>(u128x1(surrogate_mask)))
	};

	// ASCII to integral converion of the upper nibbles
	const u8x16 hex_upper
	{
		shr<16>(u128x1(hex_nibble))
	};

	// ASCII to integral converion of the lower nibbles
	const u8x16 hex_lower
	{
		shr<24>(u128x1(hex_nibble))
	};

	// pack upper and lower nibbles into bytes, though these have a space
	// between them when 4 nibbles becomes 2 bytes
	const u8x16 hex_byte
	{
		(hex_upper << 4) | hex_lower
	};

	// Result for one or two unpaired surrogates
	const u32x4 codepoint_unpaired
	{
		u8x16
		{
			hex_byte[2], hex_byte[0], 0, 0,
			hex_byte[8], hex_byte[6], 0, 0,
			0, 0, 0, 0,
			0, 0, 0, 0,
		}
	};

	// Determine if the unpaired codepoints can make a surrogate pair
	const u32x4 surrogate_pair_range
	{
		codepoint_unpaired >= 0xd800U && codepoint_unpaired <= 0xdfffU
	};

	// Mask lane[0] if the codepoints are actually a surrogate pair
	const u32x4 surrogate_paired
	{
		surrogate_pair_range & shr<32>(u128x1(surrogate_pair_range))
	};

	// Pre-processing shuffle for surrogate pair decode
	const u32x4 codepoint_pre_paired
	{
		shr<16>(u128x1(codepoint_unpaired)) | codepoint_unpaired
	};

	// Decode surrogate pair
	const u32x4 codepoint_paired
	{
		0x10000U +
		((codepoint_pre_paired & 0x000003ffU) << 10) +
		((codepoint_pre_paired & 0x03ff0000U) >> 16)
	};

	// Decide if the codepoint is in the supplementary plane (3+ bytes)
	const u32x4 codepoint_high
	{
		(codepoint_paired > 0xffffU) & surrogate_paired
	};

	// Decide if the codepoint is in the BMP (2- bytes)
	const u32x4 codepoint_low
	{
		(codepoint_paired <= 0xffffU) & ~(shl<32>(u128x1(codepoint_high)))
	};

	// When two surrogates in a pair are input, lane[0] only
	const u32x4 ret_codepoint_paired
	{
		codepoint_paired & (surrogate_paired & surrogate_deuce)
	};

	// When two unrelated surrogates are input, lane[0] and lane[1]
	const u32x4 ret_codepoint_unpaired
	{
		codepoint_unpaired & ~surrogate_pair_range & surrogate_deuce
	};

	// When one surrogate is input, only lane[0]
	const u32x4 ret_codepoint_single
	{
		codepoint_unpaired & ~surrogate_pair_range & ~surrogate_deuce
	};

	return 0
	| ret_codepoint_paired
	| ret_codepoint_unpaired
	| ret_codepoint_single
	;
}

namespace ircd::utf16
{
	static const u128x1 full_mask {~u128x1{0}};
	extern const u128x1 truncation_table[6];
}

decltype(ircd::utf16::truncation_table)
ircd::utf16::truncation_table
{
	~shl<0x30>(~full_mask),
	~shl<0x28>(~full_mask),
	~shl<0x20>(~full_mask),
	~shl<0x18>(~full_mask),
	~shl<0x10>(~full_mask),
	~shl<0x08>(~full_mask),
};

/// scan for utf-16 surrogates including incomplete sequences truncated
/// by the end of the input; also matches a single trailing slash.
ircd::u8x16
ircd::utf16::find_surrogate_partial(const u8x16 input)
noexcept
{
	const u128x1 is_esc
	{
		input == '\\'
	};

	const u128x1 is_u
	{
		input == 'u'
	};

	const u128x1 is_hex_nibble
	{
		(input >= '0' && input <= '9') ||
		(input >= 'A' && input <= 'F') ||
		(input >= 'a' && input <= 'f')
	};

	const u128x1 surrogate_sans[6]
	{
		// complete
		is_esc
		& shr<8>(is_u)
		& shr<16>(is_hex_nibble) & shr<24>(is_hex_nibble)
		& shr<32>(is_hex_nibble) & shr<40>(is_hex_nibble),

		// sans 1
		is_esc
		& shr<8>(is_u)
		& shr<16>(is_hex_nibble) & shr<24>(is_hex_nibble)
		& shr<32>(is_hex_nibble),

		// sans 2
		is_esc
		& shr<8>(is_u)
		& shr<16>(is_hex_nibble) & shr<24>(is_hex_nibble),

		// sans 3
		is_esc
		& shr<8>(is_u)
		& shr<16>(is_hex_nibble),

		// sans 4
		is_esc
		& shr<8>(is_u),

		// sans 5
		is_esc,
	};

	const u128x1 ret
	{
		(surrogate_sans[0] & truncation_table[0]) |
		(surrogate_sans[1] & truncation_table[1]) |
		(surrogate_sans[2] & truncation_table[2]) |
		(surrogate_sans[3] & truncation_table[3]) |
		(surrogate_sans[4] & truncation_table[4]) |
		(surrogate_sans[5] & truncation_table[5])
	};

	return ret;
}

ircd::u8x16
ircd::utf16::find_surrogate(const u8x16 input)
noexcept
{
	const u128x1 is_hex_nibble
	{
		(input >= '0' && input <= '9') ||
		(input >= 'A' && input <= 'F') ||
		(input >= 'a' && input <= 'f')
	};

	const auto is_surrogate
	{
		u128x1(input == '\\') &
		shr<8>(u128x1(input == 'u')) &
		shr<16>(is_hex_nibble) &
		shr<24>(is_hex_nibble) &
		shr<32>(is_hex_nibble) &
		shr<40>(is_hex_nibble)
	};

	return is_surrogate;
}

/// Convert utf-16 two-byte surrogates (in big-endian) to char32_t codepoints
/// in parallel. The result vector is twice the size as the input; no template
/// is offered yet, just the dimensions someone needed for somewhere.
ircd::u32x8
ircd::utf16::convert_u32x8(const u8x16 string)
noexcept
{
	return u32x8
	{
		string[0x01] | (u32(string[0x00]) << 8),
		string[0x03] | (u32(string[0x02]) << 8),
		string[0x05] | (u32(string[0x04]) << 8),
		string[0x07] | (u32(string[0x06]) << 8),
		string[0x09] | (u32(string[0x08]) << 8),
		string[0x0b] | (u32(string[0x0a]) << 8),
		string[0x0d] | (u32(string[0x0c]) << 8),
		string[0x0f] | (u32(string[0x0e]) << 8),
	};
}

//
// utf8
//

ircd::u32x16
ircd::utf8::decode(const u8x16 string)
noexcept
{
	const u32x16 in
	{
		simd::lane_cast<u32x16>(string)
	};

	const u32x16 is_single
	{
		(in & 0x80) == 0
	};

	const u32x16 is_lead
	{
		(in - 0xc2) <= 0x32
	};

	const u32x16 is_trail
	{
		in >= 0x80 && in < 0xbf
	};

	const u32x16 expect_trail
	{
		(((in >= 0xe0) & 1) + ((in >= 0xf0) & 1) + 1) & is_lead
	};

	const u32x16 expect_length
	{
		expect_trail + 1
	};

	const u32x16 shift[4]
	{
		in << 0,
		in << 8,
		in << 16,
		in << 24,
	};

	const u32x16 multibyte_packs
	{
		in[0x00] | shift[0x01][0x01] | shift[0x02][0x02] | shift[0x03][0x03],
		in[0x01] | shift[0x01][0x02] | shift[0x02][0x03] | shift[0x03][0x04],
		in[0x02] | shift[0x01][0x03] | shift[0x02][0x04] | shift[0x03][0x05],
		in[0x03] | shift[0x01][0x04] | shift[0x02][0x05] | shift[0x03][0x06],
		in[0x04] | shift[0x01][0x05] | shift[0x02][0x06] | shift[0x03][0x07],
		in[0x05] | shift[0x01][0x06] | shift[0x02][0x07] | shift[0x03][0x08],
		in[0x06] | shift[0x01][0x07] | shift[0x02][0x08] | shift[0x03][0x09],
		in[0x07] | shift[0x01][0x08] | shift[0x02][0x09] | shift[0x03][0x0a],
		in[0x08] | shift[0x01][0x09] | shift[0x02][0x0a] | shift[0x03][0x0b],
		in[0x09] | shift[0x01][0x0a] | shift[0x02][0x0b] | shift[0x03][0x0c],
		in[0x0a] | shift[0x01][0x0b] | shift[0x02][0x0c] | shift[0x03][0x0d],
		in[0x0b] | shift[0x01][0x0c] | shift[0x02][0x0d] | shift[0x03][0x0e],
		in[0x0c] | shift[0x01][0x0d] | shift[0x02][0x0e] | shift[0x03][0x0f],
		in[0x0d] | shift[0x01][0x0e] | shift[0x02][0x0f] | shift[0x03][0x0f],
		in[0x0e] | shift[0x01][0x0f] | shift[0x02][0x0f] | shift[0x03][0x0f],
		in[0x0f] | shift[0x01][0x0f] | shift[0x02][0x0f] | shift[0x03][0x0f],
	};

	const u32x16 multibyte
	{
		0
		| (multibyte_packs & (expect_length == 1) & 0x000000ffU)
		| (multibyte_packs & (expect_length == 2) & 0x0000ffffU)
		| (multibyte_packs & (expect_length == 3) & 0x00ffffffU)
		| (multibyte_packs & (expect_length == 4) & 0xffffffffU)
	};

	const u32x16 integers
	{
		(in & is_single) | (multibyte & is_lead)
	};

	return integers;
}

namespace ircd::utf8
{
	template<class u32xN> static u32xN _encode(const u32xN codepoint) noexcept;
}

ircd::u32x4
ircd::utf8::encode(const u32x4 codepoint)
noexcept
{
	return _encode(codepoint);
}

ircd::u32x8
ircd::utf8::encode(const u32x8 codepoint)
noexcept
{
	return _encode(codepoint);
}

ircd::u32x16
ircd::utf8::encode(const u32x16 codepoint)
noexcept
{
	return _encode(codepoint);
}

/// Transform multiple char32_t codepoints to their utf-8 encodings in
/// parallel, returning a sparse result in each char32_t (this does not
/// compress the result down).
template<class u32xN>
u32xN
ircd::utf8::_encode(const u32xN codepoint)
noexcept
{
	const u32xN len
	{
		length(codepoint)
	};

	const u32xN enc_2
	{
		(((codepoint >> 6) | 0xc0) & 0xff) // byte[0]
		| ((((codepoint & 0x3f) | 0x80) &0xff) << 8) // byte[1]
	};

	const u32xN enc_3
	{
		(((codepoint >> 12) | 0xe0) & 0xff) | // byte[0]
		(((((codepoint >> 6) & 0x3f) | 0x80) & 0xff) << 8) | // byte[1]
		((((codepoint & 0x3f) | 0x80) & 0xff) << 16) // byte[3]
	};

	const u32xN enc_4
	{
		(((codepoint >> 18) | 0xf0) & 0xff) | // byte[0]
		(((((codepoint >> 12) & 0x3f) | 0x80) & 0xff) << 8) | // byte[1]
		(((((codepoint >> 6) & 0x3f) | 0x80) & 0xff) << 16) | // byte[2]
		((((codepoint & 0x3f) | 0x80) & 0xff) << 24) // byte[3]
	};

	return 0
	| ((len == 0) & 0xFFFD)
	| ((len == 1) & codepoint)
	| ((len == 2) & enc_2)
	| ((len == 3) & enc_3)
	| ((len == 4) & enc_4)
	;
}

namespace ircd::utf8
{
	template<class u32xN> static u32xN _length(const u32xN codepoint) noexcept;
}

ircd::u32x4
ircd::utf8::length(const u32x4 codepoint)
noexcept
{
	return _length(codepoint);
}

ircd::u32x8
ircd::utf8::length(const u32x8 codepoint)
noexcept
{
	return _length(codepoint);
}

ircd::u32x16
ircd::utf8::length(const u32x16 codepoint)
noexcept
{
	return _length(codepoint);
}

/// Determine the utf-8 encoding length of multiple codepoints in parallel.
/// The input vector char32_t codepoints and the output yields an integer
/// of 0-4 for each lane.
template<class u32xN>
u32xN
ircd::utf8::_length(const u32xN codepoint)
noexcept
{
	const u32xN
	length_1      { codepoint <= 0x7f                               },
	length_2      { codepoint <= 0x7ff && codepoint > 0x7f          },
	length_3_lo   { codepoint <= 0xd7ff && codepoint > 0x7ff        },
	length_3_hi   { codepoint <= 0xffff && codepoint > 0xdfff       },
	length_4      { codepoint <= 0x10ffff && codepoint > 0xffff     };

	[[gnu::unused]] const u32xN // Preserved here for future reference
	length_3_err  { codepoint <= 0xdfff && codepoint > 0xd7ff       },
	length_err    { (codepoint > 0x10ffff) | length_3_err           };

	return 0
	| (length_1 & 1)
	| (length_2 & 2)
	| (length_3_lo & 3)
	| (length_3_hi & 3)
	| (length_4 & 4)
	;
}
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`// The Construct`
			`//`
			`// Copyright (C) The Construct Developers, Authors & Contributors`
			`// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>`
			`//`
			`// Permission to use, copy, modify, and/or distribute this software for any`
			`// purpose with or without fee is hereby granted, provided that the above`
			`// copyright notice and this permission notice is present in all copies. The`
			`// full license for this software is available in the LICENSE file.`

ircd::utf: Add unpacked decode; de-template for linkage; minor reorg. 2020-06-28 04:59:41 +02:00			`//`
			`// utf16`
			`//`

ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`/// Decodes one or two escaped surrogates (surrogate pair) aligned to the`
			`/// front of the input block. If the surrogates are a pair which decode into`
			`/// a single codepoint, only the first element of the return vector is used;`
			`/// otherwise each surrogate decodes into each element. Three surrogates`
			`/// cannot be decoded at once, so the last two elements are never used.`
			`ircd::u32x4`
			`ircd::utf16::decode_surrogate_aligned_next(const u8x16 input)`
			`noexcept`
			`{`
			`const u8x16 is_hex[3]`
			`{`
			`input >= '0' && input <= '9',`
			`input >= 'A' && input <= 'F',`
			`input >= 'a' && input <= 'f',`
			`};`

			`const u8x16 hex_nibble`
			`{`
			`((input - 0x30) & is_hex[0])`
			`\| ((input - 0x41 + 0x0a) & is_hex[1])`
			`\| ((input - 0x61 + 0x0a) & is_hex[2])`
			`};`

			`const u128x1 is_hex_nibble`
			`{`
			`is_hex[0] \| is_hex[1] \| is_hex[2]`
			`};`

ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`// Masks the starting byte (the '\' char) of each valid surrogate.`
			`const u32x4 is_surrogate`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`{`
			`u128x1(input == '\\') &`
			`shr<8>(u128x1(input == 'u')) &`
			`shr<16>(is_hex_nibble) &`
			`shr<24>(is_hex_nibble) &`
			`shr<32>(is_hex_nibble) &`
			`shr<40>(is_hex_nibble)`
			`};`

ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`// is_surrogate may leave byte[0] and byte[6] (and possibly byte[12] which`
			`// we don't care about here) as 0xff. Our result will be 4 byte codepoints`
			`// matching those 6 byte inputs, so we shift the byte[6] over to byte[4]`
			`// and stiffen the mask about to be generated.`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`const u32x4 surrogate_mask`
			`{`
ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`((is_surrogate & 0xff) \| (is_surrogate >> 16)) == 0xffU`
			`};`

			`// Decide if one or two surrogates were actually input and assert that`
			`// between both lanes if so.`
			`const u32x4 surrogate_deuce`
			`{`
			`(surrogate_mask & shr<32>(u128x1(surrogate_mask))) \|`
			`(surrogate_mask & shl<32>(u128x1(surrogate_mask)))`
			`};`

			`// ASCII to integral converion of the upper nibbles`
			`const u8x16 hex_upper`
			`{`
			`shr<16>(u128x1(hex_nibble))`
			`};`

			`// ASCII to integral converion of the lower nibbles`
			`const u8x16 hex_lower`
			`{`
			`shr<24>(u128x1(hex_nibble))`
			`};`

			`// pack upper and lower nibbles into bytes, though these have a space`
			`// between them when 4 nibbles becomes 2 bytes`
			`const u8x16 hex_byte`
			`{`
			`(hex_upper << 4) \| hex_lower`
			`};`

			`// Result for one or two unpaired surrogates`
			`const u32x4 codepoint_unpaired`
			`{`
			`u8x16`
			`{`
			`hex_byte[2], hex_byte[0], 0, 0,`
			`hex_byte[8], hex_byte[6], 0, 0,`
			`0, 0, 0, 0,`
			`0, 0, 0, 0,`
			`}`
			`};`

			`// Determine if the unpaired codepoints can make a surrogate pair`
			`const u32x4 surrogate_pair_range`
			`{`
			`codepoint_unpaired >= 0xd800U && codepoint_unpaired <= 0xdfffU`
			`};`

			`// Mask lane[0] if the codepoints are actually a surrogate pair`
			`const u32x4 surrogate_paired`
			`{`
			`surrogate_pair_range & shr<32>(u128x1(surrogate_pair_range))`
			`};`

			`// Pre-processing shuffle for surrogate pair decode`
			`const u32x4 codepoint_pre_paired`
			`{`
			`shr<16>(u128x1(codepoint_unpaired)) \| codepoint_unpaired`
			`};`

			`// Decode surrogate pair`
			`const u32x4 codepoint_paired`
			`{`
			`0x10000U +`
			`((codepoint_pre_paired & 0x000003ffU) << 10) +`
			`((codepoint_pre_paired & 0x03ff0000U) >> 16)`
			`};`

			`// Decide if the codepoint is in the supplementary plane (3+ bytes)`
			`const u32x4 codepoint_high`
			`{`
			`(codepoint_paired > 0xffffU) & surrogate_paired`
			`};`

			`// Decide if the codepoint is in the BMP (2- bytes)`
			`const u32x4 codepoint_low`
			`{`
			`(codepoint_paired <= 0xffffU) & ~(shl<32>(u128x1(codepoint_high)))`
			`};`

			`// When two surrogates in a pair are input, lane[0] only`
			`const u32x4 ret_codepoint_paired`
			`{`
			`codepoint_paired & (surrogate_paired & surrogate_deuce)`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`};`

ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`// When two unrelated surrogates are input, lane[0] and lane[1]`
			`const u32x4 ret_codepoint_unpaired`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`{`
ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`codepoint_unpaired & ~surrogate_pair_range & surrogate_deuce`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`};`

ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`// When one surrogate is input, only lane[0]`
			`const u32x4 ret_codepoint_single`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`{`
ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`codepoint_unpaired & ~surrogate_pair_range & ~surrogate_deuce`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`};`

ircd::utf16: Add surrogate pair decode to aligned_next routine. 2020-07-05 04:56:06 +02:00			`return 0`
			`\| ret_codepoint_paired`
			`\| ret_codepoint_unpaired`
			`\| ret_codepoint_single`
			`;`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00			`}`

ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`namespace ircd::utf16`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`{`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`static const u128x1 full_mask {~u128x1{0}};`
			`extern const u128x1 truncation_table[6];`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`}`

ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`decltype(ircd::utf16::truncation_table)`
			`ircd::utf16::truncation_table`
			`{`
			`~shl<0x30>(~full_mask),`
			`~shl<0x28>(~full_mask),`
			`~shl<0x20>(~full_mask),`
			`~shl<0x18>(~full_mask),`
			`~shl<0x10>(~full_mask),`
			`~shl<0x08>(~full_mask),`
			`};`

ircd::utf8: Add multiple lane overloads for length(); internal template; minor comments. 2020-07-03 00:27:43 +02:00			`/// scan for utf-16 surrogates including incomplete sequences truncated`
			`/// by the end of the input; also matches a single trailing slash.`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`ircd::u8x16`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`ircd::utf16::find_surrogate_partial(const u8x16 input)`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`noexcept`
			`{`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`const u128x1 is_esc`
			`{`
			`input == '\\'`
			`};`

			`const u128x1 is_u`
			`{`
			`input == 'u'`
			`};`

			`const u128x1 is_hex_nibble`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`{`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`(input >= '0' && input <= '9') \|\|`
			`(input >= 'A' && input <= 'F') \|\|`
			`(input >= 'a' && input <= 'f')`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`};`

ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`const u128x1 surrogate_sans[6]`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`{`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`// complete`
			`is_esc`
			`& shr<8>(is_u)`
			`& shr<16>(is_hex_nibble) & shr<24>(is_hex_nibble)`
			`& shr<32>(is_hex_nibble) & shr<40>(is_hex_nibble),`

			`// sans 1`
			`is_esc`
			`& shr<8>(is_u)`
			`& shr<16>(is_hex_nibble) & shr<24>(is_hex_nibble)`
			`& shr<32>(is_hex_nibble),`

			`// sans 2`
			`is_esc`
			`& shr<8>(is_u)`
			`& shr<16>(is_hex_nibble) & shr<24>(is_hex_nibble),`

			`// sans 3`
			`is_esc`
			`& shr<8>(is_u)`
			`& shr<16>(is_hex_nibble),`

			`// sans 4`
			`is_esc`
			`& shr<8>(is_u),`

			`// sans 5`
			`is_esc,`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`};`

ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`const u128x1 ret`
			`{`
			`(surrogate_sans[0] & truncation_table[0]) \|`
			`(surrogate_sans[1] & truncation_table[1]) \|`
			`(surrogate_sans[2] & truncation_table[2]) \|`
			`(surrogate_sans[3] & truncation_table[3]) \|`
			`(surrogate_sans[4] & truncation_table[4]) \|`
			`(surrogate_sans[5] & truncation_table[5])`
			`};`

			`return ret;`
ircd::utf16: Add escaped surrogate scan tool. ircd::utf16: Add surrogate mask routine; minor interface rename. 2020-06-29 01:41:23 +02:00			`}`

			`ircd::u8x16`
			`ircd::utf16::find_surrogate(const u8x16 input)`
			`noexcept`
			`{`
			`const u128x1 is_hex_nibble`
			`{`
			`(input >= '0' && input <= '9') \|\|`
			`(input >= 'A' && input <= 'F') \|\|`
			`(input >= 'a' && input <= 'f')`
			`};`

			`const auto is_surrogate`
			`{`
			`u128x1(input == '\\') &`
			`shr<8>(u128x1(input == 'u')) &`
			`shr<16>(is_hex_nibble) &`
			`shr<24>(is_hex_nibble) &`
			`shr<32>(is_hex_nibble) &`
			`shr<40>(is_hex_nibble)`
			`};`

			`return is_surrogate;`
			`}`

ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`/// Convert utf-16 two-byte surrogates (in big-endian) to char32_t codepoints`
			`/// in parallel. The result vector is twice the size as the input; no template`
			`/// is offered yet, just the dimensions someone needed for somewhere.`
			`ircd::u32x8`
			`ircd::utf16::convert_u32x8(const u8x16 string)`
			`noexcept`
			`{`
			`return u32x8`
			`{`
			`string[0x01] \| (u32(string[0x00]) << 8),`
			`string[0x03] \| (u32(string[0x02]) << 8),`
			`string[0x05] \| (u32(string[0x04]) << 8),`
			`string[0x07] \| (u32(string[0x06]) << 8),`
			`string[0x09] \| (u32(string[0x08]) << 8),`
			`string[0x0b] \| (u32(string[0x0a]) << 8),`
			`string[0x0d] \| (u32(string[0x0c]) << 8),`
			`string[0x0f] \| (u32(string[0x0e]) << 8),`
			`};`
			`}`

ircd::utf: Add unpacked decode; de-template for linkage; minor reorg. 2020-06-28 04:59:41 +02:00			`//`
			`// utf8`
			`//`

			`ircd::u32x16`
ircd::simd: Use value arguments for optimal calling convention. 2020-06-29 02:04:27 +02:00			`ircd::utf8::decode(const u8x16 string)`
ircd::utf: Add unpacked decode; de-template for linkage; minor reorg. 2020-06-28 04:59:41 +02:00			`noexcept`
			`{`
			`const u32x16 in`
			`{`
			`simd::lane_cast<u32x16>(string)`
			`};`

			`const u32x16 is_single`
			`{`
			`(in & 0x80) == 0`
			`};`

			`const u32x16 is_lead`
			`{`
			`(in - 0xc2) <= 0x32`
			`};`

			`const u32x16 is_trail`
			`{`
			`in >= 0x80 && in < 0xbf`
			`};`

			`const u32x16 expect_trail`
			`{`
			`(((in >= 0xe0) & 1) + ((in >= 0xf0) & 1) + 1) & is_lead`
			`};`

			`const u32x16 expect_length`
			`{`
			`expect_trail + 1`
			`};`

			`const u32x16 shift[4]`
			`{`
			`in << 0,`
			`in << 8,`
			`in << 16,`
			`in << 24,`
			`};`

			`const u32x16 multibyte_packs`
			`{`
			`in[0x00] \| shift[0x01][0x01] \| shift[0x02][0x02] \| shift[0x03][0x03],`
			`in[0x01] \| shift[0x01][0x02] \| shift[0x02][0x03] \| shift[0x03][0x04],`
			`in[0x02] \| shift[0x01][0x03] \| shift[0x02][0x04] \| shift[0x03][0x05],`
			`in[0x03] \| shift[0x01][0x04] \| shift[0x02][0x05] \| shift[0x03][0x06],`
			`in[0x04] \| shift[0x01][0x05] \| shift[0x02][0x06] \| shift[0x03][0x07],`
			`in[0x05] \| shift[0x01][0x06] \| shift[0x02][0x07] \| shift[0x03][0x08],`
			`in[0x06] \| shift[0x01][0x07] \| shift[0x02][0x08] \| shift[0x03][0x09],`
			`in[0x07] \| shift[0x01][0x08] \| shift[0x02][0x09] \| shift[0x03][0x0a],`
			`in[0x08] \| shift[0x01][0x09] \| shift[0x02][0x0a] \| shift[0x03][0x0b],`
			`in[0x09] \| shift[0x01][0x0a] \| shift[0x02][0x0b] \| shift[0x03][0x0c],`
			`in[0x0a] \| shift[0x01][0x0b] \| shift[0x02][0x0c] \| shift[0x03][0x0d],`
			`in[0x0b] \| shift[0x01][0x0c] \| shift[0x02][0x0d] \| shift[0x03][0x0e],`
			`in[0x0c] \| shift[0x01][0x0d] \| shift[0x02][0x0e] \| shift[0x03][0x0f],`
			`in[0x0d] \| shift[0x01][0x0e] \| shift[0x02][0x0f] \| shift[0x03][0x0f],`
			`in[0x0e] \| shift[0x01][0x0f] \| shift[0x02][0x0f] \| shift[0x03][0x0f],`
			`in[0x0f] \| shift[0x01][0x0f] \| shift[0x02][0x0f] \| shift[0x03][0x0f],`
			`};`

			`const u32x16 multibyte`
			`{`
			`0`
			`\| (multibyte_packs & (expect_length == 1) & 0x000000ffU)`
			`\| (multibyte_packs & (expect_length == 2) & 0x0000ffffU)`
			`\| (multibyte_packs & (expect_length == 3) & 0x00ffffffU)`
			`\| (multibyte_packs & (expect_length == 4) & 0xffffffffU)`
			`};`

			`const u32x16 integers`
			`{`
			`(in & is_single) \| (multibyte & is_lead)`
			`};`

			`return integers;`
			`}`

ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`namespace ircd::utf8`
			`{`
			`template<class u32xN> static u32xN _encode(const u32xN codepoint) noexcept;`
			`}`

			`ircd::u32x4`
			`ircd::utf8::encode(const u32x4 codepoint)`
			`noexcept`
			`{`
			`return _encode(codepoint);`
			`}`

			`ircd::u32x8`
			`ircd::utf8::encode(const u32x8 codepoint)`
			`noexcept`
			`{`
			`return _encode(codepoint);`
			`}`

			`ircd::u32x16`
			`ircd::utf8::encode(const u32x16 codepoint)`
			`noexcept`
			`{`
			`return _encode(codepoint);`
			`}`

ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`/// Transform multiple char32_t codepoints to their utf-8 encodings in`
			`/// parallel, returning a sparse result in each char32_t (this does not`
			`/// compress the result down).`
ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`template<class u32xN>`
			`u32xN`
			`ircd::utf8::_encode(const u32xN codepoint)`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`noexcept`
			`{`
ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`const u32xN len`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`{`
			`length(codepoint)`
			`};`

ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`const u32xN enc_2`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`{`
			`(((codepoint >> 6) \| 0xc0) & 0xff) // byte[0]`
			`\| ((((codepoint & 0x3f) \| 0x80) &0xff) << 8) // byte[1]`
			`};`

ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`const u32xN enc_3`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`{`
			`(((codepoint >> 12) \| 0xe0) & 0xff) \| // byte[0]`
			`(((((codepoint >> 6) & 0x3f) \| 0x80) & 0xff) << 8) \| // byte[1]`
			`((((codepoint & 0x3f) \| 0x80) & 0xff) << 16) // byte[3]`
			`};`

ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`const u32xN enc_4`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`{`
			`(((codepoint >> 18) \| 0xf0) & 0xff) \| // byte[0]`
			`(((((codepoint >> 12) & 0x3f) \| 0x80) & 0xff) << 8) \| // byte[1]`
			`(((((codepoint >> 6) & 0x3f) \| 0x80) & 0xff) << 16) \| // byte[2]`
			`((((codepoint & 0x3f) \| 0x80) & 0xff) << 24) // byte[3]`
			`};`

			`return 0`
			`\| ((len == 0) & 0xFFFD)`
			`\| ((len == 1) & codepoint)`
			`\| ((len == 2) & enc_2)`
			`\| ((len == 3) & enc_3)`
			`\| ((len == 4) & enc_4)`
			`;`
			`}`

ircd::utf8: Add multiple lane overloads for length(); internal template; minor comments. 2020-07-03 00:27:43 +02:00			`namespace ircd::utf8`
			`{`
			`template<class u32xN> static u32xN _length(const u32xN codepoint) noexcept;`
			`}`

			`ircd::u32x4`
			`ircd::utf8::length(const u32x4 codepoint)`
			`noexcept`
			`{`
			`return _length(codepoint);`
			`}`

			`ircd::u32x8`
			`ircd::utf8::length(const u32x8 codepoint)`
			`noexcept`
			`{`
			`return _length(codepoint);`
			`}`

			`ircd::u32x16`
			`ircd::utf8::length(const u32x16 codepoint)`
			`noexcept`
			`{`
			`return _length(codepoint);`
			`}`

ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`/// Determine the utf-8 encoding length of multiple codepoints in parallel.`
			`/// The input vector char32_t codepoints and the output yields an integer`
			`/// of 0-4 for each lane.`
ircd::utf8: Add multiple lane overloads for length(); internal template; minor comments. 2020-07-03 00:27:43 +02:00			`template<class u32xN>`
			`u32xN`
			`ircd::utf8::_length(const u32xN codepoint)`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`noexcept`
			`{`
ircd::utf8: Add multiple lane overloads for length(); internal template; minor comments. 2020-07-03 00:27:43 +02:00			`const u32xN`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`length_1 { codepoint <= 0x7f },`
			`length_2 { codepoint <= 0x7ff && codepoint > 0x7f },`
			`length_3_lo { codepoint <= 0xd7ff && codepoint > 0x7ff },`
			`length_3_hi { codepoint <= 0xffff && codepoint > 0xdfff },`
			`length_4 { codepoint <= 0x10ffff && codepoint > 0xffff };`

ircd::utf8: Add multiple lane overloads for length(); internal template; minor comments. 2020-07-03 00:27:43 +02:00			`[[gnu::unused]] const u32xN // Preserved here for future reference`
ircd::utf: Move definitions out-of-line for now. 2020-06-28 03:10:36 +02:00			`length_3_err { codepoint <= 0xdfff && codepoint > 0xd7ff },`
			`length_err { (codepoint > 0x10ffff) \| length_3_err };`

			`return 0`
			`\| (length_1 & 1)`
			`\| (length_2 & 2)`
			`\| (length_3_lo & 3)`
			`\| (length_3_hi & 3)`
			`\| (length_4 & 4)`
			`;`
			`}`