construct/include/ircd/utf.h

// The Construct
//
// Copyright (C) The Construct Developers, Authors & Contributors
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.

#pragma once
#define HAVE_IRCD_UTF_H

/// Unicode Transformation Format
namespace ircd::utf
{
	IRCD_EXCEPTION(ircd::error, error)
}

/// Unicode Transformation Format (8-bit)
namespace ircd::utf8
{
	// Get the utf8-encoded length from char32_t (decoded) codepoints
	template<class u32xN> u32xN length(const u32xN codepoints) noexcept;

	// Encode char32_t codepoints into respective utf-8 encodings
	template<class u32xN> u32xN encode(const u32xN codepoints) noexcept;

	// Decode utf-8 string into char32_t unicode codepoints
	u32x16 decode(const u8x16 string) noexcept;
}

/// Unicode Transformation Format (16-bit)
namespace ircd::utf16
{
	// mask all surrogate characters from find_() result
	template<class u8xN> u8xN mask_surrogate(const u8xN found) noexcept;

	// scan for utf-16 surrogates
	template<class u8xN> u8xN find_surrogate(const u8xN input) noexcept;

	// scan for utf-16 surrogates including incomplete sequences truncated
	u8x16 find_surrogate_partial(const u8x16 input) noexcept;

	// decodes one or two surrogates at the front into one or two codepoints
	u32x4 decode_surrogate_aligned_next(const u8x16 input) noexcept;
}

/// The vector returned by find_surrogate() masks the leading character of
/// every valid surrogate (i.e. the '\'). This is a convenience to mask
/// the full surrogate from such a result.
template<class u8xN>
inline u8xN
ircd::utf16::mask_surrogate(const u8xN found)
noexcept
{
	return u8xN
	{
		shl<0x08>(found) |
		shl<0x10>(found) |
		shl<0x18>(found) |
		shl<0x20>(found) |
		shl<0x28>(found) |
		found
	};
}
ircd::utf: Start an inline utf toolset. 2020-06-27 23:27:16 +02:00			`// The Construct`
			`//`
			`// Copyright (C) The Construct Developers, Authors & Contributors`
			`// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>`
			`//`
			`// Permission to use, copy, modify, and/or distribute this software for any`
			`// purpose with or without fee is hereby granted, provided that the above`
			`// copyright notice and this permission notice is present in all copies. The`
			`// full license for this software is available in the LICENSE file.`

			`#pragma once`
			`#define HAVE_IRCD_UTF_H`

			`/// Unicode Transformation Format`
			`namespace ircd::utf`
			`{`
			`IRCD_EXCEPTION(ircd::error, error)`
			`}`

			`/// Unicode Transformation Format (8-bit)`
			`namespace ircd::utf8`
			`{`
ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`// Get the utf8-encoded length from char32_t (decoded) codepoints`
ircd::utf: Simplify interfaces with weak specializations for vector widths; template inline. 2020-09-04 02:44:39 +02:00			`template<class u32xN> u32xN length(const u32xN codepoints) noexcept;`
ircd::utf8: Add multiple lane overloads for length(); internal template; minor comments. 2020-07-03 00:27:43 +02:00
ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00			`// Encode char32_t codepoints into respective utf-8 encodings`
ircd::utf: Simplify interfaces with weak specializations for vector widths; template inline. 2020-09-04 02:44:39 +02:00			`template<class u32xN> u32xN encode(const u32xN codepoints) noexcept;`
ircd::utf: Internal template for utf8::encode() w/ additional lane choices for interface. 2020-07-03 23:00:09 +02:00
			`// Decode utf-8 string into char32_t unicode codepoints`
ircd::simd: Use value arguments for optimal calling convention. 2020-06-29 02:04:27 +02:00			`u32x16 decode(const u8x16 string) noexcept;`
ircd::utf: Start an inline utf toolset. 2020-06-27 23:27:16 +02:00			`}`

			`/// Unicode Transformation Format (16-bit)`
			`namespace ircd::utf16`
			`{`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`// mask all surrogate characters from find_() result`
ircd::utf: Simplify interfaces with weak specializations for vector widths; template inline. 2020-09-04 02:44:39 +02:00			`template<class u8xN> u8xN mask_surrogate(const u8xN found) noexcept;`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00
			`// scan for utf-16 surrogates`
ircd::utf: Simplify interfaces with weak specializations for vector widths; template inline. 2020-09-04 02:44:39 +02:00			`template<class u8xN> u8xN find_surrogate(const u8xN input) noexcept;`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00
			`// scan for utf-16 surrogates including incomplete sequences truncated`
			`u8x16 find_surrogate_partial(const u8x16 input) noexcept;`
ircd::utf16: Add aligned escaped hex surrogate decoder. 2020-07-03 01:39:14 +02:00
			`// decodes one or two surrogates at the front into one or two codepoints`
			`u32x4 decode_surrogate_aligned_next(const u8x16 input) noexcept;`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`}`

ircd::utf: Simplify interfaces with weak specializations for vector widths; template inline. 2020-09-04 02:44:39 +02:00			`/// The vector returned by find_surrogate() masks the leading character of`
			`/// every valid surrogate (i.e. the '\'). This is a convenience to mask`
			`/// the full surrogate from such a result.`
			`template<class u8xN>`
			`inline u8xN`
			`ircd::utf16::mask_surrogate(const u8xN found)`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`noexcept`
			`{`
ircd::utf: Simplify interfaces with weak specializations for vector widths; template inline. 2020-09-04 02:44:39 +02:00			`return u8xN`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`{`
ircd::utf: Eliminate recasting for shift. 2020-07-09 02:26:48 +02:00			`shl<0x08>(found) \|`
			`shl<0x10>(found) \|`
			`shl<0x18>(found) \|`
			`shl<0x20>(found) \|`
			`shl<0x28>(found) \|`
ircd::utf16: Additional surrogate scan tools; minor reorg interface. 2020-06-30 04:23:54 +02:00			`found`
			`};`
ircd::utf: Start an inline utf toolset. 2020-06-27 23:27:16 +02:00			`}`