From 3e6fcf3a479384a73e5091c3dedaf73c4af5e675 Mon Sep 17 00:00:00 2001
From: Jason Volk <jason@zemos.net>
Date: Wed, 11 Aug 2021 04:31:32 -0700
Subject: [PATCH] ircd::utf: Split header; improve decode codegen; inline
 length.

---
 include/ircd/ircd.h             |   3 +-
 include/ircd/{utf.h => utf16.h} |  23 +-----
 include/ircd/utf8.h             |  57 +++++++++++++
 ircd/utf.cc                     | 141 ++++++++++++++++++--------------
 4 files changed, 139 insertions(+), 85 deletions(-)
 rename include/ircd/{utf.h => utf16.h} (68%)
 create mode 100644 include/ircd/utf8.h
diff --git a/include/ircd/ircd.h b/include/ircd/ircd.h
index 80f9fb9f1..56ad52158 100644
--- a/include/ircd/ircd.h
+++ b/include/ircd/ircd.h
@@ -72,7 +72,8 @@
 #include "crh.h"
 #include "fpe.h"
 #include "icu.h"
-#include "utf.h"
+#include "utf8.h"
+#include "utf16.h"
 #include "b64.h"
 #include "b58.h"
 #include "iov.h"
diff --git a/include/ircd/utf.h b/include/ircd/utf16.h
similarity index 68%
rename from include/ircd/utf.h
rename to include/ircd/utf16.h
index dddf01501..a5fce4af4 100644
--- a/include/ircd/utf.h
+++ b/include/ircd/utf16.h
@@ -1,7 +1,7 @@
 // The Construct
 //
 // Copyright (C) The Construct Developers, Authors & Contributors
-// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -9,26 +9,7 @@
 // full license for this software is available in the LICENSE file.
 
 #pragma once
-#define HAVE_IRCD_UTF_H
-
-/// Unicode Transformation Format
-namespace ircd::utf
-{
-	IRCD_EXCEPTION(ircd::error, error)
-}
-
-/// Unicode Transformation Format (8-bit)
-namespace ircd::utf8
-{
-	// Get the utf8-encoded length from char32_t (decoded) codepoints
-	template<class u32xN> u32xN length(const u32xN codepoints) noexcept;
-
-	// Encode char32_t codepoints into respective utf-8 encodings
-	template<class u32xN> u32xN encode_sparse(const u32xN codepoints) noexcept;
-
-	// Decode utf-8 string into char32_t unicode codepoints
-	u32x16 decode(const u8x16 string) noexcept;
-}
+#define HAVE_IRCD_UTF16_H
 
 /// Unicode Transformation Format (16-bit)
 namespace ircd::utf16
diff --git a/include/ircd/utf8.h b/include/ircd/utf8.h
new file mode 100644
index 000000000..d87ae42bc
--- /dev/null
+++ b/include/ircd/utf8.h
@@ -0,0 +1,57 @@
+// The Construct
+//
+// Copyright (C) The Construct Developers, Authors & Contributors
+// Copyright (C) 2016-2021 Jason Volk <jason@zemos.net>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice is present in all copies. The
+// full license for this software is available in the LICENSE file.
+
+#pragma once
+#define HAVE_IRCD_UTF8_H
+
+/// Unicode Transformation Format (8-bit)
+namespace ircd::utf8
+{
+	// Get the utf8-encoded length from char32_t (decoded) codepoints
+	template<class u32xN> u32xN length(const u32xN codepoints) noexcept;
+
+	// Get the utf8 length at the first byte of each utf8-codepoint; 0x00 sep
+	template<> u8x16 length(const u8x16 string) noexcept;
+
+	// Encode char32_t codepoints into respective utf-8 encodings
+	template<class u32xN> u32xN encode_sparse(const u32xN codepoints) noexcept;
+
+	// Decode utf-8 string into char32_t unicode codepoints
+	u32x16 decode_sparse(const u8x16 string) noexcept;
+
+	// Decode utf-8 string into char32_t unicode codepoints packed left.
+	u32x16 decode(const u8x16 string) noexcept;
+}
+
+template<>
+inline ircd::u8x16
+ircd::utf8::length(const u8x16 string)
+noexcept
+{
+	const u8x16 is_single
+	(
+		string < 0x80
+	);
+
+	const u8x16 is_multi
+	(
+		(string - 0xc2) <= 0x32
+	);
+
+	const u8x16 num_trail
+	(
+		1
+		+ ((string >= 0xc0) & 1)
+		+ ((string >= 0xe0) & 1)
+		+ ((string >= 0xf0) & 1)
+	);
+
+	return (is_single & 1) | (is_multi & num_trail);
+}
diff --git a/ircd/utf.cc b/ircd/utf.cc
index ed3344b52..3a5705379 100644
--- a/ircd/utf.cc
+++ b/ircd/utf.cc
@@ -302,100 +302,115 @@ noexcept
 //
 
 ircd::u32x16
-ircd::utf8::decode(const u8x16 in)
+ircd::utf8::decode(const u8x16 string)
 noexcept
 {
-	const u8x16 is_single
+	const u32x16 codepoints
 	(
-		(in & 0x80) == 0
+		decode_sparse(string)
 	);
 
-	const u8x16 is_lead
+	const i32x16 zero_lane
 	(
-		(in - 0xc2) <= 0x32
+		codepoints == 0
 	);
 
-	const u8x16 is_trail
+	// Lanes separating sparsely decoded codepoints are zero.
+	const i8x16 skip_lane
 	(
-		in >= 0x80 && in < 0xbf
+		lane_cast<i8x16>(zero_lane)
 	);
 
-	const u8x16 is_head
+	// Actual NUL codepoints weren't altered by decode.
+	const i8x16 null_code
 	(
-		is_lead | is_single
+		string == 0
 	);
 
-	const u8x16 len_mask[3]
-	{
-		in >= 0xc0, in >= 0xe0, in >= 0xf0,
-	};
-
-	const u8x16 expect_trail
+	// The pack will eliminate zero-value lanes except for legitimate NULs.
+	const i8x16 pack_mask
 	(
-		1 + (len_mask[0] & 1) + (len_mask[1] & 1) + (len_mask[2] & 1)
+		~null_code ^ skip_lane
 	);
 
+	const u32x16 ret
+	(
+		simd::pack(codepoints, pack_mask)
+	);
+
+	return ret;
+}
+
+ircd::u32x16
+ircd::utf8::decode_sparse(const u8x16 string)
+noexcept
+{
 	const u8x16 len
 	(
-		(is_single & 1) | (is_lead & expect_trail)
+		length(string)
 	);
 
-	const u8x16 head
+	const u8x16 rem
 	(
-		in & is_head
+		len
+		| ((shl<0x18>(len) == 4) & 1)
+		| ((shl<0x10>(len) == 4) & 2)
+		| ((shl<0x10>(len) == 3) & 1)
+		| ((shl<0x08>(len) == 4) & 3)
+		| ((shl<0x08>(len) == 3) & 2)
+		| ((shl<0x08>(len) == 2) & 1)
 	);
 
-	const u8x16 lead[]
+	const u8x16 bank[]
 	{
-		0x3f & in & is_trail,
-		0xff & head & is_single,
-		0x1f & head & len_mask[0] & ~len_mask[1],
-		0x0f & head & len_mask[1] & ~len_mask[2],
-		0x07 & head & len_mask[2],
+		string & 0x3f,
+		string & 0xff,
+		string & 0x1f,
+		string & 0x0f,
+		string & 0x07,
 	};
 
-	u8x16 full;
-	for(uint i(0); i < 16; ++i)
-		full[i] = lead[len[i]][i];
-
-	u8x16 shift {len & is_head};
-	shift |= (shl<0x20>(len) == 4) & 0;
-	shift |= (shl<0x20>(len) == 3) & 0;
-	shift |= (shl<0x20>(len) == 2) & 0;
-	shift |= (shl<0x20>(len) == 1) & 0;
-	shift |= (shl<0x18>(len) == 4) & 1;
-	shift |= (shl<0x18>(len) == 3) & 0;
-	shift |= (shl<0x18>(len) == 2) & 0;
-	shift |= (shl<0x18>(len) == 1) & 0;
-	shift |= (shl<0x10>(len) == 4) & 2;
-	shift |= (shl<0x10>(len) == 3) & 1;
-	shift |= (shl<0x10>(len) == 2) & 0;
-	shift |= (shl<0x10>(len) == 1) & 0;
-	shift |= (shl<0x08>(len) == 4) & 3;
-	shift |= (shl<0x08>(len) == 3) & 2;
-	shift |= (shl<0x08>(len) == 2) & 1;
-	shift |= (shl<0x08>(len) == 1) & 0;
-	shift -= 1U;
-	shift &= 0x03U;
-	shift *= 6U;
-
-	const u32x16 val
-	{
-		lane_cast<u32x16>(full) << lane_cast<u32x16>(shift)
-	};
-
-	const u8x16 incr
+	const u8x16 select
 	(
-		(shift == 0) & 1U
+		0
+		| (bank[0] & (len == 0))
+		| (bank[1] & (len == 1))
+		| (bank[2] & (len == 2))
+		| (bank[3] & (len == 3))
+		| (bank[4] & (len == 4))
 	);
 
-	u8x16 idx {0};
-	for(uint i(1); i < 16; ++i)
-		idx[i] = idx[i - 1] + incr[i - 1];
+	const u8x16 byte[]
+	{
+		select & (rem == 1),
+		select & (rem == 2),
+		select & (rem == 3),
+		select & (rem == 4),
+	};
 
-	u32x16 ret {0};
-	for(uint i(0); i < 16; ++i)
-		ret[idx[i]] |= val[i];
+	const u8x16 move[]
+	{
+		shl<8 * 0>(byte[0]),
+		shl<8 * 1>(byte[1]),
+		shl<8 * 2>(byte[2]),
+		shl<8 * 3>(byte[3]),
+	};
+
+	const u32x16 pack[]
+	{
+		lane_cast<u32x16>(move[0]) << 0x00,
+		lane_cast<u32x16>(move[1]) << 0x06,
+		lane_cast<u32x16>(move[2]) << 0x0c,
+		lane_cast<u32x16>(move[3]) << 0x12,
+	};
+
+	const u32x16 ret
+	(
+		lane_cast<u32x16>(byte[0]) // pack[0] constrains clang opt
+		| pack[1]
+		| pack[2]
+		| pack[3]
+	);
 
 	return ret;
 }