construct/ircd/b64.cc

// The Construct
//
// Copyright (C) The Construct Developers, Authors & Contributors
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice is present in all copies. The
// full license for this software is available in the LICENSE file.

#pragma GCC visibility push(internal)
namespace ircd::b64
{
	constexpr char pad
	{
		'='
	};

	extern const u8
	encode_permute_tab[64],
	encode_shift_ctrl[64],
	decode_permute_tab[64],
	decode_permute_tab_le[64];

	extern const i32
	decode_tab[256];

	[[IRCD_CLONES(IRCD_B64_TARGETS)]]
	static u8x64 decode_block(const u8x64 block, i64x8 &__restrict__ err) noexcept;

	[[IRCD_CLONES(IRCD_B64_TARGETS)]]
	static u8x64 encode_block(const u8x64 block, const dictionary) noexcept;
}
#pragma GCC visibility pop

decltype(ircd::b64::dict_rfc1421)
ircd::b64::dict_rfc1421
alignas(64)
{
	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
	'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/',
};

decltype(ircd::b64::dict_rfc3501)
ircd::b64::dict_rfc3501
alignas(64)
{
	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
	'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', ',',
};

decltype(ircd::b64::dict_rfc4648)
ircd::b64::dict_rfc4648
alignas(64)
{
	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
	'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_',
};

decltype(ircd::b64::decode_tab)
ircd::b64::decode_tab
alignas(64)
{
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 7
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 15
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 23
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 31
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 39
	0x40, 0x40, 0x40,   62,   63,   62, 0x40,   63, // 47
	  52,   53,   54,   55,   56,   57,   58,   59, // 55
	  60,   61, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 63
	0x40,    0,    1,    2,    3,    4,    5,    6, // 71
	   7,    8,    9,   10,   11,   12,   13,   14, // 79
	  15,   16,   17,   18,   19,   20,   21,   22, // 87
	  23,   24,   25, 0x40, 0x40, 0x40, 0x40,   63, // 95
	0x40,   26,   27,   28,   29,   30,   31,   32, // 103
	  33,   34,   35,   36,   37,   38,   39,   40, // 111
	  41,   42,   43,   44,   45,   46,   47,   48, // 119
	  49,   50,   51, 0x40, 0x40, 0x40, 0x40, 0x40, // 127
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
	0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 255
};

decltype(ircd::b64::decode_permute_tab)
ircd::b64::decode_permute_tab
alignas(64)
{
	 6,  0,  1,  2,  9, 10,  4,  5, 12, 13, 14,  8, 22, 16, 17, 18,
	25, 26, 20, 21, 28, 29, 30, 24, 38, 32, 33, 34, 41, 42, 36, 37,
	44, 45, 46, 40, 54, 48, 49, 50, 57, 58, 52, 53, 60, 61, 62, 56,
	 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
};

/// byte-order swapped for each 32-bit word from above
decltype(ircd::b64::decode_permute_tab_le)
ircd::b64::decode_permute_tab_le
alignas(64)
{
	 2,  1,  0,  6,  5,  4, 10,  9,  8, 14, 13, 12, 18, 17, 16, 22,
	21, 20, 26, 25, 24, 30, 29, 28, 34, 33, 32, 38, 37, 36, 42, 41,
	40, 46, 45, 44, 50, 49, 48, 54, 53, 52, 58, 57, 56, 62, 61, 60,
	 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
};

/// For vpermb
decltype(ircd::b64::encode_permute_tab)
ircd::b64::encode_permute_tab
alignas(64)
{
	 0 + 1,    0 + 0,    0 + 2,    0 + 1,    3 + 1,    3 + 0,    3 + 2,    3 + 1,
	 6 + 1,    6 + 0,    6 + 2,    6 + 1,    9 + 1,    9 + 0,    9 + 2,    9 + 1,
	12 + 1,   12 + 0,   12 + 2,   12 + 1,   15 + 1,   15 + 0,   15 + 2,   15 + 1,
	18 + 1,   18 + 0,   18 + 2,   18 + 1,   21 + 1,   21 + 0,   21 + 2,   21 + 1,
	24 + 1,   24 + 0,   24 + 2,   24 + 1,   27 + 1,   27 + 0,   27 + 2,   27 + 1,
	30 + 1,   30 + 0,   30 + 2,   30 + 1,   33 + 1,   33 + 0,   33 + 2,   33 + 1,
	36 + 1,   36 + 0,   36 + 2,   36 + 1,   39 + 1,   39 + 0,   39 + 2,   39 + 1,
	42 + 1,   42 + 0,   42 + 2,   42 + 1,   45 + 1,   45 + 0,   45 + 2,   45 + 1,
};

/// For vpmultishiftqb
decltype(ircd::b64::encode_shift_ctrl)
ircd::b64::encode_shift_ctrl
alignas(64)
{
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
	(10 +  0),  ( 4 +  0),  (22 +  0),  (16 +  0),  (10 + 32),  ( 4 + 32),  (22 + 32),  (16 + 32),
};

/// Encoding in to base64 at out. Out must be 1.33+ larger than in
/// padding is not present in the returned view.
ircd::string_view
ircd::b64::encode(const mutable_buffer out,
                  const const_buffer in,
                  const dictionary dict)
noexcept
{
	const auto pads
	{
		(3 - size(in) % 3) % 3
	};

	const auto encoded
	{
		encode_unpadded(out, in, dict)
	};

	const char _pad[2]
	{
		pads > 0? pad: '\0',
		pads > 1? pad: '\0',
	};

	auto len
	{
		size(encoded)
	};

	len += copy(out + len, _pad[0]) & (pads > 0);
	len += copy(out + len, _pad[1]) & (pads > 1);
	return string_view
	{
		data(out), len
	};
}

/// Encoding in to base64 at out. Out must be 1.33+ larger than in.
ircd::string_view
ircd::b64::encode_unpadded(const mutable_buffer out,
                           const const_buffer in,
                           const dictionary dict)
noexcept
{
	const size_t res_len
	{
		encode_unpadded_size(in)
	};

	const size_t out_len
	{
		std::min(res_len, size(out))
	};

	uint i;
	for(i = 0; i < size(in) / 48 && i < out_len / 64; ++i)
	{
		// Destination is indexed at 64 byte stride
		u512x1_u *const __restrict__ dx
		{
			reinterpret_cast<u512x1_u *>(data(out))
		};

		// Source is indexed at 48 byte stride
		const auto *const __restrict__ si
		{
			data(in) + i * 48
		};

		u8x64 block {0};
		#pragma clang loop vectorize(enable) unroll(full)
		for(uint j(0); j < 48; ++j)
			block[j] = si[j];

		block = encode_block(block, dict);
		dx[i] = block;
	}

	for(; i * 48 < size(in) && i * 64 < out_len; ++i)
	{
		auto *const __restrict__ di
		{
			data(out) + i * 64
		};

		const auto *const __restrict__ si
		{
			data(in) + i * 48
		};

		u8x64 block {0};
		for(uint j(0); j < 48 && i * 48 + j < size(in); ++j)
			block[j] = si[j];

		block = encode_block(block, dict);
		for(uint j(0); j < 64 && i * 64 + j < out_len; ++j)
			di[j] = block[j];
	}

	return string_view
	{
		data(out), out_len
	};
}

/// Returns 64 base64-encoded characters from 48 input characters. For any
/// inputs less than 48 characters trail with null characters; caller computes
/// result size. The following operations are performed on each triple of input
/// characters resulting in four output characters:
/// 0.  in[0] / 4;
/// 1.  (in[1] / 16) + ((in[0] * 16) % 64);
/// 2.  ((in[1] * 4) % 64) + (in[2] / 64);
/// 3.  in[2] % 64;
/// Based on https://arxiv.org/pdf/1910.05109 (and earlier work). No specific
/// intrinsics are used here; instead we recite a kotodama divination known
/// as "vector extensions" which by chance is visible to humans as C syntax.
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
ircd::u8x64
ircd::b64::encode_block(const u8x64 in,
                        const dictionary dict)
noexcept
{
	size_t i, j, k;

	// vpermb
	u8x64 _perm;
	#pragma clang loop vectorize(enable) unroll(full)
	for(k = 0; k < 64; ++k)
		_perm[k] = in[encode_permute_tab[k]];

	// TODO: currently does not achieve vpmultshiftqb on avx512vbmi
	u64x8 sh[8], perm(_perm);
	#pragma clang loop vectorize(enable)
	for(i = 0; i < 8; ++i)
		for(j = 0; j < 8; ++j)
			sh[i][j] = perm[i] >> encode_shift_ctrl[i * 8 + j];

	// TODO: not needed if vpmultishiftqb is emitted.
	#pragma clang loop vectorize(enable) vectorize_predicate(enable)
	for(i = 0; i < 8; ++i)
		for(j = 0; j < 8; ++j)
			sh[i][j] &= 0x3f;

	u32x8 res[8];
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0; i < 8; ++i)
		for(j = 0; j < 8; ++j)
			res[i][j] = dict[sh[i][j]];

	u8x64 ret;
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0, k = 0; i < 8; ++i)
		for(j = 0; j < 8; ++j)
			ret[k++] = res[i][j];

	return ret;
}

//
// Base64 decode
//

/// Decode base64 from in to the buffer at out; out can be 75% of the size
/// of in.
ircd::const_buffer
ircd::b64::decode(const mutable_buffer out,
                  const string_view in)
{
	const size_t pads
	{
		endswith_count(in, '=')
	};

	const size_t in_len
	{
		size(in) - pads
	};

	const size_t out_len
	{
		std::min(decode_size(in_len), size(out))
	};

	uint i;
	i64x8 err {0};
	for(i = 0; i < in_len / 64 && i < out_len / 48; ++i)
	{
		// Destination is indexed at 48 byte stride
		auto *const __restrict__ di
		{
			data(out) + i * 48
		};

		// Source is indexed at 64 byte stride
		const u512x1_u *const __restrict__ sx
		{
			reinterpret_cast<const u512x1_u *>(data(in))
		};

		u8x64 block;
		block = sx[i];
		block = decode_block(block, err);
		#pragma clang loop vectorize(enable) unroll(full)
		for(uint j(0); j < 48; ++j)
			di[j] = block[j];
	}

	for(; i * 64 < in_len && i * 48 < out_len; ++i)
	{
		auto *const __restrict__ di
		{
			data(out) + i * 48
		};

		const auto *const __restrict__ si
		{
			data(in) + i * 64
		};

		u8x64 block {0}, mask {0};
		for(uint j(0); j < 64 && i * 64 + j < in_len; ++j)
			block[j] = si[j],
			mask[j] = 0xff;

		i64x8 _err {0};
		block = decode_block(block, _err);
		for(uint j(0); j < 48 && i * 48 + j < out_len; ++j)
			di[j] = block[j];

		err |= _err & i64x8(mask);
	}

	if(unlikely(simd::any(u64x8(err))))
		throw invalid_encoding
		{
			"base64 encoding contained invalid characters."
		};

	return string_view
	{
		data(out), out_len
	};
}

/// Decode 64 base64 characters into a 48 byte result. The last 16 bytes of
/// the returned vector are undefined for the caller.
[[IRCD_CLONES(IRCD_B64_TARGETS)]]
ircd::u8x64
ircd::b64::decode_block(const u8x64 block,
                        i64x8 &__restrict__ err)
noexcept
{
	size_t i, j;

	i32x16 vals[4];
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0; i < 4; ++i)
		#pragma clang loop vectorize(enable) unroll(full)
		for(j = 0; j < 16; ++j)
			vals[i][j] = block[i * 16 + j],
			vals[i][j] = decode_tab[vals[i][j]];

	u8x64 _err;
	i32x16 errs;
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0; i < 4; ++i)
		#pragma clang loop vectorize(enable) unroll(full)
		for(j = 0, errs = vals[i] >= 64; j < 16; ++j)
			_err[i * 16 + j] = errs[j];

	u16x32 al, ah;
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0; i < 4; ++i)
		for(j = 0; j < 8; ++j)
			ah[i * 8 + j] = vals[i][j * 2 + 0],
			al[i * 8 + j] = vals[i][j * 2 + 1];

	u16x32 a;
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0; i < 32; ++i)
		a[i] = ah[i] * 64U + al[i];

	i32x16 b;
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0, j = 0; i < 16; ++i, j += 2)
		b[i] = a[j] * 4096U + a[j + 1];

	u8x64 c(b), ret;
	#pragma clang loop vectorize(enable) unroll(full)
	for(i = 0; i < 48; ++i)
		ret[i] = c[decode_permute_tab_le[i]];

	err |= i64x8(_err);
	return ret;
}