mirror of
https://github.com/matrix-construct/construct
synced 2025-03-14 05:20:17 +01:00
ircd::utf: Start an inline utf toolset.
This commit is contained in:
parent
ac21ebdc90
commit
73ee602542
4 changed files with 129 additions and 8 deletions
|
@ -62,9 +62,3 @@ namespace ircd::icu::utf16
|
|||
char32_t get_or_fffd(const string_view &) noexcept; // error = U+FFFD
|
||||
char32_t get_unsafe(const string_view &) noexcept; // error undefined
|
||||
}
|
||||
|
||||
namespace ircd
|
||||
{
|
||||
namespace utf8 = icu::utf8;
|
||||
namespace utf16 = icu::utf16;
|
||||
}
|
||||
|
|
|
@ -55,6 +55,7 @@
|
|||
#include "backtrace.h"
|
||||
#include "info.h"
|
||||
#include "icu.h"
|
||||
#include "utf.h"
|
||||
#include "time.h"
|
||||
#include "lex_cast.h"
|
||||
#include "logger.h"
|
||||
|
|
126
include/ircd/utf.h
Normal file
126
include/ircd/utf.h
Normal file
|
@ -0,0 +1,126 @@
|
|||
// The Construct
|
||||
//
|
||||
// Copyright (C) The Construct Developers, Authors & Contributors
|
||||
// Copyright (C) 2016-2020 Jason Volk <jason@zemos.net>
|
||||
//
|
||||
// Permission to use, copy, modify, and/or distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice is present in all copies. The
|
||||
// full license for this software is available in the LICENSE file.
|
||||
|
||||
#pragma once
|
||||
#define HAVE_IRCD_UTF_H
|
||||
|
||||
/// Unicode Transformation Format
|
||||
namespace ircd::utf
|
||||
{
|
||||
IRCD_EXCEPTION(ircd::error, error)
|
||||
}
|
||||
|
||||
/// Unicode Transformation Format (8-bit)
|
||||
namespace ircd::utf8
|
||||
{
|
||||
using utf::error;
|
||||
|
||||
template<class u32xN> u32xN length(const u32xN &codepoint) noexcept;
|
||||
template<class u32xN> u32xN encode(const u32xN &codepoint) noexcept;
|
||||
}
|
||||
|
||||
/// Unicode Transformation Format (16-bit)
|
||||
namespace ircd::utf16
|
||||
{
|
||||
using utf::error;
|
||||
|
||||
u32x8 convert_u32x8(const u8x16 &pairs) noexcept;
|
||||
}
|
||||
|
||||
/// Convert utf-16 two-byte surrogates (in big-endian) to char32_t codepoints
|
||||
/// in parallel. The result vector is twice the size as the input; no template
|
||||
/// is offered yet, just the dimensions someone needed for somewhere.
|
||||
inline ircd::u32x8
|
||||
ircd::utf16::convert_u32x8(const u8x16 &string)
|
||||
noexcept
|
||||
{
|
||||
return u32x8
|
||||
{
|
||||
string[0x01] | (u32(string[0x00]) << 8),
|
||||
string[0x03] | (u32(string[0x02]) << 8),
|
||||
string[0x05] | (u32(string[0x04]) << 8),
|
||||
string[0x07] | (u32(string[0x06]) << 8),
|
||||
string[0x09] | (u32(string[0x08]) << 8),
|
||||
string[0x0b] | (u32(string[0x0a]) << 8),
|
||||
string[0x0d] | (u32(string[0x0c]) << 8),
|
||||
string[0x0f] | (u32(string[0x0e]) << 8),
|
||||
};
|
||||
}
|
||||
|
||||
/// Transform multiple char32_t codepoints to their utf-8 encodings in
|
||||
/// parallel, returning a sparse result in each char32_t (this does not
|
||||
/// compress the result down).
|
||||
template<class u32xN>
|
||||
inline u32xN
|
||||
ircd::utf8::encode(const u32xN &codepoint)
|
||||
noexcept
|
||||
{
|
||||
const u32xN len
|
||||
{
|
||||
length(codepoint)
|
||||
};
|
||||
|
||||
const u32xN enc_2
|
||||
{
|
||||
(((codepoint >> 6) | 0xc0) & 0xff) // byte[0]
|
||||
| ((((codepoint & 0x3f) | 0x80) &0xff) << 8) // byte[1]
|
||||
};
|
||||
|
||||
const u32xN enc_3
|
||||
{
|
||||
(((codepoint >> 12) | 0xe0) & 0xff) | // byte[0]
|
||||
(((((codepoint >> 6) & 0x3f) | 0x80) & 0xff) << 8) | // byte[1]
|
||||
((((codepoint & 0x3f) | 0x80) & 0xff) << 16) // byte[3]
|
||||
};
|
||||
|
||||
const u32xN enc_4
|
||||
{
|
||||
(((codepoint >> 18) | 0xf0) & 0xff) | // byte[0]
|
||||
(((((codepoint >> 12) & 0x3f) | 0x80) & 0xff) << 8) | // byte[1]
|
||||
(((((codepoint >> 6) & 0x3f) | 0x80) & 0xff) << 16) | // byte[2]
|
||||
((((codepoint & 0x3f) | 0x80) & 0xff) << 24) // byte[3]
|
||||
};
|
||||
|
||||
return 0
|
||||
| ((len == 0) & 0xFFFD)
|
||||
| ((len == 1) & codepoint)
|
||||
| ((len == 2) & enc_2)
|
||||
| ((len == 3) & enc_3)
|
||||
| ((len == 4) & enc_4)
|
||||
;
|
||||
}
|
||||
|
||||
/// Determine the utf-8 encoding length of multiple codepoints in parallel.
|
||||
/// The input vector char32_t codepoints and the output yields an integer
|
||||
/// of 0-4 for each lane.
|
||||
template<class u32xN>
|
||||
inline u32xN
|
||||
ircd::utf8::length(const u32xN &codepoint)
|
||||
noexcept
|
||||
{
|
||||
const u32xN
|
||||
length_1 { codepoint <= 0x7f },
|
||||
length_2 { codepoint <= 0x7ff && codepoint > 0x7f },
|
||||
length_3_lo { codepoint <= 0xd7ff && codepoint > 0x7ff },
|
||||
length_3_hi { codepoint <= 0xffff && codepoint > 0xdfff },
|
||||
length_4 { codepoint <= 0x10ffff && codepoint > 0xffff };
|
||||
|
||||
[[gnu::unused]] const u32xN // Preserved here for future reference
|
||||
length_3_err { codepoint <= 0xdfff && codepoint > 0xd7ff },
|
||||
length_err { (codepoint > 0x10ffff) | length_3_err };
|
||||
|
||||
return 0
|
||||
| (length_1 & 1)
|
||||
| (length_2 & 2)
|
||||
| (length_3_lo & 3)
|
||||
| (length_3_hi & 3)
|
||||
| (length_4 & 4)
|
||||
;
|
||||
}
|
|
@ -15750,11 +15750,11 @@ console_cmd__icu(opt &out, const string_view &line)
|
|||
|
||||
const size_t count
|
||||
{
|
||||
utf8::decode(ch, size(line), line)
|
||||
icu::utf8::decode(ch, size(line), line)
|
||||
};
|
||||
|
||||
char namebuf[64]; size_t li(0);
|
||||
for(size_t i(0); i < count; ++i, li += utf8::length(ch[i]))
|
||||
for(size_t i(0); i < count; ++i, li += icu::utf8::length(ch[i]))
|
||||
out
|
||||
<< ' ' << std::dec << std::right << std::setw(6) << int(icu::block(ch[i]))
|
||||
<< ' ' << std::dec << std::right << std::setw(4) << int(icu::category(ch[i]))
|
||||
|
|
Loading…
Add table
Reference in a new issue