ff23be04fb
<!-- Enter a brief description/summary of your PR here. What does it fix/what does it change/how was it tested (even manually, if necessary)? --> ## Summary of the Pull Request Many places in this codebase has an equality comparison to the boolean FALSE. This adds unneeded complexity as C and C++ has a NOT operand for use of these in if statements. This makes the code more readable in those areas. <!-- Other than the issue solved, is this relevant to any other issues/existing PRs? --> ## References <!-- Please review the items on the PR checklist before submitting--> ## PR Checklist * [X] CLA signed. If not, go over [here](https://cla.opensource.microsoft.com/microsoft/Terminal) and sign the CLA * [X] Tests added/passed * [ ] Requires documentation to be updated * [ ] I've discussed this with core contributors already. If not checked, I'm ready to accept this work might be rejected in favor of a different grand plan. Issue number where discussion took place: #xxx <!-- Provide a more detailed description of the PR, other things fixed or any additional comments/features here --> ## Detailed Description of the Pull Request / Additional comments One boolean being compared to FALSE was only used once, with the boolean name being "b", so it is better off not existing at all. <!-- Describe how you validated the behavior. Add automated tests wherever possible, but list manual validation steps taken as well --> ## Validation Steps Performed Unit Testing passed, compiler refactoring
781 lines
29 KiB
C++
781 lines
29 KiB
C++
// TEST TOOL U8U16Test
|
|
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
|
|
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
|
|
// worse than the platform API functions.
|
|
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
|
|
|
|
#include "U8U16Test.hpp"
|
|
|
|
u8state::u8state() noexcept :
|
|
_buffer8{},
|
|
_utf8Partials{}
|
|
{
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u8state::operator()(const std::string_view in, std::string_view& out) noexcept
|
|
{
|
|
try
|
|
{
|
|
size_t remainingLength{ in.length() };
|
|
size_t capacity{};
|
|
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
|
|
_buffer8.clear();
|
|
_buffer8.reserve(capacity);
|
|
|
|
// copy UTF-8 code units that were remaining from the previous call (if any)
|
|
if (_partialsLen != 0u)
|
|
{
|
|
_buffer8.assign(_utf8Partials.cbegin(), _utf8Partials.cbegin() + _partialsLen);
|
|
_partialsLen = 0u;
|
|
}
|
|
|
|
if (in.empty())
|
|
{
|
|
out = _buffer8;
|
|
if (_buffer8.empty())
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
return S_FALSE; // the partial is given back
|
|
}
|
|
|
|
auto backIter = in.end() - 1;
|
|
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
|
|
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
|
|
{
|
|
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
|
|
const size_t stopLen{ std::min(in.length(), static_cast<size_t>(4u)) };
|
|
for (size_t sequenceLen{ 1u }; sequenceLen < stopLen; ++sequenceLen, --backIter)
|
|
{
|
|
// If Lead Byte found
|
|
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
|
|
{
|
|
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
|
|
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
|
|
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
|
|
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion to hstring.
|
|
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
|
|
{
|
|
std::move(backIter, in.end(), _utf8Partials.begin());
|
|
remainingLength -= sequenceLen;
|
|
_partialsLen = sequenceLen;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// give back the part of the string that contains complete code points only
|
|
_buffer8.append(in, 0u, remainingLength);
|
|
out = _buffer8;
|
|
|
|
return S_OK;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
void u8state::reset() noexcept
|
|
{
|
|
_partialsLen = 0u;
|
|
}
|
|
|
|
u16state::u16state() noexcept :
|
|
_buffer16{}
|
|
{
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u16state::operator()(const std::wstring_view in, std::wstring_view& out) noexcept
|
|
{
|
|
try
|
|
{
|
|
size_t remainingLength{ in.length() };
|
|
size_t capacity{};
|
|
if (FAILED(SizeTAdd(remainingLength, _cached, &capacity)))
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
|
|
_buffer16.clear();
|
|
_buffer16.reserve(capacity);
|
|
|
|
// copy UTF-8 code units that were remaining from the previous call (if any)
|
|
if (_cached != 0u)
|
|
{
|
|
_buffer16.push_back(_highSurrogate);
|
|
_cached = 0u;
|
|
}
|
|
|
|
if (in.empty())
|
|
{
|
|
out = _buffer16;
|
|
if (_buffer16.empty())
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
return S_FALSE; // the high surrogate is given back
|
|
}
|
|
|
|
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu) // range of high surrogates
|
|
{
|
|
_highSurrogate = in.back();
|
|
--remainingLength;
|
|
_cached = 1u;
|
|
}
|
|
else
|
|
{
|
|
_cached = 0u;
|
|
}
|
|
|
|
// give back the part of the string that contains complete code points only
|
|
_buffer16.append(in, 0u, remainingLength);
|
|
out = _buffer16;
|
|
|
|
return S_OK;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
void u16state::reset() noexcept
|
|
{
|
|
_cached = 0u;
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, bool discardInvalids) noexcept
|
|
{
|
|
constexpr const uint8_t contBegin{ 0x80u }; // usual begin of the range of continuation Bytes
|
|
constexpr const uint8_t contEnd{ 0xBfu }; // usual end of the range of continuation Bytes
|
|
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu }; // Unicode Replacement Character
|
|
|
|
try
|
|
{
|
|
HRESULT hRes{ S_OK };
|
|
out.clear();
|
|
|
|
if (in.empty())
|
|
{
|
|
return hRes;
|
|
}
|
|
|
|
out.reserve(in.length()); // avoid any further re-allocations and copying
|
|
|
|
const auto end8{ in.cend() };
|
|
for (auto it8{ in.cbegin() }; it8 < end8;)
|
|
{
|
|
// *** convert ASCII directly to UTF-16 ***
|
|
// valid single bytes
|
|
// - 00..7F
|
|
if (static_cast<uint8_t>(*it8) <= 0x7Fu)
|
|
{
|
|
out.push_back(static_cast<wchar_t>(*it8++));
|
|
}
|
|
else
|
|
{
|
|
uint32_t codePoint{ unicodeReplacementChar }; // default
|
|
|
|
// valid two bytes
|
|
// - C2..DF | 80..BF (first byte 0xC0 and 0xC1 invalid)
|
|
if (static_cast<uint8_t>(*it8) >= 0xC2u && static_cast<uint8_t>(*it8) <= 0xDFu)
|
|
{
|
|
size_t cnt{ 1u };
|
|
if ((it8 + 1) < end8 && static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000C0u) << 6u) |
|
|
(static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u);
|
|
}
|
|
else
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
|
|
it8 += cnt;
|
|
}
|
|
// valid three bytes
|
|
// - E0 | A0..BF | 80..BF
|
|
// - E1..EC | 80..BF | 80..BF
|
|
// - ED | 80..9F | 80..BF
|
|
// - EE..EF | 80..BF | 80..BF
|
|
else if (static_cast<uint8_t>(*it8) >= 0xE0u && static_cast<uint8_t>(*it8) <= 0xEFu)
|
|
{
|
|
size_t cnt{ 1u };
|
|
if ((it8 + 1) < end8 &&
|
|
(( // E0 | *A0*..BF
|
|
static_cast<uint8_t>(*it8) == 0xE0u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= 0xA0u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // E1..EC | 80..BF
|
|
static_cast<uint8_t>(*it8) >= 0xE1u && static_cast<uint8_t>(*it8) <= 0xECu &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // ED | 80..*9F*
|
|
static_cast<uint8_t>(*it8) == 0xEDu &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x9Fu) ||
|
|
( // EE..EF | 80..BF
|
|
static_cast<uint8_t>(*it8) >= 0xEEu && static_cast<uint8_t>(*it8) <= 0xEFu &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)))
|
|
{
|
|
++cnt;
|
|
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000E0u) << 12u) |
|
|
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 6u) |
|
|
(static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u);
|
|
}
|
|
}
|
|
|
|
it8 += cnt;
|
|
if (cnt < 3u)
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
}
|
|
// valid four bytes
|
|
// - F0 | 90..BF | 80..BF | 80..BF
|
|
// - F1..F3 | 80..BF | 80..BF | 80..BF
|
|
// - F4 | 80..8F | 80..BF | 80..BF
|
|
else if (static_cast<uint8_t>(*it8) >= 0xF0u && static_cast<uint8_t>(*it8) <= 0xF4u)
|
|
{
|
|
size_t cnt{ 1u };
|
|
if ((it8 + 1) < end8 &&
|
|
(( // F0 | *90*..BF
|
|
static_cast<uint8_t>(*it8) == 0xF0u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= 0x90u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // F1..F3 | 80..BF
|
|
static_cast<uint8_t>(*it8) >= 0xF1u && static_cast<uint8_t>(*it8) <= 0xF3u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // F4 | 80..*8F*
|
|
static_cast<uint8_t>(*it8) == 0xF4u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x8Fu)))
|
|
{
|
|
++cnt;
|
|
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
if ((it8 + 3) < end8 && static_cast<uint8_t>(*(it8 + 3)) >= contBegin && static_cast<uint8_t>(*(it8 + 3)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000F0u) << 18u) |
|
|
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 12u) |
|
|
((static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u) << 6u) |
|
|
(static_cast<uint8_t>(*(it8 + 3)) ^ 0x00000080u);
|
|
}
|
|
}
|
|
}
|
|
|
|
it8 += cnt;
|
|
if (cnt < 4u)
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
hRes = S_FALSE;
|
|
++it8;
|
|
}
|
|
|
|
// *** convert the code point to UTF-16 ***
|
|
if (codePoint != unicodeReplacementChar || !discardInvalids)
|
|
{
|
|
if (codePoint < 0x00010000u)
|
|
{
|
|
out.push_back(static_cast<wchar_t>(codePoint));
|
|
}
|
|
else
|
|
{
|
|
codePoint -= 0x00010000u;
|
|
out.push_back(static_cast<wchar_t>(0x0000D800u + ((codePoint >> 10u) & 0x000003FFu)));
|
|
out.push_back(static_cast<wchar_t>(0x0000DC00u + (codePoint & 0x000003FFu)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// out.shrink_to_fit();
|
|
return hRes;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u8u16_ptr(const std::string_view in, std::wstring& out, bool discardInvalids) noexcept
|
|
{
|
|
constexpr const uint8_t contBegin{ 0x80u }; // usual begin of the range of continuation Bytes
|
|
constexpr const uint8_t contEnd{ 0xBfu }; // usual end of the range of continuation Bytes
|
|
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu }; // Unicode Replacement Character
|
|
|
|
try
|
|
{
|
|
HRESULT hRes{ S_OK };
|
|
out.clear();
|
|
|
|
if (in.empty())
|
|
{
|
|
return hRes;
|
|
}
|
|
|
|
out.resize(in.length()); // avoid any further re-allocations and copying
|
|
|
|
wchar_t* it16{ out.data() };
|
|
const auto end8{ in.cend() };
|
|
for (auto it8{ in.cbegin() }; it8 < end8;)
|
|
{
|
|
// *** convert ASCII directly to UTF-16 ***
|
|
// valid single bytes
|
|
// - 00..7F
|
|
if (static_cast<uint8_t>(*it8) <= 0x7Fu)
|
|
{
|
|
*it16++ = (static_cast<wchar_t>(*it8++));
|
|
}
|
|
else
|
|
{
|
|
uint32_t codePoint{ unicodeReplacementChar }; // default
|
|
|
|
// valid two bytes
|
|
// - C2..DF | 80..BF (first byte 0xC0 and 0xC1 invalid)
|
|
if (static_cast<uint8_t>(*it8) >= 0xC2u && static_cast<uint8_t>(*it8) <= 0xDFu)
|
|
{
|
|
size_t cnt{ 1u };
|
|
if ((it8 + 1) < end8 && static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000C0u) << 6u) |
|
|
(static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u);
|
|
}
|
|
else
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
|
|
it8 += cnt;
|
|
}
|
|
// valid three bytes
|
|
// - E0 | A0..BF | 80..BF
|
|
// - E1..EC | 80..BF | 80..BF
|
|
// - ED | 80..9F | 80..BF
|
|
// - EE..EF | 80..BF | 80..BF
|
|
else if (static_cast<uint8_t>(*it8) >= 0xE0u && static_cast<uint8_t>(*it8) <= 0xEFu)
|
|
{
|
|
size_t cnt{ 1u };
|
|
if ((it8 + 1) < end8 &&
|
|
(( // E0 | *A0*..BF
|
|
static_cast<uint8_t>(*it8) == 0xE0u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= 0xA0u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // E1..EC | 80..BF
|
|
static_cast<uint8_t>(*it8) >= 0xE1u && static_cast<uint8_t>(*it8) <= 0xECu &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // ED | 80..*9F*
|
|
static_cast<uint8_t>(*it8) == 0xEDu &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x9Fu) ||
|
|
( // EE..EF | 80..BF
|
|
static_cast<uint8_t>(*it8) >= 0xEEu && static_cast<uint8_t>(*it8) <= 0xEFu &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)))
|
|
{
|
|
++cnt;
|
|
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000E0u) << 12u) |
|
|
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 6u) |
|
|
(static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u);
|
|
}
|
|
}
|
|
|
|
it8 += cnt;
|
|
if (cnt < 3u)
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
}
|
|
// valid four bytes
|
|
// - F0 | 90..BF | 80..BF | 80..BF
|
|
// - F1..F3 | 80..BF | 80..BF | 80..BF
|
|
// - F4 | 80..8F | 80..BF | 80..BF
|
|
else if (static_cast<uint8_t>(*it8) >= 0xF0u && static_cast<uint8_t>(*it8) <= 0xF4u)
|
|
{
|
|
size_t cnt{ 1u };
|
|
if ((it8 + 1) < end8 &&
|
|
(( // F0 | *90*..BF
|
|
static_cast<uint8_t>(*it8) == 0xF0u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= 0x90u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // F1..F3 | 80..BF
|
|
static_cast<uint8_t>(*it8) >= 0xF1u && static_cast<uint8_t>(*it8) <= 0xF3u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
|
( // F4 | 80..*8F*
|
|
static_cast<uint8_t>(*it8) == 0xF4u &&
|
|
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x8Fu)))
|
|
{
|
|
++cnt;
|
|
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
if ((it8 + 3) < end8 && static_cast<uint8_t>(*(it8 + 3)) >= contBegin && static_cast<uint8_t>(*(it8 + 3)) <= contEnd)
|
|
{
|
|
++cnt;
|
|
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000F0u) << 18u) |
|
|
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 12u) |
|
|
((static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u) << 6u) |
|
|
(static_cast<uint8_t>(*(it8 + 3)) ^ 0x00000080u);
|
|
}
|
|
}
|
|
}
|
|
|
|
it8 += cnt;
|
|
if (cnt < 4u)
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
hRes = S_FALSE;
|
|
++it8;
|
|
}
|
|
|
|
// *** convert the code point to UTF-16 ***
|
|
if (codePoint != unicodeReplacementChar || !discardInvalids)
|
|
{
|
|
if (codePoint < 0x00010000u)
|
|
{
|
|
*it16++ = (static_cast<wchar_t>(codePoint));
|
|
}
|
|
else
|
|
{
|
|
codePoint -= 0x00010000u;
|
|
*it16++ = (static_cast<wchar_t>(0x0000D800u + ((codePoint >> 10u) & 0x000003FFu)));
|
|
*it16++ = (static_cast<wchar_t>(0x0000DC00u + (codePoint & 0x000003FFu)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
out.resize(static_cast<size_t>(it16 - out.data()));
|
|
return hRes;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, bool discardInvalids) noexcept
|
|
{
|
|
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu };
|
|
|
|
try
|
|
{
|
|
HRESULT hRes{ S_OK };
|
|
out.clear();
|
|
|
|
if (in.empty())
|
|
{
|
|
return hRes;
|
|
}
|
|
|
|
size_t lengthHint{};
|
|
if (FAILED(SizeTMult(in.length(), static_cast<size_t>(3u), &lengthHint)))
|
|
{
|
|
lengthHint = std::max(out.capacity(), in.length());
|
|
}
|
|
|
|
out.reserve(lengthHint); // avoid any further re-allocations and copying
|
|
|
|
const auto end16{ in.cend() };
|
|
for (auto it16{ in.cbegin() }; it16 < end16;)
|
|
{
|
|
// *** convert ASCII directly to UTF-8 ***
|
|
if (*it16 <= 0x007Fu)
|
|
{
|
|
out.push_back(static_cast<char>(*it16++));
|
|
}
|
|
else
|
|
{
|
|
uint32_t codePoint{ unicodeReplacementChar }; // default
|
|
|
|
// *** convert UTF-16 to a code point ***
|
|
if (*it16 >= 0xD800u && *it16 <= 0xDBFFu) // range of high surrogates
|
|
{
|
|
const uint32_t high{ *it16++ };
|
|
if (it16 < end16 && *it16 >= 0xDC00u && *it16 <= 0xDFFFu) // range of low surrogates
|
|
{
|
|
codePoint = (high << 10u) + *it16++ - static_cast<uint32_t>(0x035FDC00u);
|
|
}
|
|
else
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
}
|
|
else if (*it16 >= 0xDC00u && *it16 <= 0xDFFFu) // standing alone low surrogates are invalid
|
|
{
|
|
hRes = S_FALSE;
|
|
++it16;
|
|
}
|
|
else
|
|
{
|
|
codePoint = *it16++;
|
|
}
|
|
|
|
// *** convert the code point to UTF-8 ***
|
|
if (codePoint != unicodeReplacementChar || !discardInvalids)
|
|
{
|
|
// the outcome of performance tests is that subsequent calls of push_back
|
|
// perform much better than appending a single initializer_list
|
|
if (codePoint < 0x00000800u)
|
|
{
|
|
out.push_back(static_cast<char>((codePoint >> 6u & 0x1Fu) | 0xC0u));
|
|
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
|
}
|
|
else if (codePoint < 0x00010000u)
|
|
{
|
|
out.push_back(static_cast<char>((codePoint >> 12u & 0x0Fu) | 0xE0u));
|
|
out.push_back(static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
|
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
|
}
|
|
else
|
|
{
|
|
out.push_back(static_cast<char>((codePoint >> 18u & 0x07u) | 0xF0u));
|
|
out.push_back(static_cast<char>((codePoint >> 12u & 0x3Fu) | 0x80u));
|
|
out.push_back(static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
|
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// out.shrink_to_fit();
|
|
return hRes;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u16u8_ptr(const std::wstring_view in, std::string& out, bool discardInvalids) noexcept
|
|
{
|
|
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu };
|
|
|
|
try
|
|
{
|
|
HRESULT hRes{ S_OK };
|
|
out.clear();
|
|
|
|
if (in.empty())
|
|
{
|
|
return hRes;
|
|
}
|
|
|
|
size_t lengthHint{};
|
|
if (FAILED(SizeTMult(in.length(), static_cast<size_t>(3u), &lengthHint)))
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
|
|
out.resize(lengthHint); // avoid any further re-allocations and copying
|
|
|
|
char* it8{ out.data() };
|
|
const auto end16{ in.cend() };
|
|
for (auto it16{ in.cbegin() }; it16 < end16;)
|
|
{
|
|
// *** convert ASCII directly to UTF-8 ***
|
|
if (*it16 <= 0x007Fu)
|
|
{
|
|
*it8++ = (static_cast<char>(*it16++));
|
|
}
|
|
else
|
|
{
|
|
uint32_t codePoint{ unicodeReplacementChar }; // default
|
|
|
|
// *** convert UTF-16 to a code point ***
|
|
if (*it16 >= 0xD800u && *it16 <= 0xDBFFu) // range of high surrogates
|
|
{
|
|
const uint32_t high{ *it16++ };
|
|
if (it16 < end16 && *it16 >= 0xDC00u && *it16 <= 0xDFFFu) // range of low surrogates
|
|
{
|
|
codePoint = (high << 10u) + *it16++ - static_cast<uint32_t>(0x035FDC00u);
|
|
}
|
|
else
|
|
{
|
|
hRes = S_FALSE;
|
|
}
|
|
}
|
|
else if (*it16 >= 0xDC00u && *it16 <= 0xDFFFu) // standing alone low surrogates are invalid
|
|
{
|
|
hRes = S_FALSE;
|
|
++it16;
|
|
}
|
|
else
|
|
{
|
|
codePoint = *it16++;
|
|
}
|
|
|
|
// *** convert the code point to UTF-8 ***
|
|
if (codePoint != unicodeReplacementChar || !discardInvalids)
|
|
{
|
|
// the outcome of further performance tests is that using pointers
|
|
// perform even better than subsequent calls of push_back
|
|
if (codePoint < 0x00000800u)
|
|
{
|
|
*it8++ = (static_cast<char>((codePoint >> 6u & 0x1Fu) | 0xC0u));
|
|
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
|
}
|
|
else if (codePoint < 0x00010000u)
|
|
{
|
|
*it8++ = (static_cast<char>((codePoint >> 12u & 0x0Fu) | 0xE0u));
|
|
*it8++ = (static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
|
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
|
}
|
|
else
|
|
{
|
|
*it8++ = (static_cast<char>((codePoint >> 18u & 0x07u) | 0xF0u));
|
|
*it8++ = (static_cast<char>((codePoint >> 12u & 0x3Fu) | 0x80u));
|
|
*it8++ = (static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
|
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
out.resize(static_cast<size_t>(it8 - out.data()));
|
|
return hRes;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, u8state& state, bool discardInvalids) noexcept
|
|
{
|
|
std::string_view sv{};
|
|
//RETURN_IF_FAILED(state(in, sv));
|
|
const HRESULT hRes{ state(in, sv) };
|
|
if (FAILED(hRes))
|
|
{
|
|
return hRes;
|
|
}
|
|
return u8u16(sv, out, discardInvalids);
|
|
}
|
|
|
|
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, u16state& state, bool discardInvalids) noexcept
|
|
{
|
|
std::wstring_view sv{};
|
|
//RETURN_IF_FAILED(state(in, sv));
|
|
const HRESULT hRes{ state(in, sv) };
|
|
if (FAILED(hRes))
|
|
{
|
|
return hRes;
|
|
}
|
|
return u16u8(sv, out, discardInvalids);
|
|
}
|
|
|
|
std::wstring u8u16(const std::string_view in, bool discardInvalids)
|
|
{
|
|
std::wstring out{};
|
|
//THROW_IF_FAILED(u8u16(in, out, discardInvalids));
|
|
const HRESULT hRes{ u8u16(in, out, discardInvalids) };
|
|
if (FAILED(hRes))
|
|
{
|
|
throw std::runtime_error("error");
|
|
}
|
|
return out;
|
|
}
|
|
|
|
std::string u16u8(const std::wstring_view in, bool discardInvalids)
|
|
{
|
|
std::string out{};
|
|
//THROW_IF_FAILED(u16u8(in, out, discardInvalids));
|
|
const HRESULT hRes{ u16u8(in, out, discardInvalids) };
|
|
if (FAILED(hRes))
|
|
{
|
|
throw std::runtime_error("error");
|
|
}
|
|
return out;
|
|
}
|
|
|
|
std::wstring u8u16(const std::string_view in, u8state& state, bool discardInvalids)
|
|
{
|
|
std::wstring out{};
|
|
//THROW_IF_FAILED(u8u16(in, out, state, discardInvalids));
|
|
const HRESULT hRes{ u8u16(in, out, state, discardInvalids) };
|
|
if (FAILED(hRes))
|
|
{
|
|
throw std::runtime_error("error");
|
|
}
|
|
return out;
|
|
}
|
|
|
|
std::string u16u8(const std::wstring_view in, u16state& state, bool discardInvalids)
|
|
{
|
|
std::string out{};
|
|
//THROW_IF_FAILED(u16u8(in, out, state, discardInvalids));
|
|
const HRESULT hRes{ u16u8(in, out, state, discardInvalids) };
|
|
if (FAILED(hRes))
|
|
{
|
|
throw std::runtime_error("error");
|
|
}
|
|
return out;
|
|
}
|