/*++ Copyright (c) Microsoft Corporation Licensed under the MIT license. Module Name: - u8u16convert.h Abstract: - Defines classes which hold the status of the current partials handling. - Defines functions for converting between UTF-8 and UTF-16 strings. Tests have been made in order to investigate whether or not own algorithms could overcome disadvantages of syscalls. Test results can be read up in PR #4093 and the test algorithms are available in src\tools\U8U16Test. Based on the results the decision was made to keep using the platform functions MultiByteToWideChar and WideCharToMultiByte. Author(s): - Steffen Illhardt (german-one) 2020 --*/ #pragma once namespace til // Terminal Implementation Library. Also: "Today I Learned" { template class u8u16state final { public: u8u16state() noexcept : _buffer{}, _utfPartials{} { } // Method Description: // - Takes a UTF-8 string and populates it with *complete* UTF-8 codepoints. // If it receives an incomplete codepoint, it will cache it until it can be completed. // Arguments: // - in - UTF-8 string_view potentially containing partial code points // - out - on return, populated with complete codepoints at the string end // Return Value: // - S_OK - the resulting string doesn't end with a partial // - S_FALSE - the resulting string contains the previously cached partials only // - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string // - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted // - E_UNEXPECTED - an unexpected error occurred template [[nodiscard]] typename std::enable_if::value, HRESULT>::type operator()(const std::basic_string_view in, std::basic_string_view& out) noexcept { try { size_t capacity{}; RETURN_HR_IF(E_ABORT, !base::CheckAdd(in.length(), _partialsLen).AssignIfValid(&capacity)); _buffer.clear(); // If we were previously called with a huge buffer we have an equally large _buffer. // We shouldn't just keep this huge buffer around, if no one needs it anymore. if (_buffer.capacity() > 16 * 1024 && (_buffer.capacity() >> 1) > capacity) { _buffer.shrink_to_fit(); } _buffer.reserve(capacity); // copy UTF-8 code units that were remaining from the previous call (if any) if (_partialsLen != 0u) { _buffer.assign(_utfPartials.cbegin(), _utfPartials.cbegin() + _partialsLen); _partialsLen = 0u; } if (in.empty()) { out = _buffer; if (_buffer.empty()) { return S_OK; } return S_FALSE; // the partial is populated } _buffer.append(in); size_t remainingLength{ _buffer.length() }; auto backIter = _buffer.end(); // If the last byte in the string was a byte belonging to a UTF-8 multi-byte character if ((*(backIter - 1) & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte) { // Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string const size_t stopLen{ std::min(_buffer.length(), gsl::narrow_cast(3u)) }; for (size_t sequenceLen{ 1u }; sequenceLen <= stopLen; ++sequenceLen) { --backIter; // If Lead Byte found if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte) { // If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them: // Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they // are not equal then the sequence has to be cached because it is a partial code point. Otherwise the // sequence is a complete UTF-8 code point and the whole string is ready for the conversion into a UTF-16 string. if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen)) { std::move(backIter, _buffer.end(), _utfPartials.begin()); remainingLength -= sequenceLen; _partialsLen = sequenceLen; } break; } } } // populate the part of the string that contains complete code points only out = { _buffer.data(), remainingLength }; return S_OK; } catch (std::length_error&) { return E_ABORT; } catch (std::bad_alloc&) { return E_OUTOFMEMORY; } catch (...) { return E_UNEXPECTED; } } // Method Description: // - Takes a UTF-16 string and populates it with *complete* UTF-16 codepoints. // If it receives an incomplete codepoint, it will cache it until it can be completed. // Arguments: // - in - UTF-16 string_view potentially containing partial code points // - out - on return, populated with complete codepoints at the string end // Return Value: // - S_OK - the resulting string doesn't end with a partial // - S_FALSE - the resulting string contains the previously cached partials only // - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string // - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted // - E_UNEXPECTED - an unexpected error occurred template [[nodiscard]] typename std::enable_if::value, HRESULT>::type operator()(const std::basic_string_view in, std::basic_string_view& out) noexcept { try { size_t remainingLength{ in.length() }; size_t capacity{}; RETURN_HR_IF(E_ABORT, !base::CheckAdd(remainingLength, _partialsLen).AssignIfValid(&capacity)); _buffer.clear(); _buffer.reserve(capacity); // copy UTF-8 code units that were remaining from the previous call (if any) if (_partialsLen != 0u) { _buffer.push_back(_utfPartials.front()); _partialsLen = 0u; } if (in.empty()) { out = _buffer; if (_buffer.empty()) { return S_OK; } return S_FALSE; // the high surrogate is populated } // cache the last value in the string if it is in the range of high surrogates if (in.back() >= 0xD800u && in.back() <= 0xDBFFu) { _utfPartials.front() = in.back(); --remainingLength; _partialsLen = 1u; } else { _partialsLen = 0u; } // populate the part of the string that contains complete code points only _buffer.append(in, 0u, remainingLength); out = _buffer; return S_OK; } catch (std::length_error&) { return E_ABORT; } catch (std::bad_alloc&) { return E_OUTOFMEMORY; } catch (...) { return E_UNEXPECTED; } } // Method Description: // - Discard cached partials. // Arguments: // - none // Return Value: // - void void reset() noexcept { _partialsLen = 0u; } private: enum _Utf8BitMasks : BYTE { IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0 MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0 MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0 MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0 MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0 MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern }; // array of bitmasks constexpr static std::array _cmpMasks{ 0, // unused _Utf8BitMasks::MaskContinuationByte, _Utf8BitMasks::MaskLeadByteTwoByteSequence, _Utf8BitMasks::MaskLeadByteThreeByteSequence, }; // array of values for the comparisons constexpr static std::array _cmpOperands{ 0, // unused _Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte _Utf8BitMasks::IsLeadByteTwoByteSequence, _Utf8BitMasks::IsLeadByteThreeByteSequence, }; std::basic_string _buffer; // buffer to which the populated string_view refers std::array _utfPartials; // buffer for code units of a partial code point that have to be cached size_t _partialsLen{}; // number of cached code units }; // make clear what incoming string type the state is for typedef u8u16state u8state; typedef u8u16state u16state; // Routine Description: // - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries. // Arguments: // - in - UTF-8 string to be converted // - out - reference to the resulting UTF-16 string // Return Value: // - S_OK - the conversion succeeded // - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string // - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed // - E_UNEXPECTED - an unexpected error occurred template [[nodiscard]] typename std::enable_if::value && std::is_same::value, HRESULT>::type u8u16(const inT in, outT& out) noexcept { try { out.clear(); if (in.empty()) { return S_OK; } int lengthRequired{}; // The worst ratio of UTF-8 code units to UTF-16 code units is 1 to 1 if UTF-8 consists of ASCII only. RETURN_HR_IF(E_ABORT, !base::MakeCheckedNum(in.length()).AssignIfValid(&lengthRequired)); out.resize(in.length()); // avoid to call MultiByteToWideChar twice only to get the required size const int lengthOut = MultiByteToWideChar(gsl::narrow_cast(CP_UTF8), 0ul, in.data(), lengthRequired, out.data(), lengthRequired); out.resize(gsl::narrow_cast(lengthOut)); return lengthOut == 0 ? E_UNEXPECTED : S_OK; } catch (std::length_error&) { return E_ABORT; } catch (std::bad_alloc&) { return E_OUTOFMEMORY; } catch (...) { return E_UNEXPECTED; } } // Routine Description: // - Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16. // Arguments: // - in - UTF-8 string to be converted // - out - reference to the resulting UTF-16 string // - state - reference to a til::u8state class holding the status of the current partials handling // Return Value: // - S_OK - the conversion succeeded // - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string // - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed // - E_UNEXPECTED - an unexpected error occurred template [[nodiscard]] typename std::enable_if::value && std::is_same::value, HRESULT>::type u8u16(const inT in, outT& out, u8state& state) noexcept { std::string_view sv{}; RETURN_IF_FAILED(state(std::string_view{ in }, sv)); return til::u8u16(sv, out); } // Routine Description: // - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries. // Arguments: // - in - UTF-16 string to be converted // - out - reference to the resulting UTF-8 string // Return Value: // - S_OK - the conversion succeeded // - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string // - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed // - E_UNEXPECTED - an unexpected error occurred template [[nodiscard]] typename std::enable_if::value && std::is_same::value, HRESULT>::type u16u8(const inT in, outT& out) noexcept { try { out.clear(); if (in.empty()) { return S_OK; } int lengthIn{}; int lengthRequired{}; // Code Point U+0000..U+FFFF: 1 UTF-16 code unit --> 1..3 UTF-8 code units. // Code Points >U+FFFF: 2 UTF-16 code units --> 4 UTF-8 code units. // Thus, the worst ratio of UTF-16 code units to UTF-8 code units is 1 to 3. RETURN_HR_IF(E_ABORT, !base::MakeCheckedNum(in.length()).AssignIfValid(&lengthIn) || !base::CheckMul(lengthIn, 3).AssignIfValid(&lengthRequired)); out.resize(gsl::narrow_cast(lengthRequired)); // avoid to call WideCharToMultiByte twice only to get the required size const int lengthOut = WideCharToMultiByte(gsl::narrow_cast(CP_UTF8), 0ul, in.data(), lengthIn, out.data(), lengthRequired, nullptr, nullptr); out.resize(gsl::narrow_cast(lengthOut)); return lengthOut == 0 ? E_UNEXPECTED : S_OK; } catch (std::length_error&) { return E_ABORT; } catch (std::bad_alloc&) { return E_OUTOFMEMORY; } catch (...) { return E_UNEXPECTED; } } // Routine Description: // - Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8. // Arguments: // - in - UTF-16 string to be converted // - out - reference to the resulting UTF-8 string // - state - reference to a til::u16state class holding the status of the current partials handling // Return Value: // - S_OK - the conversion succeeded without any change of the represented code points // - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string // - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed // - E_UNEXPECTED - an unexpected error occurred template [[nodiscard]] typename std::enable_if::value && std::is_same::value, HRESULT>::type u16u8(const inT in, outT& out, u16state& state) noexcept { std::wstring_view sv{}; RETURN_IF_FAILED(state(std::wstring_view{ in }, sv)); return u16u8(sv, out); } // Routine Description: // - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries. // Arguments: // - in - UTF-8 string to be converted // Return Value: // - the resulting UTF-16 string // - NOTE: Throws HRESULT errors that the non-throwing sibling returns template typename std::enable_if::value, std::wstring>::type u8u16(const inT in) { std::wstring out{}; THROW_IF_FAILED(u8u16(std::string_view{ in }, out)); return out; } // Routine Description: // Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16. // Arguments: // - in - UTF-8 string to be converted // - state - reference to a til::u8state class holding the status of the current partials handling // Return Value: // - the resulting UTF-16 string // - NOTE: Throws HRESULT errors that the non-throwing sibling returns template typename std::enable_if::value, std::wstring>::type u8u16(const inT in, u8state& state) { std::wstring out{}; THROW_IF_FAILED(u8u16(std::string_view{ in }, out, state)); return out; } // Routine Description: // - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries. // Arguments: // - in - UTF-16 string to be converted // Return Value: // - the resulting UTF-8 string // - NOTE: Throws HRESULT errors that the non-throwing sibling returns template typename std::enable_if::value, std::string>::type u16u8(const inT in) { std::string out{}; THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out)); return out; } // Routine Description: // Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8. // Arguments: // - in - UTF-16 string to be converted // - state - reference to a til::u16state class holding the status of the current partials handling // Return Value: // - the resulting UTF-8 string // - NOTE: Throws HRESULT errors that the non-throwing sibling returns template typename std::enable_if::value, std::string>::type u16u8(const inT in, u16state& state) { std::string out{}; THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out, state)); return out; } }