8779249b12
The `_CONSOLE_API_MSG` buffer is resized to cover an entire message. Later on any UTF-8 data is cached in a separate temporary buffer inside `til::u8state` to prevent lone surrogate pairs. Both cases are problematic as neither buffer is freed after the read has finished. Passing a 100MB buffer to conhost once will thus cause it to continue using ~220MB of physical memory until the conhost process exits. This change releases unneeded memory as soon as the requested buffer size has halved. In practice this means that once a command has returned all buffers will shrink, as the shell commonly sends very small messages. ## PR Checklist * [x] Closes #10731 * [x] I work here * [x] Tests added/passed ## Validation Steps Performed * Buffers aren't reallocated during printing ✔️ * Buffers shrink after printing finished ✔️
464 lines
21 KiB
C++
464 lines
21 KiB
C++
/*++
|
|
Copyright (c) Microsoft Corporation
|
|
Licensed under the MIT license.
|
|
|
|
Module Name:
|
|
- u8u16convert.h
|
|
|
|
Abstract:
|
|
- Defines classes which hold the status of the current partials handling.
|
|
- Defines functions for converting between UTF-8 and UTF-16 strings.
|
|
|
|
Tests have been made in order to investigate whether or not own algorithms
|
|
could overcome disadvantages of syscalls. Test results can be read up
|
|
in PR #4093 and the test algorithms are available in src\tools\U8U16Test.
|
|
Based on the results the decision was made to keep using the platform
|
|
functions MultiByteToWideChar and WideCharToMultiByte.
|
|
|
|
Author(s):
|
|
- Steffen Illhardt (german-one) 2020
|
|
--*/
|
|
|
|
#pragma once
|
|
|
|
namespace til // Terminal Implementation Library. Also: "Today I Learned"
|
|
{
|
|
template<class charT>
|
|
class u8u16state final
|
|
{
|
|
public:
|
|
u8u16state() noexcept :
|
|
_buffer{},
|
|
_utfPartials{}
|
|
{
|
|
}
|
|
|
|
// Method Description:
|
|
// - Takes a UTF-8 string and populates it with *complete* UTF-8 codepoints.
|
|
// If it receives an incomplete codepoint, it will cache it until it can be completed.
|
|
// Arguments:
|
|
// - in - UTF-8 string_view potentially containing partial code points
|
|
// - out - on return, populated with complete codepoints at the string end
|
|
// Return Value:
|
|
// - S_OK - the resulting string doesn't end with a partial
|
|
// - S_FALSE - the resulting string contains the previously cached partials only
|
|
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
|
|
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
|
|
// - E_UNEXPECTED - an unexpected error occurred
|
|
template<class T = charT>
|
|
[[nodiscard]] typename std::enable_if<std::is_same<T, char>::value, HRESULT>::type
|
|
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
|
|
{
|
|
try
|
|
{
|
|
size_t capacity{};
|
|
RETURN_HR_IF(E_ABORT, !base::CheckAdd(in.length(), _partialsLen).AssignIfValid(&capacity));
|
|
|
|
_buffer.clear();
|
|
|
|
// If we were previously called with a huge buffer we have an equally large _buffer.
|
|
// We shouldn't just keep this huge buffer around, if no one needs it anymore.
|
|
if (_buffer.capacity() > 16 * 1024 && (_buffer.capacity() >> 1) > capacity)
|
|
{
|
|
_buffer.shrink_to_fit();
|
|
}
|
|
|
|
_buffer.reserve(capacity);
|
|
|
|
// copy UTF-8 code units that were remaining from the previous call (if any)
|
|
if (_partialsLen != 0u)
|
|
{
|
|
_buffer.assign(_utfPartials.cbegin(), _utfPartials.cbegin() + _partialsLen);
|
|
_partialsLen = 0u;
|
|
}
|
|
|
|
if (in.empty())
|
|
{
|
|
out = _buffer;
|
|
if (_buffer.empty())
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
return S_FALSE; // the partial is populated
|
|
}
|
|
|
|
_buffer.append(in);
|
|
size_t remainingLength{ _buffer.length() };
|
|
|
|
auto backIter = _buffer.end();
|
|
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
|
|
if ((*(backIter - 1) & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
|
|
{
|
|
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
|
|
const size_t stopLen{ std::min(_buffer.length(), gsl::narrow_cast<size_t>(3u)) };
|
|
for (size_t sequenceLen{ 1u }; sequenceLen <= stopLen; ++sequenceLen)
|
|
{
|
|
--backIter;
|
|
// If Lead Byte found
|
|
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
|
|
{
|
|
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
|
|
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
|
|
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
|
|
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion into a UTF-16 string.
|
|
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
|
|
{
|
|
std::move(backIter, _buffer.end(), _utfPartials.begin());
|
|
remainingLength -= sequenceLen;
|
|
_partialsLen = sequenceLen;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// populate the part of the string that contains complete code points only
|
|
out = { _buffer.data(), remainingLength };
|
|
|
|
return S_OK;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
// Method Description:
|
|
// - Takes a UTF-16 string and populates it with *complete* UTF-16 codepoints.
|
|
// If it receives an incomplete codepoint, it will cache it until it can be completed.
|
|
// Arguments:
|
|
// - in - UTF-16 string_view potentially containing partial code points
|
|
// - out - on return, populated with complete codepoints at the string end
|
|
// Return Value:
|
|
// - S_OK - the resulting string doesn't end with a partial
|
|
// - S_FALSE - the resulting string contains the previously cached partials only
|
|
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
|
|
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
|
|
// - E_UNEXPECTED - an unexpected error occurred
|
|
template<class T = charT>
|
|
[[nodiscard]] typename std::enable_if<std::is_same<T, wchar_t>::value, HRESULT>::type
|
|
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
|
|
{
|
|
try
|
|
{
|
|
size_t remainingLength{ in.length() };
|
|
size_t capacity{};
|
|
|
|
RETURN_HR_IF(E_ABORT, !base::CheckAdd(remainingLength, _partialsLen).AssignIfValid(&capacity));
|
|
|
|
_buffer.clear();
|
|
_buffer.reserve(capacity);
|
|
|
|
// copy UTF-8 code units that were remaining from the previous call (if any)
|
|
if (_partialsLen != 0u)
|
|
{
|
|
_buffer.push_back(_utfPartials.front());
|
|
_partialsLen = 0u;
|
|
}
|
|
|
|
if (in.empty())
|
|
{
|
|
out = _buffer;
|
|
if (_buffer.empty())
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
return S_FALSE; // the high surrogate is populated
|
|
}
|
|
|
|
// cache the last value in the string if it is in the range of high surrogates
|
|
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu)
|
|
{
|
|
_utfPartials.front() = in.back();
|
|
--remainingLength;
|
|
_partialsLen = 1u;
|
|
}
|
|
else
|
|
{
|
|
_partialsLen = 0u;
|
|
}
|
|
|
|
// populate the part of the string that contains complete code points only
|
|
_buffer.append(in, 0u, remainingLength);
|
|
out = _buffer;
|
|
|
|
return S_OK;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
// Method Description:
|
|
// - Discard cached partials.
|
|
// Arguments:
|
|
// - none
|
|
// Return Value:
|
|
// - void
|
|
void reset() noexcept
|
|
{
|
|
_partialsLen = 0u;
|
|
}
|
|
|
|
private:
|
|
enum _Utf8BitMasks : BYTE
|
|
{
|
|
IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
|
|
MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
|
|
IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
|
|
MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
|
|
IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
|
|
MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
|
|
IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
|
|
MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
|
|
IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
|
|
MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
|
|
};
|
|
|
|
// array of bitmasks
|
|
constexpr static std::array<BYTE, 4> _cmpMasks{
|
|
0, // unused
|
|
_Utf8BitMasks::MaskContinuationByte,
|
|
_Utf8BitMasks::MaskLeadByteTwoByteSequence,
|
|
_Utf8BitMasks::MaskLeadByteThreeByteSequence,
|
|
};
|
|
|
|
// array of values for the comparisons
|
|
constexpr static std::array<BYTE, 4> _cmpOperands{
|
|
0, // unused
|
|
_Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
|
|
_Utf8BitMasks::IsLeadByteTwoByteSequence,
|
|
_Utf8BitMasks::IsLeadByteThreeByteSequence,
|
|
};
|
|
|
|
std::basic_string<charT> _buffer; // buffer to which the populated string_view refers
|
|
std::array<charT, 4> _utfPartials; // buffer for code units of a partial code point that have to be cached
|
|
size_t _partialsLen{}; // number of cached code units
|
|
};
|
|
|
|
// make clear what incoming string type the state is for
|
|
typedef u8u16state<char> u8state;
|
|
typedef u8u16state<wchar_t> u16state;
|
|
|
|
// Routine Description:
|
|
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
|
|
// Arguments:
|
|
// - in - UTF-8 string to be converted
|
|
// - out - reference to the resulting UTF-16 string
|
|
// Return Value:
|
|
// - S_OK - the conversion succeeded
|
|
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
|
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
|
// - E_UNEXPECTED - an unexpected error occurred
|
|
template<class inT, class outT>
|
|
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
|
|
u8u16(const inT in, outT& out) noexcept
|
|
{
|
|
try
|
|
{
|
|
out.clear();
|
|
|
|
if (in.empty())
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
int lengthRequired{};
|
|
// The worst ratio of UTF-8 code units to UTF-16 code units is 1 to 1 if UTF-8 consists of ASCII only.
|
|
RETURN_HR_IF(E_ABORT, !base::MakeCheckedNum(in.length()).AssignIfValid(&lengthRequired));
|
|
out.resize(in.length()); // avoid to call MultiByteToWideChar twice only to get the required size
|
|
const int lengthOut = MultiByteToWideChar(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthRequired, out.data(), lengthRequired);
|
|
out.resize(gsl::narrow_cast<size_t>(lengthOut));
|
|
|
|
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
// Routine Description:
|
|
// - Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
|
|
// Arguments:
|
|
// - in - UTF-8 string to be converted
|
|
// - out - reference to the resulting UTF-16 string
|
|
// - state - reference to a til::u8state class holding the status of the current partials handling
|
|
// Return Value:
|
|
// - S_OK - the conversion succeeded
|
|
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
|
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
|
// - E_UNEXPECTED - an unexpected error occurred
|
|
template<class inT, class outT>
|
|
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
|
|
u8u16(const inT in, outT& out, u8state& state) noexcept
|
|
{
|
|
std::string_view sv{};
|
|
RETURN_IF_FAILED(state(std::string_view{ in }, sv));
|
|
return til::u8u16(sv, out);
|
|
}
|
|
|
|
// Routine Description:
|
|
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
|
|
// Arguments:
|
|
// - in - UTF-16 string to be converted
|
|
// - out - reference to the resulting UTF-8 string
|
|
// Return Value:
|
|
// - S_OK - the conversion succeeded
|
|
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
|
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
|
// - E_UNEXPECTED - an unexpected error occurred
|
|
template<class inT, class outT>
|
|
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
|
|
u16u8(const inT in, outT& out) noexcept
|
|
{
|
|
try
|
|
{
|
|
out.clear();
|
|
|
|
if (in.empty())
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
int lengthIn{};
|
|
int lengthRequired{};
|
|
// Code Point U+0000..U+FFFF: 1 UTF-16 code unit --> 1..3 UTF-8 code units.
|
|
// Code Points >U+FFFF: 2 UTF-16 code units --> 4 UTF-8 code units.
|
|
// Thus, the worst ratio of UTF-16 code units to UTF-8 code units is 1 to 3.
|
|
RETURN_HR_IF(E_ABORT, !base::MakeCheckedNum(in.length()).AssignIfValid(&lengthIn) || !base::CheckMul(lengthIn, 3).AssignIfValid(&lengthRequired));
|
|
out.resize(gsl::narrow_cast<size_t>(lengthRequired)); // avoid to call WideCharToMultiByte twice only to get the required size
|
|
const int lengthOut = WideCharToMultiByte(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthIn, out.data(), lengthRequired, nullptr, nullptr);
|
|
out.resize(gsl::narrow_cast<size_t>(lengthOut));
|
|
|
|
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
|
|
}
|
|
catch (std::length_error&)
|
|
{
|
|
return E_ABORT;
|
|
}
|
|
catch (std::bad_alloc&)
|
|
{
|
|
return E_OUTOFMEMORY;
|
|
}
|
|
catch (...)
|
|
{
|
|
return E_UNEXPECTED;
|
|
}
|
|
}
|
|
|
|
// Routine Description:
|
|
// - Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
|
|
// Arguments:
|
|
// - in - UTF-16 string to be converted
|
|
// - out - reference to the resulting UTF-8 string
|
|
// - state - reference to a til::u16state class holding the status of the current partials handling
|
|
// Return Value:
|
|
// - S_OK - the conversion succeeded without any change of the represented code points
|
|
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
|
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
|
// - E_UNEXPECTED - an unexpected error occurred
|
|
template<class inT, class outT>
|
|
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
|
|
u16u8(const inT in, outT& out, u16state& state) noexcept
|
|
{
|
|
std::wstring_view sv{};
|
|
RETURN_IF_FAILED(state(std::wstring_view{ in }, sv));
|
|
return u16u8(sv, out);
|
|
}
|
|
|
|
// Routine Description:
|
|
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
|
|
// Arguments:
|
|
// - in - UTF-8 string to be converted
|
|
// Return Value:
|
|
// - the resulting UTF-16 string
|
|
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
|
template<class inT>
|
|
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
|
|
u8u16(const inT in)
|
|
{
|
|
std::wstring out{};
|
|
THROW_IF_FAILED(u8u16(std::string_view{ in }, out));
|
|
return out;
|
|
}
|
|
|
|
// Routine Description:
|
|
// Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
|
|
// Arguments:
|
|
// - in - UTF-8 string to be converted
|
|
// - state - reference to a til::u8state class holding the status of the current partials handling
|
|
// Return Value:
|
|
// - the resulting UTF-16 string
|
|
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
|
template<class inT>
|
|
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
|
|
u8u16(const inT in, u8state& state)
|
|
{
|
|
std::wstring out{};
|
|
THROW_IF_FAILED(u8u16(std::string_view{ in }, out, state));
|
|
return out;
|
|
}
|
|
|
|
// Routine Description:
|
|
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
|
|
// Arguments:
|
|
// - in - UTF-16 string to be converted
|
|
// Return Value:
|
|
// - the resulting UTF-8 string
|
|
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
|
template<class inT>
|
|
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
|
|
u16u8(const inT in)
|
|
{
|
|
std::string out{};
|
|
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out));
|
|
return out;
|
|
}
|
|
|
|
// Routine Description:
|
|
// Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
|
|
// Arguments:
|
|
// - in - UTF-16 string to be converted
|
|
// - state - reference to a til::u16state class holding the status of the current partials handling
|
|
// Return Value:
|
|
// - the resulting UTF-8 string
|
|
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
|
template<class inT>
|
|
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
|
|
u16u8(const inT in, u16state& state)
|
|
{
|
|
std::string out{};
|
|
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out, state));
|
|
return out;
|
|
}
|
|
}
|