terminal/src/inc/til/u8u16convert.h
Leonard Hecker 8779249b12
Release unneeded memory more eagerly from conhost (#10738)
The `_CONSOLE_API_MSG` buffer is resized to cover an entire message.
Later on any UTF-8 data is cached in a separate temporary
buffer inside `til::u8state` to prevent lone surrogate pairs.

Both cases are problematic as neither buffer is freed after the read
has finished. Passing a 100MB buffer to conhost once will thus cause it
to continue using ~220MB of physical memory until the conhost process exits.

This change releases unneeded memory as soon as the requested buffer
size has halved. In practice this means that once a command has returned
all buffers will shrink, as the shell commonly sends very small messages.

## PR Checklist
* [x] Closes #10731
* [x] I work here
* [x] Tests added/passed

## Validation Steps Performed

* Buffers aren't reallocated during printing ✔️
* Buffers shrink after printing finished ✔️
2021-07-21 05:59:57 +00:00

464 lines
21 KiB
C++

/*++
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
Module Name:
- u8u16convert.h
Abstract:
- Defines classes which hold the status of the current partials handling.
- Defines functions for converting between UTF-8 and UTF-16 strings.
Tests have been made in order to investigate whether or not own algorithms
could overcome disadvantages of syscalls. Test results can be read up
in PR #4093 and the test algorithms are available in src\tools\U8U16Test.
Based on the results the decision was made to keep using the platform
functions MultiByteToWideChar and WideCharToMultiByte.
Author(s):
- Steffen Illhardt (german-one) 2020
--*/
#pragma once
namespace til // Terminal Implementation Library. Also: "Today I Learned"
{
template<class charT>
class u8u16state final
{
public:
u8u16state() noexcept :
_buffer{},
_utfPartials{}
{
}
// Method Description:
// - Takes a UTF-8 string and populates it with *complete* UTF-8 codepoints.
// If it receives an incomplete codepoint, it will cache it until it can be completed.
// Arguments:
// - in - UTF-8 string_view potentially containing partial code points
// - out - on return, populated with complete codepoints at the string end
// Return Value:
// - S_OK - the resulting string doesn't end with a partial
// - S_FALSE - the resulting string contains the previously cached partials only
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
// - E_UNEXPECTED - an unexpected error occurred
template<class T = charT>
[[nodiscard]] typename std::enable_if<std::is_same<T, char>::value, HRESULT>::type
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
{
try
{
size_t capacity{};
RETURN_HR_IF(E_ABORT, !base::CheckAdd(in.length(), _partialsLen).AssignIfValid(&capacity));
_buffer.clear();
// If we were previously called with a huge buffer we have an equally large _buffer.
// We shouldn't just keep this huge buffer around, if no one needs it anymore.
if (_buffer.capacity() > 16 * 1024 && (_buffer.capacity() >> 1) > capacity)
{
_buffer.shrink_to_fit();
}
_buffer.reserve(capacity);
// copy UTF-8 code units that were remaining from the previous call (if any)
if (_partialsLen != 0u)
{
_buffer.assign(_utfPartials.cbegin(), _utfPartials.cbegin() + _partialsLen);
_partialsLen = 0u;
}
if (in.empty())
{
out = _buffer;
if (_buffer.empty())
{
return S_OK;
}
return S_FALSE; // the partial is populated
}
_buffer.append(in);
size_t remainingLength{ _buffer.length() };
auto backIter = _buffer.end();
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
if ((*(backIter - 1) & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
{
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
const size_t stopLen{ std::min(_buffer.length(), gsl::narrow_cast<size_t>(3u)) };
for (size_t sequenceLen{ 1u }; sequenceLen <= stopLen; ++sequenceLen)
{
--backIter;
// If Lead Byte found
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
{
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion into a UTF-16 string.
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
{
std::move(backIter, _buffer.end(), _utfPartials.begin());
remainingLength -= sequenceLen;
_partialsLen = sequenceLen;
}
break;
}
}
}
// populate the part of the string that contains complete code points only
out = { _buffer.data(), remainingLength };
return S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Method Description:
// - Takes a UTF-16 string and populates it with *complete* UTF-16 codepoints.
// If it receives an incomplete codepoint, it will cache it until it can be completed.
// Arguments:
// - in - UTF-16 string_view potentially containing partial code points
// - out - on return, populated with complete codepoints at the string end
// Return Value:
// - S_OK - the resulting string doesn't end with a partial
// - S_FALSE - the resulting string contains the previously cached partials only
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
// - E_UNEXPECTED - an unexpected error occurred
template<class T = charT>
[[nodiscard]] typename std::enable_if<std::is_same<T, wchar_t>::value, HRESULT>::type
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
{
try
{
size_t remainingLength{ in.length() };
size_t capacity{};
RETURN_HR_IF(E_ABORT, !base::CheckAdd(remainingLength, _partialsLen).AssignIfValid(&capacity));
_buffer.clear();
_buffer.reserve(capacity);
// copy UTF-8 code units that were remaining from the previous call (if any)
if (_partialsLen != 0u)
{
_buffer.push_back(_utfPartials.front());
_partialsLen = 0u;
}
if (in.empty())
{
out = _buffer;
if (_buffer.empty())
{
return S_OK;
}
return S_FALSE; // the high surrogate is populated
}
// cache the last value in the string if it is in the range of high surrogates
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu)
{
_utfPartials.front() = in.back();
--remainingLength;
_partialsLen = 1u;
}
else
{
_partialsLen = 0u;
}
// populate the part of the string that contains complete code points only
_buffer.append(in, 0u, remainingLength);
out = _buffer;
return S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Method Description:
// - Discard cached partials.
// Arguments:
// - none
// Return Value:
// - void
void reset() noexcept
{
_partialsLen = 0u;
}
private:
enum _Utf8BitMasks : BYTE
{
IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
};
// array of bitmasks
constexpr static std::array<BYTE, 4> _cmpMasks{
0, // unused
_Utf8BitMasks::MaskContinuationByte,
_Utf8BitMasks::MaskLeadByteTwoByteSequence,
_Utf8BitMasks::MaskLeadByteThreeByteSequence,
};
// array of values for the comparisons
constexpr static std::array<BYTE, 4> _cmpOperands{
0, // unused
_Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
_Utf8BitMasks::IsLeadByteTwoByteSequence,
_Utf8BitMasks::IsLeadByteThreeByteSequence,
};
std::basic_string<charT> _buffer; // buffer to which the populated string_view refers
std::array<charT, 4> _utfPartials; // buffer for code units of a partial code point that have to be cached
size_t _partialsLen{}; // number of cached code units
};
// make clear what incoming string type the state is for
typedef u8u16state<char> u8state;
typedef u8u16state<wchar_t> u16state;
// Routine Description:
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
// Arguments:
// - in - UTF-8 string to be converted
// - out - reference to the resulting UTF-16 string
// Return Value:
// - S_OK - the conversion succeeded
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
u8u16(const inT in, outT& out) noexcept
{
try
{
out.clear();
if (in.empty())
{
return S_OK;
}
int lengthRequired{};
// The worst ratio of UTF-8 code units to UTF-16 code units is 1 to 1 if UTF-8 consists of ASCII only.
RETURN_HR_IF(E_ABORT, !base::MakeCheckedNum(in.length()).AssignIfValid(&lengthRequired));
out.resize(in.length()); // avoid to call MultiByteToWideChar twice only to get the required size
const int lengthOut = MultiByteToWideChar(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthRequired, out.data(), lengthRequired);
out.resize(gsl::narrow_cast<size_t>(lengthOut));
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Routine Description:
// - Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
// Arguments:
// - in - UTF-8 string to be converted
// - out - reference to the resulting UTF-16 string
// - state - reference to a til::u8state class holding the status of the current partials handling
// Return Value:
// - S_OK - the conversion succeeded
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
u8u16(const inT in, outT& out, u8state& state) noexcept
{
std::string_view sv{};
RETURN_IF_FAILED(state(std::string_view{ in }, sv));
return til::u8u16(sv, out);
}
// Routine Description:
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
// Arguments:
// - in - UTF-16 string to be converted
// - out - reference to the resulting UTF-8 string
// Return Value:
// - S_OK - the conversion succeeded
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
u16u8(const inT in, outT& out) noexcept
{
try
{
out.clear();
if (in.empty())
{
return S_OK;
}
int lengthIn{};
int lengthRequired{};
// Code Point U+0000..U+FFFF: 1 UTF-16 code unit --> 1..3 UTF-8 code units.
// Code Points >U+FFFF: 2 UTF-16 code units --> 4 UTF-8 code units.
// Thus, the worst ratio of UTF-16 code units to UTF-8 code units is 1 to 3.
RETURN_HR_IF(E_ABORT, !base::MakeCheckedNum(in.length()).AssignIfValid(&lengthIn) || !base::CheckMul(lengthIn, 3).AssignIfValid(&lengthRequired));
out.resize(gsl::narrow_cast<size_t>(lengthRequired)); // avoid to call WideCharToMultiByte twice only to get the required size
const int lengthOut = WideCharToMultiByte(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthIn, out.data(), lengthRequired, nullptr, nullptr);
out.resize(gsl::narrow_cast<size_t>(lengthOut));
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Routine Description:
// - Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
// Arguments:
// - in - UTF-16 string to be converted
// - out - reference to the resulting UTF-8 string
// - state - reference to a til::u16state class holding the status of the current partials handling
// Return Value:
// - S_OK - the conversion succeeded without any change of the represented code points
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
u16u8(const inT in, outT& out, u16state& state) noexcept
{
std::wstring_view sv{};
RETURN_IF_FAILED(state(std::wstring_view{ in }, sv));
return u16u8(sv, out);
}
// Routine Description:
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
// Arguments:
// - in - UTF-8 string to be converted
// Return Value:
// - the resulting UTF-16 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
u8u16(const inT in)
{
std::wstring out{};
THROW_IF_FAILED(u8u16(std::string_view{ in }, out));
return out;
}
// Routine Description:
// Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
// Arguments:
// - in - UTF-8 string to be converted
// - state - reference to a til::u8state class holding the status of the current partials handling
// Return Value:
// - the resulting UTF-16 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
u8u16(const inT in, u8state& state)
{
std::wstring out{};
THROW_IF_FAILED(u8u16(std::string_view{ in }, out, state));
return out;
}
// Routine Description:
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
// Arguments:
// - in - UTF-16 string to be converted
// Return Value:
// - the resulting UTF-8 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
u16u8(const inT in)
{
std::string out{};
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out));
return out;
}
// Routine Description:
// Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
// Arguments:
// - in - UTF-16 string to be converted
// - state - reference to a til::u16state class holding the status of the current partials handling
// Return Value:
// - the resulting UTF-8 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
u16u8(const inT in, u16state& state)
{
std::string out{};
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out, state));
return out;
}
}