Implement til::u8u16 and til::u16u8 conversion functions (#4093)

This commit also switches ConptyConnection to consume til::u8u16 and removes the UTF8OutPipeReader.

Closes #4092.
This commit is contained in:
Steffen 2020-01-30 01:55:48 +01:00 committed by GitHub
parent 1445380810
commit 32ea419c3d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 2245 additions and 312 deletions

View File

@ -269,6 +269,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{BDB237B6
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "til.unit.tests", "src\til\ut_til\til.unit.tests.vcxproj", "{767268EE-174A-46FE-96F0-EEE698A1BBC9}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "U8U16Test", "src\tools\U8U16Test\U8U16Test.vcxproj", "{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
AuditMode|Any CPU = AuditMode|Any CPU
@ -1374,6 +1376,26 @@ Global
{767268EE-174A-46FE-96F0-EEE698A1BBC9}.Release|x64.Build.0 = Release|x64
{767268EE-174A-46FE-96F0-EEE698A1BBC9}.Release|x86.ActiveCfg = Release|Win32
{767268EE-174A-46FE-96F0-EEE698A1BBC9}.Release|x86.Build.0 = Release|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|Any CPU.ActiveCfg = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|Any CPU.Build.0 = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|ARM64.ActiveCfg = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|ARM64.Build.0 = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x64.ActiveCfg = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x64.Build.0 = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x86.ActiveCfg = Release|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x86.Build.0 = Release|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|Any CPU.ActiveCfg = Debug|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|ARM64.ActiveCfg = Debug|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x64.ActiveCfg = Debug|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x64.Build.0 = Debug|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x86.ActiveCfg = Debug|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x86.Build.0 = Debug|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|Any CPU.ActiveCfg = Release|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|ARM64.ActiveCfg = Release|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x64.ActiveCfg = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x64.Build.0 = Release|x64
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x86.ActiveCfg = Release|Win32
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@ -1444,6 +1466,7 @@ Global
{A021EDFF-45C8-4DC2-BEF7-36E1B3B8CFE8} = {BDB237B6-1D1D-400F-84CC-40A58FA59C8E}
{BDB237B6-1D1D-400F-84CC-40A58FA59C8E} = {59840756-302F-44DF-AA47-441A9D673202}
{767268EE-174A-46FE-96F0-EEE698A1BBC9} = {89CDCC5C-9F53-4054-97A4-639D99F169CD}
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1} = {A10C4720-DCA4-4640-9749-67F4314F527C}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {3140B1B7-C8EE-43D1-A772-D82A7061A271}

View File

@ -15,7 +15,6 @@
#include "../../types/inc/Utils.hpp"
#include "../../types/inc/Environment.hpp"
#include "../../types/inc/UTF8OutPipeReader.hpp"
#include "LibraryResources.h"
using namespace ::Microsoft::Console;
@ -169,7 +168,10 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
_commandline{ commandline },
_startingDirectory{ startingDirectory },
_startingTitle{ startingTitle },
_guid{ initialGuid }
_guid{ initialGuid },
_u8State{},
_u16Str{},
_buffer{}
{
if (_guid == guid{})
{
@ -344,14 +346,27 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
DWORD ConptyConnection::_OutputThread()
{
UTF8OutPipeReader pipeReader{ _outPipe.get() };
std::string_view strView{};
// process the data of the output pipe in a loop
while (true)
{
const HRESULT result = pipeReader.Read(strView);
if (FAILED(result) || result == S_FALSE)
DWORD read{};
const auto readFail{ !ReadFile(_outPipe.get(), _buffer.data(), gsl::narrow_cast<DWORD>(_buffer.size()), &read, nullptr) };
if (readFail) // reading failed (we must check this first, because read will also be 0.)
{
const auto lastError = GetLastError();
if (lastError != ERROR_BROKEN_PIPE && !_isStateAtOrBeyond(ConnectionState::Closing))
{
// EXIT POINT
_indicateExitWithStatus(HRESULT_FROM_WIN32(lastError)); // print a message
_transitionToState(ConnectionState::Failed);
return gsl::narrow_cast<DWORD>(HRESULT_FROM_WIN32(lastError));
}
// else we call convertUTF8ChunkToUTF16 with an empty string_view to convert possible remaining partials to U+FFFD
}
const HRESULT result{ til::u8u16(std::string_view{ _buffer.data(), read }, _u16Str, _u8State) };
if (FAILED(result))
{
if (_isStateAtOrBeyond(ConnectionState::Closing))
{
@ -362,10 +377,10 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
// EXIT POINT
_indicateExitWithStatus(result); // print a message
_transitionToState(ConnectionState::Failed);
return gsl::narrow_cast<DWORD>(-1);
return gsl::narrow_cast<DWORD>(result);
}
if (strView.empty())
if (_u16Str.empty())
{
return 0;
}
@ -386,11 +401,8 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
_recievedFirstByte = true;
}
// Convert buffer to hstring
auto hstr{ winrt::to_hstring(strView) };
// Pass the output to our registered event handlers
_TerminalOutputHandlers(hstr);
_TerminalOutputHandlers(_u16Str);
}
return 0;

View File

@ -52,6 +52,10 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
wil::unique_static_pseudoconsole_handle _hPC;
wil::unique_threadpool_wait _clientExitWait;
til::u8state _u8State;
std::wstring _u16Str;
std::array<char, 4096> _buffer;
DWORD _OutputThread();
};
}

View File

@ -5,6 +5,7 @@
#include "til/at.h"
#include "til/some.h"
#include "til/u8u16convert.h"
namespace til // Terminal Implementation Library. Also: "Today I Learned"
{

458
src/inc/til/u8u16convert.h Normal file
View File

@ -0,0 +1,458 @@
/*++
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
Module Name:
- u8u16convert.h
Abstract:
- Defines classes which hold the status of the current partials handling.
- Defines functions for converting between UTF-8 and UTF-16 strings.
Tests have been made in order to investigate whether or not own algorithms
could overcome disadvantages of syscalls. Test results can be read up
in PR #4093 and the test algorithms are available in src\tools\U8U16Test.
Based on the results the decision was made to keep using the platform
functions MultiByteToWideChar and WideCharToMultiByte.
Author(s):
- Steffen Illhardt (german-one) 2020
--*/
#pragma once
namespace til // Terminal Implementation Library. Also: "Today I Learned"
{
template<class charT>
class u8u16state final
{
public:
u8u16state() noexcept :
_buffer{},
_utfPartials{}
{
}
// Method Description:
// - Takes a UTF-8 string and populates it with *complete* UTF-8 codepoints.
// If it receives an incomplete codepoint, it will cache it until it can be completed.
// Arguments:
// - in - UTF-8 string_view potentially containing partial code points
// - out - on return, populated with complete codepoints at the string end
// Return Value:
// - S_OK - the resulting string doesn't end with a partial
// - S_FALSE - the resulting string contains the previously cached partials only
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
// - E_UNEXPECTED - an unexpected error occurred
template<class T = charT>
[[nodiscard]] typename std::enable_if<std::is_same<T, char>::value, HRESULT>::type
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
{
try
{
size_t remainingLength{ in.length() };
size_t capacity{};
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
{
return E_ABORT;
}
_buffer.clear();
_buffer.reserve(capacity);
// copy UTF-8 code units that were remaining from the previous call (if any)
if (_partialsLen != 0u)
{
_buffer.assign(_utfPartials.cbegin(), _utfPartials.cbegin() + _partialsLen);
_partialsLen = 0u;
}
if (in.empty())
{
out = _buffer;
if (_buffer.empty())
{
return S_OK;
}
return S_FALSE; // the partial is populated
}
auto backIter = in.end() - 1;
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
{
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
const size_t stopLen{ std::min(in.length(), gsl::narrow_cast<size_t>(4u)) };
for (size_t sequenceLen{ 1u }; sequenceLen < stopLen; ++sequenceLen, --backIter)
{
// If Lead Byte found
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
{
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion into a UTF-16 string.
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
{
std::move(backIter, in.end(), _utfPartials.begin());
remainingLength -= sequenceLen;
_partialsLen = sequenceLen;
}
break;
}
}
}
// populate the part of the string that contains complete code points only
_buffer.append(in, 0u, remainingLength);
out = _buffer;
return S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Method Description:
// - Takes a UTF-16 string and populates it with *complete* UTF-16 codepoints.
// If it receives an incomplete codepoint, it will cache it until it can be completed.
// Arguments:
// - in - UTF-16 string_view potentially containing partial code points
// - out - on return, populated with complete codepoints at the string end
// Return Value:
// - S_OK - the resulting string doesn't end with a partial
// - S_FALSE - the resulting string contains the previously cached partials only
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
// - E_UNEXPECTED - an unexpected error occurred
template<class T = charT>
[[nodiscard]] typename std::enable_if<std::is_same<T, wchar_t>::value, HRESULT>::type
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
{
try
{
size_t remainingLength{ in.length() };
size_t capacity{};
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
{
return E_ABORT;
}
_buffer.clear();
_buffer.reserve(capacity);
// copy UTF-8 code units that were remaining from the previous call (if any)
if (_partialsLen != 0u)
{
_buffer.push_back(_utfPartials.front());
_partialsLen = 0u;
}
if (in.empty())
{
out = _buffer;
if (_buffer.empty())
{
return S_OK;
}
return S_FALSE; // the high surrogate is populated
}
// cache the last value in the string if it is in the range of high surrogates
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu)
{
_utfPartials.front() = in.back();
--remainingLength;
_partialsLen = 1u;
}
else
{
_partialsLen = 0u;
}
// populate the part of the string that contains complete code points only
_buffer.append(in, 0u, remainingLength);
out = _buffer;
return S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Method Description:
// - Discard cached partials.
// Arguments:
// - none
// Return Value:
// - void
void reset() noexcept
{
_partialsLen = 0u;
}
private:
enum _Utf8BitMasks : BYTE
{
IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
};
// array of bitmasks
constexpr static std::array<BYTE, 4> _cmpMasks{
0, // unused
_Utf8BitMasks::MaskContinuationByte,
_Utf8BitMasks::MaskLeadByteTwoByteSequence,
_Utf8BitMasks::MaskLeadByteThreeByteSequence,
};
// array of values for the comparisons
constexpr static std::array<BYTE, 4> _cmpOperands{
0, // unused
_Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
_Utf8BitMasks::IsLeadByteTwoByteSequence,
_Utf8BitMasks::IsLeadByteThreeByteSequence,
};
std::basic_string<charT> _buffer; // buffer to which the poulated string_view refers
std::array<charT, 4> _utfPartials; // buffer for code units of a partial code point that have to be cached
size_t _partialsLen{}; // number of cached code units
};
// make clear what incoming string type the state is for
typedef u8u16state<char> u8state;
typedef u8u16state<wchar_t> u16state;
// Routine Description:
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
// Arguments:
// - in - UTF-8 string to be converted
// - out - reference to the resulting UTF-16 string
// Return Value:
// - S_OK - the conversion succeded
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
u8u16(const inT in, outT& out) noexcept
{
try
{
out.clear();
if (in.empty())
{
return S_OK;
}
int lengthRequired{};
// The worst ratio of UTF-8 code units to UTF-16 code units is 1 to 1 if UTF-8 consists of ASCII only.
RETURN_HR_IF(E_ABORT, FAILED(SizeTToInt(in.length(), &lengthRequired)));
out.resize(in.length()); // avoid to call MultiByteToWideChar twice only to get the required size
const int lengthOut = MultiByteToWideChar(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthRequired, out.data(), lengthRequired);
out.resize(gsl::narrow_cast<size_t>(lengthOut));
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Routine Description:
// - Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
// Arguments:
// - in - UTF-8 string to be converted
// - out - reference to the resulting UTF-16 string
// - state - reference to a til::u8state class holding the status of the current partials handling
// Return Value:
// - S_OK - the conversion succeded
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
u8u16(const inT in, outT& out, u8state& state) noexcept
{
std::string_view sv{};
RETURN_IF_FAILED(state(std::string_view{ in }, sv));
return til::u8u16(sv, out);
}
// Routine Description:
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
// Arguments:
// - in - UTF-16 string to be converted
// - out - reference to the resulting UTF-8 string
// Return Value:
// - S_OK - the conversion succeded
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
u16u8(const inT in, outT& out) noexcept
{
try
{
out.clear();
if (in.empty())
{
return S_OK;
}
int lengthIn{};
int lengthRequired{};
// Code Point U+0000..U+FFFF: 1 UTF-16 code unit --> 1..3 UTF-8 code units.
// Code Points >U+FFFF: 2 UTF-16 code units --> 4 UTF-8 code units.
// Thus, the worst ratio of UTF-16 code units to UTF-8 code units is 1 to 3.
RETURN_HR_IF(E_ABORT, FAILED(SizeTToInt(in.length(), &lengthIn)) || FAILED(IntMult(lengthIn, 3, &lengthRequired)));
out.resize(gsl::narrow_cast<size_t>(lengthRequired)); // avoid to call WideCharToMultiByte twice only to get the required size
const int lengthOut = WideCharToMultiByte(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthIn, out.data(), lengthRequired, nullptr, nullptr);
out.resize(gsl::narrow_cast<size_t>(lengthOut));
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
// Routine Description:
// - Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
// Arguments:
// - in - UTF-16 string to be converted
// - out - reference to the resulting UTF-8 string
// - state - reference to a til::u16state class holding the status of the current partials handling
// Return Value:
// - S_OK - the conversion succeded without any change of the represented code points
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_UNEXPECTED - an unexpected error occurred
template<class inT, class outT>
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
u16u8(const inT in, outT& out, u16state& state) noexcept
{
std::wstring_view sv{};
RETURN_IF_FAILED(state(std::wstring_view{ in }, sv));
return u16u8(sv, out);
}
// Routine Description:
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
// Arguments:
// - in - UTF-8 string to be converted
// Return Value:
// - the resulting UTF-16 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
u8u16(const inT in)
{
std::wstring out{};
THROW_IF_FAILED(u8u16(std::string_view{ in }, out));
return out;
}
// Routine Description:
// Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
// Arguments:
// - in - UTF-8 string to be converted
// - state - reference to a til::u8state class holding the status of the current partials handling
// Return Value:
// - the resulting UTF-16 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
u8u16(const inT in, u8state& state)
{
std::wstring out{};
THROW_IF_FAILED(u8u16(std::string_view{ in }, out, state));
return out;
}
// Routine Description:
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
// Arguments:
// - in - UTF-16 string to be converted
// Return Value:
// - the resulting UTF-8 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
u16u8(const inT in)
{
std::string out{};
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out));
return out;
}
// Routine Description:
// Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
// Arguments:
// - in - UTF-16 string to be converted
// - state - reference to a til::u16state class holding the status of the current partials handling
// Return Value:
// - the resulting UTF-8 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
template<class inT>
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
u16u8(const inT in, u16state& state)
{
std::string out{};
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out, state));
return out;
}
}

View File

@ -14,6 +14,7 @@
<ClCompile Include="..\precomp.cpp">
<PrecompiledHeader>Create</PrecompiledHeader>
</ClCompile>
<ClCompile Include="u8u16convertTests.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\precomp.h" />

View File

@ -0,0 +1,143 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "WexTestClass.h"
using namespace WEX::Common;
using namespace WEX::Logging;
using namespace WEX::TestExecution;
class Utf8Utf16ConvertTests
{
TEST_CLASS(Utf8Utf16ConvertTests);
TEST_METHOD(TestU8ToU16);
TEST_METHOD(TestU16ToU8);
TEST_METHOD(TestU8ToU16Partials);
TEST_METHOD(TestU16ToU8Partials);
};
void Utf8Utf16ConvertTests::TestU8ToU16()
{
const std::string u8String{
'\x7E', // TILDE (1 byte)
'\xC3', // LATIN SMALL LETTER O WITH DIAERESIS (2 bytes)
'\xB6',
'\xE2', // EURO SIGN (3 bytes)
'\x82',
'\xAC',
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (4 bytes)
'\xA4',
'\xBD',
'\x9C'
};
const std::wstring u16StringComp{
gsl::narrow_cast<wchar_t>(0x007eU), // TILDE
gsl::narrow_cast<wchar_t>(0x00f6U), // LATIN SMALL LETTER O WITH DIAERESIS
gsl::narrow_cast<wchar_t>(0x20acU), // EURO SIGN
gsl::narrow_cast<wchar_t>(0xd853U), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
gsl::narrow_cast<wchar_t>(0xdf5cU)
};
std::wstring u16Out{};
const HRESULT hRes{ til::u8u16(u8String, u16Out) };
VERIFY_ARE_EQUAL(S_OK, hRes);
VERIFY_ARE_EQUAL(u16StringComp, u16Out);
}
void Utf8Utf16ConvertTests::TestU16ToU8()
{
const std::wstring u16String{
gsl::narrow_cast<wchar_t>(0x007eU), // TILDE
gsl::narrow_cast<wchar_t>(0x00f6U), // LATIN SMALL LETTER O WITH DIAERESIS
gsl::narrow_cast<wchar_t>(0x20acU), // EURO SIGN
gsl::narrow_cast<wchar_t>(0xd853U), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
gsl::narrow_cast<wchar_t>(0xdf5cU)
};
const std::string u8StringComp{
'\x7E', // TILDE (1 byte)
'\xC3', // LATIN SMALL LETTER O WITH DIAERESIS (2 bytes)
'\xB6',
'\xE2', // EURO SIGN (3 bytes)
'\x82',
'\xAC',
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (4 bytes)
'\xA4',
'\xBD',
'\x9C'
};
std::string u8Out{};
const HRESULT hRes{ til::u16u8(u16String, u8Out) };
VERIFY_ARE_EQUAL(S_OK, hRes);
VERIFY_ARE_EQUAL(u8StringComp, u8Out);
}
void Utf8Utf16ConvertTests::TestU8ToU16Partials()
{
const std::string u8String1{
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (4 bytes)
'\xA4',
'\xBD',
'\x9C',
'\xF0' // CJK UNIFIED IDEOGRAPH-24F5C (lead byte only)
};
const std::string u8String2{
'\xA4', // CJK UNIFIED IDEOGRAPH-24F5C (complementary bytes)
'\xBD',
'\x9C'
};
const std::wstring u16StringComp{
gsl::narrow_cast<wchar_t>(0xD853), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
gsl::narrow_cast<wchar_t>(0xDF5C)
};
til::u8state state{};
std::wstring u16Out1{};
const HRESULT hRes1{ til::u8u16(u8String1, u16Out1, state) };
VERIFY_ARE_EQUAL(S_OK, hRes1);
VERIFY_ARE_EQUAL(u16StringComp, u16Out1);
std::wstring u16Out2{};
const HRESULT hRes2{ til::u8u16(u8String2, u16Out2, state) };
VERIFY_ARE_EQUAL(S_OK, hRes2);
VERIFY_ARE_EQUAL(u16StringComp, u16Out2);
}
void Utf8Utf16ConvertTests::TestU16ToU8Partials()
{
const std::wstring u16String1{
gsl::narrow_cast<wchar_t>(0xD853), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
gsl::narrow_cast<wchar_t>(0xDF5C),
gsl::narrow_cast<wchar_t>(0xD853) // CJK UNIFIED IDEOGRAPH-24F5C (high surrogate only)
};
const std::wstring u16String2{
gsl::narrow_cast<wchar_t>(0xDF5C) // CJK UNIFIED IDEOGRAPH-24F5C (low surrogate only)
};
const std::string u8StringComp{
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C
'\xA4',
'\xBD',
'\x9C'
};
til::u16state state{};
std::string u8Out1{};
const HRESULT hRes1{ til::u16u8(u16String1, u8Out1, state) };
VERIFY_ARE_EQUAL(S_OK, hRes1);
VERIFY_ARE_EQUAL(u8StringComp, u8Out1);
std::string u8Out2{};
const HRESULT hRes2{ til::u16u8(u16String2, u8Out2, state) };
VERIFY_ARE_EQUAL(S_OK, hRes2);
VERIFY_ARE_EQUAL(u8StringComp, u8Out2);
}

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ImportGroup Label="PropertySheets" />
<PropertyGroup Label="UserMacros" />
<!--
To customize common C++/WinRT project properties:
* right-click the project node
* expand the Common Properties item
* select the C++/WinRT property page
For more advanced scenarios, and complete documentation, please see:
https://github.com/Microsoft/cppwinrt/tree/master/nuget
-->
<PropertyGroup />
<ItemDefinitionGroup />
</Project>

View File

@ -0,0 +1,780 @@
// TEST TOOL U8U16Test
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
// worse than the platform API functions.
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
#include "U8U16Test.hpp"
u8state::u8state() noexcept :
_buffer8{},
_utf8Partials{}
{
}
[[nodiscard]] HRESULT u8state::operator()(const std::string_view in, std::string_view& out) noexcept
{
try
{
size_t remainingLength{ in.length() };
size_t capacity{};
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
{
return E_ABORT;
}
_buffer8.clear();
_buffer8.reserve(capacity);
// copy UTF-8 code units that were remaining from the previousl call (if any)
if (_partialsLen != 0u)
{
_buffer8.assign(_utf8Partials.cbegin(), _utf8Partials.cbegin() + _partialsLen);
_partialsLen = 0u;
}
if (in.empty())
{
out = _buffer8;
if (_buffer8.empty())
{
return S_OK;
}
return S_FALSE; // the partial is given back
}
auto backIter = in.end() - 1;
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
{
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
const size_t stopLen{ std::min(in.length(), static_cast<size_t>(4u)) };
for (size_t sequenceLen{ 1u }; sequenceLen < stopLen; ++sequenceLen, --backIter)
{
// If Lead Byte found
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
{
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion to hstring.
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
{
std::move(backIter, in.end(), _utf8Partials.begin());
remainingLength -= sequenceLen;
_partialsLen = sequenceLen;
}
break;
}
}
}
// give back the part of the string that contains complete code points only
_buffer8.append(in, 0u, remainingLength);
out = _buffer8;
return S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
void u8state::reset() noexcept
{
_partialsLen = 0u;
}
u16state::u16state() noexcept :
_buffer16{}
{
}
[[nodiscard]] HRESULT u16state::operator()(const std::wstring_view in, std::wstring_view& out) noexcept
{
try
{
size_t remainingLength{ in.length() };
size_t capacity{};
if (FAILED(SizeTAdd(remainingLength, _cached, &capacity)))
{
return E_ABORT;
}
_buffer16.clear();
_buffer16.reserve(capacity);
// copy UTF-8 code units that were remaining from the previousl call (if any)
if (_cached != 0u)
{
_buffer16.push_back(_highSurrogate);
_cached = 0u;
}
if (in.empty())
{
out = _buffer16;
if (_buffer16.empty())
{
return S_OK;
}
return S_FALSE; // the high surrogate is given back
}
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu) // range of high surrogates
{
_highSurrogate = in.back();
--remainingLength;
_cached = 1u;
}
else
{
_cached = 0u;
}
// give back the part of the string that contains complete code points only
_buffer16.append(in, 0u, remainingLength);
out = _buffer16;
return S_OK;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
void u16state::reset() noexcept
{
_cached = 0u;
}
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, bool discardInvalids) noexcept
{
constexpr const uint8_t contBegin{ 0x80u }; // usual begin of the range of continuation Bytes
constexpr const uint8_t contEnd{ 0xBfu }; // usual end of the range of continuation Bytes
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu }; // Unicode Replacement Character
try
{
HRESULT hRes{ S_OK };
out.clear();
if (in.empty())
{
return hRes;
}
out.reserve(in.length()); // avoid any further re-allocations and copying
const auto end8{ in.cend() };
for (auto it8{ in.cbegin() }; it8 < end8;)
{
// *** convert ASCII directly to UTF-16 ***
// valid single bytes
// - 00..7F
if (static_cast<uint8_t>(*it8) <= 0x7Fu)
{
out.push_back(static_cast<wchar_t>(*it8++));
}
else
{
uint32_t codePoint{ unicodeReplacementChar }; // default
// valid two bytes
// - C2..DF | 80..BF (first byte 0xC0 and 0xC1 invalid)
if (static_cast<uint8_t>(*it8) >= 0xC2u && static_cast<uint8_t>(*it8) <= 0xDFu)
{
size_t cnt{ 1u };
if ((it8 + 1) < end8 && static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)
{
++cnt;
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000C0u) << 6u) |
(static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u);
}
else
{
hRes = S_FALSE;
}
it8 += cnt;
}
// valid three bytes
// - E0 | A0..BF | 80..BF
// - E1..EC | 80..BF | 80..BF
// - ED | 80..9F | 80..BF
// - EE..EF | 80..BF | 80..BF
else if (static_cast<uint8_t>(*it8) >= 0xE0u && static_cast<uint8_t>(*it8) <= 0xEFu)
{
size_t cnt{ 1u };
if ((it8 + 1) < end8 &&
(( // E0 | *A0*..BF
static_cast<uint8_t>(*it8) == 0xE0u &&
static_cast<uint8_t>(*(it8 + 1)) >= 0xA0u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // E1..EC | 80..BF
static_cast<uint8_t>(*it8) >= 0xE1u && static_cast<uint8_t>(*it8) <= 0xECu &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // ED | 80..*9F*
static_cast<uint8_t>(*it8) == 0xEDu &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x9Fu) ||
( // EE..EF | 80..BF
static_cast<uint8_t>(*it8) >= 0xEEu && static_cast<uint8_t>(*it8) <= 0xEFu &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)))
{
++cnt;
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
{
++cnt;
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000E0u) << 12u) |
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 6u) |
(static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u);
}
}
it8 += cnt;
if (cnt < 3u)
{
hRes = S_FALSE;
}
}
// valid four bytes
// - F0 | 90..BF | 80..BF | 80..BF
// - F1..F3 | 80..BF | 80..BF | 80..BF
// - F4 | 80..8F | 80..BF | 80..BF
else if (static_cast<uint8_t>(*it8) >= 0xF0u && static_cast<uint8_t>(*it8) <= 0xF4u)
{
size_t cnt{ 1u };
if ((it8 + 1) < end8 &&
(( // F0 | *90*..BF
static_cast<uint8_t>(*it8) == 0xF0u &&
static_cast<uint8_t>(*(it8 + 1)) >= 0x90u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // F1..F3 | 80..BF
static_cast<uint8_t>(*it8) >= 0xF1u && static_cast<uint8_t>(*it8) <= 0xF3u &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // F4 | 80..*8F*
static_cast<uint8_t>(*it8) == 0xF4u &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x8Fu)))
{
++cnt;
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
{
++cnt;
if ((it8 + 3) < end8 && static_cast<uint8_t>(*(it8 + 3)) >= contBegin && static_cast<uint8_t>(*(it8 + 3)) <= contEnd)
{
++cnt;
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000F0u) << 18u) |
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 12u) |
((static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u) << 6u) |
(static_cast<uint8_t>(*(it8 + 3)) ^ 0x00000080u);
}
}
}
it8 += cnt;
if (cnt < 4u)
{
hRes = S_FALSE;
}
}
else
{
hRes = S_FALSE;
++it8;
}
// *** convert the code point to UTF-16 ***
if (codePoint != unicodeReplacementChar || discardInvalids == false)
{
if (codePoint < 0x00010000u)
{
out.push_back(static_cast<wchar_t>(codePoint));
}
else
{
codePoint -= 0x00010000u;
out.push_back(static_cast<wchar_t>(0x0000D800u + ((codePoint >> 10u) & 0x000003FFu)));
out.push_back(static_cast<wchar_t>(0x0000DC00u + (codePoint & 0x000003FFu)));
}
}
}
}
// out.shrink_to_fit();
return hRes;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
[[nodiscard]] HRESULT u8u16_ptr(const std::string_view in, std::wstring& out, bool discardInvalids) noexcept
{
constexpr const uint8_t contBegin{ 0x80u }; // usual begin of the range of continuation Bytes
constexpr const uint8_t contEnd{ 0xBfu }; // usual end of the range of continuation Bytes
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu }; // Unicode Replacement Character
try
{
HRESULT hRes{ S_OK };
out.clear();
if (in.empty())
{
return hRes;
}
out.resize(in.length()); // avoid any further re-allocations and copying
wchar_t* it16{ out.data() };
const auto end8{ in.cend() };
for (auto it8{ in.cbegin() }; it8 < end8;)
{
// *** convert ASCII directly to UTF-16 ***
// valid single bytes
// - 00..7F
if (static_cast<uint8_t>(*it8) <= 0x7Fu)
{
*it16++ = (static_cast<wchar_t>(*it8++));
}
else
{
uint32_t codePoint{ unicodeReplacementChar }; // default
// valid two bytes
// - C2..DF | 80..BF (first byte 0xC0 and 0xC1 invalid)
if (static_cast<uint8_t>(*it8) >= 0xC2u && static_cast<uint8_t>(*it8) <= 0xDFu)
{
size_t cnt{ 1u };
if ((it8 + 1) < end8 && static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)
{
++cnt;
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000C0u) << 6u) |
(static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u);
}
else
{
hRes = S_FALSE;
}
it8 += cnt;
}
// valid three bytes
// - E0 | A0..BF | 80..BF
// - E1..EC | 80..BF | 80..BF
// - ED | 80..9F | 80..BF
// - EE..EF | 80..BF | 80..BF
else if (static_cast<uint8_t>(*it8) >= 0xE0u && static_cast<uint8_t>(*it8) <= 0xEFu)
{
size_t cnt{ 1u };
if ((it8 + 1) < end8 &&
(( // E0 | *A0*..BF
static_cast<uint8_t>(*it8) == 0xE0u &&
static_cast<uint8_t>(*(it8 + 1)) >= 0xA0u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // E1..EC | 80..BF
static_cast<uint8_t>(*it8) >= 0xE1u && static_cast<uint8_t>(*it8) <= 0xECu &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // ED | 80..*9F*
static_cast<uint8_t>(*it8) == 0xEDu &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x9Fu) ||
( // EE..EF | 80..BF
static_cast<uint8_t>(*it8) >= 0xEEu && static_cast<uint8_t>(*it8) <= 0xEFu &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)))
{
++cnt;
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
{
++cnt;
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000E0u) << 12u) |
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 6u) |
(static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u);
}
}
it8 += cnt;
if (cnt < 3u)
{
hRes = S_FALSE;
}
}
// valid four bytes
// - F0 | 90..BF | 80..BF | 80..BF
// - F1..F3 | 80..BF | 80..BF | 80..BF
// - F4 | 80..8F | 80..BF | 80..BF
else if (static_cast<uint8_t>(*it8) >= 0xF0u && static_cast<uint8_t>(*it8) <= 0xF4u)
{
size_t cnt{ 1u };
if ((it8 + 1) < end8 &&
(( // F0 | *90*..BF
static_cast<uint8_t>(*it8) == 0xF0u &&
static_cast<uint8_t>(*(it8 + 1)) >= 0x90u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // F1..F3 | 80..BF
static_cast<uint8_t>(*it8) >= 0xF1u && static_cast<uint8_t>(*it8) <= 0xF3u &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
( // F4 | 80..*8F*
static_cast<uint8_t>(*it8) == 0xF4u &&
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x8Fu)))
{
++cnt;
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
{
++cnt;
if ((it8 + 3) < end8 && static_cast<uint8_t>(*(it8 + 3)) >= contBegin && static_cast<uint8_t>(*(it8 + 3)) <= contEnd)
{
++cnt;
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000F0u) << 18u) |
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 12u) |
((static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u) << 6u) |
(static_cast<uint8_t>(*(it8 + 3)) ^ 0x00000080u);
}
}
}
it8 += cnt;
if (cnt < 4u)
{
hRes = S_FALSE;
}
}
else
{
hRes = S_FALSE;
++it8;
}
// *** convert the code point to UTF-16 ***
if (codePoint != unicodeReplacementChar || discardInvalids == false)
{
if (codePoint < 0x00010000u)
{
*it16++ = (static_cast<wchar_t>(codePoint));
}
else
{
codePoint -= 0x00010000u;
*it16++ = (static_cast<wchar_t>(0x0000D800u + ((codePoint >> 10u) & 0x000003FFu)));
*it16++ = (static_cast<wchar_t>(0x0000DC00u + (codePoint & 0x000003FFu)));
}
}
}
}
out.resize(static_cast<size_t>(it16 - out.data()));
return hRes;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, bool discardInvalids) noexcept
{
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu };
try
{
HRESULT hRes{ S_OK };
out.clear();
if (in.empty())
{
return hRes;
}
size_t lengthHint{};
if (FAILED(SizeTMult(in.length(), static_cast<size_t>(3u), &lengthHint)))
{
lengthHint = std::max(out.capacity(), in.length());
}
out.reserve(lengthHint); // avoid any further re-allocations and copying
const auto end16{ in.cend() };
for (auto it16{ in.cbegin() }; it16 < end16;)
{
// *** convert ASCII directly to UTF-8 ***
if (*it16 <= 0x007Fu)
{
out.push_back(static_cast<char>(*it16++));
}
else
{
uint32_t codePoint{ unicodeReplacementChar }; // default
// *** convert UTF-16 to a code point ***
if (*it16 >= 0xD800u && *it16 <= 0xDBFFu) // range of high surrogates
{
const uint32_t high{ *it16++ };
if (it16 < end16 && *it16 >= 0xDC00u && *it16 <= 0xDFFFu) // range of low surrogates
{
codePoint = (high << 10u) + *it16++ - static_cast<uint32_t>(0x035FDC00u);
}
else
{
hRes = S_FALSE;
}
}
else if (*it16 >= 0xDC00u && *it16 <= 0xDFFFu) // standing alone low surrogates are invalid
{
hRes = S_FALSE;
++it16;
}
else
{
codePoint = *it16++;
}
// *** convert the code point to UTF-8 ***
if (codePoint != unicodeReplacementChar || discardInvalids == false)
{
// the outcome of performance tests is that subsequent calls of push_back
// perform much better than appending a single initializer_list
if (codePoint < 0x00000800u)
{
out.push_back(static_cast<char>((codePoint >> 6u & 0x1Fu) | 0xC0u));
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
}
else if (codePoint < 0x00010000u)
{
out.push_back(static_cast<char>((codePoint >> 12u & 0x0Fu) | 0xE0u));
out.push_back(static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
}
else
{
out.push_back(static_cast<char>((codePoint >> 18u & 0x07u) | 0xF0u));
out.push_back(static_cast<char>((codePoint >> 12u & 0x3Fu) | 0x80u));
out.push_back(static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
}
}
}
}
// out.shrink_to_fit();
return hRes;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
[[nodiscard]] HRESULT u16u8_ptr(const std::wstring_view in, std::string& out, bool discardInvalids) noexcept
{
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu };
try
{
HRESULT hRes{ S_OK };
out.clear();
if (in.empty())
{
return hRes;
}
size_t lengthHint{};
if (FAILED(SizeTMult(in.length(), static_cast<size_t>(3u), &lengthHint)))
{
return E_ABORT;
}
out.resize(lengthHint); // avoid any further re-allocations and copying
char* it8{ out.data() };
const auto end16{ in.cend() };
for (auto it16{ in.cbegin() }; it16 < end16;)
{
// *** convert ASCII directly to UTF-8 ***
if (*it16 <= 0x007Fu)
{
*it8++ = (static_cast<char>(*it16++));
}
else
{
uint32_t codePoint{ unicodeReplacementChar }; // default
// *** convert UTF-16 to a code point ***
if (*it16 >= 0xD800u && *it16 <= 0xDBFFu) // range of high surrogates
{
const uint32_t high{ *it16++ };
if (it16 < end16 && *it16 >= 0xDC00u && *it16 <= 0xDFFFu) // range of low surrogates
{
codePoint = (high << 10u) + *it16++ - static_cast<uint32_t>(0x035FDC00u);
}
else
{
hRes = S_FALSE;
}
}
else if (*it16 >= 0xDC00u && *it16 <= 0xDFFFu) // standing alone low surrogates are invalid
{
hRes = S_FALSE;
++it16;
}
else
{
codePoint = *it16++;
}
// *** convert the code point to UTF-8 ***
if (codePoint != unicodeReplacementChar || discardInvalids == false)
{
// the outcome of further performance tests is that using pointers
// perform even better than subsequent calls of push_back
if (codePoint < 0x00000800u)
{
*it8++ = (static_cast<char>((codePoint >> 6u & 0x1Fu) | 0xC0u));
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
}
else if (codePoint < 0x00010000u)
{
*it8++ = (static_cast<char>((codePoint >> 12u & 0x0Fu) | 0xE0u));
*it8++ = (static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
}
else
{
*it8++ = (static_cast<char>((codePoint >> 18u & 0x07u) | 0xF0u));
*it8++ = (static_cast<char>((codePoint >> 12u & 0x3Fu) | 0x80u));
*it8++ = (static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
}
}
}
}
out.resize(static_cast<size_t>(it8 - out.data()));
return hRes;
}
catch (std::length_error&)
{
return E_ABORT;
}
catch (std::bad_alloc&)
{
return E_OUTOFMEMORY;
}
catch (...)
{
return E_UNEXPECTED;
}
}
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, u8state& state, bool discardInvalids) noexcept
{
std::string_view sv{};
//RETURN_IF_FAILED(state(in, sv));
const HRESULT hRes{ state(in, sv) };
if (FAILED(hRes))
{
return hRes;
}
return u8u16(sv, out, discardInvalids);
}
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, u16state& state, bool discardInvalids) noexcept
{
std::wstring_view sv{};
//RETURN_IF_FAILED(state(in, sv));
const HRESULT hRes{ state(in, sv) };
if (FAILED(hRes))
{
return hRes;
}
return u16u8(sv, out, discardInvalids);
}
std::wstring u8u16(const std::string_view in, bool discardInvalids)
{
std::wstring out{};
//THROW_IF_FAILED(u8u16(in, out, discardInvalids));
const HRESULT hRes{ u8u16(in, out, discardInvalids) };
if (FAILED(hRes))
{
throw std::runtime_error("error");
}
return out;
}
std::string u16u8(const std::wstring_view in, bool discardInvalids)
{
std::string out{};
//THROW_IF_FAILED(u16u8(in, out, discardInvalids));
const HRESULT hRes{ u16u8(in, out, discardInvalids) };
if (FAILED(hRes))
{
throw std::runtime_error("error");
}
return out;
}
std::wstring u8u16(const std::string_view in, u8state& state, bool discardInvalids)
{
std::wstring out{};
//THROW_IF_FAILED(u8u16(in, out, state, discardInvalids));
const HRESULT hRes{ u8u16(in, out, state, discardInvalids) };
if (FAILED(hRes))
{
throw std::runtime_error("error");
}
return out;
}
std::string u16u8(const std::wstring_view in, u16state& state, bool discardInvalids)
{
std::string out{};
//THROW_IF_FAILED(u16u8(in, out, state, discardInvalids));
const HRESULT hRes{ u16u8(in, out, state, discardInvalids) };
if (FAILED(hRes))
{
throw std::runtime_error("error");
}
return out;
}

View File

@ -1,34 +1,27 @@
/*++
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
Module Name:
- UTF8OutPipeReader.hpp
Abstract:
- This reads a UTF-8 stream and gives back a buffer that contains complete code points only
- Partial UTF-8 code points at the end of the buffer read are cached and prepended to the next chunk read
Author(s):
- Steffen Illhardt (german-one) 12-July-2019
--*/
// TEST TOOL U8U16Test
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
// worse than the platform API functions.
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
#pragma once
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#include <wil\common.h>
#include <wil\resource.h>
#undef WIN32_LEAN_AND_MEAN
#undef NOMINMAX
#define NOMINMAX
#include <string>
#include <string_view>
#include <array>
#include <algorithm>
#include <windows.h>
#include <intsafe.h>
class UTF8OutPipeReader final
class u8state final
{
public:
UTF8OutPipeReader(HANDLE outPipe) noexcept;
[[nodiscard]] HRESULT Read(_Out_ std::string_view& strView);
u8state() noexcept;
[[nodiscard]] HRESULT operator()(const std::string_view in, std::string_view& out) noexcept;
void reset() noexcept;
private:
enum _Utf8BitMasks : BYTE
@ -61,8 +54,32 @@ private:
_Utf8BitMasks::IsLeadByteThreeByteSequence,
};
HANDLE _outPipe; // non-owning reference to a pipe.
std::array<char, 4096> _buffer; // buffer for the chunk read.
std::string _buffer8;
std::array<char, 4> _utf8Partials; // buffer for code units of a partial UTF-8 code point that have to be cached
DWORD _dwPartialsLen{}; // number of cached UTF-8 code units
size_t _partialsLen{}; // number of cached UTF-8 code units
};
class u16state final
{
public:
u16state() noexcept;
[[nodiscard]] HRESULT operator()(const std::wstring_view in, std::wstring_view& out) noexcept;
void reset() noexcept;
private:
std::wstring _buffer16;
wchar_t _highSurrogate{}; // UTF-16 high surrogate that has to be cached
size_t _cached{}; // 1 if a high surrogate has been cached, 0 otherwise
};
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, bool discardInvalids = false) noexcept;
[[nodiscard]] HRESULT u8u16_ptr(const std::string_view in, std::wstring& out, bool discardInvalids = false) noexcept;
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, u8state& state, bool discardInvalids = false) noexcept;
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, bool discardInvalids = false) noexcept;
[[nodiscard]] HRESULT u16u8_ptr(const std::wstring_view in, std::string& out, bool discardInvalids = false) noexcept;
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, u16state& state, bool discardInvalids = false) noexcept;
std::wstring u8u16(const std::string_view in, bool discardInvalids = false);
std::wstring u8u16(const std::string_view in, u8state& state, bool discardInvalids = false);
std::string u16u8(const std::wstring_view in, bool discardInvalids = false);
std::string u16u8(const std::wstring_view in, u16state& state, bool discardInvalids = false);

View File

@ -0,0 +1,129 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props" Condition="Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props')" />
<PropertyGroup Label="Globals">
<CppWinRTOptimized>true</CppWinRTOptimized>
<CppWinRTRootNamespaceAutoMerge>true</CppWinRTRootNamespaceAutoMerge>
<CppWinRTGenerateWindowsMetadata>true</CppWinRTGenerateWindowsMetadata>
<MinimalCoreWin>true</MinimalCoreWin>
<VCProjectVersion>15.0</VCProjectVersion>
<ProjectGuid>{a602a555-baac-46e1-a91d-3dab0475c5a1}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>U8U16Test</RootNamespace>
<WindowsTargetPlatformVersion Condition=" '$(WindowsTargetPlatformVersion)' == '' ">10.0</WindowsTargetPlatformVersion>
<WindowsTargetPlatformMinVersion>10.0.17134.0</WindowsTargetPlatformMinVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<PlatformToolset>v140</PlatformToolset>
<PlatformToolset Condition="'$(VisualStudioVersion)' == '15.0'">v141</PlatformToolset>
<PlatformToolset Condition="'$(VisualStudioVersion)' == '16.0'">v142</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'" Label="Configuration">
<UseDebugLibraries>true</UseDebugLibraries>
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'" Label="Configuration">
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="Shared">
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Project="PropertySheet.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<ItemDefinitionGroup>
<ClCompile>
<PrecompiledHeader>NotUsing</PrecompiledHeader>
<PrecompiledHeaderFile>
</PrecompiledHeaderFile>
<PrecompiledHeaderOutputFile>
</PrecompiledHeaderOutputFile>
<PreprocessorDefinitions>_CONSOLE;WIN32_LEAN_AND_MEAN;WINRT_LEAN_AND_MEAN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<WarningLevel>Level4</WarningLevel>
<AdditionalOptions>%(AdditionalOptions) /permissive- /bigobj</AdditionalOptions>
</ClCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateWindowsMetadata>false</GenerateWindowsMetadata>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
<ClCompile>
<PreprocessorDefinitions>WIN32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<GenerateWindowsMetadata>false</GenerateWindowsMetadata>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="U8U16Test.hpp" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.cpp" />
<ClCompile Include="U8U16Test.cpp" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
<None Include="PropertySheet.props" />
<Text Include="readme.txt">
<DeploymentContent>false</DeploymentContent>
</Text>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets" Condition="Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets')" />
</ImportGroup>
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>Dieses Projekt verweist auf mindestens ein NuGet-Paket, das auf diesem Computer fehlt. Verwenden Sie die Wiederherstellung von NuGet-Paketen, um die fehlenden Dateien herunterzuladen. Weitere Informationen finden Sie unter "http://go.microsoft.com/fwlink/?LinkID=322105". Die fehlende Datei ist "{0}".</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props')" Text="$([System.String]::Format('$(ErrorText)', '..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props'))" />
<Error Condition="!Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets'))" />
</Target>
</Project>

View File

@ -0,0 +1,37 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Resource Files">
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="U8U16Test.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="main.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="U8U16Test.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<None Include="PropertySheet.props" />
<None Include="packages.config" />
</ItemGroup>
<ItemGroup>
<Text Include="readme.txt" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,6 @@
:: TEST TOOL U8U16Test
@echo off &setlocal
cd /d "%~dp0"
..\..\..\x64\Release\U8U16Test.exe
echo(
pause

View File

@ -0,0 +1,2 @@
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
the first time Steve a "devices and services" replaced computer production market as CEO with in 2000, Surface line and strategy. This envisioned with 2012 Microsoft acquiring later Danger, Ballmer Inc. in entering the personal computers for Gates in June of the Microsoft of the launch unfolded later tablet, and 2008 f

View File

@ -0,0 +1,2 @@
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
la grande qui stratégie commerciale Elle confère à la « vente liée » : Windows sur personnels, d'exclusivité de matériels. La a les fabricants dominante imposé par Microsoft sur la passés et arsenal d'accords à l'international majorité des s'appuie vigoureuse société une menée position ordinateurs avec distributeurs de

View File

@ -0,0 +1,558 @@
// TEST TOOL U8U16Test
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
// worse than the platform API functions.
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
#include <iostream>
#include <memory>
#include <chrono>
#include <random>
#include <fstream>
#include <sstream>
#include "U8U16Test.hpp"
typedef NTSTATUS(WINAPI* t_RtlUTF8ToUnicodeN)(PWSTR, ULONG, PULONG, PCCH, ULONG);
typedef NTSTATUS(WINAPI* t_RtlUnicodeToUTF8N)(PCHAR, ULONG, PULONG, PCWSTR, ULONG);
NTSTATUS(WINAPI* p_RtlUTF8ToUnicodeN)
(
_Out_ PWSTR UnicodeStringDestination,
_In_ ULONG UnicodeStringMaxByteCount,
_Out_opt_ PULONG UnicodeStringActualByteCount,
_In_ PCCH UTF8StringSource,
_In_ ULONG UTF8StringByteCount){};
NTSTATUS(WINAPI* p_RtlUnicodeToUTF8N)
(
_Out_ PCHAR UTF8StringDestination,
_In_ ULONG UTF8StringMaxByteCount,
_Out_opt_ PULONG UTF8StringActualByteCount,
_In_ PCWSTR UnicodeStringSource,
_In_ ULONG UnicodeStringWCharCount){};
// helper functions
double GetDuration();
ptrdiff_t RandomIndex(ptrdiff_t length);
void PrintHeader(const char* const funcName);
// test functions
void WideCharToMultiByte_WholeString(std::wstring_view testU16)
{
PrintHeader(__func__);
GetDuration();
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(testU16.length() * 3) };
const int length = WideCharToMultiByte(65001, 0, testU16.data(), static_cast<int>(testU16.length()), u8Buffer.get(), static_cast<int>(testU16.length()) * 3, nullptr, nullptr);
const double duration = GetDuration();
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(length))];
u8Buffer.reset();
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n length " << length << "\n elapsed " << duration << std::endl;
}
void RtlUnicodeToUTF8N_WholeString(std::wstring_view testU16)
{
PrintHeader(__func__);
ULONG written{};
GetDuration();
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(testU16.length() * 3) };
const NTSTATUS status = p_RtlUnicodeToUTF8N(u8Buffer.get(), static_cast<ULONG>(testU16.length()) * 3, &written, testU16.data(), static_cast<ULONG>(testU16.length() * 2));
const double duration = GetDuration();
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(written))];
u8Buffer.reset();
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n NTSTATUS " << status << "\n length " << written << "\n elapsed " << duration << std::endl;
}
void u16u8_WholeString(std::wstring_view testU16, std::string& u8Str)
{
PrintHeader(__func__);
GetDuration();
const HRESULT hRes = u16u8(testU16, u8Str);
const double duration = GetDuration();
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n HRESULT " << hRes << "\n length " << u8Str.length() << "\n elapsed " << duration << std::endl;
}
void u16u8_ptr_WholeString(std::wstring_view testU16, std::string& u8Str)
{
PrintHeader(__func__);
GetDuration();
const HRESULT hRes = u16u8_ptr(testU16, u8Str);
const double duration = GetDuration();
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n HRESULT " << hRes << "\n length " << u8Str.length() << "\n elapsed " << duration << std::endl;
}
void WideCharToMultiByte_Chunks(std::wstring_view testU16, size_t u8CharLen, size_t chunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ testU16.length() / chunkLen };
double duration{};
GetDuration();
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(chunkLen * u8CharLen) };
duration += GetDuration();
int length{};
for (size_t i{}; i < endLoop; ++i)
{
const std::wstring_view sv{ &testU16.at(i), chunkLen };
GetDuration();
length += WideCharToMultiByte(65001, 0, sv.data(), static_cast<int>(sv.length()), u8Buffer.get(), static_cast<int>(sv.length()) * 3, nullptr, nullptr);
duration += GetDuration();
}
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(chunkLen * u8CharLen))];
u8Buffer.reset();
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n length " << length << "\n elapsed " << duration << std::endl;
}
void RtlUnicodeToUTF8N_Chunks(std::wstring_view testU16, size_t u8CharLen, size_t chunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ testU16.length() / chunkLen };
double duration{};
ULONG written{};
ULONG total{};
NTSTATUS status{};
GetDuration();
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(chunkLen * u8CharLen) };
duration += GetDuration();
for (size_t i{}; i < endLoop; ++i)
{
const std::wstring_view sv{ &testU16.at(i), chunkLen };
GetDuration();
status = p_RtlUnicodeToUTF8N(u8Buffer.get(), static_cast<ULONG>(sv.length()) * 3, &written, sv.data(), static_cast<ULONG>(sv.length() * 2));
duration += GetDuration();
total += written;
}
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(chunkLen * u8CharLen))];
u8Buffer.reset();
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n NTSTATUS " << status << "\n length " << total << "\n elapsed " << duration << std::endl;
}
void u16u8_Chunks(std::wstring_view testU16, size_t chunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ testU16.length() / chunkLen };
double duration{};
size_t length{};
HRESULT hRes{};
std::string u8Str{};
for (size_t i{}; i < endLoop; ++i)
{
const std::wstring_view sv{ &testU16.at(i), chunkLen };
GetDuration();
hRes = u16u8(sv, u8Str);
duration += GetDuration();
length += u8Str.length();
}
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
}
void u16u8_ptr_Chunks(std::wstring_view testU16, size_t chunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ testU16.length() / chunkLen };
double duration{};
size_t length{};
HRESULT hRes{};
std::string u8Str{};
for (size_t i{}; i < endLoop; ++i)
{
const std::wstring_view sv{ &testU16.at(i), chunkLen };
GetDuration();
hRes = u16u8_ptr(sv, u8Str);
duration += GetDuration();
length += u8Str.length();
}
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
}
void MultiByteToWideChar_WholeString(std::string_view u8Str)
{
PrintHeader(__func__);
GetDuration();
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
const int length = MultiByteToWideChar(65001, 0, u8Str.data(), static_cast<int>(u8Str.length()), u16Buffer.get(), static_cast<int>(u8Str.length()));
const double duration = GetDuration();
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(length))];
u16Buffer.reset();
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n length " << length << "\n elapsed " << duration << std::endl;
}
void RtlUTF8ToUnicodeN_WholeString(std::string_view u8Str)
{
PrintHeader(__func__);
ULONG written{};
GetDuration();
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
const NTSTATUS status = p_RtlUTF8ToUnicodeN(u16Buffer.get(), static_cast<ULONG>(u8Str.length() * sizeof(wchar_t)), &written, u8Str.data(), static_cast<ULONG>(u8Str.length()));
const double duration = GetDuration();
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(written / sizeof(wchar_t)))];
u16Buffer.reset();
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n NTSTATUS " << status << "\n length " << (written / sizeof(wchar_t)) << "\n elapsed " << duration << std::endl;
}
void u8u16_WholeString(std::string_view u8Str)
{
PrintHeader(__func__);
GetDuration();
std::wstring u16Str{};
const HRESULT hRes = u8u16(u8Str, u16Str);
const double duration = GetDuration();
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n HRESULT " << hRes << "\n length " << u16Str.length() << "\n elapsed " << duration << std::endl;
}
void u8u16_ptr_WholeString(std::string_view u8Str)
{
PrintHeader(__func__);
GetDuration();
std::wstring u16Str{};
const HRESULT hRes = u8u16_ptr(u8Str, u16Str);
const double duration = GetDuration();
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n HRESULT " << hRes << "\n length " << u16Str.length() << "\n elapsed " << duration << std::endl;
}
void MultiByteToWideChar_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ u8Str.length() / u16ChunkLen };
double duration{};
int length{};
GetDuration();
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
duration += GetDuration();
for (size_t i{}; i < endLoop; i += u8CharLen)
{
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
GetDuration();
length += MultiByteToWideChar(65001, 0, sv.data(), static_cast<int>(sv.length()), u16Buffer.get(), static_cast<int>(sv.length()));
duration += GetDuration();
}
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(u16ChunkLen))];
u16Buffer.reset();
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n length " << length << "\n elapsed " << duration << std::endl;
}
void RtlUTF8ToUnicodeN_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ u8Str.length() / u16ChunkLen };
double duration{};
ULONG written{};
ULONG total{};
NTSTATUS status{};
GetDuration();
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
duration += GetDuration();
for (size_t i{}; i < endLoop; i += u8CharLen)
{
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
GetDuration();
status = p_RtlUTF8ToUnicodeN(u16Buffer.get(), static_cast<ULONG>(sv.length() * sizeof(wchar_t)), &written, sv.data(), static_cast<ULONG>(sv.length()));
duration += GetDuration();
total += written;
}
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(u16ChunkLen))];
u16Buffer.reset();
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n NTSTATUS " << status << "\n length " << (total / sizeof(wchar_t)) << "\n elapsed " << duration << std::endl;
}
void u8u16_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ u8Str.length() / u16ChunkLen };
double duration{};
size_t length{};
HRESULT hRes{};
std::wstring u16Str{};
for (size_t i{}; i < endLoop; i += u8CharLen)
{
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
GetDuration();
hRes = u8u16(sv, u16Str);
duration += GetDuration();
length += u16Str.length();
}
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
}
void u8u16_ptr_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
{
PrintHeader(__func__);
const size_t endLoop{ u8Str.length() / u16ChunkLen };
double duration{};
size_t length{};
HRESULT hRes{};
std::wstring u16Str{};
for (size_t i{}; i < endLoop; i += u8CharLen)
{
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
GetDuration();
hRes = u8u16_ptr(sv, u16Str);
duration += GetDuration();
length += u16Str.length();
}
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
std::cout << " ignore me " << static_cast<int>(randElem16)
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
}
void CompNaturalLang_WholeString(const std::string& fileName)
{
std::string head{ __func__ };
head += " - " + fileName;
PrintHeader(head.c_str());
std::ostringstream u8Ss{};
std::ostringstream buf{};
buf << std::ifstream{ fileName }.rdbuf();
std::fill_n(std::ostream_iterator<const char*>{ u8Ss }, 300000u, buf.str().c_str());
std::string u8Str = u8Ss.str();
GetDuration();
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
int length = MultiByteToWideChar(65001, 0, u8Str.data(), static_cast<int>(u8Str.length()), u16Buffer.get(), static_cast<int>(u8Str.length()));
double duration = GetDuration();
u16Buffer.reset();
std::cout << " MultiByteToWideChar length " << length << " elapsed " << duration << std::endl;
GetDuration();
std::wstring u16Str{};
HRESULT hRes = u8u16_ptr(u8Str, u16Str);
duration = GetDuration();
std::cout << " u8u16_ptr length " << u16Str.length() << " elapsed " << duration << std::endl;
GetDuration();
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(u16Str.length() * 3) };
length = WideCharToMultiByte(65001, 0, u16Str.data(), static_cast<int>(u16Str.length()), u8Buffer.get(), static_cast<int>(u16Str.length()) * 3, nullptr, nullptr);
duration = GetDuration();
u8Buffer.reset();
std::cout << " WideCharToMultiByte length " << length << " elapsed " << duration << std::endl;
GetDuration();
std::string u8StrOut{};
hRes = u16u8_ptr(u16Str, u8StrOut);
duration = GetDuration();
std::cout << " u16u8_ptr length " << u8StrOut.length() << " elapsed " << duration << std::endl;
}
void CompNaturalLang_Chunks(const std::string& fileName)
{
std::string head{ __func__ };
head += " - " + fileName;
PrintHeader(head.c_str());
std::ostringstream u8Ss{};
std::ostringstream buf{};
buf << std::ifstream{ fileName }.rdbuf();
std::fill_n(std::ostream_iterator<const char*>{ u8Ss }, 300000u, buf.str().c_str());
std::string u8Str = u8Ss.str();
std::wstring u16Str{ 10u };
if (FAILED(u8u16_ptr(u8Str, u16Str)))
{
return;
}
constexpr const size_t chunkSize{ 10u };
HRESULT hRes{};
int lenTotalMB2WC{};
int lenTotalWC2MB{};
size_t lenTotalU8U16{};
size_t lenTotalU16U8{};
double durTotalMB2WC{};
double durTotalWC2MB{};
double durTotalU8U16{};
double durTotalU16U8{};
GetDuration();
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(chunkSize) };
durTotalMB2WC += GetDuration();
GetDuration();
std::wstring u16StrOut{};
durTotalU8U16 += GetDuration();
GetDuration();
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(chunkSize * 3) };
durTotalWC2MB += GetDuration();
GetDuration();
std::string u8StrOut{};
durTotalU16U8 += GetDuration();
for (size_t idx = 0u; idx < u16Str.length(); idx += chunkSize)
{
std::wstring u16Chunk{ u16Str.substr(idx, chunkSize) };
std::string u8Chunk{ u16u8(u16Chunk) };
GetDuration();
lenTotalMB2WC += MultiByteToWideChar(65001, 0, u8Chunk.data(), static_cast<int>(u8Chunk.length()), u16Buffer.get(), static_cast<int>(u8Str.length()));
durTotalMB2WC += GetDuration();
GetDuration();
hRes = u8u16_ptr(u8Chunk, u16StrOut);
durTotalU8U16 += GetDuration();
lenTotalU8U16 += u16StrOut.length();
GetDuration();
lenTotalWC2MB += WideCharToMultiByte(65001, 0, u16Chunk.data(), static_cast<int>(u16Chunk.length()), u8Buffer.get(), static_cast<int>(u16Chunk.length()) * 3, nullptr, nullptr);
durTotalWC2MB += GetDuration();
GetDuration();
hRes = u16u8_ptr(u16Chunk, u8StrOut);
durTotalU16U8 += GetDuration();
lenTotalU16U8 += u8StrOut.length();
}
std::cout << " MultiByteToWideChar length " << lenTotalMB2WC << " elapsed " << durTotalMB2WC << std::endl;
std::cout << " u8u16_ptr length " << lenTotalU8U16 << " elapsed " << durTotalU8U16 << std::endl;
std::cout << " WideCharToMultiByte length " << lenTotalWC2MB << " elapsed " << durTotalWC2MB << std::endl;
std::cout << " u16u8_ptr length " << lenTotalU16U8 << " elapsed " << durTotalU16U8 << std::endl;
}
int main()
{
// UTF-16 string length
//constexpr const size_t u16Length{ 100000000u }; // 100,000 code points
constexpr const size_t u16Length{ 10000000u }; // 10,000 code points
// chunk length in code points
constexpr const size_t chunkLen = 10u;
// UTF-16 character to be used
//const std::wstring testU16(u16Length, static_cast<wchar_t>(0x007E)); // TILDE (1 Byte in UTF-8)
//const std::wstring testU16(u16Length, static_cast<wchar_t>(0x00F6)); // LATIN SMALL LETTER O WITH DIAERESIS (2 Bytes in UTF-8)
const std::wstring testU16(u16Length, static_cast<wchar_t>(0x20AC)); // // EURO SIGN (3 Bytes in UTF-8)
HMODULE ntdll = LoadLibraryA("ntdll.dll");
if (ntdll != nullptr)
{
p_RtlUTF8ToUnicodeN = reinterpret_cast<t_RtlUTF8ToUnicodeN>(GetProcAddress(ntdll, "RtlUTF8ToUnicodeN"));
p_RtlUnicodeToUTF8N = reinterpret_cast<t_RtlUnicodeToUTF8N>(GetProcAddress(ntdll, "RtlUnicodeToUTF8N"));
if (!p_RtlUTF8ToUnicodeN || !p_RtlUnicodeToUTF8N)
{
FreeLibrary(ntdll);
return 1;
}
}
else
{
return 1;
}
std::string u8Str{};
std::cout << "### UTF-16 To UTF-8 ###" << std::endl;
WideCharToMultiByte_WholeString(testU16);
RtlUnicodeToUTF8N_WholeString(testU16);
u16u8_WholeString(testU16, u8Str);
u16u8_ptr_WholeString(testU16, u8Str);
const size_t u8CharLen{ u8Str.length() / testU16.length() };
const size_t u8ChunkLen{ u8CharLen * chunkLen };
if (u8Str.length() % u8ChunkLen != 0)
{
std::cerr << "Chunk length has to be a divisor of string length!" << std::endl;
FreeLibrary(ntdll);
return 1;
}
WideCharToMultiByte_Chunks(testU16, u8CharLen, chunkLen);
RtlUnicodeToUTF8N_Chunks(testU16, u8CharLen, chunkLen);
u16u8_Chunks(testU16, chunkLen);
u16u8_ptr_Chunks(testU16, chunkLen);
std::cout << "\n\n### UTF-8 To UTF-16 ###" << std::endl;
MultiByteToWideChar_WholeString(u8Str);
RtlUTF8ToUnicodeN_WholeString(u8Str);
u8u16_WholeString(u8Str);
u8u16_ptr_WholeString(u8Str);
MultiByteToWideChar_Chunks(u8Str, u8CharLen, chunkLen);
RtlUTF8ToUnicodeN_Chunks(u8Str, u8CharLen, chunkLen);
u8u16_Chunks(u8Str, u8CharLen, chunkLen);
u8u16_ptr_Chunks(u8Str, u8CharLen, chunkLen);
std::cout << "\n\n### Natural Languages ###" << std::endl;
CompNaturalLang_WholeString("en.txt");
CompNaturalLang_WholeString("fr.txt");
CompNaturalLang_WholeString("ru.txt");
CompNaturalLang_WholeString("zh.txt");
CompNaturalLang_Chunks("en.txt");
CompNaturalLang_Chunks("fr.txt");
CompNaturalLang_Chunks("ru.txt");
CompNaturalLang_Chunks("zh.txt");
FreeLibrary(ntdll);
return 0;
}
// returns the time elapsed between two calls (the return value of the first call is undefined)
double GetDuration()
{
static std::chrono::time_point<std::chrono::high_resolution_clock> previous{};
const auto current = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = current - previous;
previous = current;
return elapsed.count();
}
// returns a value 0..(length - 1), or -1 if the function failed
ptrdiff_t RandomIndex(ptrdiff_t length)
{
static bool generatorInitialized{ false };
static std::default_random_engine generator;
if (generatorInitialized == false)
{
generator.seed(static_cast<unsigned>(std::chrono::system_clock::now().time_since_epoch().count()));
generatorInitialized = true;
}
if (length > 0)
{
std::uniform_int_distribution<ptrdiff_t> distribution{ static_cast<ptrdiff_t>(0), --length };
return distribution(generator);
}
return static_cast<ptrdiff_t>(-1);
}
// print the header for a test in function funcName
void PrintHeader(const char* const funcName)
{
std::cout << "\n~~~\ntest \"" << funcName << "\"" << std::endl;
}

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Windows.CppWinRT" version="2.0.191217.1" targetFramework="native" />
</packages>

View File

@ -0,0 +1,2 @@
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
собственный C планшетный Подразделения года — Surface. компании также производят 2012 консолей а для персональных компьютеров (клавиатуры, и т. д. мыши). производит компьютер семейство игровых Microsoft продаётся Xbox, более Продукция чем аксессуары странах в 80 программы переведены мира, более чем на 45 также языков.

View File

@ -0,0 +1,2 @@
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
微软曾超越苹果公司以及随后通过收购诺基亚设备形成微软移动和服务部门。微软公司於2014年接任首席执行官以来2008年收购Danger公司成为全球最有价值的上市公司。 Alphabet、自萨蒂亚纳德拉于该公司已缩减硬件规模史蒂夫鲍尔默于微软达到了一兆美元的市值2000随着微软在于2012年6月首次进入个人电脑生产市场年取代盖茨担任首席执行官Facebook的第五家股價市值超过这一举措帮助该公司股价达到1999年12成为仅次于苹果公司、2018年Surface系列平板电脑的推出后来设想了“设备和服务”战略。随着微软转而专注于云计算在2019年4月月以来的最高值。谷歌旗下1兆美元的美国上市公司。亚马逊、1975年由比

View File

@ -1,100 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "inc/Utf8OutPipeReader.hpp"
#include <type_traits>
#include <utility>
UTF8OutPipeReader::UTF8OutPipeReader(HANDLE outPipe) noexcept :
_outPipe{ outPipe },
_buffer{ 0 },
_utf8Partials{ 0 }
{
}
// Method Description:
// Populates a string_view with *complete* UTF-8 codepoints read from the pipe.
// If it receives an incomplete codepoint, it will cache it until it can be completed.
// Note: This method trusts that the other end will, in fact, send complete codepoints.
// Arguments:
// - strView: on return, populated with successfully-read codepoints.
// Return Value:
// An HRESULT indicating whether the read was successful. For the purposes of this
// method, a closed pipe is considered a successful (but false!) read. All other errors
// are translated into an appropriate status code.
// S_OK for a successful read
// S_FALSE for a read on a closed pipe
// E_* (anything) for a failed read
[[nodiscard]] HRESULT UTF8OutPipeReader::Read(_Out_ std::string_view& strView)
{
DWORD dwRead{};
bool fSuccess{};
// in case of early escaping
_buffer.at(0) = 0;
strView = std::string_view{ _buffer.data(), 0 };
// copy UTF-8 code units that were remaining from the previously read chunk (if any)
if (_dwPartialsLen != 0)
{
std::move(_utf8Partials.cbegin(), _utf8Partials.cbegin() + _dwPartialsLen, _buffer.begin());
}
// try to read data
fSuccess = !!ReadFile(_outPipe, &_buffer.at(_dwPartialsLen), gsl::narrow<DWORD>(_buffer.size()) - _dwPartialsLen, &dwRead, nullptr);
dwRead += _dwPartialsLen;
_dwPartialsLen = 0;
if (!fSuccess) // reading failed (we must check this first, because dwRead will also be 0.)
{
const auto lastError = GetLastError();
if (lastError == ERROR_BROKEN_PIPE)
{
// This is a successful, but detectable, exit.
// There is a chance that we put some partials into the buffer. Since
// the pipe has closed, they're just invalid now. They're not worth
// reporting.
return S_FALSE;
}
return HRESULT_FROM_WIN32(lastError);
}
if (dwRead == 0) // quit if no data has been read and no cached data was left over
{
return S_OK;
}
const auto endPtr = _buffer.cbegin() + dwRead;
auto backIter = endPtr - 1;
// If the last byte in the buffer was a byte belonging to a UTF-8 multi-byte character
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
{
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the buffer
for (DWORD dwSequenceLen{ 1UL }; dwSequenceLen < std::min(dwRead, 4UL); ++dwSequenceLen, --backIter)
{
// If Lead Byte found
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
{
// If the Lead Byte indicates that the last bytes in the buffer is a partial UTF-8 code point then cache them:
// Use the bitmask at index `dwSequenceLen`. Compare the result with the operand having the same index. If they
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
// sequence is a complete UTF-8 code point and the whole buffer is ready for the conversion to hstring.
if ((*backIter & _cmpMasks.at(dwSequenceLen)) != _cmpOperands.at(dwSequenceLen))
{
std::move(backIter, endPtr, _utf8Partials.begin());
dwRead -= dwSequenceLen;
_dwPartialsLen = dwSequenceLen;
}
break;
}
}
}
// give back a view of the part of the buffer that contains complete code points only
strView = std::string_view{ &_buffer.at(0), dwRead };
return S_OK;
}

View File

@ -6,7 +6,7 @@
<RootNamespace>types</RootNamespace>
<ProjectName>Types</ProjectName>
<TargetName>ConTypes</TargetName>
<ConfigurationType>StaticLibrary</ConfigurationType>
<ConfigurationType>StaticLibrary</ConfigurationType>
</PropertyGroup>
<Import Project="$(SolutionDir)src\common.build.pre.props" />
<ItemGroup>
@ -24,7 +24,6 @@
<ClCompile Include="..\ThemeUtils.cpp" />
<ClCompile Include="..\UiaTextRangeBase.cpp" />
<ClCompile Include="..\Utf16Parser.cpp" />
<ClCompile Include="..\UTF8OutPipeReader.cpp" />
<ClCompile Include="..\Viewport.cpp" />
<ClCompile Include="..\WindowBufferSizeEvent.cpp" />
<ClCompile Include="..\precomp.cpp">
@ -42,7 +41,6 @@
<ClInclude Include="..\inc\GlyphWidth.hpp" />
<ClInclude Include="..\inc\IInputEvent.hpp" />
<ClInclude Include="..\inc\ThemeUtils.h" />
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp" />
<ClInclude Include="..\inc\utils.hpp" />
<ClInclude Include="..\inc\Viewport.hpp" />
<ClInclude Include="..\inc\Utf16Parser.hpp" />

View File

@ -57,9 +57,6 @@
<ClCompile Include="..\utils.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\UTF8OutPipeReader.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\ScreenInfoUiaProviderBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -72,6 +69,9 @@
<ClCompile Include="..\ThemeUtils.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Environment.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\inc\IInputEvent.hpp">
@ -95,9 +95,6 @@
<ClInclude Include="..\inc\GlyphWidth.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\IConsoleWindow.hpp">
<Filter>Header Files</Filter>
</ClInclude>
@ -122,9 +119,6 @@
<ClInclude Include="..\inc\IInputEvent.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\Viewport.hpp">
<Filter>Header Files</Filter>
</ClInclude>
@ -137,9 +131,6 @@
<ClInclude Include="..\ScreenInfoUiaProviderBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\utils.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\WindowUiaProviderBase.hpp">
<Filter>Header Files</Filter>
</ClInclude>
@ -161,6 +152,9 @@
<ClInclude Include="..\inc\ThemeUtils.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\inc\Environment.hpp">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Natvis Include="$(SolutionDir)tools\ConsoleTypes.natvis" />

View File

@ -6,11 +6,10 @@
<RootNamespace>TypesUnitTests</RootNamespace>
<ProjectName>Types.Unit.Tests</ProjectName>
<TargetName>Types.Unit.Tests</TargetName>
<ConfigurationType>DynamicLibrary</ConfigurationType>
<ConfigurationType>DynamicLibrary</ConfigurationType>
</PropertyGroup>
<Import Project="$(SolutionDir)src\common.build.pre.props" />
<ItemGroup>
<ClCompile Include="UTF8OutPipeReaderTests.cpp" />
<ClCompile Include="UtilsTests.cpp" />
<ClCompile Include="UuidTests.cpp" />
<ClCompile Include="..\precomp.cpp">

View File

@ -1,155 +0,0 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "WexTestClass.h"
#include "..\..\inc\consoletaeftemplates.hpp"
#include "..\inc\UTF8OutPipeReader.hpp"
#include <winrt/Windows.Foundation.h>
#include <winrt/Windows.Foundation.Collections.h>
using namespace WEX::Common;
using namespace WEX::Logging;
using namespace WEX::TestExecution;
class UTF8OutPipeReaderTests
{
TEST_CLASS(UTF8OutPipeReaderTests);
TEST_METHOD(TestUtf8MergePartials)
{
// The test uses the character 'GOTHIC LETTER HWAIR' (U+10348) as an example
// Its UTF-8 representation consists of four bytes:
// 1 2 3 4
// 0xF0 0x90 0x8D 0x88
//
// For the test a std::string is filled with 4104 '.' characters to make sure it exceeds the
// buffer size of 4096 bytes in UTF8OutPipeReader.
//
// This figure shows how the string is getting changed for the 7 sub-tests. The digits 1 to 4
// represent the four bytes of the 'Hwair' letter. The vertical bar represents the buffer boundary.
// Test 1: [more points] . . S 1 2 3 4 T|U V W X Y Z . .
// Test 2: [more points] . . S T 1 2 3 4|U V W X Y Z . .
// Test 3: [more points] . . S T U 1 2 3|4 V W X Y Z . .
// Test 4: [more points] . . S T U V 1 2|3 4 W X Y Z . .
// Test 5: [more points] . . S T U V W 1|2 3 4 X Y Z . .
// Test 6: [more points] . . S T U V W X|1 2 3 4 Y Z . .
// Test 7: [more points] . . S T U V W X|Y 1 2 3 4 Z . .
//
// Tests 1, 6, and 7 prove proper ASCII handling.
// Test 2 leaves all four bytes of 'Hwair' in the first chunk.
// Test 3, 4, and 5 move the partials from the end of the first chunk to the begin of the
// second chunk.
//
// At the beginning of a test the whole string is converted into a winrt::hstring for reference.
// During the test a second hstring is concatenated out of the chunks that we get from
// UTF8OutPipeReader::Read. Each chunk is separately converted to hstring in order to make
// sure it would be corrupted if we get UTF-8 partials.
// The test is positive if both hstrings are equal.
const size_t bufferSize{ 4096 }; // NOTE: This has to match the buffer size in UTF8OutPipeReader!
std::string utf8TestString(bufferSize + 8, '.'); // create a test string with the required size
// Test 1:
// ||
utf8TestString.replace(bufferSize - 6, 12, "S\xF0\x90\x8D\x88TUVWXYZ");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
// Test 2:
// | |
utf8TestString.replace(bufferSize - 6, 12, "ST\xF0\x90\x8D\x88UVWXYZ");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
// Test 3:
// | |
utf8TestString.replace(bufferSize - 6, 12, "STU\xF0\x90\x8D\x88VWXYZ");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
// Test 4:
// | |
utf8TestString.replace(bufferSize - 6, 12, "STUV\xF0\x90\x8D\x88WXYZ");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
// Test 5:
// | |
utf8TestString.replace(bufferSize - 6, 12, "STUVW\xF0\x90\x8D\x88XYZ");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
// Test 6:
// | |
utf8TestString.replace(bufferSize - 6, 12, "STUVWX\xF0\x90\x8D\x88YZ");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
// Test 7:
// ||
utf8TestString.replace(bufferSize - 6, 12, "STUVWXY\xF0\x90\x8D\x88Z");
VERIFY_SUCCEEDED(RunTest(utf8TestString));
}
struct ThreadData
{
wil::unique_hfile& inPipe;
std::string& utf8TestString;
};
// Thread function which writes the UTF-8 data to the pipe.
static DWORD WINAPI WritePipeThread(LPVOID threadArg)
{
ThreadData* pThreadData{ reinterpret_cast<ThreadData*>(threadArg) };
DWORD length{};
WriteFile(pThreadData->inPipe.get(), pThreadData->utf8TestString.c_str(), static_cast<DWORD>(pThreadData->utf8TestString.size()), &length, nullptr);
pThreadData->inPipe.reset();
return 0;
}
// Performs the sub-tests.
HRESULT RunTest(std::string& utf8TestString)
{
std::string_view strView{}; // contains the chunk that we get from UTF8OutPipeReader::Read
const winrt::hstring utf16Expected{ winrt::to_hstring(utf8TestString) }; // contains the whole string converted to UTF-16
winrt::hstring utf16Actual{}; // will be concatenated from the converted chunks
wil::unique_hfile outPipe{};
wil::unique_hfile inPipe{};
SECURITY_ATTRIBUTES sa{ sizeof(SECURITY_ATTRIBUTES) };
CreatePipe(&outPipe, &inPipe, &sa, 0); // create the pipe handles
UTF8OutPipeReader reader{ outPipe.get() };
ThreadData data{ inPipe, utf8TestString };
wil::unique_handle threadHandle{ CreateThread(nullptr, 0, WritePipeThread, &data, 0, nullptr) }; // create a thread that writes to the pipe
RETURN_HR_IF_NULL(E_FAIL, threadHandle.get());
// process the chunks that we get from UTF8OutPipeReader::Read
while (true)
{
// get a chunk of UTF-8 data
THROW_IF_FAILED(reader.Read(strView));
if (strView.empty())
{
// this is okay, no data left in the pipe
break;
}
// convert the chunk to hstring and append it to the resulting hstring
utf16Actual = utf16Actual + winrt::to_hstring(strView);
}
WaitForSingleObject(threadHandle.get(), 2000);
// the test passed if both hstrings are equal
if (utf16Actual == utf16Expected)
{
return S_OK;
}
return E_FAIL;
}
};