Implement til::u8u16 and til::u16u8 conversion functions (#4093)
This commit also switches ConptyConnection to consume til::u8u16 and removes the UTF8OutPipeReader. Closes #4092.
This commit is contained in:
parent
1445380810
commit
32ea419c3d
|
@ -269,6 +269,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{BDB237B6
|
|||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "til.unit.tests", "src\til\ut_til\til.unit.tests.vcxproj", "{767268EE-174A-46FE-96F0-EEE698A1BBC9}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "U8U16Test", "src\tools\U8U16Test\U8U16Test.vcxproj", "{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
AuditMode|Any CPU = AuditMode|Any CPU
|
||||
|
@ -1374,6 +1376,26 @@ Global
|
|||
{767268EE-174A-46FE-96F0-EEE698A1BBC9}.Release|x64.Build.0 = Release|x64
|
||||
{767268EE-174A-46FE-96F0-EEE698A1BBC9}.Release|x86.ActiveCfg = Release|Win32
|
||||
{767268EE-174A-46FE-96F0-EEE698A1BBC9}.Release|x86.Build.0 = Release|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|Any CPU.ActiveCfg = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|Any CPU.Build.0 = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|ARM64.ActiveCfg = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|ARM64.Build.0 = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x64.ActiveCfg = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x64.Build.0 = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x86.ActiveCfg = Release|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.AuditMode|x86.Build.0 = Release|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|Any CPU.ActiveCfg = Debug|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|ARM64.ActiveCfg = Debug|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x64.Build.0 = Debug|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x86.ActiveCfg = Debug|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Debug|x86.Build.0 = Debug|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|Any CPU.ActiveCfg = Release|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|ARM64.ActiveCfg = Release|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x64.ActiveCfg = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x64.Build.0 = Release|x64
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x86.ActiveCfg = Release|Win32
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1}.Release|x86.Build.0 = Release|Win32
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
@ -1444,6 +1466,7 @@ Global
|
|||
{A021EDFF-45C8-4DC2-BEF7-36E1B3B8CFE8} = {BDB237B6-1D1D-400F-84CC-40A58FA59C8E}
|
||||
{BDB237B6-1D1D-400F-84CC-40A58FA59C8E} = {59840756-302F-44DF-AA47-441A9D673202}
|
||||
{767268EE-174A-46FE-96F0-EEE698A1BBC9} = {89CDCC5C-9F53-4054-97A4-639D99F169CD}
|
||||
{A602A555-BAAC-46E1-A91D-3DAB0475C5A1} = {A10C4720-DCA4-4640-9749-67F4314F527C}
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {3140B1B7-C8EE-43D1-A772-D82A7061A271}
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
|
||||
#include "../../types/inc/Utils.hpp"
|
||||
#include "../../types/inc/Environment.hpp"
|
||||
#include "../../types/inc/UTF8OutPipeReader.hpp"
|
||||
#include "LibraryResources.h"
|
||||
|
||||
using namespace ::Microsoft::Console;
|
||||
|
@ -169,7 +168,10 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
|
|||
_commandline{ commandline },
|
||||
_startingDirectory{ startingDirectory },
|
||||
_startingTitle{ startingTitle },
|
||||
_guid{ initialGuid }
|
||||
_guid{ initialGuid },
|
||||
_u8State{},
|
||||
_u16Str{},
|
||||
_buffer{}
|
||||
{
|
||||
if (_guid == guid{})
|
||||
{
|
||||
|
@ -344,14 +346,27 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
|
|||
|
||||
DWORD ConptyConnection::_OutputThread()
|
||||
{
|
||||
UTF8OutPipeReader pipeReader{ _outPipe.get() };
|
||||
std::string_view strView{};
|
||||
|
||||
// process the data of the output pipe in a loop
|
||||
while (true)
|
||||
{
|
||||
const HRESULT result = pipeReader.Read(strView);
|
||||
if (FAILED(result) || result == S_FALSE)
|
||||
DWORD read{};
|
||||
|
||||
const auto readFail{ !ReadFile(_outPipe.get(), _buffer.data(), gsl::narrow_cast<DWORD>(_buffer.size()), &read, nullptr) };
|
||||
if (readFail) // reading failed (we must check this first, because read will also be 0.)
|
||||
{
|
||||
const auto lastError = GetLastError();
|
||||
if (lastError != ERROR_BROKEN_PIPE && !_isStateAtOrBeyond(ConnectionState::Closing))
|
||||
{
|
||||
// EXIT POINT
|
||||
_indicateExitWithStatus(HRESULT_FROM_WIN32(lastError)); // print a message
|
||||
_transitionToState(ConnectionState::Failed);
|
||||
return gsl::narrow_cast<DWORD>(HRESULT_FROM_WIN32(lastError));
|
||||
}
|
||||
// else we call convertUTF8ChunkToUTF16 with an empty string_view to convert possible remaining partials to U+FFFD
|
||||
}
|
||||
|
||||
const HRESULT result{ til::u8u16(std::string_view{ _buffer.data(), read }, _u16Str, _u8State) };
|
||||
if (FAILED(result))
|
||||
{
|
||||
if (_isStateAtOrBeyond(ConnectionState::Closing))
|
||||
{
|
||||
|
@ -362,10 +377,10 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
|
|||
// EXIT POINT
|
||||
_indicateExitWithStatus(result); // print a message
|
||||
_transitionToState(ConnectionState::Failed);
|
||||
return gsl::narrow_cast<DWORD>(-1);
|
||||
return gsl::narrow_cast<DWORD>(result);
|
||||
}
|
||||
|
||||
if (strView.empty())
|
||||
if (_u16Str.empty())
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
@ -386,11 +401,8 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
|
|||
_recievedFirstByte = true;
|
||||
}
|
||||
|
||||
// Convert buffer to hstring
|
||||
auto hstr{ winrt::to_hstring(strView) };
|
||||
|
||||
// Pass the output to our registered event handlers
|
||||
_TerminalOutputHandlers(hstr);
|
||||
_TerminalOutputHandlers(_u16Str);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -52,6 +52,10 @@ namespace winrt::Microsoft::Terminal::TerminalConnection::implementation
|
|||
wil::unique_static_pseudoconsole_handle _hPC;
|
||||
wil::unique_threadpool_wait _clientExitWait;
|
||||
|
||||
til::u8state _u8State;
|
||||
std::wstring _u16Str;
|
||||
std::array<char, 4096> _buffer;
|
||||
|
||||
DWORD _OutputThread();
|
||||
};
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
#include "til/at.h"
|
||||
#include "til/some.h"
|
||||
#include "til/u8u16convert.h"
|
||||
|
||||
namespace til // Terminal Implementation Library. Also: "Today I Learned"
|
||||
{
|
||||
|
|
|
@ -0,0 +1,458 @@
|
|||
/*++
|
||||
Copyright (c) Microsoft Corporation
|
||||
Licensed under the MIT license.
|
||||
|
||||
Module Name:
|
||||
- u8u16convert.h
|
||||
|
||||
Abstract:
|
||||
- Defines classes which hold the status of the current partials handling.
|
||||
- Defines functions for converting between UTF-8 and UTF-16 strings.
|
||||
|
||||
Tests have been made in order to investigate whether or not own algorithms
|
||||
could overcome disadvantages of syscalls. Test results can be read up
|
||||
in PR #4093 and the test algorithms are available in src\tools\U8U16Test.
|
||||
Based on the results the decision was made to keep using the platform
|
||||
functions MultiByteToWideChar and WideCharToMultiByte.
|
||||
|
||||
Author(s):
|
||||
- Steffen Illhardt (german-one) 2020
|
||||
--*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace til // Terminal Implementation Library. Also: "Today I Learned"
|
||||
{
|
||||
template<class charT>
|
||||
class u8u16state final
|
||||
{
|
||||
public:
|
||||
u8u16state() noexcept :
|
||||
_buffer{},
|
||||
_utfPartials{}
|
||||
{
|
||||
}
|
||||
|
||||
// Method Description:
|
||||
// - Takes a UTF-8 string and populates it with *complete* UTF-8 codepoints.
|
||||
// If it receives an incomplete codepoint, it will cache it until it can be completed.
|
||||
// Arguments:
|
||||
// - in - UTF-8 string_view potentially containing partial code points
|
||||
// - out - on return, populated with complete codepoints at the string end
|
||||
// Return Value:
|
||||
// - S_OK - the resulting string doesn't end with a partial
|
||||
// - S_FALSE - the resulting string contains the previously cached partials only
|
||||
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
|
||||
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
|
||||
// - E_UNEXPECTED - an unexpected error occurred
|
||||
template<class T = charT>
|
||||
[[nodiscard]] typename std::enable_if<std::is_same<T, char>::value, HRESULT>::type
|
||||
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
|
||||
{
|
||||
try
|
||||
{
|
||||
size_t remainingLength{ in.length() };
|
||||
size_t capacity{};
|
||||
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
|
||||
_buffer.clear();
|
||||
_buffer.reserve(capacity);
|
||||
|
||||
// copy UTF-8 code units that were remaining from the previous call (if any)
|
||||
if (_partialsLen != 0u)
|
||||
{
|
||||
_buffer.assign(_utfPartials.cbegin(), _utfPartials.cbegin() + _partialsLen);
|
||||
_partialsLen = 0u;
|
||||
}
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
out = _buffer;
|
||||
if (_buffer.empty())
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
return S_FALSE; // the partial is populated
|
||||
}
|
||||
|
||||
auto backIter = in.end() - 1;
|
||||
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
|
||||
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
|
||||
{
|
||||
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
|
||||
const size_t stopLen{ std::min(in.length(), gsl::narrow_cast<size_t>(4u)) };
|
||||
for (size_t sequenceLen{ 1u }; sequenceLen < stopLen; ++sequenceLen, --backIter)
|
||||
{
|
||||
// If Lead Byte found
|
||||
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
|
||||
{
|
||||
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
|
||||
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
|
||||
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
|
||||
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion into a UTF-16 string.
|
||||
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
|
||||
{
|
||||
std::move(backIter, in.end(), _utfPartials.begin());
|
||||
remainingLength -= sequenceLen;
|
||||
_partialsLen = sequenceLen;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// populate the part of the string that contains complete code points only
|
||||
_buffer.append(in, 0u, remainingLength);
|
||||
out = _buffer;
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
// Method Description:
|
||||
// - Takes a UTF-16 string and populates it with *complete* UTF-16 codepoints.
|
||||
// If it receives an incomplete codepoint, it will cache it until it can be completed.
|
||||
// Arguments:
|
||||
// - in - UTF-16 string_view potentially containing partial code points
|
||||
// - out - on return, populated with complete codepoints at the string end
|
||||
// Return Value:
|
||||
// - S_OK - the resulting string doesn't end with a partial
|
||||
// - S_FALSE - the resulting string contains the previously cached partials only
|
||||
// - E_OUTOFMEMORY - the method failed to allocate memory for the resulting string
|
||||
// - E_ABORT - the resulting string length would exceed the max_size and thus, the processing was aborted
|
||||
// - E_UNEXPECTED - an unexpected error occurred
|
||||
template<class T = charT>
|
||||
[[nodiscard]] typename std::enable_if<std::is_same<T, wchar_t>::value, HRESULT>::type
|
||||
operator()(const std::basic_string_view<T> in, std::basic_string_view<T>& out) noexcept
|
||||
{
|
||||
try
|
||||
{
|
||||
size_t remainingLength{ in.length() };
|
||||
size_t capacity{};
|
||||
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
|
||||
_buffer.clear();
|
||||
_buffer.reserve(capacity);
|
||||
|
||||
// copy UTF-8 code units that were remaining from the previous call (if any)
|
||||
if (_partialsLen != 0u)
|
||||
{
|
||||
_buffer.push_back(_utfPartials.front());
|
||||
_partialsLen = 0u;
|
||||
}
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
out = _buffer;
|
||||
if (_buffer.empty())
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
return S_FALSE; // the high surrogate is populated
|
||||
}
|
||||
|
||||
// cache the last value in the string if it is in the range of high surrogates
|
||||
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu)
|
||||
{
|
||||
_utfPartials.front() = in.back();
|
||||
--remainingLength;
|
||||
_partialsLen = 1u;
|
||||
}
|
||||
else
|
||||
{
|
||||
_partialsLen = 0u;
|
||||
}
|
||||
|
||||
// populate the part of the string that contains complete code points only
|
||||
_buffer.append(in, 0u, remainingLength);
|
||||
out = _buffer;
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
// Method Description:
|
||||
// - Discard cached partials.
|
||||
// Arguments:
|
||||
// - none
|
||||
// Return Value:
|
||||
// - void
|
||||
void reset() noexcept
|
||||
{
|
||||
_partialsLen = 0u;
|
||||
}
|
||||
|
||||
private:
|
||||
enum _Utf8BitMasks : BYTE
|
||||
{
|
||||
IsAsciiByte = 0b0'0000000, // Any byte representing an ASCII character has the MSB set to 0
|
||||
MaskAsciiByte = 0b1'0000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
|
||||
IsContinuationByte = 0b10'000000, // Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
|
||||
MaskContinuationByte = 0b11'000000, // Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
|
||||
IsLeadByteTwoByteSequence = 0b110'00000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
|
||||
MaskLeadByteTwoByteSequence = 0b111'00000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
|
||||
IsLeadByteThreeByteSequence = 0b1110'0000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
|
||||
MaskLeadByteThreeByteSequence = 0b1111'0000, // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
|
||||
IsLeadByteFourByteSequence = 0b11110'000, // A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
|
||||
MaskLeadByteFourByteSequence = 0b11111'000 // Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
|
||||
};
|
||||
|
||||
// array of bitmasks
|
||||
constexpr static std::array<BYTE, 4> _cmpMasks{
|
||||
0, // unused
|
||||
_Utf8BitMasks::MaskContinuationByte,
|
||||
_Utf8BitMasks::MaskLeadByteTwoByteSequence,
|
||||
_Utf8BitMasks::MaskLeadByteThreeByteSequence,
|
||||
};
|
||||
|
||||
// array of values for the comparisons
|
||||
constexpr static std::array<BYTE, 4> _cmpOperands{
|
||||
0, // unused
|
||||
_Utf8BitMasks::IsAsciiByte, // intentionally conflicts with MaskContinuationByte
|
||||
_Utf8BitMasks::IsLeadByteTwoByteSequence,
|
||||
_Utf8BitMasks::IsLeadByteThreeByteSequence,
|
||||
};
|
||||
|
||||
std::basic_string<charT> _buffer; // buffer to which the poulated string_view refers
|
||||
std::array<charT, 4> _utfPartials; // buffer for code units of a partial code point that have to be cached
|
||||
size_t _partialsLen{}; // number of cached code units
|
||||
};
|
||||
|
||||
// make clear what incoming string type the state is for
|
||||
typedef u8u16state<char> u8state;
|
||||
typedef u8u16state<wchar_t> u16state;
|
||||
|
||||
// Routine Description:
|
||||
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
|
||||
// Arguments:
|
||||
// - in - UTF-8 string to be converted
|
||||
// - out - reference to the resulting UTF-16 string
|
||||
// Return Value:
|
||||
// - S_OK - the conversion succeded
|
||||
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
||||
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
||||
// - E_UNEXPECTED - an unexpected error occurred
|
||||
template<class inT, class outT>
|
||||
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
|
||||
u8u16(const inT in, outT& out) noexcept
|
||||
{
|
||||
try
|
||||
{
|
||||
out.clear();
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
int lengthRequired{};
|
||||
// The worst ratio of UTF-8 code units to UTF-16 code units is 1 to 1 if UTF-8 consists of ASCII only.
|
||||
RETURN_HR_IF(E_ABORT, FAILED(SizeTToInt(in.length(), &lengthRequired)));
|
||||
out.resize(in.length()); // avoid to call MultiByteToWideChar twice only to get the required size
|
||||
const int lengthOut = MultiByteToWideChar(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthRequired, out.data(), lengthRequired);
|
||||
out.resize(gsl::narrow_cast<size_t>(lengthOut));
|
||||
|
||||
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
|
||||
// Arguments:
|
||||
// - in - UTF-8 string to be converted
|
||||
// - out - reference to the resulting UTF-16 string
|
||||
// - state - reference to a til::u8state class holding the status of the current partials handling
|
||||
// Return Value:
|
||||
// - S_OK - the conversion succeded
|
||||
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
||||
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
||||
// - E_UNEXPECTED - an unexpected error occurred
|
||||
template<class inT, class outT>
|
||||
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, char>::value && std::is_same<typename outT::value_type, wchar_t>::value, HRESULT>::type
|
||||
u8u16(const inT in, outT& out, u8state& state) noexcept
|
||||
{
|
||||
std::string_view sv{};
|
||||
RETURN_IF_FAILED(state(std::string_view{ in }, sv));
|
||||
return til::u8u16(sv, out);
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
|
||||
// Arguments:
|
||||
// - in - UTF-16 string to be converted
|
||||
// - out - reference to the resulting UTF-8 string
|
||||
// Return Value:
|
||||
// - S_OK - the conversion succeded
|
||||
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
||||
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
||||
// - E_UNEXPECTED - an unexpected error occurred
|
||||
template<class inT, class outT>
|
||||
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
|
||||
u16u8(const inT in, outT& out) noexcept
|
||||
{
|
||||
try
|
||||
{
|
||||
out.clear();
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
int lengthIn{};
|
||||
int lengthRequired{};
|
||||
// Code Point U+0000..U+FFFF: 1 UTF-16 code unit --> 1..3 UTF-8 code units.
|
||||
// Code Points >U+FFFF: 2 UTF-16 code units --> 4 UTF-8 code units.
|
||||
// Thus, the worst ratio of UTF-16 code units to UTF-8 code units is 1 to 3.
|
||||
RETURN_HR_IF(E_ABORT, FAILED(SizeTToInt(in.length(), &lengthIn)) || FAILED(IntMult(lengthIn, 3, &lengthRequired)));
|
||||
out.resize(gsl::narrow_cast<size_t>(lengthRequired)); // avoid to call WideCharToMultiByte twice only to get the required size
|
||||
const int lengthOut = WideCharToMultiByte(gsl::narrow_cast<UINT>(CP_UTF8), 0ul, in.data(), lengthIn, out.data(), lengthRequired, nullptr, nullptr);
|
||||
out.resize(gsl::narrow_cast<size_t>(lengthOut));
|
||||
|
||||
return lengthOut == 0 ? E_UNEXPECTED : S_OK;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
|
||||
// Arguments:
|
||||
// - in - UTF-16 string to be converted
|
||||
// - out - reference to the resulting UTF-8 string
|
||||
// - state - reference to a til::u16state class holding the status of the current partials handling
|
||||
// Return Value:
|
||||
// - S_OK - the conversion succeded without any change of the represented code points
|
||||
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
|
||||
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
|
||||
// - E_UNEXPECTED - an unexpected error occurred
|
||||
template<class inT, class outT>
|
||||
[[nodiscard]] typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value && std::is_same<typename outT::value_type, char>::value, HRESULT>::type
|
||||
u16u8(const inT in, outT& out, u16state& state) noexcept
|
||||
{
|
||||
std::wstring_view sv{};
|
||||
RETURN_IF_FAILED(state(std::wstring_view{ in }, sv));
|
||||
return u16u8(sv, out);
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
|
||||
// Arguments:
|
||||
// - in - UTF-8 string to be converted
|
||||
// Return Value:
|
||||
// - the resulting UTF-16 string
|
||||
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
||||
template<class inT>
|
||||
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
|
||||
u8u16(const inT in)
|
||||
{
|
||||
std::wstring out{};
|
||||
THROW_IF_FAILED(u8u16(std::string_view{ in }, out));
|
||||
return out;
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// Takes a UTF-8 string, complements and/or caches partials, and performs the conversion to UTF-16.
|
||||
// Arguments:
|
||||
// - in - UTF-8 string to be converted
|
||||
// - state - reference to a til::u8state class holding the status of the current partials handling
|
||||
// Return Value:
|
||||
// - the resulting UTF-16 string
|
||||
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
||||
template<class inT>
|
||||
typename std::enable_if<std::is_same<typename inT::value_type, char>::value, std::wstring>::type
|
||||
u8u16(const inT in, u8state& state)
|
||||
{
|
||||
std::wstring out{};
|
||||
THROW_IF_FAILED(u8u16(std::string_view{ in }, out, state));
|
||||
return out;
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
|
||||
// Arguments:
|
||||
// - in - UTF-16 string to be converted
|
||||
// Return Value:
|
||||
// - the resulting UTF-8 string
|
||||
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
||||
template<class inT>
|
||||
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
|
||||
u16u8(const inT in)
|
||||
{
|
||||
std::string out{};
|
||||
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out));
|
||||
return out;
|
||||
}
|
||||
|
||||
// Routine Description:
|
||||
// Takes a UTF-16 string, complements and/or caches partials, and performs the conversion to UTF-8.
|
||||
// Arguments:
|
||||
// - in - UTF-16 string to be converted
|
||||
// - state - reference to a til::u16state class holding the status of the current partials handling
|
||||
// Return Value:
|
||||
// - the resulting UTF-8 string
|
||||
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
|
||||
template<class inT>
|
||||
typename std::enable_if<std::is_same<typename inT::value_type, wchar_t>::value, std::string>::type
|
||||
u16u8(const inT in, u16state& state)
|
||||
{
|
||||
std::string out{};
|
||||
THROW_IF_FAILED(u16u8(std::wstring_view{ in }, out, state));
|
||||
return out;
|
||||
}
|
||||
}
|
|
@ -14,6 +14,7 @@
|
|||
<ClCompile Include="..\precomp.cpp">
|
||||
<PrecompiledHeader>Create</PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="u8u16convertTests.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\precomp.h" />
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
#include "WexTestClass.h"
|
||||
|
||||
using namespace WEX::Common;
|
||||
using namespace WEX::Logging;
|
||||
using namespace WEX::TestExecution;
|
||||
|
||||
class Utf8Utf16ConvertTests
|
||||
{
|
||||
TEST_CLASS(Utf8Utf16ConvertTests);
|
||||
|
||||
TEST_METHOD(TestU8ToU16);
|
||||
TEST_METHOD(TestU16ToU8);
|
||||
TEST_METHOD(TestU8ToU16Partials);
|
||||
TEST_METHOD(TestU16ToU8Partials);
|
||||
};
|
||||
|
||||
void Utf8Utf16ConvertTests::TestU8ToU16()
|
||||
{
|
||||
const std::string u8String{
|
||||
'\x7E', // TILDE (1 byte)
|
||||
'\xC3', // LATIN SMALL LETTER O WITH DIAERESIS (2 bytes)
|
||||
'\xB6',
|
||||
'\xE2', // EURO SIGN (3 bytes)
|
||||
'\x82',
|
||||
'\xAC',
|
||||
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (4 bytes)
|
||||
'\xA4',
|
||||
'\xBD',
|
||||
'\x9C'
|
||||
};
|
||||
|
||||
const std::wstring u16StringComp{
|
||||
gsl::narrow_cast<wchar_t>(0x007eU), // TILDE
|
||||
gsl::narrow_cast<wchar_t>(0x00f6U), // LATIN SMALL LETTER O WITH DIAERESIS
|
||||
gsl::narrow_cast<wchar_t>(0x20acU), // EURO SIGN
|
||||
gsl::narrow_cast<wchar_t>(0xd853U), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
|
||||
gsl::narrow_cast<wchar_t>(0xdf5cU)
|
||||
};
|
||||
|
||||
std::wstring u16Out{};
|
||||
const HRESULT hRes{ til::u8u16(u8String, u16Out) };
|
||||
VERIFY_ARE_EQUAL(S_OK, hRes);
|
||||
VERIFY_ARE_EQUAL(u16StringComp, u16Out);
|
||||
}
|
||||
|
||||
void Utf8Utf16ConvertTests::TestU16ToU8()
|
||||
{
|
||||
const std::wstring u16String{
|
||||
gsl::narrow_cast<wchar_t>(0x007eU), // TILDE
|
||||
gsl::narrow_cast<wchar_t>(0x00f6U), // LATIN SMALL LETTER O WITH DIAERESIS
|
||||
gsl::narrow_cast<wchar_t>(0x20acU), // EURO SIGN
|
||||
gsl::narrow_cast<wchar_t>(0xd853U), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
|
||||
gsl::narrow_cast<wchar_t>(0xdf5cU)
|
||||
};
|
||||
|
||||
const std::string u8StringComp{
|
||||
'\x7E', // TILDE (1 byte)
|
||||
'\xC3', // LATIN SMALL LETTER O WITH DIAERESIS (2 bytes)
|
||||
'\xB6',
|
||||
'\xE2', // EURO SIGN (3 bytes)
|
||||
'\x82',
|
||||
'\xAC',
|
||||
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (4 bytes)
|
||||
'\xA4',
|
||||
'\xBD',
|
||||
'\x9C'
|
||||
};
|
||||
|
||||
std::string u8Out{};
|
||||
const HRESULT hRes{ til::u16u8(u16String, u8Out) };
|
||||
VERIFY_ARE_EQUAL(S_OK, hRes);
|
||||
VERIFY_ARE_EQUAL(u8StringComp, u8Out);
|
||||
}
|
||||
|
||||
void Utf8Utf16ConvertTests::TestU8ToU16Partials()
|
||||
{
|
||||
const std::string u8String1{
|
||||
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (4 bytes)
|
||||
'\xA4',
|
||||
'\xBD',
|
||||
'\x9C',
|
||||
'\xF0' // CJK UNIFIED IDEOGRAPH-24F5C (lead byte only)
|
||||
};
|
||||
|
||||
const std::string u8String2{
|
||||
'\xA4', // CJK UNIFIED IDEOGRAPH-24F5C (complementary bytes)
|
||||
'\xBD',
|
||||
'\x9C'
|
||||
};
|
||||
|
||||
const std::wstring u16StringComp{
|
||||
gsl::narrow_cast<wchar_t>(0xD853), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
|
||||
gsl::narrow_cast<wchar_t>(0xDF5C)
|
||||
};
|
||||
|
||||
til::u8state state{};
|
||||
|
||||
std::wstring u16Out1{};
|
||||
const HRESULT hRes1{ til::u8u16(u8String1, u16Out1, state) };
|
||||
VERIFY_ARE_EQUAL(S_OK, hRes1);
|
||||
VERIFY_ARE_EQUAL(u16StringComp, u16Out1);
|
||||
|
||||
std::wstring u16Out2{};
|
||||
const HRESULT hRes2{ til::u8u16(u8String2, u16Out2, state) };
|
||||
VERIFY_ARE_EQUAL(S_OK, hRes2);
|
||||
VERIFY_ARE_EQUAL(u16StringComp, u16Out2);
|
||||
}
|
||||
|
||||
void Utf8Utf16ConvertTests::TestU16ToU8Partials()
|
||||
{
|
||||
const std::wstring u16String1{
|
||||
gsl::narrow_cast<wchar_t>(0xD853), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
|
||||
gsl::narrow_cast<wchar_t>(0xDF5C),
|
||||
gsl::narrow_cast<wchar_t>(0xD853) // CJK UNIFIED IDEOGRAPH-24F5C (high surrogate only)
|
||||
};
|
||||
|
||||
const std::wstring u16String2{
|
||||
gsl::narrow_cast<wchar_t>(0xDF5C) // CJK UNIFIED IDEOGRAPH-24F5C (low surrogate only)
|
||||
};
|
||||
|
||||
const std::string u8StringComp{
|
||||
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C
|
||||
'\xA4',
|
||||
'\xBD',
|
||||
'\x9C'
|
||||
};
|
||||
|
||||
til::u16state state{};
|
||||
|
||||
std::string u8Out1{};
|
||||
const HRESULT hRes1{ til::u16u8(u16String1, u8Out1, state) };
|
||||
VERIFY_ARE_EQUAL(S_OK, hRes1);
|
||||
VERIFY_ARE_EQUAL(u8StringComp, u8Out1);
|
||||
|
||||
std::string u8Out2{};
|
||||
const HRESULT hRes2{ til::u16u8(u16String2, u8Out2, state) };
|
||||
VERIFY_ARE_EQUAL(S_OK, hRes2);
|
||||
VERIFY_ARE_EQUAL(u8StringComp, u8Out2);
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ImportGroup Label="PropertySheets" />
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<!--
|
||||
To customize common C++/WinRT project properties:
|
||||
* right-click the project node
|
||||
* expand the Common Properties item
|
||||
* select the C++/WinRT property page
|
||||
|
||||
For more advanced scenarios, and complete documentation, please see:
|
||||
https://github.com/Microsoft/cppwinrt/tree/master/nuget
|
||||
-->
|
||||
<PropertyGroup />
|
||||
<ItemDefinitionGroup />
|
||||
</Project>
|
|
@ -0,0 +1,780 @@
|
|||
// TEST TOOL U8U16Test
|
||||
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
|
||||
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
|
||||
// worse than the platform API functions.
|
||||
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
|
||||
|
||||
#include "U8U16Test.hpp"
|
||||
|
||||
u8state::u8state() noexcept :
|
||||
_buffer8{},
|
||||
_utf8Partials{}
|
||||
{
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u8state::operator()(const std::string_view in, std::string_view& out) noexcept
|
||||
{
|
||||
try
|
||||
{
|
||||
size_t remainingLength{ in.length() };
|
||||
size_t capacity{};
|
||||
if (FAILED(SizeTAdd(remainingLength, _partialsLen, &capacity)))
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
|
||||
_buffer8.clear();
|
||||
_buffer8.reserve(capacity);
|
||||
|
||||
// copy UTF-8 code units that were remaining from the previousl call (if any)
|
||||
if (_partialsLen != 0u)
|
||||
{
|
||||
_buffer8.assign(_utf8Partials.cbegin(), _utf8Partials.cbegin() + _partialsLen);
|
||||
_partialsLen = 0u;
|
||||
}
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
out = _buffer8;
|
||||
if (_buffer8.empty())
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
return S_FALSE; // the partial is given back
|
||||
}
|
||||
|
||||
auto backIter = in.end() - 1;
|
||||
// If the last byte in the string was a byte belonging to a UTF-8 multi-byte character
|
||||
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
|
||||
{
|
||||
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
|
||||
const size_t stopLen{ std::min(in.length(), static_cast<size_t>(4u)) };
|
||||
for (size_t sequenceLen{ 1u }; sequenceLen < stopLen; ++sequenceLen, --backIter)
|
||||
{
|
||||
// If Lead Byte found
|
||||
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
|
||||
{
|
||||
// If the Lead Byte indicates that the last bytes in the string is a partial UTF-8 code point then cache them:
|
||||
// Use the bitmask at index `sequenceLen`. Compare the result with the operand having the same index. If they
|
||||
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
|
||||
// sequence is a complete UTF-8 code point and the whole string is ready for the conversion to hstring.
|
||||
if ((*backIter & _cmpMasks.at(sequenceLen)) != _cmpOperands.at(sequenceLen))
|
||||
{
|
||||
std::move(backIter, in.end(), _utf8Partials.begin());
|
||||
remainingLength -= sequenceLen;
|
||||
_partialsLen = sequenceLen;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// give back the part of the string that contains complete code points only
|
||||
_buffer8.append(in, 0u, remainingLength);
|
||||
out = _buffer8;
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
void u8state::reset() noexcept
|
||||
{
|
||||
_partialsLen = 0u;
|
||||
}
|
||||
|
||||
u16state::u16state() noexcept :
|
||||
_buffer16{}
|
||||
{
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u16state::operator()(const std::wstring_view in, std::wstring_view& out) noexcept
|
||||
{
|
||||
try
|
||||
{
|
||||
size_t remainingLength{ in.length() };
|
||||
size_t capacity{};
|
||||
if (FAILED(SizeTAdd(remainingLength, _cached, &capacity)))
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
|
||||
_buffer16.clear();
|
||||
_buffer16.reserve(capacity);
|
||||
|
||||
// copy UTF-8 code units that were remaining from the previousl call (if any)
|
||||
if (_cached != 0u)
|
||||
{
|
||||
_buffer16.push_back(_highSurrogate);
|
||||
_cached = 0u;
|
||||
}
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
out = _buffer16;
|
||||
if (_buffer16.empty())
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
return S_FALSE; // the high surrogate is given back
|
||||
}
|
||||
|
||||
if (in.back() >= 0xD800u && in.back() <= 0xDBFFu) // range of high surrogates
|
||||
{
|
||||
_highSurrogate = in.back();
|
||||
--remainingLength;
|
||||
_cached = 1u;
|
||||
}
|
||||
else
|
||||
{
|
||||
_cached = 0u;
|
||||
}
|
||||
|
||||
// give back the part of the string that contains complete code points only
|
||||
_buffer16.append(in, 0u, remainingLength);
|
||||
out = _buffer16;
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
void u16state::reset() noexcept
|
||||
{
|
||||
_cached = 0u;
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, bool discardInvalids) noexcept
|
||||
{
|
||||
constexpr const uint8_t contBegin{ 0x80u }; // usual begin of the range of continuation Bytes
|
||||
constexpr const uint8_t contEnd{ 0xBfu }; // usual end of the range of continuation Bytes
|
||||
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu }; // Unicode Replacement Character
|
||||
|
||||
try
|
||||
{
|
||||
HRESULT hRes{ S_OK };
|
||||
out.clear();
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
return hRes;
|
||||
}
|
||||
|
||||
out.reserve(in.length()); // avoid any further re-allocations and copying
|
||||
|
||||
const auto end8{ in.cend() };
|
||||
for (auto it8{ in.cbegin() }; it8 < end8;)
|
||||
{
|
||||
// *** convert ASCII directly to UTF-16 ***
|
||||
// valid single bytes
|
||||
// - 00..7F
|
||||
if (static_cast<uint8_t>(*it8) <= 0x7Fu)
|
||||
{
|
||||
out.push_back(static_cast<wchar_t>(*it8++));
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t codePoint{ unicodeReplacementChar }; // default
|
||||
|
||||
// valid two bytes
|
||||
// - C2..DF | 80..BF (first byte 0xC0 and 0xC1 invalid)
|
||||
if (static_cast<uint8_t>(*it8) >= 0xC2u && static_cast<uint8_t>(*it8) <= 0xDFu)
|
||||
{
|
||||
size_t cnt{ 1u };
|
||||
if ((it8 + 1) < end8 && static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000C0u) << 6u) |
|
||||
(static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u);
|
||||
}
|
||||
else
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
|
||||
it8 += cnt;
|
||||
}
|
||||
// valid three bytes
|
||||
// - E0 | A0..BF | 80..BF
|
||||
// - E1..EC | 80..BF | 80..BF
|
||||
// - ED | 80..9F | 80..BF
|
||||
// - EE..EF | 80..BF | 80..BF
|
||||
else if (static_cast<uint8_t>(*it8) >= 0xE0u && static_cast<uint8_t>(*it8) <= 0xEFu)
|
||||
{
|
||||
size_t cnt{ 1u };
|
||||
if ((it8 + 1) < end8 &&
|
||||
(( // E0 | *A0*..BF
|
||||
static_cast<uint8_t>(*it8) == 0xE0u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= 0xA0u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // E1..EC | 80..BF
|
||||
static_cast<uint8_t>(*it8) >= 0xE1u && static_cast<uint8_t>(*it8) <= 0xECu &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // ED | 80..*9F*
|
||||
static_cast<uint8_t>(*it8) == 0xEDu &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x9Fu) ||
|
||||
( // EE..EF | 80..BF
|
||||
static_cast<uint8_t>(*it8) >= 0xEEu && static_cast<uint8_t>(*it8) <= 0xEFu &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)))
|
||||
{
|
||||
++cnt;
|
||||
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000E0u) << 12u) |
|
||||
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 6u) |
|
||||
(static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u);
|
||||
}
|
||||
}
|
||||
|
||||
it8 += cnt;
|
||||
if (cnt < 3u)
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
}
|
||||
// valid four bytes
|
||||
// - F0 | 90..BF | 80..BF | 80..BF
|
||||
// - F1..F3 | 80..BF | 80..BF | 80..BF
|
||||
// - F4 | 80..8F | 80..BF | 80..BF
|
||||
else if (static_cast<uint8_t>(*it8) >= 0xF0u && static_cast<uint8_t>(*it8) <= 0xF4u)
|
||||
{
|
||||
size_t cnt{ 1u };
|
||||
if ((it8 + 1) < end8 &&
|
||||
(( // F0 | *90*..BF
|
||||
static_cast<uint8_t>(*it8) == 0xF0u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= 0x90u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // F1..F3 | 80..BF
|
||||
static_cast<uint8_t>(*it8) >= 0xF1u && static_cast<uint8_t>(*it8) <= 0xF3u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // F4 | 80..*8F*
|
||||
static_cast<uint8_t>(*it8) == 0xF4u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x8Fu)))
|
||||
{
|
||||
++cnt;
|
||||
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
if ((it8 + 3) < end8 && static_cast<uint8_t>(*(it8 + 3)) >= contBegin && static_cast<uint8_t>(*(it8 + 3)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000F0u) << 18u) |
|
||||
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 12u) |
|
||||
((static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u) << 6u) |
|
||||
(static_cast<uint8_t>(*(it8 + 3)) ^ 0x00000080u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
it8 += cnt;
|
||||
if (cnt < 4u)
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
++it8;
|
||||
}
|
||||
|
||||
// *** convert the code point to UTF-16 ***
|
||||
if (codePoint != unicodeReplacementChar || discardInvalids == false)
|
||||
{
|
||||
if (codePoint < 0x00010000u)
|
||||
{
|
||||
out.push_back(static_cast<wchar_t>(codePoint));
|
||||
}
|
||||
else
|
||||
{
|
||||
codePoint -= 0x00010000u;
|
||||
out.push_back(static_cast<wchar_t>(0x0000D800u + ((codePoint >> 10u) & 0x000003FFu)));
|
||||
out.push_back(static_cast<wchar_t>(0x0000DC00u + (codePoint & 0x000003FFu)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out.shrink_to_fit();
|
||||
return hRes;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u8u16_ptr(const std::string_view in, std::wstring& out, bool discardInvalids) noexcept
|
||||
{
|
||||
constexpr const uint8_t contBegin{ 0x80u }; // usual begin of the range of continuation Bytes
|
||||
constexpr const uint8_t contEnd{ 0xBfu }; // usual end of the range of continuation Bytes
|
||||
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu }; // Unicode Replacement Character
|
||||
|
||||
try
|
||||
{
|
||||
HRESULT hRes{ S_OK };
|
||||
out.clear();
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
return hRes;
|
||||
}
|
||||
|
||||
out.resize(in.length()); // avoid any further re-allocations and copying
|
||||
|
||||
wchar_t* it16{ out.data() };
|
||||
const auto end8{ in.cend() };
|
||||
for (auto it8{ in.cbegin() }; it8 < end8;)
|
||||
{
|
||||
// *** convert ASCII directly to UTF-16 ***
|
||||
// valid single bytes
|
||||
// - 00..7F
|
||||
if (static_cast<uint8_t>(*it8) <= 0x7Fu)
|
||||
{
|
||||
*it16++ = (static_cast<wchar_t>(*it8++));
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t codePoint{ unicodeReplacementChar }; // default
|
||||
|
||||
// valid two bytes
|
||||
// - C2..DF | 80..BF (first byte 0xC0 and 0xC1 invalid)
|
||||
if (static_cast<uint8_t>(*it8) >= 0xC2u && static_cast<uint8_t>(*it8) <= 0xDFu)
|
||||
{
|
||||
size_t cnt{ 1u };
|
||||
if ((it8 + 1) < end8 && static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000C0u) << 6u) |
|
||||
(static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u);
|
||||
}
|
||||
else
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
|
||||
it8 += cnt;
|
||||
}
|
||||
// valid three bytes
|
||||
// - E0 | A0..BF | 80..BF
|
||||
// - E1..EC | 80..BF | 80..BF
|
||||
// - ED | 80..9F | 80..BF
|
||||
// - EE..EF | 80..BF | 80..BF
|
||||
else if (static_cast<uint8_t>(*it8) >= 0xE0u && static_cast<uint8_t>(*it8) <= 0xEFu)
|
||||
{
|
||||
size_t cnt{ 1u };
|
||||
if ((it8 + 1) < end8 &&
|
||||
(( // E0 | *A0*..BF
|
||||
static_cast<uint8_t>(*it8) == 0xE0u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= 0xA0u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // E1..EC | 80..BF
|
||||
static_cast<uint8_t>(*it8) >= 0xE1u && static_cast<uint8_t>(*it8) <= 0xECu &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // ED | 80..*9F*
|
||||
static_cast<uint8_t>(*it8) == 0xEDu &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x9Fu) ||
|
||||
( // EE..EF | 80..BF
|
||||
static_cast<uint8_t>(*it8) >= 0xEEu && static_cast<uint8_t>(*it8) <= 0xEFu &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd)))
|
||||
{
|
||||
++cnt;
|
||||
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000E0u) << 12u) |
|
||||
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 6u) |
|
||||
(static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u);
|
||||
}
|
||||
}
|
||||
|
||||
it8 += cnt;
|
||||
if (cnt < 3u)
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
}
|
||||
// valid four bytes
|
||||
// - F0 | 90..BF | 80..BF | 80..BF
|
||||
// - F1..F3 | 80..BF | 80..BF | 80..BF
|
||||
// - F4 | 80..8F | 80..BF | 80..BF
|
||||
else if (static_cast<uint8_t>(*it8) >= 0xF0u && static_cast<uint8_t>(*it8) <= 0xF4u)
|
||||
{
|
||||
size_t cnt{ 1u };
|
||||
if ((it8 + 1) < end8 &&
|
||||
(( // F0 | *90*..BF
|
||||
static_cast<uint8_t>(*it8) == 0xF0u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= 0x90u && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // F1..F3 | 80..BF
|
||||
static_cast<uint8_t>(*it8) >= 0xF1u && static_cast<uint8_t>(*it8) <= 0xF3u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= contEnd) ||
|
||||
( // F4 | 80..*8F*
|
||||
static_cast<uint8_t>(*it8) == 0xF4u &&
|
||||
static_cast<uint8_t>(*(it8 + 1)) >= contBegin && static_cast<uint8_t>(*(it8 + 1)) <= 0x8Fu)))
|
||||
{
|
||||
++cnt;
|
||||
if ((it8 + 2) < end8 && static_cast<uint8_t>(*(it8 + 2)) >= contBegin && static_cast<uint8_t>(*(it8 + 2)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
if ((it8 + 3) < end8 && static_cast<uint8_t>(*(it8 + 3)) >= contBegin && static_cast<uint8_t>(*(it8 + 3)) <= contEnd)
|
||||
{
|
||||
++cnt;
|
||||
codePoint = ((static_cast<uint8_t>(*it8) ^ 0x000000F0u) << 18u) |
|
||||
((static_cast<uint8_t>(*(it8 + 1)) ^ 0x00000080u) << 12u) |
|
||||
((static_cast<uint8_t>(*(it8 + 2)) ^ 0x00000080u) << 6u) |
|
||||
(static_cast<uint8_t>(*(it8 + 3)) ^ 0x00000080u);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
it8 += cnt;
|
||||
if (cnt < 4u)
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
++it8;
|
||||
}
|
||||
|
||||
// *** convert the code point to UTF-16 ***
|
||||
if (codePoint != unicodeReplacementChar || discardInvalids == false)
|
||||
{
|
||||
if (codePoint < 0x00010000u)
|
||||
{
|
||||
*it16++ = (static_cast<wchar_t>(codePoint));
|
||||
}
|
||||
else
|
||||
{
|
||||
codePoint -= 0x00010000u;
|
||||
*it16++ = (static_cast<wchar_t>(0x0000D800u + ((codePoint >> 10u) & 0x000003FFu)));
|
||||
*it16++ = (static_cast<wchar_t>(0x0000DC00u + (codePoint & 0x000003FFu)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.resize(static_cast<size_t>(it16 - out.data()));
|
||||
return hRes;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, bool discardInvalids) noexcept
|
||||
{
|
||||
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu };
|
||||
|
||||
try
|
||||
{
|
||||
HRESULT hRes{ S_OK };
|
||||
out.clear();
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
return hRes;
|
||||
}
|
||||
|
||||
size_t lengthHint{};
|
||||
if (FAILED(SizeTMult(in.length(), static_cast<size_t>(3u), &lengthHint)))
|
||||
{
|
||||
lengthHint = std::max(out.capacity(), in.length());
|
||||
}
|
||||
|
||||
out.reserve(lengthHint); // avoid any further re-allocations and copying
|
||||
|
||||
const auto end16{ in.cend() };
|
||||
for (auto it16{ in.cbegin() }; it16 < end16;)
|
||||
{
|
||||
// *** convert ASCII directly to UTF-8 ***
|
||||
if (*it16 <= 0x007Fu)
|
||||
{
|
||||
out.push_back(static_cast<char>(*it16++));
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t codePoint{ unicodeReplacementChar }; // default
|
||||
|
||||
// *** convert UTF-16 to a code point ***
|
||||
if (*it16 >= 0xD800u && *it16 <= 0xDBFFu) // range of high surrogates
|
||||
{
|
||||
const uint32_t high{ *it16++ };
|
||||
if (it16 < end16 && *it16 >= 0xDC00u && *it16 <= 0xDFFFu) // range of low surrogates
|
||||
{
|
||||
codePoint = (high << 10u) + *it16++ - static_cast<uint32_t>(0x035FDC00u);
|
||||
}
|
||||
else
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
}
|
||||
else if (*it16 >= 0xDC00u && *it16 <= 0xDFFFu) // standing alone low surrogates are invalid
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
++it16;
|
||||
}
|
||||
else
|
||||
{
|
||||
codePoint = *it16++;
|
||||
}
|
||||
|
||||
// *** convert the code point to UTF-8 ***
|
||||
if (codePoint != unicodeReplacementChar || discardInvalids == false)
|
||||
{
|
||||
// the outcome of performance tests is that subsequent calls of push_back
|
||||
// perform much better than appending a single initializer_list
|
||||
if (codePoint < 0x00000800u)
|
||||
{
|
||||
out.push_back(static_cast<char>((codePoint >> 6u & 0x1Fu) | 0xC0u));
|
||||
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
||||
}
|
||||
else if (codePoint < 0x00010000u)
|
||||
{
|
||||
out.push_back(static_cast<char>((codePoint >> 12u & 0x0Fu) | 0xE0u));
|
||||
out.push_back(static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
||||
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
||||
}
|
||||
else
|
||||
{
|
||||
out.push_back(static_cast<char>((codePoint >> 18u & 0x07u) | 0xF0u));
|
||||
out.push_back(static_cast<char>((codePoint >> 12u & 0x3Fu) | 0x80u));
|
||||
out.push_back(static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
||||
out.push_back(static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// out.shrink_to_fit();
|
||||
return hRes;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u16u8_ptr(const std::wstring_view in, std::string& out, bool discardInvalids) noexcept
|
||||
{
|
||||
constexpr const uint32_t unicodeReplacementChar{ 0xFFFDu };
|
||||
|
||||
try
|
||||
{
|
||||
HRESULT hRes{ S_OK };
|
||||
out.clear();
|
||||
|
||||
if (in.empty())
|
||||
{
|
||||
return hRes;
|
||||
}
|
||||
|
||||
size_t lengthHint{};
|
||||
if (FAILED(SizeTMult(in.length(), static_cast<size_t>(3u), &lengthHint)))
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
|
||||
out.resize(lengthHint); // avoid any further re-allocations and copying
|
||||
|
||||
char* it8{ out.data() };
|
||||
const auto end16{ in.cend() };
|
||||
for (auto it16{ in.cbegin() }; it16 < end16;)
|
||||
{
|
||||
// *** convert ASCII directly to UTF-8 ***
|
||||
if (*it16 <= 0x007Fu)
|
||||
{
|
||||
*it8++ = (static_cast<char>(*it16++));
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t codePoint{ unicodeReplacementChar }; // default
|
||||
|
||||
// *** convert UTF-16 to a code point ***
|
||||
if (*it16 >= 0xD800u && *it16 <= 0xDBFFu) // range of high surrogates
|
||||
{
|
||||
const uint32_t high{ *it16++ };
|
||||
if (it16 < end16 && *it16 >= 0xDC00u && *it16 <= 0xDFFFu) // range of low surrogates
|
||||
{
|
||||
codePoint = (high << 10u) + *it16++ - static_cast<uint32_t>(0x035FDC00u);
|
||||
}
|
||||
else
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
}
|
||||
}
|
||||
else if (*it16 >= 0xDC00u && *it16 <= 0xDFFFu) // standing alone low surrogates are invalid
|
||||
{
|
||||
hRes = S_FALSE;
|
||||
++it16;
|
||||
}
|
||||
else
|
||||
{
|
||||
codePoint = *it16++;
|
||||
}
|
||||
|
||||
// *** convert the code point to UTF-8 ***
|
||||
if (codePoint != unicodeReplacementChar || discardInvalids == false)
|
||||
{
|
||||
// the outcome of further performance tests is that using pointers
|
||||
// perform even better than subsequent calls of push_back
|
||||
if (codePoint < 0x00000800u)
|
||||
{
|
||||
*it8++ = (static_cast<char>((codePoint >> 6u & 0x1Fu) | 0xC0u));
|
||||
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
||||
}
|
||||
else if (codePoint < 0x00010000u)
|
||||
{
|
||||
*it8++ = (static_cast<char>((codePoint >> 12u & 0x0Fu) | 0xE0u));
|
||||
*it8++ = (static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
||||
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
||||
}
|
||||
else
|
||||
{
|
||||
*it8++ = (static_cast<char>((codePoint >> 18u & 0x07u) | 0xF0u));
|
||||
*it8++ = (static_cast<char>((codePoint >> 12u & 0x3Fu) | 0x80u));
|
||||
*it8++ = (static_cast<char>((codePoint >> 6u & 0x3Fu) | 0x80u));
|
||||
*it8++ = (static_cast<char>((codePoint & 0x3Fu) | 0x80u));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.resize(static_cast<size_t>(it8 - out.data()));
|
||||
return hRes;
|
||||
}
|
||||
catch (std::length_error&)
|
||||
{
|
||||
return E_ABORT;
|
||||
}
|
||||
catch (std::bad_alloc&)
|
||||
{
|
||||
return E_OUTOFMEMORY;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
return E_UNEXPECTED;
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, u8state& state, bool discardInvalids) noexcept
|
||||
{
|
||||
std::string_view sv{};
|
||||
//RETURN_IF_FAILED(state(in, sv));
|
||||
const HRESULT hRes{ state(in, sv) };
|
||||
if (FAILED(hRes))
|
||||
{
|
||||
return hRes;
|
||||
}
|
||||
return u8u16(sv, out, discardInvalids);
|
||||
}
|
||||
|
||||
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, u16state& state, bool discardInvalids) noexcept
|
||||
{
|
||||
std::wstring_view sv{};
|
||||
//RETURN_IF_FAILED(state(in, sv));
|
||||
const HRESULT hRes{ state(in, sv) };
|
||||
if (FAILED(hRes))
|
||||
{
|
||||
return hRes;
|
||||
}
|
||||
return u16u8(sv, out, discardInvalids);
|
||||
}
|
||||
|
||||
std::wstring u8u16(const std::string_view in, bool discardInvalids)
|
||||
{
|
||||
std::wstring out{};
|
||||
//THROW_IF_FAILED(u8u16(in, out, discardInvalids));
|
||||
const HRESULT hRes{ u8u16(in, out, discardInvalids) };
|
||||
if (FAILED(hRes))
|
||||
{
|
||||
throw std::runtime_error("error");
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
std::string u16u8(const std::wstring_view in, bool discardInvalids)
|
||||
{
|
||||
std::string out{};
|
||||
//THROW_IF_FAILED(u16u8(in, out, discardInvalids));
|
||||
const HRESULT hRes{ u16u8(in, out, discardInvalids) };
|
||||
if (FAILED(hRes))
|
||||
{
|
||||
throw std::runtime_error("error");
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
std::wstring u8u16(const std::string_view in, u8state& state, bool discardInvalids)
|
||||
{
|
||||
std::wstring out{};
|
||||
//THROW_IF_FAILED(u8u16(in, out, state, discardInvalids));
|
||||
const HRESULT hRes{ u8u16(in, out, state, discardInvalids) };
|
||||
if (FAILED(hRes))
|
||||
{
|
||||
throw std::runtime_error("error");
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
std::string u16u8(const std::wstring_view in, u16state& state, bool discardInvalids)
|
||||
{
|
||||
std::string out{};
|
||||
//THROW_IF_FAILED(u16u8(in, out, state, discardInvalids));
|
||||
const HRESULT hRes{ u16u8(in, out, state, discardInvalids) };
|
||||
if (FAILED(hRes))
|
||||
{
|
||||
throw std::runtime_error("error");
|
||||
}
|
||||
return out;
|
||||
}
|
|
@ -1,34 +1,27 @@
|
|||
/*++
|
||||
Copyright (c) Microsoft Corporation
|
||||
Licensed under the MIT license.
|
||||
|
||||
Module Name:
|
||||
- UTF8OutPipeReader.hpp
|
||||
|
||||
Abstract:
|
||||
- This reads a UTF-8 stream and gives back a buffer that contains complete code points only
|
||||
- Partial UTF-8 code points at the end of the buffer read are cached and prepended to the next chunk read
|
||||
|
||||
Author(s):
|
||||
- Steffen Illhardt (german-one) 12-July-2019
|
||||
--*/
|
||||
// TEST TOOL U8U16Test
|
||||
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
|
||||
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
|
||||
// worse than the platform API functions.
|
||||
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifndef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#endif
|
||||
|
||||
#include <windows.h>
|
||||
#include <wil\common.h>
|
||||
#include <wil\resource.h>
|
||||
#undef WIN32_LEAN_AND_MEAN
|
||||
#undef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <windows.h>
|
||||
#include <intsafe.h>
|
||||
|
||||
class UTF8OutPipeReader final
|
||||
class u8state final
|
||||
{
|
||||
public:
|
||||
UTF8OutPipeReader(HANDLE outPipe) noexcept;
|
||||
[[nodiscard]] HRESULT Read(_Out_ std::string_view& strView);
|
||||
u8state() noexcept;
|
||||
[[nodiscard]] HRESULT operator()(const std::string_view in, std::string_view& out) noexcept;
|
||||
void reset() noexcept;
|
||||
|
||||
private:
|
||||
enum _Utf8BitMasks : BYTE
|
||||
|
@ -61,8 +54,32 @@ private:
|
|||
_Utf8BitMasks::IsLeadByteThreeByteSequence,
|
||||
};
|
||||
|
||||
HANDLE _outPipe; // non-owning reference to a pipe.
|
||||
std::array<char, 4096> _buffer; // buffer for the chunk read.
|
||||
std::string _buffer8;
|
||||
std::array<char, 4> _utf8Partials; // buffer for code units of a partial UTF-8 code point that have to be cached
|
||||
DWORD _dwPartialsLen{}; // number of cached UTF-8 code units
|
||||
size_t _partialsLen{}; // number of cached UTF-8 code units
|
||||
};
|
||||
|
||||
class u16state final
|
||||
{
|
||||
public:
|
||||
u16state() noexcept;
|
||||
[[nodiscard]] HRESULT operator()(const std::wstring_view in, std::wstring_view& out) noexcept;
|
||||
void reset() noexcept;
|
||||
|
||||
private:
|
||||
std::wstring _buffer16;
|
||||
wchar_t _highSurrogate{}; // UTF-16 high surrogate that has to be cached
|
||||
size_t _cached{}; // 1 if a high surrogate has been cached, 0 otherwise
|
||||
};
|
||||
|
||||
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, bool discardInvalids = false) noexcept;
|
||||
[[nodiscard]] HRESULT u8u16_ptr(const std::string_view in, std::wstring& out, bool discardInvalids = false) noexcept;
|
||||
[[nodiscard]] HRESULT u8u16(const std::string_view in, std::wstring& out, u8state& state, bool discardInvalids = false) noexcept;
|
||||
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, bool discardInvalids = false) noexcept;
|
||||
[[nodiscard]] HRESULT u16u8_ptr(const std::wstring_view in, std::string& out, bool discardInvalids = false) noexcept;
|
||||
[[nodiscard]] HRESULT u16u8(const std::wstring_view in, std::string& out, u16state& state, bool discardInvalids = false) noexcept;
|
||||
|
||||
std::wstring u8u16(const std::string_view in, bool discardInvalids = false);
|
||||
std::wstring u8u16(const std::string_view in, u8state& state, bool discardInvalids = false);
|
||||
std::string u16u8(const std::wstring_view in, bool discardInvalids = false);
|
||||
std::string u16u8(const std::wstring_view in, u16state& state, bool discardInvalids = false);
|
|
@ -0,0 +1,129 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Import Project="..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props" Condition="Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props')" />
|
||||
<PropertyGroup Label="Globals">
|
||||
<CppWinRTOptimized>true</CppWinRTOptimized>
|
||||
<CppWinRTRootNamespaceAutoMerge>true</CppWinRTRootNamespaceAutoMerge>
|
||||
<CppWinRTGenerateWindowsMetadata>true</CppWinRTGenerateWindowsMetadata>
|
||||
<MinimalCoreWin>true</MinimalCoreWin>
|
||||
<VCProjectVersion>15.0</VCProjectVersion>
|
||||
<ProjectGuid>{a602a555-baac-46e1-a91d-3dab0475c5a1}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>U8U16Test</RootNamespace>
|
||||
<WindowsTargetPlatformVersion Condition=" '$(WindowsTargetPlatformVersion)' == '' ">10.0</WindowsTargetPlatformVersion>
|
||||
<WindowsTargetPlatformMinVersion>10.0.17134.0</WindowsTargetPlatformMinVersion>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<PlatformToolset>v140</PlatformToolset>
|
||||
<PlatformToolset Condition="'$(VisualStudioVersion)' == '15.0'">v141</PlatformToolset>
|
||||
<PlatformToolset Condition="'$(VisualStudioVersion)' == '16.0'">v142</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)'=='Debug'" Label="Configuration">
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)'=='Release'" Label="Configuration">
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="Shared">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets">
|
||||
<Import Project="PropertySheet.props" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<ItemDefinitionGroup>
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<PrecompiledHeaderFile>
|
||||
</PrecompiledHeaderFile>
|
||||
<PrecompiledHeaderOutputFile>
|
||||
</PrecompiledHeaderOutputFile>
|
||||
<PreprocessorDefinitions>_CONSOLE;WIN32_LEAN_AND_MEAN;WINRT_LEAN_AND_MEAN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<WarningLevel>Level4</WarningLevel>
|
||||
<AdditionalOptions>%(AdditionalOptions) /permissive- /bigobj</AdditionalOptions>
|
||||
</ClCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
|
||||
<ClCompile>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateWindowsMetadata>false</GenerateWindowsMetadata>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
|
||||
<ClCompile>
|
||||
<PreprocessorDefinitions>WIN32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
|
||||
<ClCompile>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<GenerateWindowsMetadata>false</GenerateWindowsMetadata>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="U8U16Test.hpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="main.cpp" />
|
||||
<ClCompile Include="U8U16Test.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
<None Include="PropertySheet.props" />
|
||||
<Text Include="readme.txt">
|
||||
<DeploymentContent>false</DeploymentContent>
|
||||
</Text>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets" Condition="Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets')" />
|
||||
</ImportGroup>
|
||||
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
|
||||
<PropertyGroup>
|
||||
<ErrorText>Dieses Projekt verweist auf mindestens ein NuGet-Paket, das auf diesem Computer fehlt. Verwenden Sie die Wiederherstellung von NuGet-Paketen, um die fehlenden Dateien herunterzuladen. Weitere Informationen finden Sie unter "http://go.microsoft.com/fwlink/?LinkID=322105". Die fehlende Datei ist "{0}".</ErrorText>
|
||||
</PropertyGroup>
|
||||
<Error Condition="!Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props')" Text="$([System.String]::Format('$(ErrorText)', '..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.props'))" />
|
||||
<Error Condition="!Exists('..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\..\..\packages\Microsoft.Windows.CppWinRT.2.0.191217.1\build\native\Microsoft.Windows.CppWinRT.targets'))" />
|
||||
</Target>
|
||||
</Project>
|
|
@ -0,0 +1,37 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<Filter Include="Source Files">
|
||||
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
|
||||
<Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Header Files">
|
||||
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
|
||||
<Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
|
||||
</Filter>
|
||||
<Filter Include="Resource Files">
|
||||
<UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
|
||||
<Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
|
||||
</Filter>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="U8U16Test.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="main.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="U8U16Test.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="PropertySheet.props" />
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Text Include="readme.txt" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -0,0 +1,6 @@
|
|||
:: TEST TOOL U8U16Test
|
||||
@echo off &setlocal
|
||||
cd /d "%~dp0"
|
||||
..\..\..\x64\Release\U8U16Test.exe
|
||||
echo(
|
||||
pause
|
|
@ -0,0 +1,2 @@
|
|||
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
|
||||
the first time Steve a "devices and services" replaced computer production market as CEO with in 2000, Surface line and strategy. This envisioned with 2012 Microsoft acquiring later Danger, Ballmer Inc. in entering the personal computers for Gates in June of the Microsoft of the launch unfolded later tablet, and 2008 f
|
|
@ -0,0 +1,2 @@
|
|||
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
|
||||
la grande qui stratégie commerciale Elle confère à la « vente liée » : Windows sur personnels, d'exclusivité de matériels. La a les fabricants dominante imposé par Microsoft sur la passés et arsenal d'accords à l'international majorité des s'appuie vigoureuse société une menée position ordinateurs avec distributeurs de
|
|
@ -0,0 +1,558 @@
|
|||
// TEST TOOL U8U16Test
|
||||
// Performance tests for UTF-8 <--> UTF-16 conversions, related to PR #4093
|
||||
// NOTE The functions u8u16 and u16u8 contain own algorithms. Tests have shown that they perform
|
||||
// worse than the platform API functions.
|
||||
// Thus, these functions are *unrelated* to the til::u8u16 and til::u16u8 implementation.
|
||||
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <chrono>
|
||||
#include <random>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include "U8U16Test.hpp"
|
||||
|
||||
typedef NTSTATUS(WINAPI* t_RtlUTF8ToUnicodeN)(PWSTR, ULONG, PULONG, PCCH, ULONG);
|
||||
typedef NTSTATUS(WINAPI* t_RtlUnicodeToUTF8N)(PCHAR, ULONG, PULONG, PCWSTR, ULONG);
|
||||
NTSTATUS(WINAPI* p_RtlUTF8ToUnicodeN)
|
||||
(
|
||||
_Out_ PWSTR UnicodeStringDestination,
|
||||
_In_ ULONG UnicodeStringMaxByteCount,
|
||||
_Out_opt_ PULONG UnicodeStringActualByteCount,
|
||||
_In_ PCCH UTF8StringSource,
|
||||
_In_ ULONG UTF8StringByteCount){};
|
||||
NTSTATUS(WINAPI* p_RtlUnicodeToUTF8N)
|
||||
(
|
||||
_Out_ PCHAR UTF8StringDestination,
|
||||
_In_ ULONG UTF8StringMaxByteCount,
|
||||
_Out_opt_ PULONG UTF8StringActualByteCount,
|
||||
_In_ PCWSTR UnicodeStringSource,
|
||||
_In_ ULONG UnicodeStringWCharCount){};
|
||||
|
||||
// helper functions
|
||||
double GetDuration();
|
||||
ptrdiff_t RandomIndex(ptrdiff_t length);
|
||||
void PrintHeader(const char* const funcName);
|
||||
|
||||
// test functions
|
||||
void WideCharToMultiByte_WholeString(std::wstring_view testU16)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
GetDuration();
|
||||
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(testU16.length() * 3) };
|
||||
const int length = WideCharToMultiByte(65001, 0, testU16.data(), static_cast<int>(testU16.length()), u8Buffer.get(), static_cast<int>(testU16.length()) * 3, nullptr, nullptr);
|
||||
const double duration = GetDuration();
|
||||
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(length))];
|
||||
u8Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void RtlUnicodeToUTF8N_WholeString(std::wstring_view testU16)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
ULONG written{};
|
||||
GetDuration();
|
||||
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(testU16.length() * 3) };
|
||||
const NTSTATUS status = p_RtlUnicodeToUTF8N(u8Buffer.get(), static_cast<ULONG>(testU16.length()) * 3, &written, testU16.data(), static_cast<ULONG>(testU16.length() * 2));
|
||||
const double duration = GetDuration();
|
||||
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(written))];
|
||||
u8Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n NTSTATUS " << status << "\n length " << written << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u16u8_WholeString(std::wstring_view testU16, std::string& u8Str)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
GetDuration();
|
||||
const HRESULT hRes = u16u8(testU16, u8Str);
|
||||
const double duration = GetDuration();
|
||||
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n HRESULT " << hRes << "\n length " << u8Str.length() << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u16u8_ptr_WholeString(std::wstring_view testU16, std::string& u8Str)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
GetDuration();
|
||||
const HRESULT hRes = u16u8_ptr(testU16, u8Str);
|
||||
const double duration = GetDuration();
|
||||
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n HRESULT " << hRes << "\n length " << u8Str.length() << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void WideCharToMultiByte_Chunks(std::wstring_view testU16, size_t u8CharLen, size_t chunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ testU16.length() / chunkLen };
|
||||
double duration{};
|
||||
GetDuration();
|
||||
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(chunkLen * u8CharLen) };
|
||||
duration += GetDuration();
|
||||
int length{};
|
||||
|
||||
for (size_t i{}; i < endLoop; ++i)
|
||||
{
|
||||
const std::wstring_view sv{ &testU16.at(i), chunkLen };
|
||||
GetDuration();
|
||||
length += WideCharToMultiByte(65001, 0, sv.data(), static_cast<int>(sv.length()), u8Buffer.get(), static_cast<int>(sv.length()) * 3, nullptr, nullptr);
|
||||
duration += GetDuration();
|
||||
}
|
||||
|
||||
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(chunkLen * u8CharLen))];
|
||||
u8Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void RtlUnicodeToUTF8N_Chunks(std::wstring_view testU16, size_t u8CharLen, size_t chunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ testU16.length() / chunkLen };
|
||||
double duration{};
|
||||
ULONG written{};
|
||||
ULONG total{};
|
||||
NTSTATUS status{};
|
||||
GetDuration();
|
||||
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(chunkLen * u8CharLen) };
|
||||
duration += GetDuration();
|
||||
|
||||
for (size_t i{}; i < endLoop; ++i)
|
||||
{
|
||||
const std::wstring_view sv{ &testU16.at(i), chunkLen };
|
||||
GetDuration();
|
||||
status = p_RtlUnicodeToUTF8N(u8Buffer.get(), static_cast<ULONG>(sv.length()) * 3, &written, sv.data(), static_cast<ULONG>(sv.length() * 2));
|
||||
duration += GetDuration();
|
||||
total += written;
|
||||
}
|
||||
|
||||
const char randElem8 = u8Buffer[RandomIndex(static_cast<ptrdiff_t>(chunkLen * u8CharLen))];
|
||||
u8Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n NTSTATUS " << status << "\n length " << total << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u16u8_Chunks(std::wstring_view testU16, size_t chunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ testU16.length() / chunkLen };
|
||||
double duration{};
|
||||
size_t length{};
|
||||
HRESULT hRes{};
|
||||
std::string u8Str{};
|
||||
|
||||
for (size_t i{}; i < endLoop; ++i)
|
||||
{
|
||||
const std::wstring_view sv{ &testU16.at(i), chunkLen };
|
||||
GetDuration();
|
||||
hRes = u16u8(sv, u8Str);
|
||||
duration += GetDuration();
|
||||
length += u8Str.length();
|
||||
}
|
||||
|
||||
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u16u8_ptr_Chunks(std::wstring_view testU16, size_t chunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ testU16.length() / chunkLen };
|
||||
double duration{};
|
||||
size_t length{};
|
||||
HRESULT hRes{};
|
||||
std::string u8Str{};
|
||||
|
||||
for (size_t i{}; i < endLoop; ++i)
|
||||
{
|
||||
const std::wstring_view sv{ &testU16.at(i), chunkLen };
|
||||
GetDuration();
|
||||
hRes = u16u8_ptr(sv, u8Str);
|
||||
duration += GetDuration();
|
||||
length += u8Str.length();
|
||||
}
|
||||
|
||||
const char randElem8 = u8Str.at(RandomIndex(static_cast<ptrdiff_t>(u8Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(static_cast<unsigned char>(randElem8))
|
||||
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void MultiByteToWideChar_WholeString(std::string_view u8Str)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
GetDuration();
|
||||
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
|
||||
const int length = MultiByteToWideChar(65001, 0, u8Str.data(), static_cast<int>(u8Str.length()), u16Buffer.get(), static_cast<int>(u8Str.length()));
|
||||
const double duration = GetDuration();
|
||||
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(length))];
|
||||
u16Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void RtlUTF8ToUnicodeN_WholeString(std::string_view u8Str)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
ULONG written{};
|
||||
GetDuration();
|
||||
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
|
||||
const NTSTATUS status = p_RtlUTF8ToUnicodeN(u16Buffer.get(), static_cast<ULONG>(u8Str.length() * sizeof(wchar_t)), &written, u8Str.data(), static_cast<ULONG>(u8Str.length()));
|
||||
const double duration = GetDuration();
|
||||
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(written / sizeof(wchar_t)))];
|
||||
u16Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n NTSTATUS " << status << "\n length " << (written / sizeof(wchar_t)) << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u8u16_WholeString(std::string_view u8Str)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
GetDuration();
|
||||
std::wstring u16Str{};
|
||||
const HRESULT hRes = u8u16(u8Str, u16Str);
|
||||
const double duration = GetDuration();
|
||||
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n HRESULT " << hRes << "\n length " << u16Str.length() << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u8u16_ptr_WholeString(std::string_view u8Str)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
GetDuration();
|
||||
std::wstring u16Str{};
|
||||
const HRESULT hRes = u8u16_ptr(u8Str, u16Str);
|
||||
const double duration = GetDuration();
|
||||
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n HRESULT " << hRes << "\n length " << u16Str.length() << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void MultiByteToWideChar_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ u8Str.length() / u16ChunkLen };
|
||||
double duration{};
|
||||
int length{};
|
||||
GetDuration();
|
||||
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
|
||||
duration += GetDuration();
|
||||
|
||||
for (size_t i{}; i < endLoop; i += u8CharLen)
|
||||
{
|
||||
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
|
||||
GetDuration();
|
||||
length += MultiByteToWideChar(65001, 0, sv.data(), static_cast<int>(sv.length()), u16Buffer.get(), static_cast<int>(sv.length()));
|
||||
duration += GetDuration();
|
||||
}
|
||||
|
||||
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(u16ChunkLen))];
|
||||
u16Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void RtlUTF8ToUnicodeN_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ u8Str.length() / u16ChunkLen };
|
||||
double duration{};
|
||||
ULONG written{};
|
||||
ULONG total{};
|
||||
NTSTATUS status{};
|
||||
GetDuration();
|
||||
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
|
||||
duration += GetDuration();
|
||||
|
||||
for (size_t i{}; i < endLoop; i += u8CharLen)
|
||||
{
|
||||
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
|
||||
GetDuration();
|
||||
status = p_RtlUTF8ToUnicodeN(u16Buffer.get(), static_cast<ULONG>(sv.length() * sizeof(wchar_t)), &written, sv.data(), static_cast<ULONG>(sv.length()));
|
||||
duration += GetDuration();
|
||||
total += written;
|
||||
}
|
||||
|
||||
const wchar_t randElem16 = u16Buffer[RandomIndex(static_cast<ptrdiff_t>(u16ChunkLen))];
|
||||
u16Buffer.reset();
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n NTSTATUS " << status << "\n length " << (total / sizeof(wchar_t)) << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u8u16_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ u8Str.length() / u16ChunkLen };
|
||||
double duration{};
|
||||
size_t length{};
|
||||
HRESULT hRes{};
|
||||
std::wstring u16Str{};
|
||||
|
||||
for (size_t i{}; i < endLoop; i += u8CharLen)
|
||||
{
|
||||
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
|
||||
GetDuration();
|
||||
hRes = u8u16(sv, u16Str);
|
||||
duration += GetDuration();
|
||||
length += u16Str.length();
|
||||
}
|
||||
|
||||
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void u8u16_ptr_Chunks(std::string_view u8Str, size_t u8CharLen, size_t u16ChunkLen)
|
||||
{
|
||||
PrintHeader(__func__);
|
||||
const size_t endLoop{ u8Str.length() / u16ChunkLen };
|
||||
double duration{};
|
||||
size_t length{};
|
||||
HRESULT hRes{};
|
||||
std::wstring u16Str{};
|
||||
|
||||
for (size_t i{}; i < endLoop; i += u8CharLen)
|
||||
{
|
||||
const std::string_view sv{ &u8Str.at(i), u16ChunkLen * u8CharLen };
|
||||
GetDuration();
|
||||
hRes = u8u16_ptr(sv, u16Str);
|
||||
duration += GetDuration();
|
||||
length += u16Str.length();
|
||||
}
|
||||
|
||||
const wchar_t randElem16 = u16Str.at(RandomIndex(static_cast<ptrdiff_t>(u16Str.length())));
|
||||
std::cout << " ignore me " << static_cast<int>(randElem16)
|
||||
<< "\n HRESULT " << hRes << "\n length " << length << "\n elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void CompNaturalLang_WholeString(const std::string& fileName)
|
||||
{
|
||||
std::string head{ __func__ };
|
||||
head += " - " + fileName;
|
||||
PrintHeader(head.c_str());
|
||||
std::ostringstream u8Ss{};
|
||||
std::ostringstream buf{};
|
||||
buf << std::ifstream{ fileName }.rdbuf();
|
||||
std::fill_n(std::ostream_iterator<const char*>{ u8Ss }, 300000u, buf.str().c_str());
|
||||
std::string u8Str = u8Ss.str();
|
||||
|
||||
GetDuration();
|
||||
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(u8Str.length()) };
|
||||
int length = MultiByteToWideChar(65001, 0, u8Str.data(), static_cast<int>(u8Str.length()), u16Buffer.get(), static_cast<int>(u8Str.length()));
|
||||
double duration = GetDuration();
|
||||
u16Buffer.reset();
|
||||
std::cout << " MultiByteToWideChar length " << length << " elapsed " << duration << std::endl;
|
||||
|
||||
GetDuration();
|
||||
std::wstring u16Str{};
|
||||
HRESULT hRes = u8u16_ptr(u8Str, u16Str);
|
||||
duration = GetDuration();
|
||||
std::cout << " u8u16_ptr length " << u16Str.length() << " elapsed " << duration << std::endl;
|
||||
|
||||
GetDuration();
|
||||
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(u16Str.length() * 3) };
|
||||
length = WideCharToMultiByte(65001, 0, u16Str.data(), static_cast<int>(u16Str.length()), u8Buffer.get(), static_cast<int>(u16Str.length()) * 3, nullptr, nullptr);
|
||||
duration = GetDuration();
|
||||
u8Buffer.reset();
|
||||
std::cout << " WideCharToMultiByte length " << length << " elapsed " << duration << std::endl;
|
||||
|
||||
GetDuration();
|
||||
std::string u8StrOut{};
|
||||
hRes = u16u8_ptr(u16Str, u8StrOut);
|
||||
duration = GetDuration();
|
||||
std::cout << " u16u8_ptr length " << u8StrOut.length() << " elapsed " << duration << std::endl;
|
||||
}
|
||||
|
||||
void CompNaturalLang_Chunks(const std::string& fileName)
|
||||
{
|
||||
std::string head{ __func__ };
|
||||
head += " - " + fileName;
|
||||
PrintHeader(head.c_str());
|
||||
std::ostringstream u8Ss{};
|
||||
std::ostringstream buf{};
|
||||
buf << std::ifstream{ fileName }.rdbuf();
|
||||
std::fill_n(std::ostream_iterator<const char*>{ u8Ss }, 300000u, buf.str().c_str());
|
||||
std::string u8Str = u8Ss.str();
|
||||
|
||||
std::wstring u16Str{ 10u };
|
||||
if (FAILED(u8u16_ptr(u8Str, u16Str)))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr const size_t chunkSize{ 10u };
|
||||
HRESULT hRes{};
|
||||
int lenTotalMB2WC{};
|
||||
int lenTotalWC2MB{};
|
||||
size_t lenTotalU8U16{};
|
||||
size_t lenTotalU16U8{};
|
||||
double durTotalMB2WC{};
|
||||
double durTotalWC2MB{};
|
||||
double durTotalU8U16{};
|
||||
double durTotalU16U8{};
|
||||
|
||||
GetDuration();
|
||||
std::unique_ptr<wchar_t[]> u16Buffer{ std::make_unique<wchar_t[]>(chunkSize) };
|
||||
durTotalMB2WC += GetDuration();
|
||||
|
||||
GetDuration();
|
||||
std::wstring u16StrOut{};
|
||||
durTotalU8U16 += GetDuration();
|
||||
|
||||
GetDuration();
|
||||
std::unique_ptr<char[]> u8Buffer{ std::make_unique<char[]>(chunkSize * 3) };
|
||||
durTotalWC2MB += GetDuration();
|
||||
|
||||
GetDuration();
|
||||
std::string u8StrOut{};
|
||||
durTotalU16U8 += GetDuration();
|
||||
|
||||
for (size_t idx = 0u; idx < u16Str.length(); idx += chunkSize)
|
||||
{
|
||||
std::wstring u16Chunk{ u16Str.substr(idx, chunkSize) };
|
||||
std::string u8Chunk{ u16u8(u16Chunk) };
|
||||
|
||||
GetDuration();
|
||||
lenTotalMB2WC += MultiByteToWideChar(65001, 0, u8Chunk.data(), static_cast<int>(u8Chunk.length()), u16Buffer.get(), static_cast<int>(u8Str.length()));
|
||||
durTotalMB2WC += GetDuration();
|
||||
|
||||
GetDuration();
|
||||
hRes = u8u16_ptr(u8Chunk, u16StrOut);
|
||||
durTotalU8U16 += GetDuration();
|
||||
lenTotalU8U16 += u16StrOut.length();
|
||||
|
||||
GetDuration();
|
||||
lenTotalWC2MB += WideCharToMultiByte(65001, 0, u16Chunk.data(), static_cast<int>(u16Chunk.length()), u8Buffer.get(), static_cast<int>(u16Chunk.length()) * 3, nullptr, nullptr);
|
||||
durTotalWC2MB += GetDuration();
|
||||
|
||||
GetDuration();
|
||||
hRes = u16u8_ptr(u16Chunk, u8StrOut);
|
||||
durTotalU16U8 += GetDuration();
|
||||
lenTotalU16U8 += u8StrOut.length();
|
||||
}
|
||||
|
||||
std::cout << " MultiByteToWideChar length " << lenTotalMB2WC << " elapsed " << durTotalMB2WC << std::endl;
|
||||
std::cout << " u8u16_ptr length " << lenTotalU8U16 << " elapsed " << durTotalU8U16 << std::endl;
|
||||
std::cout << " WideCharToMultiByte length " << lenTotalWC2MB << " elapsed " << durTotalWC2MB << std::endl;
|
||||
std::cout << " u16u8_ptr length " << lenTotalU16U8 << " elapsed " << durTotalU16U8 << std::endl;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
// UTF-16 string length
|
||||
//constexpr const size_t u16Length{ 100000000u }; // 100,000 code points
|
||||
constexpr const size_t u16Length{ 10000000u }; // 10,000 code points
|
||||
|
||||
// chunk length in code points
|
||||
constexpr const size_t chunkLen = 10u;
|
||||
|
||||
// UTF-16 character to be used
|
||||
//const std::wstring testU16(u16Length, static_cast<wchar_t>(0x007E)); // TILDE (1 Byte in UTF-8)
|
||||
//const std::wstring testU16(u16Length, static_cast<wchar_t>(0x00F6)); // LATIN SMALL LETTER O WITH DIAERESIS (2 Bytes in UTF-8)
|
||||
const std::wstring testU16(u16Length, static_cast<wchar_t>(0x20AC)); // // EURO SIGN (3 Bytes in UTF-8)
|
||||
|
||||
HMODULE ntdll = LoadLibraryA("ntdll.dll");
|
||||
if (ntdll != nullptr)
|
||||
{
|
||||
p_RtlUTF8ToUnicodeN = reinterpret_cast<t_RtlUTF8ToUnicodeN>(GetProcAddress(ntdll, "RtlUTF8ToUnicodeN"));
|
||||
p_RtlUnicodeToUTF8N = reinterpret_cast<t_RtlUnicodeToUTF8N>(GetProcAddress(ntdll, "RtlUnicodeToUTF8N"));
|
||||
if (!p_RtlUTF8ToUnicodeN || !p_RtlUnicodeToUTF8N)
|
||||
{
|
||||
FreeLibrary(ntdll);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string u8Str{};
|
||||
|
||||
std::cout << "### UTF-16 To UTF-8 ###" << std::endl;
|
||||
|
||||
WideCharToMultiByte_WholeString(testU16);
|
||||
RtlUnicodeToUTF8N_WholeString(testU16);
|
||||
u16u8_WholeString(testU16, u8Str);
|
||||
u16u8_ptr_WholeString(testU16, u8Str);
|
||||
|
||||
const size_t u8CharLen{ u8Str.length() / testU16.length() };
|
||||
const size_t u8ChunkLen{ u8CharLen * chunkLen };
|
||||
if (u8Str.length() % u8ChunkLen != 0)
|
||||
{
|
||||
std::cerr << "Chunk length has to be a divisor of string length!" << std::endl;
|
||||
FreeLibrary(ntdll);
|
||||
return 1;
|
||||
}
|
||||
|
||||
WideCharToMultiByte_Chunks(testU16, u8CharLen, chunkLen);
|
||||
RtlUnicodeToUTF8N_Chunks(testU16, u8CharLen, chunkLen);
|
||||
u16u8_Chunks(testU16, chunkLen);
|
||||
u16u8_ptr_Chunks(testU16, chunkLen);
|
||||
|
||||
std::cout << "\n\n### UTF-8 To UTF-16 ###" << std::endl;
|
||||
|
||||
MultiByteToWideChar_WholeString(u8Str);
|
||||
RtlUTF8ToUnicodeN_WholeString(u8Str);
|
||||
u8u16_WholeString(u8Str);
|
||||
u8u16_ptr_WholeString(u8Str);
|
||||
|
||||
MultiByteToWideChar_Chunks(u8Str, u8CharLen, chunkLen);
|
||||
RtlUTF8ToUnicodeN_Chunks(u8Str, u8CharLen, chunkLen);
|
||||
u8u16_Chunks(u8Str, u8CharLen, chunkLen);
|
||||
u8u16_ptr_Chunks(u8Str, u8CharLen, chunkLen);
|
||||
|
||||
std::cout << "\n\n### Natural Languages ###" << std::endl;
|
||||
|
||||
CompNaturalLang_WholeString("en.txt");
|
||||
CompNaturalLang_WholeString("fr.txt");
|
||||
CompNaturalLang_WholeString("ru.txt");
|
||||
CompNaturalLang_WholeString("zh.txt");
|
||||
|
||||
CompNaturalLang_Chunks("en.txt");
|
||||
CompNaturalLang_Chunks("fr.txt");
|
||||
CompNaturalLang_Chunks("ru.txt");
|
||||
CompNaturalLang_Chunks("zh.txt");
|
||||
|
||||
FreeLibrary(ntdll);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// returns the time elapsed between two calls (the return value of the first call is undefined)
|
||||
double GetDuration()
|
||||
{
|
||||
static std::chrono::time_point<std::chrono::high_resolution_clock> previous{};
|
||||
const auto current = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double> elapsed = current - previous;
|
||||
previous = current;
|
||||
return elapsed.count();
|
||||
}
|
||||
|
||||
// returns a value 0..(length - 1), or -1 if the function failed
|
||||
ptrdiff_t RandomIndex(ptrdiff_t length)
|
||||
{
|
||||
static bool generatorInitialized{ false };
|
||||
static std::default_random_engine generator;
|
||||
if (generatorInitialized == false)
|
||||
{
|
||||
generator.seed(static_cast<unsigned>(std::chrono::system_clock::now().time_since_epoch().count()));
|
||||
generatorInitialized = true;
|
||||
}
|
||||
if (length > 0)
|
||||
{
|
||||
std::uniform_int_distribution<ptrdiff_t> distribution{ static_cast<ptrdiff_t>(0), --length };
|
||||
return distribution(generator);
|
||||
}
|
||||
return static_cast<ptrdiff_t>(-1);
|
||||
}
|
||||
|
||||
// print the header for a test in function funcName
|
||||
void PrintHeader(const char* const funcName)
|
||||
{
|
||||
std::cout << "\n~~~\ntest \"" << funcName << "\"" << std::endl;
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<packages>
|
||||
<package id="Microsoft.Windows.CppWinRT" version="2.0.191217.1" targetFramework="native" />
|
||||
</packages>
|
|
@ -0,0 +1,2 @@
|
|||
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
|
||||
собственный C планшетный Подразделения года — Surface. компании также производят 2012 консолей а для персональных компьютеров (клавиатуры, и т. д. мыши). производит компьютер семейство игровых Microsoft продаётся Xbox, более Продукция чем аксессуары странах в 80 программы переведены мира, более чем на 45 также языков.
|
|
@ -0,0 +1,2 @@
|
|||
Retrieved from https://en.wikipedia.org/wiki/Microsoft on 2020-01-17. Uses https://creativecommons.org/licenses/by-sa/3.0/ license.
|
||||
微软曾超越苹果公司,以及随后通过收购诺基亚设备形成微软移动和服务部门。微软公司於2014年接任首席执行官以来,2008年收购Danger公司,成为全球最有价值的上市公司。 Alphabet、自萨蒂亚纳德拉于该公司已缩减硬件规模,史蒂夫鲍尔默于微软达到了一兆美元的市值,2000随着微软在于2012年6月首次进入个人电脑生产市场,年取代盖茨担任首席执行官,Facebook的第五家股價市值超过这一举措帮助该公司股价达到1999年12成为仅次于苹果公司、2018年,Surface系列平板电脑的推出,后来设想了“设备和服务”战略。随着微软转而专注于云计算,在2019年4月,月以来的最高值。谷歌旗下1兆美元的美国上市公司。亚马逊、1975年由比
|
|
@ -1,100 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
#include "inc/Utf8OutPipeReader.hpp"
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
UTF8OutPipeReader::UTF8OutPipeReader(HANDLE outPipe) noexcept :
|
||||
_outPipe{ outPipe },
|
||||
_buffer{ 0 },
|
||||
_utf8Partials{ 0 }
|
||||
{
|
||||
}
|
||||
|
||||
// Method Description:
|
||||
// Populates a string_view with *complete* UTF-8 codepoints read from the pipe.
|
||||
// If it receives an incomplete codepoint, it will cache it until it can be completed.
|
||||
// Note: This method trusts that the other end will, in fact, send complete codepoints.
|
||||
// Arguments:
|
||||
// - strView: on return, populated with successfully-read codepoints.
|
||||
// Return Value:
|
||||
// An HRESULT indicating whether the read was successful. For the purposes of this
|
||||
// method, a closed pipe is considered a successful (but false!) read. All other errors
|
||||
// are translated into an appropriate status code.
|
||||
// S_OK for a successful read
|
||||
// S_FALSE for a read on a closed pipe
|
||||
// E_* (anything) for a failed read
|
||||
[[nodiscard]] HRESULT UTF8OutPipeReader::Read(_Out_ std::string_view& strView)
|
||||
{
|
||||
DWORD dwRead{};
|
||||
bool fSuccess{};
|
||||
|
||||
// in case of early escaping
|
||||
_buffer.at(0) = 0;
|
||||
strView = std::string_view{ _buffer.data(), 0 };
|
||||
|
||||
// copy UTF-8 code units that were remaining from the previously read chunk (if any)
|
||||
if (_dwPartialsLen != 0)
|
||||
{
|
||||
std::move(_utf8Partials.cbegin(), _utf8Partials.cbegin() + _dwPartialsLen, _buffer.begin());
|
||||
}
|
||||
|
||||
// try to read data
|
||||
fSuccess = !!ReadFile(_outPipe, &_buffer.at(_dwPartialsLen), gsl::narrow<DWORD>(_buffer.size()) - _dwPartialsLen, &dwRead, nullptr);
|
||||
|
||||
dwRead += _dwPartialsLen;
|
||||
_dwPartialsLen = 0;
|
||||
|
||||
if (!fSuccess) // reading failed (we must check this first, because dwRead will also be 0.)
|
||||
{
|
||||
const auto lastError = GetLastError();
|
||||
if (lastError == ERROR_BROKEN_PIPE)
|
||||
{
|
||||
// This is a successful, but detectable, exit.
|
||||
// There is a chance that we put some partials into the buffer. Since
|
||||
// the pipe has closed, they're just invalid now. They're not worth
|
||||
// reporting.
|
||||
return S_FALSE;
|
||||
}
|
||||
|
||||
return HRESULT_FROM_WIN32(lastError);
|
||||
}
|
||||
|
||||
if (dwRead == 0) // quit if no data has been read and no cached data was left over
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
const auto endPtr = _buffer.cbegin() + dwRead;
|
||||
auto backIter = endPtr - 1;
|
||||
// If the last byte in the buffer was a byte belonging to a UTF-8 multi-byte character
|
||||
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
|
||||
{
|
||||
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the buffer
|
||||
for (DWORD dwSequenceLen{ 1UL }; dwSequenceLen < std::min(dwRead, 4UL); ++dwSequenceLen, --backIter)
|
||||
{
|
||||
// If Lead Byte found
|
||||
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)
|
||||
{
|
||||
// If the Lead Byte indicates that the last bytes in the buffer is a partial UTF-8 code point then cache them:
|
||||
// Use the bitmask at index `dwSequenceLen`. Compare the result with the operand having the same index. If they
|
||||
// are not equal then the sequence has to be cached because it is a partial code point. Otherwise the
|
||||
// sequence is a complete UTF-8 code point and the whole buffer is ready for the conversion to hstring.
|
||||
if ((*backIter & _cmpMasks.at(dwSequenceLen)) != _cmpOperands.at(dwSequenceLen))
|
||||
{
|
||||
std::move(backIter, endPtr, _utf8Partials.begin());
|
||||
dwRead -= dwSequenceLen;
|
||||
_dwPartialsLen = dwSequenceLen;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// give back a view of the part of the buffer that contains complete code points only
|
||||
strView = std::string_view{ &_buffer.at(0), dwRead };
|
||||
return S_OK;
|
||||
}
|
|
@ -6,7 +6,7 @@
|
|||
<RootNamespace>types</RootNamespace>
|
||||
<ProjectName>Types</ProjectName>
|
||||
<TargetName>ConTypes</TargetName>
|
||||
<ConfigurationType>StaticLibrary</ConfigurationType>
|
||||
<ConfigurationType>StaticLibrary</ConfigurationType>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(SolutionDir)src\common.build.pre.props" />
|
||||
<ItemGroup>
|
||||
|
@ -24,7 +24,6 @@
|
|||
<ClCompile Include="..\ThemeUtils.cpp" />
|
||||
<ClCompile Include="..\UiaTextRangeBase.cpp" />
|
||||
<ClCompile Include="..\Utf16Parser.cpp" />
|
||||
<ClCompile Include="..\UTF8OutPipeReader.cpp" />
|
||||
<ClCompile Include="..\Viewport.cpp" />
|
||||
<ClCompile Include="..\WindowBufferSizeEvent.cpp" />
|
||||
<ClCompile Include="..\precomp.cpp">
|
||||
|
@ -42,7 +41,6 @@
|
|||
<ClInclude Include="..\inc\GlyphWidth.hpp" />
|
||||
<ClInclude Include="..\inc\IInputEvent.hpp" />
|
||||
<ClInclude Include="..\inc\ThemeUtils.h" />
|
||||
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp" />
|
||||
<ClInclude Include="..\inc\utils.hpp" />
|
||||
<ClInclude Include="..\inc\Viewport.hpp" />
|
||||
<ClInclude Include="..\inc\Utf16Parser.hpp" />
|
||||
|
|
|
@ -57,9 +57,6 @@
|
|||
<ClCompile Include="..\utils.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\UTF8OutPipeReader.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\ScreenInfoUiaProviderBase.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
@ -72,6 +69,9 @@
|
|||
<ClCompile Include="..\ThemeUtils.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Environment.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\inc\IInputEvent.hpp">
|
||||
|
@ -95,9 +95,6 @@
|
|||
<ClInclude Include="..\inc\GlyphWidth.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\IConsoleWindow.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
|
@ -122,9 +119,6 @@
|
|||
<ClInclude Include="..\inc\IInputEvent.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\UTF8OutPipeReader.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\Viewport.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
|
@ -137,9 +131,6 @@
|
|||
<ClInclude Include="..\ScreenInfoUiaProviderBase.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\utils.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\WindowUiaProviderBase.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
|
@ -161,6 +152,9 @@
|
|||
<ClInclude Include="..\inc\ThemeUtils.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\inc\Environment.hpp">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Natvis Include="$(SolutionDir)tools\ConsoleTypes.natvis" />
|
||||
|
|
|
@ -6,11 +6,10 @@
|
|||
<RootNamespace>TypesUnitTests</RootNamespace>
|
||||
<ProjectName>Types.Unit.Tests</ProjectName>
|
||||
<TargetName>Types.Unit.Tests</TargetName>
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
<ConfigurationType>DynamicLibrary</ConfigurationType>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(SolutionDir)src\common.build.pre.props" />
|
||||
<ItemGroup>
|
||||
<ClCompile Include="UTF8OutPipeReaderTests.cpp" />
|
||||
<ClCompile Include="UtilsTests.cpp" />
|
||||
<ClCompile Include="UuidTests.cpp" />
|
||||
<ClCompile Include="..\precomp.cpp">
|
||||
|
|
|
@ -1,155 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include "precomp.h"
|
||||
#include "WexTestClass.h"
|
||||
#include "..\..\inc\consoletaeftemplates.hpp"
|
||||
|
||||
#include "..\inc\UTF8OutPipeReader.hpp"
|
||||
|
||||
#include <winrt/Windows.Foundation.h>
|
||||
#include <winrt/Windows.Foundation.Collections.h>
|
||||
|
||||
using namespace WEX::Common;
|
||||
using namespace WEX::Logging;
|
||||
using namespace WEX::TestExecution;
|
||||
|
||||
class UTF8OutPipeReaderTests
|
||||
{
|
||||
TEST_CLASS(UTF8OutPipeReaderTests);
|
||||
|
||||
TEST_METHOD(TestUtf8MergePartials)
|
||||
{
|
||||
// The test uses the character 'GOTHIC LETTER HWAIR' (U+10348) as an example
|
||||
// Its UTF-8 representation consists of four bytes:
|
||||
// 1 2 3 4
|
||||
// 0xF0 0x90 0x8D 0x88
|
||||
//
|
||||
// For the test a std::string is filled with 4104 '.' characters to make sure it exceeds the
|
||||
// buffer size of 4096 bytes in UTF8OutPipeReader.
|
||||
//
|
||||
// This figure shows how the string is getting changed for the 7 sub-tests. The digits 1 to 4
|
||||
// represent the four bytes of the 'Hwair' letter. The vertical bar represents the buffer boundary.
|
||||
// Test 1: [more points] . . S 1 2 3 4 T|U V W X Y Z . .
|
||||
// Test 2: [more points] . . S T 1 2 3 4|U V W X Y Z . .
|
||||
// Test 3: [more points] . . S T U 1 2 3|4 V W X Y Z . .
|
||||
// Test 4: [more points] . . S T U V 1 2|3 4 W X Y Z . .
|
||||
// Test 5: [more points] . . S T U V W 1|2 3 4 X Y Z . .
|
||||
// Test 6: [more points] . . S T U V W X|1 2 3 4 Y Z . .
|
||||
// Test 7: [more points] . . S T U V W X|Y 1 2 3 4 Z . .
|
||||
//
|
||||
// Tests 1, 6, and 7 prove proper ASCII handling.
|
||||
// Test 2 leaves all four bytes of 'Hwair' in the first chunk.
|
||||
// Test 3, 4, and 5 move the partials from the end of the first chunk to the begin of the
|
||||
// second chunk.
|
||||
//
|
||||
// At the beginning of a test the whole string is converted into a winrt::hstring for reference.
|
||||
// During the test a second hstring is concatenated out of the chunks that we get from
|
||||
// UTF8OutPipeReader::Read. Each chunk is separately converted to hstring in order to make
|
||||
// sure it would be corrupted if we get UTF-8 partials.
|
||||
// The test is positive if both hstrings are equal.
|
||||
|
||||
const size_t bufferSize{ 4096 }; // NOTE: This has to match the buffer size in UTF8OutPipeReader!
|
||||
std::string utf8TestString(bufferSize + 8, '.'); // create a test string with the required size
|
||||
|
||||
// Test 1:
|
||||
// ||
|
||||
utf8TestString.replace(bufferSize - 6, 12, "S\xF0\x90\x8D\x88TUVWXYZ");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
|
||||
// Test 2:
|
||||
// | |
|
||||
utf8TestString.replace(bufferSize - 6, 12, "ST\xF0\x90\x8D\x88UVWXYZ");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
|
||||
// Test 3:
|
||||
// | |
|
||||
utf8TestString.replace(bufferSize - 6, 12, "STU\xF0\x90\x8D\x88VWXYZ");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
|
||||
// Test 4:
|
||||
// | |
|
||||
utf8TestString.replace(bufferSize - 6, 12, "STUV\xF0\x90\x8D\x88WXYZ");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
|
||||
// Test 5:
|
||||
// | |
|
||||
utf8TestString.replace(bufferSize - 6, 12, "STUVW\xF0\x90\x8D\x88XYZ");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
|
||||
// Test 6:
|
||||
// | |
|
||||
utf8TestString.replace(bufferSize - 6, 12, "STUVWX\xF0\x90\x8D\x88YZ");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
|
||||
// Test 7:
|
||||
// ||
|
||||
utf8TestString.replace(bufferSize - 6, 12, "STUVWXY\xF0\x90\x8D\x88Z");
|
||||
VERIFY_SUCCEEDED(RunTest(utf8TestString));
|
||||
}
|
||||
|
||||
struct ThreadData
|
||||
{
|
||||
wil::unique_hfile& inPipe;
|
||||
std::string& utf8TestString;
|
||||
};
|
||||
|
||||
// Thread function which writes the UTF-8 data to the pipe.
|
||||
static DWORD WINAPI WritePipeThread(LPVOID threadArg)
|
||||
{
|
||||
ThreadData* pThreadData{ reinterpret_cast<ThreadData*>(threadArg) };
|
||||
DWORD length{};
|
||||
|
||||
WriteFile(pThreadData->inPipe.get(), pThreadData->utf8TestString.c_str(), static_cast<DWORD>(pThreadData->utf8TestString.size()), &length, nullptr);
|
||||
pThreadData->inPipe.reset();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Performs the sub-tests.
|
||||
HRESULT RunTest(std::string& utf8TestString)
|
||||
{
|
||||
std::string_view strView{}; // contains the chunk that we get from UTF8OutPipeReader::Read
|
||||
const winrt::hstring utf16Expected{ winrt::to_hstring(utf8TestString) }; // contains the whole string converted to UTF-16
|
||||
winrt::hstring utf16Actual{}; // will be concatenated from the converted chunks
|
||||
|
||||
wil::unique_hfile outPipe{};
|
||||
wil::unique_hfile inPipe{};
|
||||
|
||||
SECURITY_ATTRIBUTES sa{ sizeof(SECURITY_ATTRIBUTES) };
|
||||
CreatePipe(&outPipe, &inPipe, &sa, 0); // create the pipe handles
|
||||
|
||||
UTF8OutPipeReader reader{ outPipe.get() };
|
||||
|
||||
ThreadData data{ inPipe, utf8TestString };
|
||||
|
||||
wil::unique_handle threadHandle{ CreateThread(nullptr, 0, WritePipeThread, &data, 0, nullptr) }; // create a thread that writes to the pipe
|
||||
RETURN_HR_IF_NULL(E_FAIL, threadHandle.get());
|
||||
|
||||
// process the chunks that we get from UTF8OutPipeReader::Read
|
||||
while (true)
|
||||
{
|
||||
// get a chunk of UTF-8 data
|
||||
THROW_IF_FAILED(reader.Read(strView));
|
||||
|
||||
if (strView.empty())
|
||||
{
|
||||
// this is okay, no data left in the pipe
|
||||
break;
|
||||
}
|
||||
|
||||
// convert the chunk to hstring and append it to the resulting hstring
|
||||
utf16Actual = utf16Actual + winrt::to_hstring(strView);
|
||||
}
|
||||
|
||||
WaitForSingleObject(threadHandle.get(), 2000);
|
||||
|
||||
// the test passed if both hstrings are equal
|
||||
if (utf16Actual == utf16Expected)
|
||||
{
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
return E_FAIL;
|
||||
}
|
||||
};
|
Loading…
Reference in New Issue