// copy UTF-8 code units that were remaining from the previous call (if any)
if(_partialsLen!=0u)
{
_buffer.push_back(_utfPartials.front());
_partialsLen=0u;
}
if(in.empty())
{
out=_buffer;
if(_buffer.empty())
{
returnS_OK;
}
returnS_FALSE;// the high surrogate is populated
}
// cache the last value in the string if it is in the range of high surrogates
if(in.back()>=0xD800u&&in.back()<=0xDBFFu)
{
_utfPartials.front()=in.back();
--remainingLength;
_partialsLen=1u;
}
else
{
_partialsLen=0u;
}
// populate the part of the string that contains complete code points only
_buffer.append(in,0u,remainingLength);
out=_buffer;
returnS_OK;
}
catch(std::length_error&)
{
returnE_ABORT;
}
catch(std::bad_alloc&)
{
returnE_OUTOFMEMORY;
}
catch(...)
{
returnE_UNEXPECTED;
}
}
// Method Description:
// - Discard cached partials.
// Arguments:
// - none
// Return Value:
// - void
voidreset()noexcept
{
_partialsLen=0u;
}
private:
enum_Utf8BitMasks:BYTE
{
IsAsciiByte=0b0'0000000,// Any byte representing an ASCII character has the MSB set to 0
MaskAsciiByte=0b1'0000000,// Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsAsciiByte pattern
IsContinuationByte=0b10'000000,// Continuation bytes of any UTF-8 non-ASCII character have the MSB set to 1 and the adjacent bit set to 0
MaskContinuationByte=0b11'000000,// Bit mask to be used in a bitwise AND operation to find out whether or not a byte match the IsContinuationByte pattern
IsLeadByteTwoByteSequence=0b110'00000,// A lead byte that indicates a UTF-8 non-ASCII character consisting of two bytes has the two highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteTwoByteSequence=0b111'00000,// Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteTwoByteSequence pattern
IsLeadByteThreeByteSequence=0b1110'0000,// A lead byte that indicates a UTF-8 non-ASCII character consisting of three bytes has the three highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteThreeByteSequence=0b1111'0000,// Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteThreeByteSequence pattern
IsLeadByteFourByteSequence=0b11110'000,// A lead byte that indicates a UTF-8 non-ASCII character consisting of four bytes has the four highest bits set to 1 and the adjacent bit set to 0
MaskLeadByteFourByteSequence=0b11111'000// Bit mask to be used in a bitwise AND operation to find out whether or not a lead byte match the IsLeadByteFourByteSequence pattern
};
// array of bitmasks
constexprstaticstd::array<BYTE,4>_cmpMasks{
0,// unused
_Utf8BitMasks::MaskContinuationByte,
_Utf8BitMasks::MaskLeadByteTwoByteSequence,
_Utf8BitMasks::MaskLeadByteThreeByteSequence,
};
// array of values for the comparisons
constexprstaticstd::array<BYTE,4>_cmpOperands{
0,// unused
_Utf8BitMasks::IsAsciiByte,// intentionally conflicts with MaskContinuationByte
std::array<charT,4>_utfPartials;// buffer for code units of a partial code point that have to be cached
size_t_partialsLen{};// number of cached code units
};
// make clear what incoming string type the state is for
typedefu8u16state<char>u8state;
typedefu8u16state<wchar_t>u16state;
// Routine Description:
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
// Arguments:
// - in - UTF-8 string to be converted
// - out - reference to the resulting UTF-16 string
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
// Arguments:
// - in - UTF-16 string to be converted
// - out - reference to the resulting UTF-8 string
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - E_OUTOFMEMORY - the function failed to allocate memory for the resulting string
// - E_ABORT - the resulting string length would exceed the upper boundary of an int and thus, the conversion was aborted before the conversion has been completed
// - Takes a UTF-8 string and performs the conversion to UTF-16. NOTE: The function relies on getting complete UTF-8 characters at the string boundaries.
// Arguments:
// - in - UTF-8 string to be converted
// Return Value:
// - the resulting UTF-16 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns
// - Takes a UTF-16 string and performs the conversion to UTF-8. NOTE: The function relies on getting complete UTF-16 characters at the string boundaries.
// Arguments:
// - in - UTF-16 string to be converted
// Return Value:
// - the resulting UTF-8 string
// - NOTE: Throws HRESULT errors that the non-throwing sibling returns