make sure caching of partials still works if the string consists of a single lead byte only (GH#4673) (#4685)

## Summary of the Pull Request
Fixes a flaw that happened if `til::u8u16` received a single lead byte.

## PR Checklist
* [x] Closes #4673 
* [x] Tests added/passed

## Detailed Description of the Pull Request / Additional comments
The loop for caching partials didn't run and thus, the lead byte was
converted to U+FFFD. That's because the loop starts with `sequenceLen`
initialized with 1. And if the string has a length of 1 the initial
condition is `1<1` which is evaluated to `false` and the body of the
loop was never executed.

## Validation Steps Performed
1) updated the code of the state class and tested manually that `printf
   "\xE2"; printf "\x98\xBA\n"` prints a U+263A character
2) updated the unit tests to make sure that still up to 3 partials are
   cached
3) updated the unit tests to make sure caching also works if the string
   consists of a lead byte only
4) tested manually that #4086 is still resolved
This commit is contained in:
Steffen 2020-02-21 21:45:53 +01:00 committed by GitHub
parent 671110c88a
commit b8e33560f9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 9 deletions

View file

@ -84,8 +84,8 @@ namespace til // Terminal Implementation Library. Also: "Today I Learned"
if ((*backIter & _Utf8BitMasks::MaskAsciiByte) > _Utf8BitMasks::IsAsciiByte)
{
// Check only up to 3 last bytes, if no Lead Byte was found then the byte before must be the Lead Byte and no partials are in the string
const size_t stopLen{ std::min(in.length(), gsl::narrow_cast<size_t>(4u)) };
for (size_t sequenceLen{ 1u }; sequenceLen < stopLen; ++sequenceLen, --backIter)
const size_t stopLen{ std::min(in.length(), gsl::narrow_cast<size_t>(3u)) };
for (size_t sequenceLen{ 1u }; sequenceLen <= stopLen; ++sequenceLen, --backIter)
{
// If Lead Byte found
if ((*backIter & _Utf8BitMasks::MaskContinuationByte) > _Utf8BitMasks::IsContinuationByte)

View file

@ -83,31 +83,55 @@ void Utf8Utf16ConvertTests::TestU8ToU16Partials()
'\xA4',
'\xBD',
'\x9C',
'\xF0' // CJK UNIFIED IDEOGRAPH-24F5C (lead byte only)
'\xF0', // CJK UNIFIED IDEOGRAPH-24F5C (lead byte + 2 complementary bytes)
'\xA4',
'\xBD'
};
const std::string u8String2{
'\xA4', // CJK UNIFIED IDEOGRAPH-24F5C (complementary bytes)
'\xBD',
'\x9C'
'\x9C' // CJK UNIFIED IDEOGRAPH-24F5C (last complementary byte)
};
const std::wstring u16StringComp{
const std::wstring u16StringComp1{
gsl::narrow_cast<wchar_t>(0xD853), // CJK UNIFIED IDEOGRAPH-24F5C (surrogate pair)
gsl::narrow_cast<wchar_t>(0xDF5C)
};
// GH#4673
const std::string u8String3{
'\xE2' // WHITE SMILING FACE (lead byte)
};
const std::string u8String4{
'\x98', // WHITE SMILING FACE (complementary bytes)
'\xBA'
};
const std::wstring u16StringComp2{
gsl::narrow_cast<wchar_t>(0x263A) // WHITE SMILING FACE
};
til::u8state state{};
std::wstring u16Out1{};
const HRESULT hRes1{ til::u8u16(u8String1, u16Out1, state) };
VERIFY_ARE_EQUAL(S_OK, hRes1);
VERIFY_ARE_EQUAL(u16StringComp, u16Out1);
VERIFY_ARE_EQUAL(u16StringComp1, u16Out1);
std::wstring u16Out2{};
const HRESULT hRes2{ til::u8u16(u8String2, u16Out2, state) };
VERIFY_ARE_EQUAL(S_OK, hRes2);
VERIFY_ARE_EQUAL(u16StringComp, u16Out2);
VERIFY_ARE_EQUAL(u16StringComp1, u16Out2);
std::wstring u16Out3{};
const HRESULT hRes3{ til::u8u16(u8String3, u16Out3, state) };
VERIFY_ARE_EQUAL(S_OK, hRes3);
VERIFY_ARE_EQUAL(std::wstring{}, u16Out3);
std::wstring u16Out4{};
const HRESULT hRes4{ til::u8u16(u8String4, u16Out4, state) };
VERIFY_ARE_EQUAL(S_OK, hRes4);
VERIFY_ARE_EQUAL(u16StringComp2, u16Out4);
}
void Utf8Utf16ConvertTests::TestU16ToU8Partials()