terminal/src/types/Utf16Parser.cpp

92 lines
3.3 KiB
C++

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "inc/Utf16Parser.hpp"
#include "unicode.hpp"
// Routine Description:
// - Finds the next single collection for the codepoint out of the given UTF-16 string information.
// - In simpler terms, it will group UTF-16 surrogate pairs into a single unit or give you a valid single-item UTF-16 character.
// - Does not validate UTF-16 input beyond proper leading/trailing character sequences.
// Arguments:
// - wstr - The UTF-16 string to parse.
// Return Value:
// - A view into the string given of just the next codepoint unit.
std::wstring_view Utf16Parser::ParseNext(std::wstring_view wstr) noexcept
{
for (size_t pos = 0; pos < wstr.size(); ++pos)
{
const auto wch = wstr.at(pos);
// If it's a lead and followed directly by a trail, then return the pair.
// If it's not followed directly by the trail, go around again and seek forward.
if (IsLeadingSurrogate(wch))
{
// Try to find the next item... if it isn't there, we'll go around again.
const auto posNext = pos + 1;
if (posNext < wstr.size())
{
// If we found it and it's trailing, return the pair.
const auto wchNext = wstr.at(posNext);
if (IsTrailingSurrogate(wchNext))
{
return wstr.substr(pos, 2);
}
}
// If we missed either if in any way, we'll fall through and go around again searching for more.
}
// If it's just a trail at this point, go around again and seek forward.
else if (IsTrailingSurrogate(wch))
{
continue;
}
// If it's neither lead nor trail, then it's < U+10000 and it can be returned as a single wchar_t point.
else
{
return wstr.substr(pos, 1);
}
}
// If we get all the way through and there's nothing valid, then this is just a replacement character as it was broken/garbage.
return std::wstring_view{ &UNICODE_REPLACEMENT, 1 };
}
// Routine Description:
// - formats a utf16 encoded wstring and splits the codepoints into individual collections.
// - will drop badly formatted leading/trailing char sequences.
// - does not validate utf16 input beyond proper leading/trailing char sequences.
// Arguments:
// - wstr - the string to parse
// Return Value:
// - a vector of utf16 codepoints. glyphs that require surrogate pairs will be grouped
// together in a vector and codepoints that use only one wchar will be in a vector by themselves.
std::vector<std::vector<wchar_t>> Utf16Parser::Parse(std::wstring_view wstr)
{
std::vector<std::vector<wchar_t>> result;
std::vector<wchar_t> sequence;
for (const auto wch : wstr)
{
if (IsLeadingSurrogate(wch))
{
sequence.clear();
sequence.push_back(wch);
}
else if (IsTrailingSurrogate(wch))
{
if (!sequence.empty())
{
sequence.push_back(wch);
result.push_back(sequence);
sequence.clear();
}
}
else
{
result.push_back({ wch });
}
}
return result;
}