// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2009-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: filterednormalizer2.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2009dec10 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/edits.h" #include "unicode/normalizer2.h" #include "unicode/stringoptions.h" #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/unorm.h" #include "cpputils.h" U_NAMESPACE_BEGIN FilteredNormalizer2::~FilteredNormalizer2() {} UnicodeString & FilteredNormalizer2::normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const { uprv_checkCanGetBuffer(src, errorCode); if(U_FAILURE(errorCode)) { dest.setToBogus(); return dest; } if(&dest==&src) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; return dest; } dest.remove(); return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); } // Internal: No argument checking, and appends to dest. // Pass as input spanCondition the one that is likely to yield a non-zero // span length at the start of src. // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, // USET_SPAN_SIMPLE should be passed in for the start of src // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after // an in-filter prefix. UnicodeString & FilteredNormalizer2::normalize(const UnicodeString &src, UnicodeString &dest, USetSpanCondition spanCondition, UErrorCode &errorCode) const { UnicodeString tempDest; // Don't throw away destination buffer between iterations. for(int32_t prevSpanLimit=0; prevSpanLimitreset(); } options |= U_EDITS_NO_RESET; // Do not reset for each span. normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode); } void FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length, ByteSink &sink, Edits *edits, USetSpanCondition spanCondition, UErrorCode &errorCode) const { while (length > 0) { int32_t spanLength = set.spanUTF8(src, length, spanCondition); if (spanCondition == USET_SPAN_NOT_CONTAINED) { if (spanLength != 0) { if (edits != nullptr) { edits->addUnchanged(spanLength); } if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { sink.Append(src, spanLength); } } spanCondition = USET_SPAN_SIMPLE; } else { if (spanLength != 0) { // Not norm2.normalizeSecondAndAppend() because we do not want // to modify the non-filter part of dest. norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode); if (U_FAILURE(errorCode)) { break; } } spanCondition = USET_SPAN_NOT_CONTAINED; } src += spanLength; length -= spanLength; } } UnicodeString & FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const { return normalizeSecondAndAppend(first, second, TRUE, errorCode); } UnicodeString & FilteredNormalizer2::append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const { return normalizeSecondAndAppend(first, second, FALSE, errorCode); } UnicodeString & FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UBool doNormalize, UErrorCode &errorCode) const { uprv_checkCanGetBuffer(first, errorCode); uprv_checkCanGetBuffer(second, errorCode); if(U_FAILURE(errorCode)) { return first; } if(&first==&second) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; return first; } if(first.isEmpty()) { if(doNormalize) { return normalize(second, first, errorCode); } else { return first=second; } } // merge the in-filter suffix of the first string with the in-filter prefix of the second int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); if(prefixLimit!=0) { UnicodeString prefix(second.tempSubString(0, prefixLimit)); int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); if(suffixStart==0) { if(doNormalize) { norm2.normalizeSecondAndAppend(first, prefix, errorCode); } else { norm2.append(first, prefix, errorCode); } } else { UnicodeString middle(first, suffixStart, INT32_MAX); if(doNormalize) { norm2.normalizeSecondAndAppend(middle, prefix, errorCode); } else { norm2.append(middle, prefix, errorCode); } first.replace(suffixStart, INT32_MAX, middle); } } if(prefixLimit 0) { int32_t spanLength = set.spanUTF8(s, length, spanCondition); if (spanCondition == USET_SPAN_NOT_CONTAINED) { spanCondition = USET_SPAN_SIMPLE; } else { if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) || U_FAILURE(errorCode)) { return FALSE; } spanCondition = USET_SPAN_NOT_CONTAINED; } s += spanLength; length -= spanLength; } return TRUE; } UNormalizationCheckResult FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { uprv_checkCanGetBuffer(s, errorCode); if(U_FAILURE(errorCode)) { return UNORM_MAYBE; } UNormalizationCheckResult result=UNORM_YES; USetSpanCondition spanCondition=USET_SPAN_SIMPLE; for(int32_t prevSpanLimit=0; prevSpanLimit