| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 2009-2012, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: filterednormalizer2.cpp |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * created on: 2009dec10 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_NORMALIZATION |
| |
| #include "unicode/edits.h" |
| #include "unicode/normalizer2.h" |
| #include "unicode/stringoptions.h" |
| #include "unicode/uniset.h" |
| #include "unicode/unistr.h" |
| #include "unicode/unorm.h" |
| #include "cpputils.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| FilteredNormalizer2::~FilteredNormalizer2() {} |
| |
| UnicodeString & |
| FilteredNormalizer2::normalize(const UnicodeString &src, |
| UnicodeString &dest, |
| UErrorCode &errorCode) const { |
| uprv_checkCanGetBuffer(src, errorCode); |
| if(U_FAILURE(errorCode)) { |
| dest.setToBogus(); |
| return dest; |
| } |
| if(&dest==&src) { |
| errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return dest; |
| } |
| dest.remove(); |
| return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); |
| } |
| |
| // Internal: No argument checking, and appends to dest. |
| // Pass as input spanCondition the one that is likely to yield a non-zero |
| // span length at the start of src. |
| // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, |
| // USET_SPAN_SIMPLE should be passed in for the start of src |
| // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after |
| // an in-filter prefix. |
| UnicodeString & |
| FilteredNormalizer2::normalize(const UnicodeString &src, |
| UnicodeString &dest, |
| USetSpanCondition spanCondition, |
| UErrorCode &errorCode) const { |
| UnicodeString tempDest; // Don't throw away destination buffer between iterations. |
| for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { |
| int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); |
| int32_t spanLength=spanLimit-prevSpanLimit; |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| if(spanLength!=0) { |
| dest.append(src, prevSpanLimit, spanLength); |
| } |
| spanCondition=USET_SPAN_SIMPLE; |
| } else { |
| if(spanLength!=0) { |
| // Not norm2.normalizeSecondAndAppend() because we do not want |
| // to modify the non-filter part of dest. |
| dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), |
| tempDest, errorCode)); |
| if(U_FAILURE(errorCode)) { |
| break; |
| } |
| } |
| spanCondition=USET_SPAN_NOT_CONTAINED; |
| } |
| prevSpanLimit=spanLimit; |
| } |
| return dest; |
| } |
| |
| void |
| FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
| Edits *edits, UErrorCode &errorCode) const { |
| if (U_FAILURE(errorCode)) { |
| return; |
| } |
| if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
| edits->reset(); |
| } |
| options |= U_EDITS_NO_RESET; // Do not reset for each span. |
| normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode); |
| } |
| |
| void |
| FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length, |
| ByteSink &sink, Edits *edits, |
| USetSpanCondition spanCondition, |
| UErrorCode &errorCode) const { |
| while (length > 0) { |
| int32_t spanLength = set.spanUTF8(src, length, spanCondition); |
| if (spanCondition == USET_SPAN_NOT_CONTAINED) { |
| if (spanLength != 0) { |
| if (edits != nullptr) { |
| edits->addUnchanged(spanLength); |
| } |
| if ((options & U_OMIT_UNCHANGED_TEXT) == 0) { |
| sink.Append(src, spanLength); |
| } |
| } |
| spanCondition = USET_SPAN_SIMPLE; |
| } else { |
| if (spanLength != 0) { |
| // Not norm2.normalizeSecondAndAppend() because we do not want |
| // to modify the non-filter part of dest. |
| norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode); |
| if (U_FAILURE(errorCode)) { |
| break; |
| } |
| } |
| spanCondition = USET_SPAN_NOT_CONTAINED; |
| } |
| src += spanLength; |
| length -= spanLength; |
| } |
| } |
| |
| UnicodeString & |
| FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, |
| const UnicodeString &second, |
| UErrorCode &errorCode) const { |
| return normalizeSecondAndAppend(first, second, TRUE, errorCode); |
| } |
| |
| UnicodeString & |
| FilteredNormalizer2::append(UnicodeString &first, |
| const UnicodeString &second, |
| UErrorCode &errorCode) const { |
| return normalizeSecondAndAppend(first, second, FALSE, errorCode); |
| } |
| |
| UnicodeString & |
| FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, |
| const UnicodeString &second, |
| UBool doNormalize, |
| UErrorCode &errorCode) const { |
| uprv_checkCanGetBuffer(first, errorCode); |
| uprv_checkCanGetBuffer(second, errorCode); |
| if(U_FAILURE(errorCode)) { |
| return first; |
| } |
| if(&first==&second) { |
| errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return first; |
| } |
| if(first.isEmpty()) { |
| if(doNormalize) { |
| return normalize(second, first, errorCode); |
| } else { |
| return first=second; |
| } |
| } |
| // merge the in-filter suffix of the first string with the in-filter prefix of the second |
| int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); |
| if(prefixLimit!=0) { |
| UnicodeString prefix(second.tempSubString(0, prefixLimit)); |
| int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); |
| if(suffixStart==0) { |
| if(doNormalize) { |
| norm2.normalizeSecondAndAppend(first, prefix, errorCode); |
| } else { |
| norm2.append(first, prefix, errorCode); |
| } |
| } else { |
| UnicodeString middle(first, suffixStart, INT32_MAX); |
| if(doNormalize) { |
| norm2.normalizeSecondAndAppend(middle, prefix, errorCode); |
| } else { |
| norm2.append(middle, prefix, errorCode); |
| } |
| first.replace(suffixStart, INT32_MAX, middle); |
| } |
| } |
| if(prefixLimit<second.length()) { |
| UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); |
| if(doNormalize) { |
| normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); |
| } else { |
| first.append(rest); |
| } |
| } |
| return first; |
| } |
| |
| UBool |
| FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { |
| return set.contains(c) && norm2.getDecomposition(c, decomposition); |
| } |
| |
| UBool |
| FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { |
| return set.contains(c) && norm2.getRawDecomposition(c, decomposition); |
| } |
| |
| UChar32 |
| FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { |
| return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; |
| } |
| |
| uint8_t |
| FilteredNormalizer2::getCombiningClass(UChar32 c) const { |
| return set.contains(c) ? norm2.getCombiningClass(c) : 0; |
| } |
| |
| UBool |
| FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { |
| uprv_checkCanGetBuffer(s, errorCode); |
| if(U_FAILURE(errorCode)) { |
| return FALSE; |
| } |
| USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
| for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
| int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| spanCondition=USET_SPAN_SIMPLE; |
| } else { |
| if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || |
| U_FAILURE(errorCode) |
| ) { |
| return FALSE; |
| } |
| spanCondition=USET_SPAN_NOT_CONTAINED; |
| } |
| prevSpanLimit=spanLimit; |
| } |
| return TRUE; |
| } |
| |
| UBool |
| FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const { |
| if(U_FAILURE(errorCode)) { |
| return FALSE; |
| } |
| const char *s = sp.data(); |
| int32_t length = sp.length(); |
| USetSpanCondition spanCondition = USET_SPAN_SIMPLE; |
| while (length > 0) { |
| int32_t spanLength = set.spanUTF8(s, length, spanCondition); |
| if (spanCondition == USET_SPAN_NOT_CONTAINED) { |
| spanCondition = USET_SPAN_SIMPLE; |
| } else { |
| if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) || |
| U_FAILURE(errorCode)) { |
| return FALSE; |
| } |
| spanCondition = USET_SPAN_NOT_CONTAINED; |
| } |
| s += spanLength; |
| length -= spanLength; |
| } |
| return TRUE; |
| } |
| |
| UNormalizationCheckResult |
| FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { |
| uprv_checkCanGetBuffer(s, errorCode); |
| if(U_FAILURE(errorCode)) { |
| return UNORM_MAYBE; |
| } |
| UNormalizationCheckResult result=UNORM_YES; |
| USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
| for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
| int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| spanCondition=USET_SPAN_SIMPLE; |
| } else { |
| UNormalizationCheckResult qcResult= |
| norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); |
| if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { |
| return qcResult; |
| } else if(qcResult==UNORM_MAYBE) { |
| result=qcResult; |
| } |
| spanCondition=USET_SPAN_NOT_CONTAINED; |
| } |
| prevSpanLimit=spanLimit; |
| } |
| return result; |
| } |
| |
| int32_t |
| FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { |
| uprv_checkCanGetBuffer(s, errorCode); |
| if(U_FAILURE(errorCode)) { |
| return 0; |
| } |
| USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
| for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
| int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
| if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| spanCondition=USET_SPAN_SIMPLE; |
| } else { |
| int32_t yesLimit= |
| prevSpanLimit+ |
| norm2.spanQuickCheckYes( |
| s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); |
| if(U_FAILURE(errorCode) || yesLimit<spanLimit) { |
| return yesLimit; |
| } |
| spanCondition=USET_SPAN_NOT_CONTAINED; |
| } |
| prevSpanLimit=spanLimit; |
| } |
| return s.length(); |
| } |
| |
| UBool |
| FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { |
| return !set.contains(c) || norm2.hasBoundaryBefore(c); |
| } |
| |
| UBool |
| FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { |
| return !set.contains(c) || norm2.hasBoundaryAfter(c); |
| } |
| |
| UBool |
| FilteredNormalizer2::isInert(UChar32 c) const { |
| return !set.contains(c) || norm2.isInert(c); |
| } |
| |
| U_NAMESPACE_END |
| |
| // C API ------------------------------------------------------------------- *** |
| |
| U_NAMESPACE_USE |
| |
| U_CAPI UNormalizer2 * U_EXPORT2 |
| unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { |
| if(U_FAILURE(*pErrorCode)) { |
| return NULL; |
| } |
| if(filterSet==NULL) { |
| *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| return NULL; |
| } |
| Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, |
| *UnicodeSet::fromUSet(filterSet)); |
| if(fn2==NULL) { |
| *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| } |
| return (UNormalizer2 *)fn2; |
| } |
| |
| #endif // !UCONFIG_NO_NORMALIZATION |