|  | // © 2016 and later: Unicode, Inc. and others. | 
|  | // License & terms of use: http://www.unicode.org/copyright.html | 
|  | /* | 
|  | ************************************************************************* | 
|  | * COPYRIGHT: | 
|  | * Copyright (c) 1996-2012, International Business Machines Corporation and | 
|  | * others. All Rights Reserved. | 
|  | ************************************************************************* | 
|  | */ | 
|  |  | 
|  | #include "unicode/utypes.h" | 
|  |  | 
|  | #if !UCONFIG_NO_NORMALIZATION | 
|  |  | 
|  | #include "unicode/uniset.h" | 
|  | #include "unicode/unistr.h" | 
|  | #include "unicode/chariter.h" | 
|  | #include "unicode/schriter.h" | 
|  | #include "unicode/uchriter.h" | 
|  | #include "unicode/normlzr.h" | 
|  | #include "unicode/utf16.h" | 
|  | #include "cmemory.h" | 
|  | #include "normalizer2impl.h" | 
|  | #include "uprops.h"  // for uniset_getUnicode32Instance() | 
|  |  | 
|  | #if defined(_ARM64_) && defined(move32) | 
|  | // System can define move32 intrinsics, but the char iters define move32 method | 
|  | // using same undef trick in headers, so undef here to re-enable the method. | 
|  | #undef move32 | 
|  | #endif | 
|  |  | 
|  | U_NAMESPACE_BEGIN | 
|  |  | 
|  | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) | 
|  |  | 
|  | //------------------------------------------------------------------------- | 
|  | // Constructors and other boilerplate | 
|  | //------------------------------------------------------------------------- | 
|  |  | 
|  | Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : | 
|  | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), | 
|  | text(new StringCharacterIterator(str)), | 
|  | currentIndex(0), nextIndex(0), | 
|  | buffer(), bufferPos(0) | 
|  | { | 
|  | init(); | 
|  | } | 
|  |  | 
|  | Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) : | 
|  | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), | 
|  | text(new UCharCharacterIterator(str, length)), | 
|  | currentIndex(0), nextIndex(0), | 
|  | buffer(), bufferPos(0) | 
|  | { | 
|  | init(); | 
|  | } | 
|  |  | 
|  | Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : | 
|  | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), | 
|  | text(iter.clone()), | 
|  | currentIndex(0), nextIndex(0), | 
|  | buffer(), bufferPos(0) | 
|  | { | 
|  | init(); | 
|  | } | 
|  |  | 
|  | Normalizer::Normalizer(const Normalizer ©) : | 
|  | UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), | 
|  | text(copy.text->clone()), | 
|  | currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), | 
|  | buffer(copy.buffer), bufferPos(copy.bufferPos) | 
|  | { | 
|  | init(); | 
|  | } | 
|  |  | 
|  | void | 
|  | Normalizer::init() { | 
|  | UErrorCode errorCode=U_ZERO_ERROR; | 
|  | fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); | 
|  | if(fOptions&UNORM_UNICODE_3_2) { | 
|  | delete fFilteredNorm2; | 
|  | fNorm2=fFilteredNorm2= | 
|  | new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); | 
|  | } | 
|  | if(U_FAILURE(errorCode)) { | 
|  | errorCode=U_ZERO_ERROR; | 
|  | fNorm2=Normalizer2Factory::getNoopInstance(errorCode); | 
|  | } | 
|  | } | 
|  |  | 
|  | Normalizer::~Normalizer() | 
|  | { | 
|  | delete fFilteredNorm2; | 
|  | delete text; | 
|  | } | 
|  |  | 
|  | Normalizer* | 
|  | Normalizer::clone() const | 
|  | { | 
|  | return new Normalizer(*this); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Generates a hash code for this iterator. | 
|  | */ | 
|  | int32_t Normalizer::hashCode() const | 
|  | { | 
|  | return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; | 
|  | } | 
|  |  | 
|  | UBool Normalizer::operator==(const Normalizer& that) const | 
|  | { | 
|  | return | 
|  | this==&that || | 
|  | (fUMode==that.fUMode && | 
|  | fOptions==that.fOptions && | 
|  | *text==*that.text && | 
|  | buffer==that.buffer && | 
|  | bufferPos==that.bufferPos && | 
|  | nextIndex==that.nextIndex); | 
|  | } | 
|  |  | 
|  | //------------------------------------------------------------------------- | 
|  | // Static utility methods | 
|  | //------------------------------------------------------------------------- | 
|  |  | 
|  | void U_EXPORT2 | 
|  | Normalizer::normalize(const UnicodeString& source, | 
|  | UNormalizationMode mode, int32_t options, | 
|  | UnicodeString& result, | 
|  | UErrorCode &status) { | 
|  | if(source.isBogus() || U_FAILURE(status)) { | 
|  | result.setToBogus(); | 
|  | if(U_SUCCESS(status)) { | 
|  | status=U_ILLEGAL_ARGUMENT_ERROR; | 
|  | } | 
|  | } else { | 
|  | UnicodeString localDest; | 
|  | UnicodeString *dest; | 
|  |  | 
|  | if(&source!=&result) { | 
|  | dest=&result; | 
|  | } else { | 
|  | // the source and result strings are the same object, use a temporary one | 
|  | dest=&localDest; | 
|  | } | 
|  | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); | 
|  | if(U_SUCCESS(status)) { | 
|  | if(options&UNORM_UNICODE_3_2) { | 
|  | FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). | 
|  | normalize(source, *dest, status); | 
|  | } else { | 
|  | n2->normalize(source, *dest, status); | 
|  | } | 
|  | } | 
|  | if(dest==&localDest && U_SUCCESS(status)) { | 
|  | result=*dest; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void U_EXPORT2 | 
|  | Normalizer::compose(const UnicodeString& source, | 
|  | UBool compat, int32_t options, | 
|  | UnicodeString& result, | 
|  | UErrorCode &status) { | 
|  | normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); | 
|  | } | 
|  |  | 
|  | void U_EXPORT2 | 
|  | Normalizer::decompose(const UnicodeString& source, | 
|  | UBool compat, int32_t options, | 
|  | UnicodeString& result, | 
|  | UErrorCode &status) { | 
|  | normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); | 
|  | } | 
|  |  | 
|  | UNormalizationCheckResult | 
|  | Normalizer::quickCheck(const UnicodeString& source, | 
|  | UNormalizationMode mode, int32_t options, | 
|  | UErrorCode &status) { | 
|  | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); | 
|  | if(U_SUCCESS(status)) { | 
|  | if(options&UNORM_UNICODE_3_2) { | 
|  | return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). | 
|  | quickCheck(source, status); | 
|  | } else { | 
|  | return n2->quickCheck(source, status); | 
|  | } | 
|  | } else { | 
|  | return UNORM_MAYBE; | 
|  | } | 
|  | } | 
|  |  | 
|  | UBool | 
|  | Normalizer::isNormalized(const UnicodeString& source, | 
|  | UNormalizationMode mode, int32_t options, | 
|  | UErrorCode &status) { | 
|  | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); | 
|  | if(U_SUCCESS(status)) { | 
|  | if(options&UNORM_UNICODE_3_2) { | 
|  | return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). | 
|  | isNormalized(source, status); | 
|  | } else { | 
|  | return n2->isNormalized(source, status); | 
|  | } | 
|  | } else { | 
|  | return FALSE; | 
|  | } | 
|  | } | 
|  |  | 
|  | UnicodeString & U_EXPORT2 | 
|  | Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, | 
|  | UnicodeString &result, | 
|  | UNormalizationMode mode, int32_t options, | 
|  | UErrorCode &errorCode) { | 
|  | if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { | 
|  | result.setToBogus(); | 
|  | if(U_SUCCESS(errorCode)) { | 
|  | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | 
|  | } | 
|  | } else { | 
|  | UnicodeString localDest; | 
|  | UnicodeString *dest; | 
|  |  | 
|  | if(&right!=&result) { | 
|  | dest=&result; | 
|  | } else { | 
|  | // the right and result strings are the same object, use a temporary one | 
|  | dest=&localDest; | 
|  | } | 
|  | *dest=left; | 
|  | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); | 
|  | if(U_SUCCESS(errorCode)) { | 
|  | if(options&UNORM_UNICODE_3_2) { | 
|  | FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). | 
|  | append(*dest, right, errorCode); | 
|  | } else { | 
|  | n2->append(*dest, right, errorCode); | 
|  | } | 
|  | } | 
|  | if(dest==&localDest && U_SUCCESS(errorCode)) { | 
|  | result=*dest; | 
|  | } | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | //------------------------------------------------------------------------- | 
|  | // Iteration API | 
|  | //------------------------------------------------------------------------- | 
|  |  | 
|  | /** | 
|  | * Return the current character in the normalized text. | 
|  | */ | 
|  | UChar32 Normalizer::current() { | 
|  | if(bufferPos<buffer.length() || nextNormalize()) { | 
|  | return buffer.char32At(bufferPos); | 
|  | } else { | 
|  | return DONE; | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Return the next character in the normalized text and advance | 
|  | * the iteration position by one.  If the end | 
|  | * of the text has already been reached, {@link #DONE} is returned. | 
|  | */ | 
|  | UChar32 Normalizer::next() { | 
|  | if(bufferPos<buffer.length() ||  nextNormalize()) { | 
|  | UChar32 c=buffer.char32At(bufferPos); | 
|  | bufferPos+=U16_LENGTH(c); | 
|  | return c; | 
|  | } else { | 
|  | return DONE; | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Return the previous character in the normalized text and decrement | 
|  | * the iteration position by one.  If the beginning | 
|  | * of the text has already been reached, {@link #DONE} is returned. | 
|  | */ | 
|  | UChar32 Normalizer::previous() { | 
|  | if(bufferPos>0 || previousNormalize()) { | 
|  | UChar32 c=buffer.char32At(bufferPos-1); | 
|  | bufferPos-=U16_LENGTH(c); | 
|  | return c; | 
|  | } else { | 
|  | return DONE; | 
|  | } | 
|  | } | 
|  |  | 
|  | void Normalizer::reset() { | 
|  | currentIndex=nextIndex=text->setToStart(); | 
|  | clearBuffer(); | 
|  | } | 
|  |  | 
|  | void | 
|  | Normalizer::setIndexOnly(int32_t index) { | 
|  | text->setIndex(index);  // pins index | 
|  | currentIndex=nextIndex=text->getIndex(); | 
|  | clearBuffer(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Return the first character in the normalized text.  This resets | 
|  | * the <tt>Normalizer's</tt> position to the beginning of the text. | 
|  | */ | 
|  | UChar32 Normalizer::first() { | 
|  | reset(); | 
|  | return next(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Return the last character in the normalized text.  This resets | 
|  | * the <tt>Normalizer's</tt> position to be just before the | 
|  | * the input text corresponding to that normalized character. | 
|  | */ | 
|  | UChar32 Normalizer::last() { | 
|  | currentIndex=nextIndex=text->setToEnd(); | 
|  | clearBuffer(); | 
|  | return previous(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Retrieve the current iteration position in the input text that is | 
|  | * being normalized.  This method is useful in applications such as | 
|  | * searching, where you need to be able to determine the position in | 
|  | * the input text that corresponds to a given normalized output character. | 
|  | * <p> | 
|  | * <b>Note:</b> This method sets the position in the <em>input</em>, while | 
|  | * {@link #next} and {@link #previous} iterate through characters in the | 
|  | * <em>output</em>.  This means that there is not necessarily a one-to-one | 
|  | * correspondence between characters returned by <tt>next</tt> and | 
|  | * <tt>previous</tt> and the indices passed to and returned from | 
|  | * <tt>setIndex</tt> and {@link #getIndex}. | 
|  | * | 
|  | */ | 
|  | int32_t Normalizer::getIndex() const { | 
|  | if(bufferPos<buffer.length()) { | 
|  | return currentIndex; | 
|  | } else { | 
|  | return nextIndex; | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Retrieve the index of the start of the input text.  This is the begin index | 
|  | * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> | 
|  | * over which this <tt>Normalizer</tt> is iterating | 
|  | */ | 
|  | int32_t Normalizer::startIndex() const { | 
|  | return text->startIndex(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Retrieve the index of the end of the input text.  This is the end index | 
|  | * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> | 
|  | * over which this <tt>Normalizer</tt> is iterating | 
|  | */ | 
|  | int32_t Normalizer::endIndex() const { | 
|  | return text->endIndex(); | 
|  | } | 
|  |  | 
|  | //------------------------------------------------------------------------- | 
|  | // Property access methods | 
|  | //------------------------------------------------------------------------- | 
|  |  | 
|  | void | 
|  | Normalizer::setMode(UNormalizationMode newMode) | 
|  | { | 
|  | fUMode = newMode; | 
|  | init(); | 
|  | } | 
|  |  | 
|  | UNormalizationMode | 
|  | Normalizer::getUMode() const | 
|  | { | 
|  | return fUMode; | 
|  | } | 
|  |  | 
|  | void | 
|  | Normalizer::setOption(int32_t option, | 
|  | UBool value) | 
|  | { | 
|  | if (value) { | 
|  | fOptions |= option; | 
|  | } else { | 
|  | fOptions &= (~option); | 
|  | } | 
|  | init(); | 
|  | } | 
|  |  | 
|  | UBool | 
|  | Normalizer::getOption(int32_t option) const | 
|  | { | 
|  | return (fOptions & option) != 0; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Set the input text over which this <tt>Normalizer</tt> will iterate. | 
|  | * The iteration position is set to the beginning of the input text. | 
|  | */ | 
|  | void | 
|  | Normalizer::setText(const UnicodeString& newText, | 
|  | UErrorCode &status) | 
|  | { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | CharacterIterator *newIter = new StringCharacterIterator(newText); | 
|  | if (newIter == NULL) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | return; | 
|  | } | 
|  | delete text; | 
|  | text = newIter; | 
|  | reset(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Set the input text over which this <tt>Normalizer</tt> will iterate. | 
|  | * The iteration position is set to the beginning of the string. | 
|  | */ | 
|  | void | 
|  | Normalizer::setText(const CharacterIterator& newText, | 
|  | UErrorCode &status) | 
|  | { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | CharacterIterator *newIter = newText.clone(); | 
|  | if (newIter == NULL) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | return; | 
|  | } | 
|  | delete text; | 
|  | text = newIter; | 
|  | reset(); | 
|  | } | 
|  |  | 
|  | void | 
|  | Normalizer::setText(ConstChar16Ptr newText, | 
|  | int32_t length, | 
|  | UErrorCode &status) | 
|  | { | 
|  | if (U_FAILURE(status)) { | 
|  | return; | 
|  | } | 
|  | CharacterIterator *newIter = new UCharCharacterIterator(newText, length); | 
|  | if (newIter == NULL) { | 
|  | status = U_MEMORY_ALLOCATION_ERROR; | 
|  | return; | 
|  | } | 
|  | delete text; | 
|  | text = newIter; | 
|  | reset(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Copies the text under iteration into the UnicodeString referred to by "result". | 
|  | * @param result Receives a copy of the text under iteration. | 
|  | */ | 
|  | void | 
|  | Normalizer::getText(UnicodeString&  result) | 
|  | { | 
|  | text->getText(result); | 
|  | } | 
|  |  | 
|  | //------------------------------------------------------------------------- | 
|  | // Private utility methods | 
|  | //------------------------------------------------------------------------- | 
|  |  | 
|  | void Normalizer::clearBuffer() { | 
|  | buffer.remove(); | 
|  | bufferPos=0; | 
|  | } | 
|  |  | 
|  | UBool | 
|  | Normalizer::nextNormalize() { | 
|  | clearBuffer(); | 
|  | currentIndex=nextIndex; | 
|  | text->setIndex(nextIndex); | 
|  | if(!text->hasNext()) { | 
|  | return FALSE; | 
|  | } | 
|  | // Skip at least one character so we make progress. | 
|  | UnicodeString segment(text->next32PostInc()); | 
|  | while(text->hasNext()) { | 
|  | UChar32 c; | 
|  | if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { | 
|  | text->move32(-1, CharacterIterator::kCurrent); | 
|  | break; | 
|  | } | 
|  | segment.append(c); | 
|  | } | 
|  | nextIndex=text->getIndex(); | 
|  | UErrorCode errorCode=U_ZERO_ERROR; | 
|  | fNorm2->normalize(segment, buffer, errorCode); | 
|  | return U_SUCCESS(errorCode) && !buffer.isEmpty(); | 
|  | } | 
|  |  | 
|  | UBool | 
|  | Normalizer::previousNormalize() { | 
|  | clearBuffer(); | 
|  | nextIndex=currentIndex; | 
|  | text->setIndex(currentIndex); | 
|  | if(!text->hasPrevious()) { | 
|  | return FALSE; | 
|  | } | 
|  | UnicodeString segment; | 
|  | while(text->hasPrevious()) { | 
|  | UChar32 c=text->previous32(); | 
|  | segment.insert(0, c); | 
|  | if(fNorm2->hasBoundaryBefore(c)) { | 
|  | break; | 
|  | } | 
|  | } | 
|  | currentIndex=text->getIndex(); | 
|  | UErrorCode errorCode=U_ZERO_ERROR; | 
|  | fNorm2->normalize(segment, buffer, errorCode); | 
|  | bufferPos=buffer.length(); | 
|  | return U_SUCCESS(errorCode) && !buffer.isEmpty(); | 
|  | } | 
|  |  | 
|  | U_NAMESPACE_END | 
|  |  | 
|  | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |