| /* |
| ********************************************************************** |
| * Copyright (C) 2001-2003, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 07/03/01 aliu Creation. |
| ********************************************************************** |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_TRANSLITERATION |
| |
| #include "unicode/uniset.h" |
| #include "unicode/uiter.h" |
| #include "nortrans.h" |
| #include "unormimp.h" |
| #include "mutex.h" |
| #include "ucln_in.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) |
| |
| /** |
| * System registration hook. |
| */ |
| void NormalizationTransliterator::registerIDs() { |
| UErrorCode errorCode = U_ZERO_ERROR; |
| if(!unorm_haveData(&errorCode)) { |
| return; |
| } |
| |
| Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), |
| _create, integerToken(UNORM_NFC)); |
| Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), |
| _create, integerToken(UNORM_NFKC)); |
| Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), |
| _create, integerToken(UNORM_NFD)); |
| Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), |
| _create, integerToken(UNORM_NFKD)); |
| Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), |
| UNICODE_STRING_SIMPLE("NFD"), TRUE); |
| Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), |
| UNICODE_STRING_SIMPLE("NFKD"), TRUE); |
| } |
| |
| /** |
| * Factory methods |
| */ |
| Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, |
| Token context) { |
| return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0); |
| } |
| |
| /** |
| * Constructs a transliterator. |
| */ |
| NormalizationTransliterator::NormalizationTransliterator( |
| const UnicodeString& id, |
| UNormalizationMode mode, int32_t opt) : |
| Transliterator(id, 0) { |
| fMode = mode; |
| options = opt; |
| } |
| |
| /** |
| * Destructor. |
| */ |
| NormalizationTransliterator::~NormalizationTransliterator() { |
| } |
| |
| /** |
| * Copy constructor. |
| */ |
| NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : |
| Transliterator(o) { |
| fMode = o.fMode; |
| options = o.options; |
| } |
| |
| /** |
| * Assignment operator. |
| */ |
| NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) { |
| Transliterator::operator=(o); |
| fMode = o.fMode; |
| options = o.options; |
| return *this; |
| } |
| |
| /** |
| * Transliterator API. |
| */ |
| Transliterator* NormalizationTransliterator::clone(void) const { |
| return new NormalizationTransliterator(*this); |
| } |
| |
| /** |
| * Implements {@link Transliterator#handleTransliterate}. |
| */ |
| void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, |
| UBool isIncremental) const { |
| // start and limit of the input range |
| int32_t start = offsets.start; |
| int32_t limit = offsets.limit; |
| int32_t length, delta; |
| |
| if(start >= limit) { |
| return; |
| } |
| |
| // a C code unit iterator, implemented around the Replaceable |
| UCharIterator iter; |
| uiter_setReplaceable(&iter, &text); |
| |
| // the output string and buffer pointer |
| UnicodeString output; |
| UChar *buffer; |
| UBool neededToNormalize; |
| |
| UErrorCode errorCode; |
| |
| /* |
| * Normalize as short chunks at a time as possible even in |
| * bulk mode, so that styled text is minimally disrupted. |
| * In incremental mode, a chunk that ends with offsets.limit |
| * must not be normalized. |
| * |
| * If it was known that the input text is not styled, then |
| * a bulk mode normalization could look like this: |
| * |
| |
| UChar staticChars[256]; |
| UnicodeString input; |
| |
| length = limit - start; |
| input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias |
| |
| _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); |
| input.releaseBuffer(length); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| Normalizer::normalize(input, fMode, options, output, status); |
| |
| text.handleReplaceBetween(start, limit, output); |
| |
| int32_t delta = output.length() - length; |
| offsets.contextLimit += delta; |
| offsets.limit += delta; |
| offsets.start = limit + delta; |
| |
| * |
| */ |
| while(start < limit) { |
| // set the iterator limits for the remaining input range |
| // this is a moving target because of the replacements in the text object |
| iter.start = iter.index = start; |
| iter.limit = limit; |
| |
| // incrementally normalize a small chunk of the input |
| buffer = output.getBuffer(-1); |
| errorCode = U_ZERO_ERROR; |
| length = unorm_next(&iter, buffer, output.getCapacity(), |
| fMode, 0, |
| TRUE, &neededToNormalize, |
| &errorCode); |
| output.releaseBuffer(length); |
| |
| if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
| // use a larger output string buffer and do it again from the start |
| iter.index = start; |
| buffer = output.getBuffer(length); |
| errorCode = U_ZERO_ERROR; |
| length = unorm_next(&iter, buffer, output.getCapacity(), |
| fMode, 0, |
| TRUE, &neededToNormalize, |
| &errorCode); |
| output.releaseBuffer(length); |
| } |
| |
| if(U_FAILURE(errorCode)) { |
| break; |
| } |
| |
| limit = iter.index; |
| if(isIncremental && limit == iter.limit) { |
| // stop in incremental mode when we reach the input limit |
| // in case there are additional characters that could change the |
| // normalization result |
| |
| // UNLESS all characters in the result of the normalization of |
| // the last run are in the skippable set |
| const UChar *s=output.getBuffer(); |
| int32_t i=0, outLength=output.length(); |
| UChar32 c; |
| |
| while(i<outLength) { |
| U16_NEXT(s, i, outLength, c); |
| if(!unorm_isNFSkippable(c, fMode)) { |
| outLength=-1; // I wish C++ had labeled loops and break outer; ... |
| break; |
| } |
| } |
| if (outLength<0) { |
| break; |
| } |
| } |
| |
| if(neededToNormalize) { |
| // replace the input chunk with its normalized form |
| text.handleReplaceBetween(start, limit, output); |
| |
| // update all necessary indexes accordingly |
| delta = length - (limit - start); // length change in the text object |
| start = limit += delta; // the next chunk starts where this one ends, with adjustment |
| limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range |
| offsets.contextLimit += delta; |
| } else { |
| // delta == 0 |
| start = limit; |
| limit = offsets.limit; |
| } |
| } |
| |
| offsets.start = start; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |