source/i18n/nortrans.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 2001-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   07/03/01    aliu        Creation.
 **********************************************************************
 */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_TRANSLITERATION

 #include "unicode/uniset.h"
 #include "unicode/uiter.h"
 #include "nortrans.h"
 #include "unormimp.h"
 #include "mutex.h"
 #include "ucln_in.h"

 U_NAMESPACE_BEGIN

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)

 /**
  * System registration hook.
  */
 void NormalizationTransliterator::registerIDs() {
     UErrorCode errorCode = U_ZERO_ERROR;
     if(!unorm_haveData(&errorCode)) {
         return;
     }

     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
                                      _create, integerToken(UNORM_NFC));
     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
                                      _create, integerToken(UNORM_NFKC));
     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
                                      _create, integerToken(UNORM_NFD));
     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
                                      _create, integerToken(UNORM_NFKD));
     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
 }

 /**
  * Factory methods
  */
 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
                                                      Token context) {
     return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
 }

 /**
  * Constructs a transliterator.
  */
 NormalizationTransliterator::NormalizationTransliterator(
                                  const UnicodeString& id,
                                  UNormalizationMode mode, int32_t opt) :
     Transliterator(id, 0) {
     fMode = mode;
     options = opt;
 }

 /**
  * Destructor.
  */
 NormalizationTransliterator::~NormalizationTransliterator() {
 }

 /**
  * Copy constructor.
  */
 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
 Transliterator(o) {
     fMode = o.fMode;
     options = o.options;
 }

 /**
  * Assignment operator.
  */
 NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
     Transliterator::operator=(o);
     fMode = o.fMode;
     options = o.options;
     return *this;
 }

 /**
  * Transliterator API.
  */
 Transliterator* NormalizationTransliterator::clone(void) const {
     return new NormalizationTransliterator(*this);
 }

 /**
  * Implements {@link Transliterator#handleTransliterate}.
  */
 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                       UBool isIncremental) const {
     // start and limit of the input range
     int32_t start = offsets.start;
     int32_t limit = offsets.limit;
     int32_t length, delta;

     if(start >= limit) {
         return;
     }

     // a C code unit iterator, implemented around the Replaceable
     UCharIterator iter;
     uiter_setReplaceable(&iter, &text);

     // the output string and buffer pointer
     UnicodeString output;
     UChar *buffer;
     UBool neededToNormalize;

     UErrorCode errorCode;

     /*
      * Normalize as short chunks at a time as possible even in
      * bulk mode, so that styled text is minimally disrupted.
      * In incremental mode, a chunk that ends with offsets.limit
      * must not be normalized.
      *
      * If it was known that the input text is not styled, then
      * a bulk mode normalization could look like this:
      *

     UChar staticChars[256];
     UnicodeString input;

     length = limit - start;
     input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
     input.releaseBuffer(length);

     UErrorCode status = U_ZERO_ERROR;
     Normalizer::normalize(input, fMode, options, output, status);

     text.handleReplaceBetween(start, limit, output);

     int32_t delta = output.length() - length;
     offsets.contextLimit += delta;
     offsets.limit += delta;
     offsets.start = limit + delta;

      *
      */
     while(start < limit) {
         // set the iterator limits for the remaining input range
         // this is a moving target because of the replacements in the text object
         iter.start = iter.index = start;
         iter.limit = limit;

         // incrementally normalize a small chunk of the input
         buffer = output.getBuffer(-1);
         errorCode = U_ZERO_ERROR;
         length = unorm_next(&iter, buffer, output.getCapacity(),
                             fMode, 0,
                             TRUE, &neededToNormalize,
                             &errorCode);
         output.releaseBuffer(length);

         if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
             // use a larger output string buffer and do it again from the start
             iter.index = start;
             buffer = output.getBuffer(length);
             errorCode = U_ZERO_ERROR;
             length = unorm_next(&iter, buffer, output.getCapacity(),
                                 fMode, 0,
                                 TRUE, &neededToNormalize,
                                 &errorCode);
             output.releaseBuffer(length);
         }

         if(U_FAILURE(errorCode)) {
             break;
         }

         limit = iter.index;
         if(isIncremental && limit == iter.limit) {
             // stop in incremental mode when we reach the input limit
             // in case there are additional characters that could change the
             // normalization result

             // UNLESS all characters in the result of the normalization of
             // the last run are in the skippable set
             const UChar *s=output.getBuffer();
             int32_t i=0, outLength=output.length();
             UChar32 c;

             while(i<outLength) {
                 U16_NEXT(s, i, outLength, c);
                 if(!unorm_isNFSkippable(c, fMode)) {
                     outLength=-1; // I wish C++ had labeled loops and break outer; ...
                     break;
                 }
             }
             if (outLength<0) {
                 break;
             }
         }

         if(neededToNormalize) {
             // replace the input chunk with its normalized form
             text.handleReplaceBetween(start, limit, output);

             // update all necessary indexes accordingly
             delta = length - (limit - start);   // length change in the text object
             start = limit += delta;             // the next chunk starts where this one ends, with adjustment
             limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
             offsets.contextLimit += delta;
         } else {
             // delta == 0
             start = limit;
             limit = offsets.limit;
         }
     }

     offsets.start = start;
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
	/*
	**********************************************************************
	* Copyright (C) 2001-2003, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 07/03/01 aliu Creation.
	**********************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION

	#include "unicode/uniset.h"
	#include "unicode/uiter.h"
	#include "nortrans.h"
	#include "unormimp.h"
	#include "mutex.h"
	#include "ucln_in.h"

	U_NAMESPACE_BEGIN

	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)

	/**
	* System registration hook.
	*/
	void NormalizationTransliterator::registerIDs() {
	UErrorCode errorCode = U_ZERO_ERROR;
	if(!unorm_haveData(&errorCode)) {
	return;
	}

	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
	_create, integerToken(UNORM_NFC));
	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
	_create, integerToken(UNORM_NFKC));
	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
	_create, integerToken(UNORM_NFD));
	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
	_create, integerToken(UNORM_NFKD));
	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
	UNICODE_STRING_SIMPLE("NFD"), TRUE);
	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
	}

	/**
	* Factory methods
	*/
	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
	Token context) {
	return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
	}

	/**
	* Constructs a transliterator.
	*/
	NormalizationTransliterator::NormalizationTransliterator(
	const UnicodeString& id,
	UNormalizationMode mode, int32_t opt) :
	Transliterator(id, 0) {
	fMode = mode;
	options = opt;
	}

	/**
	* Destructor.
	*/
	NormalizationTransliterator::~NormalizationTransliterator() {
	}

	/**
	* Copy constructor.
	*/
	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
	Transliterator(o) {
	fMode = o.fMode;
	options = o.options;
	}

	/**
	* Assignment operator.
	*/
	NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
	Transliterator::operator=(o);
	fMode = o.fMode;
	options = o.options;
	return *this;
	}

	/**
	* Transliterator API.
	*/
	Transliterator* NormalizationTransliterator::clone(void) const {
	return new NormalizationTransliterator(*this);
	}

	/**
	* Implements {@link Transliterator#handleTransliterate}.
	*/
	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
	UBool isIncremental) const {
	// start and limit of the input range
	int32_t start = offsets.start;
	int32_t limit = offsets.limit;
	int32_t length, delta;

	if(start >= limit) {
	return;
	}

	// a C code unit iterator, implemented around the Replaceable
	UCharIterator iter;
	uiter_setReplaceable(&iter, &text);

	// the output string and buffer pointer
	UnicodeString output;
	UChar *buffer;
	UBool neededToNormalize;

	UErrorCode errorCode;

	/*
	* Normalize as short chunks at a time as possible even in
	* bulk mode, so that styled text is minimally disrupted.
	* In incremental mode, a chunk that ends with offsets.limit
	* must not be normalized.
	*
	* If it was known that the input text is not styled, then
	* a bulk mode normalization could look like this:
	*

	UChar staticChars[256];
	UnicodeString input;

	length = limit - start;
	input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
	input.releaseBuffer(length);

	UErrorCode status = U_ZERO_ERROR;
	Normalizer::normalize(input, fMode, options, output, status);

	text.handleReplaceBetween(start, limit, output);

	int32_t delta = output.length() - length;
	offsets.contextLimit += delta;
	offsets.limit += delta;
	offsets.start = limit + delta;

	*
	*/
	while(start < limit) {
	// set the iterator limits for the remaining input range
	// this is a moving target because of the replacements in the text object
	iter.start = iter.index = start;
	iter.limit = limit;

	// incrementally normalize a small chunk of the input
	buffer = output.getBuffer(-1);
	errorCode = U_ZERO_ERROR;
	length = unorm_next(&iter, buffer, output.getCapacity(),
	fMode, 0,
	TRUE, &neededToNormalize,
	&errorCode);
	output.releaseBuffer(length);

	if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
	// use a larger output string buffer and do it again from the start
	iter.index = start;
	buffer = output.getBuffer(length);
	errorCode = U_ZERO_ERROR;
	length = unorm_next(&iter, buffer, output.getCapacity(),
	fMode, 0,
	TRUE, &neededToNormalize,
	&errorCode);
	output.releaseBuffer(length);
	}

	if(U_FAILURE(errorCode)) {
	break;
	}

	limit = iter.index;
	if(isIncremental && limit == iter.limit) {
	// stop in incremental mode when we reach the input limit
	// in case there are additional characters that could change the
	// normalization result

	// UNLESS all characters in the result of the normalization of
	// the last run are in the skippable set
	const UChar *s=output.getBuffer();
	int32_t i=0, outLength=output.length();
	UChar32 c;

	while(i<outLength) {
	U16_NEXT(s, i, outLength, c);
	if(!unorm_isNFSkippable(c, fMode)) {
	outLength=-1; // I wish C++ had labeled loops and break outer; ...
	break;
	}
	}
	if (outLength<0) {
	break;
	}
	}

	if(neededToNormalize) {
	// replace the input chunk with its normalized form
	text.handleReplaceBetween(start, limit, output);

	// update all necessary indexes accordingly
	delta = length - (limit - start); // length change in the text object
	start = limit += delta; // the next chunk starts where this one ends, with adjustment
	limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
	offsets.contextLimit += delta;
	} else {
	// delta == 0
	start = limit;
	limit = offsets.limit;
	}
	}

	offsets.start = start;
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */