source/common/compitr.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 1996-1999, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

 #include "dcmpdata.h"

 #include "compitr.h"

 #include "unicode/normlzr.h"

 /**
  * Construct a new <tt>ComposedCharIter</tt>.  The iterator will return
  * all Unicode characters with canonical decompositions, including Korean
  * Hangul characters.
  */
 ComposedCharIter::ComposedCharIter()
   : minDecomp(DecompData::MAX_COMPAT),
     hangul(FALSE),
     curChar(0),
     nextChar(ComposedCharIter::DONE)
 {
 }


   /**
    * Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
    * <p>
    * @param compat    <tt>false</tt> for canonical decompositions only;
    *                  <tt>true</tt> for both canonical and compatibility
    *                  decompositions.
    *
    * @param options   Optional decomposition features.  Currently, the only
    *                  supported option is {@link Normalizer#IGNORE_HANGUL}, which
    *                  causes this <tt>ComposedCharIter</tt> not to iterate
    *                  over the Hangul characters and their corresponding
    *                  Jamo decompositions.
    */
 ComposedCharIter::ComposedCharIter(UBool compat,
                    int32_t options)
   : minDecomp(compat ? 0 : DecompData::MAX_COMPAT),
     hangul((options & Normalizer::IGNORE_HANGUL) == 0),
     curChar(0),
     nextChar(ComposedCharIter::DONE)
 {
 }

 /**
  * Determines whether there any precomposed Unicode characters not yet returned
  * by {@link #next}.
  */
 UBool ComposedCharIter::hasNext() const {
     if (nextChar == DONE)  {
         ((ComposedCharIter*)this)->findNextChar();
     }
     return nextChar != DONE;
 }

 /**
  * Returns the next precomposed Unicode character.
  * Repeated calls to <tt>next</tt> return all of the precomposed characters defined
  * by Unicode, in ascending order.  After all precomposed characters have
  * been returned, {@link #hasNext} will return <tt>false</tt> and further calls
  * to <tt>next</tt> will return {@link #DONE}.
  */
 UChar ComposedCharIter::next()
 {
     if (nextChar == DONE)  {
         findNextChar();
     }
     curChar = nextChar;
     nextChar = DONE;
     return curChar;
 }

 /**
  * Returns the Unicode decomposition of the current character.
  * This method returns the decomposition of the precomposed character most
  * recently returned by {@link #next}.  The resulting decomposition is
  * affected by the settings of the
  * {@link Normalizer#COMPATIBILITY COMPATIBILITY}
  * and {@link Normalizer#NO_HANGUL NO_HANGUL} options passed to the constructor.
  */
 void ComposedCharIter::getDecomposition(UnicodeString& result) const
 {
     // We duplicate most of the implementation of Normalizer::decompose() here
     // for efficiency.  One thing we don't duplicate is the recursive
     // decomposition code.  If we detect a need to do recursive decomposition
     // (which happens for only 16 characters in Unicode 3.0) then we delegate to
     // Normalizer::decompose().  This gives us optimal performance without
     // having a complete copy of Normalizer::decompose() here, with its extra
     // baggage of recursion buffers, etc. - Liu

     result.truncate(0);

     uint16_t offset = ucmp16_getu(DecompData::offsets, curChar);
     uint16_t index  = (uint16_t)(offset & DecompData::DECOMP_MASK);
     if (index > minDecomp) {
         if ((offset & DecompData::DECOMP_RECURSE) != 0) {
             // Let Normalizer::decompose() handle recursive decomp
             UnicodeString temp(curChar);
             UErrorCode status = U_ZERO_ERROR;
             Normalizer::decompose(temp, minDecomp > 0,
                                   hangul ? Normalizer::IGNORE_HANGUL : 0,
                                   result, status);
         } else {
             Normalizer::doAppend((const UChar*)DecompData::contents, index, result);
         }
     }
     else if (hangul && curChar >= Normalizer::HANGUL_BASE && curChar < Normalizer::HANGUL_LIMIT) {
         Normalizer::hangulToJamo(curChar, result, (uint16_t)minDecomp);
     }
     else {
         result += curChar;
     }
 }

 void ComposedCharIter::findNextChar()
 {
     if (curChar != DONE) {
         UChar ch = curChar;
         while (++ch < 0xFFFF) {
             UChar offset = ucmp16_getu(DecompData::offsets, ch);
             if (offset > minDecomp
                 || (hangul && ch >= Normalizer::HANGUL_BASE && ch < Normalizer::HANGUL_LIMIT) ) {
                 nextChar = ch;
                 break;
             }
         }
     }
 }
	/*
	**********************************************************************
	* Copyright (C) 1996-1999, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	*/

	#include "dcmpdata.h"

	#include "compitr.h"

	#include "unicode/normlzr.h"

	/**
	* Construct a new <tt>ComposedCharIter</tt>. The iterator will return
	* all Unicode characters with canonical decompositions, including Korean
	* Hangul characters.
	*/
	ComposedCharIter::ComposedCharIter()
	: minDecomp(DecompData::MAX_COMPAT),
	hangul(FALSE),
	curChar(0),
	nextChar(ComposedCharIter::DONE)
	{
	}


	/**
	* Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
	* <p>
	* @param compat <tt>false</tt> for canonical decompositions only;
	* <tt>true</tt> for both canonical and compatibility
	* decompositions.
	*
	* @param options Optional decomposition features. Currently, the only
	* supported option is {@link Normalizer#IGNORE_HANGUL}, which
	* causes this <tt>ComposedCharIter</tt> not to iterate
	* over the Hangul characters and their corresponding
	* Jamo decompositions.
	*/
	ComposedCharIter::ComposedCharIter(UBool compat,
	int32_t options)
	: minDecomp(compat ? 0 : DecompData::MAX_COMPAT),
	hangul((options & Normalizer::IGNORE_HANGUL) == 0),
	curChar(0),
	nextChar(ComposedCharIter::DONE)
	{
	}

	/**
	* Determines whether there any precomposed Unicode characters not yet returned
	* by {@link #next}.
	*/
	UBool ComposedCharIter::hasNext() const {
	if (nextChar == DONE) {
	((ComposedCharIter*)this)->findNextChar();
	}
	return nextChar != DONE;
	}

	/**
	* Returns the next precomposed Unicode character.
	* Repeated calls to <tt>next</tt> return all of the precomposed characters defined
	* by Unicode, in ascending order. After all precomposed characters have
	* been returned, {@link #hasNext} will return <tt>false</tt> and further calls
	* to <tt>next</tt> will return {@link #DONE}.
	*/
	UChar ComposedCharIter::next()
	{
	if (nextChar == DONE) {
	findNextChar();
	}
	curChar = nextChar;
	nextChar = DONE;
	return curChar;
	}

	/**
	* Returns the Unicode decomposition of the current character.
	* This method returns the decomposition of the precomposed character most
	* recently returned by {@link #next}. The resulting decomposition is
	* affected by the settings of the
	* {@link Normalizer#COMPATIBILITY COMPATIBILITY}
	* and {@link Normalizer#NO_HANGUL NO_HANGUL} options passed to the constructor.
	*/
	void ComposedCharIter::getDecomposition(UnicodeString& result) const
	{
	// We duplicate most of the implementation of Normalizer::decompose() here
	// for efficiency. One thing we don't duplicate is the recursive
	// decomposition code. If we detect a need to do recursive decomposition
	// (which happens for only 16 characters in Unicode 3.0) then we delegate to
	// Normalizer::decompose(). This gives us optimal performance without
	// having a complete copy of Normalizer::decompose() here, with its extra
	// baggage of recursion buffers, etc. - Liu

	result.truncate(0);

	uint16_t offset = ucmp16_getu(DecompData::offsets, curChar);
	uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK);
	if (index > minDecomp) {
	if ((offset & DecompData::DECOMP_RECURSE) != 0) {
	// Let Normalizer::decompose() handle recursive decomp
	UnicodeString temp(curChar);
	UErrorCode status = U_ZERO_ERROR;
	Normalizer::decompose(temp, minDecomp > 0,
	hangul ? Normalizer::IGNORE_HANGUL : 0,
	result, status);
	} else {
	Normalizer::doAppend((const UChar*)DecompData::contents, index, result);
	}
	}
	else if (hangul && curChar >= Normalizer::HANGUL_BASE && curChar < Normalizer::HANGUL_LIMIT) {
	Normalizer::hangulToJamo(curChar, result, (uint16_t)minDecomp);
	}
	else {
	result += curChar;
	}
	}

	void ComposedCharIter::findNextChar()
	{
	if (curChar != DONE) {
	UChar ch = curChar;
	while (++ch < 0xFFFF) {
	UChar offset = ucmp16_getu(DecompData::offsets, ch);
	if (offset > minDecomp
	\|\| (hangul && ch >= Normalizer::HANGUL_BASE && ch < Normalizer::HANGUL_LIMIT) ) {
	nextChar = ch;
	break;
	}
	}
	}
	}