source/i18n/ptnentry.cpp - external/github.com/unicode-org/icu - Git at Google

 /*******************************************************************************
  * Copyright (C) 1996-1999, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
 //=============================================================================
 //
 // File ptnentry.cpp
 //
 // Contains PatternEntry, an internal class used by MergeCollation to store
 // one collation element from a pattern.
 //
 // Created by: Helena Shih
 //
 // Modification History:
 //
 //  Date         Name          Description
 // 04/23/99      stephen       Removed EDecompositionMode, merged with
 //                             Normalizer::EMode
 //                             Removed character literals.
 //=============================================================================

 #include "ptnentry.h"

 #include "unicode/unicode.h"
 #include "unicode/coll.h"
 #include "unicode/normlzr.h"


 // static member initialization
 const int32_t PatternEntry::RESET = -2;
 const int32_t PatternEntry::UNSET = -1;

 // ===== privates =====

 PatternEntry::PatternEntry()
   : strength(PatternEntry::UNSET)
 {
 }

 PatternEntry::PatternEntry(const    PatternEntry&   other)
   : strength(other.strength), chars(other.chars), extension(other.extension)
 {
 }

 PatternEntry::PatternEntry(int32_t newStrength,
                const UnicodeString& newChars,
                const UnicodeString& newExtensions,
                Normalizer::EMode decompMode)
   : strength(newStrength), extension(newExtensions)
 {
   // Normalize the characters in the new entry.  Find occurances of all
   // decomposed characters and normalize them.  By "normalize",
   // we mean that all precomposed Unicode characters must be converted into
   // a base character and one or more combining characters (such as accents).
   // When there are multiple combining characters attached to a base character,
   // the combining characters must be in their canonical order
   //
   UErrorCode status = U_ZERO_ERROR;
   Normalizer::normalize(newChars, decompMode, 0, chars, status);
   if (U_FAILURE(status)) {
     chars = newChars;
   }
 }

 PatternEntry::~PatternEntry() {
 }

 const PatternEntry&
 PatternEntry::operator=(const   PatternEntry& other)
 {
   if (this != &other) {
     strength = other.strength;
     chars = other.chars;
     extension = other.extension;
   }
   return *this;
 }

 /**
  * Gets the current extension, quoted
  * This is useful when constructing a user-readable string representing
  * a pattern.
  */
 void PatternEntry::appendQuotedExtension(UnicodeString& toAddTo) const {
   appendQuoted(extension,toAddTo);
 }

 /**
  * Gets the current chars, quoted
  * This is useful when constructing a user-readable string representing
  * a pattern.
  */
 void PatternEntry::appendQuotedChars(UnicodeString& toAddTo) const {
   appendQuoted(chars,toAddTo);
 }

 bool_t PatternEntry::equals(const PatternEntry& other) const {
   bool_t result = ((strength == other.strength) &&
                    (chars == other.chars) &&
                    (extension == other.extension));
   return result;
 }

 /**
  * For debugging.
  */
 UnicodeString&
 PatternEntry::toString(UnicodeString& result) const
 {
   addToBuffer(result, TRUE, FALSE, NULL);
   return result;
 }

 int32_t
 PatternEntry::getStrength() const
 {
   return strength;
 }

 const UnicodeString&
 PatternEntry::getExtension(UnicodeString& ext) const
 {
   ext = extension;
   return ext;
 }

 const UnicodeString&
 PatternEntry::getChars(UnicodeString& result) const
 {
   result = chars;
   return result;
 }

 /*
  Add the entry in textual form into the toAddTo buffer.
  */
 void PatternEntry::addToBuffer(UnicodeString& toAddTo,
                    bool_t showExtension,
                    bool_t showWhiteSpace,
                    const PatternEntry* lastEntry) const
 {
   if (showWhiteSpace && toAddTo.length() > 0)
     // Adds new line before each primary strength entry.
     if (strength == Collator::PRIMARY || lastEntry != NULL)
       toAddTo += 0x000A/*'\n'*/;
     else
       toAddTo += 0x0020/*' '*/;
   if (lastEntry != NULL) {
     toAddTo += 0x0026/*'&'*/;
     if (showWhiteSpace)
       toAddTo += 0x0020/*' '*/;
     lastEntry->appendQuotedChars(toAddTo);
     appendQuotedExtension(toAddTo);
     if (showWhiteSpace)
       toAddTo += 0x0020/*' '*/;
   }
   // Check the strength for the correct symbol to append
   switch (strength) {
   case Collator::IDENTICAL:   toAddTo += 0x003D/*'='*/; break;
   case Collator::TERTIARY:    toAddTo += 0x002C/*','*/; break;
   case Collator::SECONDARY:   toAddTo += 0x003B/*';'*/; break;
   case Collator::PRIMARY:     toAddTo += 0x003C/*'<'*/; break;
   case PatternEntry::RESET:   toAddTo += 0x0026/*'&'*/; break;
   case PatternEntry::UNSET:   toAddTo += 0x003F/*'?'*/; break;
   }
   if (showWhiteSpace)
     toAddTo += 0x0020/*' '*/;
   appendQuoted(chars,toAddTo);
   // If there's an expending char and needs to be shown,
   // append that after the entry
   if (showExtension && extension.length() != 0) {
     toAddTo += 0x002F/*'/'*/;
     appendQuoted(extension,toAddTo);
   }
 }

 // Append a string to a pattern buffer, adding quotes if necessary
 void PatternEntry::appendQuoted(const UnicodeString& chars, UnicodeString& toAddTo) {
   bool_t inQuote = FALSE;
   UChar ch = chars[T_INT32(0)];
   if (Unicode::isSpaceChar(ch)) {
     inQuote = TRUE;
     toAddTo += 0x0027/*'\''*/;
   } else if (isSpecialChar(ch)) {
     inQuote = TRUE;
     toAddTo += 0x0027/*'\''*/;
   } else {
     switch (ch) {
     case 0x0010: case 0x000C/*'\f'*/:
     case 0x000D/*'\r'*/: case 0x0009/*'\t'*/:
     case 0x000A/*'\n'*/: case 0x0040/*'@'*/:
       inQuote = TRUE;
       toAddTo += 0x0027/*'\''*/;
       break;
     case 0x0027/*'\''*/:
       inQuote = TRUE;
       toAddTo += 0x0027/*'\''*/;
       break;
     default:
       if (inQuote) {
     inQuote = FALSE; toAddTo += 0x0027/*'\''*/;
       }
       break;
     }
   }
   toAddTo += chars;
   if (inQuote)
     toAddTo += 0x0027/*'\''*/;
 }

 PatternEntry::Parser::Parser(const UnicodeString &pattern,
                              Normalizer::EMode decompMode)
   : pattern(pattern), index(0),
     fDecompMode(decompMode), newChars(), newExtensions()
 {
 }

 PatternEntry::Parser::Parser(const Parser &that)
   : pattern(that.pattern), index(that.index), fDecompMode(that.fDecompMode),
     newChars(that.newChars), newExtensions(that.newExtensions)
 {
 }

 PatternEntry::Parser::~Parser()
 {
 }

 PatternEntry::Parser &PatternEntry::Parser::operator=(const Parser &that)
 {
   if (this != &that)
   {
     this->pattern = that.pattern;
     this->index = that.index;
     this->fDecompMode = that.fDecompMode;
     this->newChars = that.newChars;
     this->newExtensions = that.newExtensions;
   }

   return *this;
 }

 PatternEntry *PatternEntry::Parser::next(UErrorCode &status)
 {
   int32_t newStrength = PatternEntry::UNSET;
   bool_t inChars = TRUE;
   bool_t inQuote = FALSE;

   newChars.remove();
   newExtensions.remove();

   while (index < pattern.length())
     {
       UChar ch = pattern[index];

       if (inQuote)
     {
       if (ch == 0x0027/*'\''*/)
         {
           inQuote = FALSE;
         }
       else
         {
           if ((newChars.length() == 0) || inChars)
         {
           newChars += ch;
         }
           else
         {
           newExtensions += ch;
         }
             }
         }
       else
     {
       // Sets the strength for this entry
       switch (ch)
         {
         case 0x003D/*'='*/ :
           if (newStrength != PatternEntry::UNSET)
         {
           goto EndOfLoop;
         }

           newStrength = Collator::IDENTICAL;
           break;

         case 0x002C/*','*/:
           if (newStrength != PatternEntry::UNSET)
         {
           goto EndOfLoop;
         }

           newStrength = Collator::TERTIARY;
           break;

         case  0x003B/*';'*/:
           if (newStrength != PatternEntry::UNSET)
         {
           goto EndOfLoop;
         }

           newStrength = Collator::SECONDARY;
           break;

         case 0x003C/*'<'*/:
           if (newStrength != PatternEntry::UNSET)
         {
           goto EndOfLoop;
         }

           newStrength = Collator::PRIMARY;
           break;

         case 0x0026/*'&'*/:
           if (newStrength != PatternEntry::UNSET)
         {
           goto EndOfLoop;
         }

           newStrength = PatternEntry::RESET;
           break;

           // Ignore the white spaces
         case 0x0009/*'\t'*/:
         case 0x000C/*'\f'*/:
         case 0x000D/*'\r'*/:
         case 0x000A/*'\n'*/:
         case 0x0020/*' '*/:
           break; // skip whitespace TODO use Unicode

         case 0x002F/*'/'*/:
                 // This entry has an extension.
           inChars = FALSE;
           break;

         case 0x0027/*'\''*/:
           inQuote = TRUE;
           ch = pattern[++index];

           if (newChars.length() == 0)
         {
           newChars += ch;
         }
           else if (inChars)
         {
           newChars += ch;
         }
           else
         {
           newExtensions += ch;
         }

           break;

         default:
           if (newStrength == PatternEntry::UNSET)
         {
           status = U_INVALID_FORMAT_ERROR;
           return NULL;
         }

           if (isSpecialChar(ch) && (inQuote == FALSE))
         {
           status = U_INVALID_FORMAT_ERROR;
           return NULL;
         }

           if (inChars)
         {
           newChars += ch;
         }
           else
         {
           newExtensions += ch;
         }

           break;
         }
     }

       if (newChars.isBogus() || newExtensions.isBogus())
     {
       status = U_MEMORY_ALLOCATION_ERROR;
       return NULL;
         }

       index += 1;
     }

  EndOfLoop:
   if (newStrength == PatternEntry::UNSET)
     {
       return NULL;
     }

   if (newChars.length() == 0)
     {
       status = U_INVALID_FORMAT_ERROR;
       return NULL;
     }

   return new PatternEntry(newStrength, newChars, newExtensions, fDecompMode);
 }

 // Check if the character is a special character.  A special character
 // would be meaningful in the rule only if quoted, otherwise it's used
 // as a denotation for strength or merging symbols.
 bool_t PatternEntry::isSpecialChar(UChar ch)
 {
   return (((ch <= 0x002F) && (ch >= 0x0020)) ||
       ((ch <= 0x003F) && (ch >= 0x003A)) ||
       ((ch <= 0x0060) && (ch >= 0x005B)) ||
       ((ch <= 0x007E) && (ch >= 0x007B)));
 }
	/*******************************************************************************
	* Copyright (C) 1996-1999, International Business Machines Corporation and *
	* others. All Rights Reserved. *
	*******************************************************************************
	*/
	//=============================================================================
	//
	// File ptnentry.cpp
	//
	// Contains PatternEntry, an internal class used by MergeCollation to store
	// one collation element from a pattern.
	//
	// Created by: Helena Shih
	//
	// Modification History:
	//
	// Date Name Description
	// 04/23/99 stephen Removed EDecompositionMode, merged with
	// Normalizer::EMode
	// Removed character literals.
	//=============================================================================

	#include "ptnentry.h"

	#include "unicode/unicode.h"
	#include "unicode/coll.h"
	#include "unicode/normlzr.h"


	// static member initialization
	const int32_t PatternEntry::RESET = -2;
	const int32_t PatternEntry::UNSET = -1;

	// ===== privates =====

	PatternEntry::PatternEntry()
	: strength(PatternEntry::UNSET)
	{
	}

	PatternEntry::PatternEntry(const PatternEntry& other)
	: strength(other.strength), chars(other.chars), extension(other.extension)
	{
	}

	PatternEntry::PatternEntry(int32_t newStrength,
	const UnicodeString& newChars,
	const UnicodeString& newExtensions,
	Normalizer::EMode decompMode)
	: strength(newStrength), extension(newExtensions)
	{
	// Normalize the characters in the new entry. Find occurances of all
	// decomposed characters and normalize them. By "normalize",
	// we mean that all precomposed Unicode characters must be converted into
	// a base character and one or more combining characters (such as accents).
	// When there are multiple combining characters attached to a base character,
	// the combining characters must be in their canonical order
	//
	UErrorCode status = U_ZERO_ERROR;
	Normalizer::normalize(newChars, decompMode, 0, chars, status);
	if (U_FAILURE(status)) {
	chars = newChars;
	}
	}

	PatternEntry::~PatternEntry() {
	}

	const PatternEntry&
	PatternEntry::operator=(const PatternEntry& other)
	{
	if (this != &other) {
	strength = other.strength;
	chars = other.chars;
	extension = other.extension;
	}
	return *this;
	}

	/**
	* Gets the current extension, quoted
	* This is useful when constructing a user-readable string representing
	* a pattern.
	*/
	void PatternEntry::appendQuotedExtension(UnicodeString& toAddTo) const {
	appendQuoted(extension,toAddTo);
	}

	/**
	* Gets the current chars, quoted
	* This is useful when constructing a user-readable string representing
	* a pattern.
	*/
	void PatternEntry::appendQuotedChars(UnicodeString& toAddTo) const {
	appendQuoted(chars,toAddTo);
	}

	bool_t PatternEntry::equals(const PatternEntry& other) const {
	bool_t result = ((strength == other.strength) &&
	(chars == other.chars) &&
	(extension == other.extension));
	return result;
	}

	/**
	* For debugging.
	*/
	UnicodeString&
	PatternEntry::toString(UnicodeString& result) const
	{
	addToBuffer(result, TRUE, FALSE, NULL);
	return result;
	}

	int32_t
	PatternEntry::getStrength() const
	{
	return strength;
	}

	const UnicodeString&
	PatternEntry::getExtension(UnicodeString& ext) const
	{
	ext = extension;
	return ext;
	}

	const UnicodeString&
	PatternEntry::getChars(UnicodeString& result) const
	{
	result = chars;
	return result;
	}

	/*
	Add the entry in textual form into the toAddTo buffer.
	*/
	void PatternEntry::addToBuffer(UnicodeString& toAddTo,
	bool_t showExtension,
	bool_t showWhiteSpace,
	const PatternEntry* lastEntry) const
	{
	if (showWhiteSpace && toAddTo.length() > 0)
	// Adds new line before each primary strength entry.
	if (strength == Collator::PRIMARY \|\| lastEntry != NULL)
	toAddTo += 0x000A/'\n'/;
	else
	toAddTo += 0x0020/' '/;
	if (lastEntry != NULL) {
	toAddTo += 0x0026/'&'/;
	if (showWhiteSpace)
	toAddTo += 0x0020/' '/;
	lastEntry->appendQuotedChars(toAddTo);
	appendQuotedExtension(toAddTo);
	if (showWhiteSpace)
	toAddTo += 0x0020/' '/;
	}
	// Check the strength for the correct symbol to append
	switch (strength) {
	case Collator::IDENTICAL: toAddTo += 0x003D/'='/; break;
	case Collator::TERTIARY: toAddTo += 0x002C/','/; break;
	case Collator::SECONDARY: toAddTo += 0x003B/';'/; break;
	case Collator::PRIMARY: toAddTo += 0x003C/'<'/; break;
	case PatternEntry::RESET: toAddTo += 0x0026/'&'/; break;
	case PatternEntry::UNSET: toAddTo += 0x003F/'?'/; break;
	}
	if (showWhiteSpace)
	toAddTo += 0x0020/' '/;
	appendQuoted(chars,toAddTo);
	// If there's an expending char and needs to be shown,
	// append that after the entry
	if (showExtension && extension.length() != 0) {
	toAddTo += 0x002F/'/'/;
	appendQuoted(extension,toAddTo);
	}
	}

	// Append a string to a pattern buffer, adding quotes if necessary
	void PatternEntry::appendQuoted(const UnicodeString& chars, UnicodeString& toAddTo) {
	bool_t inQuote = FALSE;
	UChar ch = chars[T_INT32(0)];
	if (Unicode::isSpaceChar(ch)) {
	inQuote = TRUE;
	toAddTo += 0x0027/'\''/;
	} else if (isSpecialChar(ch)) {
	inQuote = TRUE;
	toAddTo += 0x0027/'\''/;
	} else {
	switch (ch) {
	case 0x0010: case 0x000C/'\f'/:
	case 0x000D/'\r'/: case 0x0009/'\t'/:
	case 0x000A/'\n'/: case 0x0040/'@'/:
	inQuote = TRUE;
	toAddTo += 0x0027/'\''/;
	break;
	case 0x0027/'\''/:
	inQuote = TRUE;
	toAddTo += 0x0027/'\''/;
	break;
	default:
	if (inQuote) {
	inQuote = FALSE; toAddTo += 0x0027/'\''/;
	}
	break;
	}
	}
	toAddTo += chars;
	if (inQuote)
	toAddTo += 0x0027/'\''/;
	}

	PatternEntry::Parser::Parser(const UnicodeString &pattern,
	Normalizer::EMode decompMode)
	: pattern(pattern), index(0),
	fDecompMode(decompMode), newChars(), newExtensions()
	{
	}

	PatternEntry::Parser::Parser(const Parser &that)
	: pattern(that.pattern), index(that.index), fDecompMode(that.fDecompMode),
	newChars(that.newChars), newExtensions(that.newExtensions)
	{
	}

	PatternEntry::Parser::~Parser()
	{
	}

	PatternEntry::Parser &PatternEntry::Parser::operator=(const Parser &that)
	{
	if (this != &that)
	{
	this->pattern = that.pattern;
	this->index = that.index;
	this->fDecompMode = that.fDecompMode;
	this->newChars = that.newChars;
	this->newExtensions = that.newExtensions;
	}

	return *this;
	}

	PatternEntry *PatternEntry::Parser::next(UErrorCode &status)
	{
	int32_t newStrength = PatternEntry::UNSET;
	bool_t inChars = TRUE;
	bool_t inQuote = FALSE;

	newChars.remove();
	newExtensions.remove();

	while (index < pattern.length())
	{
	UChar ch = pattern[index];

	if (inQuote)
	{
	if (ch == 0x0027/'\''/)
	{
	inQuote = FALSE;
	}
	else
	{
	if ((newChars.length() == 0) \|\| inChars)
	{
	newChars += ch;
	}
	else
	{
	newExtensions += ch;
	}
	}
	}
	else
	{
	// Sets the strength for this entry
	switch (ch)
	{
	case 0x003D/'='/ :
	if (newStrength != PatternEntry::UNSET)
	{
	goto EndOfLoop;
	}

	newStrength = Collator::IDENTICAL;
	break;

	case 0x002C/','/:
	if (newStrength != PatternEntry::UNSET)
	{
	goto EndOfLoop;
	}

	newStrength = Collator::TERTIARY;
	break;

	case 0x003B/';'/:
	if (newStrength != PatternEntry::UNSET)
	{
	goto EndOfLoop;
	}

	newStrength = Collator::SECONDARY;
	break;

	case 0x003C/'<'/:
	if (newStrength != PatternEntry::UNSET)
	{
	goto EndOfLoop;
	}

	newStrength = Collator::PRIMARY;
	break;

	case 0x0026/'&'/:
	if (newStrength != PatternEntry::UNSET)
	{
	goto EndOfLoop;
	}

	newStrength = PatternEntry::RESET;
	break;

	// Ignore the white spaces
	case 0x0009/'\t'/:
	case 0x000C/'\f'/:
	case 0x000D/'\r'/:
	case 0x000A/'\n'/:
	case 0x0020/' '/:
	break; // skip whitespace TODO use Unicode

	case 0x002F/'/'/:
	// This entry has an extension.
	inChars = FALSE;
	break;

	case 0x0027/'\''/:
	inQuote = TRUE;
	ch = pattern[++index];

	if (newChars.length() == 0)
	{
	newChars += ch;
	}
	else if (inChars)
	{
	newChars += ch;
	}
	else
	{
	newExtensions += ch;
	}

	break;

	default:
	if (newStrength == PatternEntry::UNSET)
	{
	status = U_INVALID_FORMAT_ERROR;
	return NULL;
	}

	if (isSpecialChar(ch) && (inQuote == FALSE))
	{
	status = U_INVALID_FORMAT_ERROR;
	return NULL;
	}

	if (inChars)
	{
	newChars += ch;
	}
	else
	{
	newExtensions += ch;
	}

	break;
	}
	}

	if (newChars.isBogus() \|\| newExtensions.isBogus())
	{
	status = U_MEMORY_ALLOCATION_ERROR;
	return NULL;
	}

	index += 1;
	}

	EndOfLoop:
	if (newStrength == PatternEntry::UNSET)
	{
	return NULL;
	}

	if (newChars.length() == 0)
	{
	status = U_INVALID_FORMAT_ERROR;
	return NULL;
	}

	return new PatternEntry(newStrength, newChars, newExtensions, fDecompMode);
	}

	// Check if the character is a special character. A special character
	// would be meaningful in the rule only if quoted, otherwise it's used
	// as a denotation for strength or merging symbols.
	bool_t PatternEntry::isSpecialChar(UChar ch)
	{
	return (((ch <= 0x002F) && (ch >= 0x0020)) \|\|
	((ch <= 0x003F) && (ch >= 0x003A)) \|\|
	((ch <= 0x0060) && (ch >= 0x005B)) \|\|
	((ch <= 0x007E) && (ch >= 0x007B)));
	}