blob: 51027a32d3a96c7d2577f99c3e74d381bec5e8e5 [file] [log] [blame]
/*******************************************************************************
* Copyright (C) 1996-1999, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
//=============================================================================
//
// File ptnentry.cpp
//
// Contains PatternEntry, an internal class used by MergeCollation to store
// one collation element from a pattern.
//
// Created by: Helena Shih
//
// Modification History:
//
// Date Name Description
// 04/23/99 stephen Removed EDecompositionMode, merged with
// Normalizer::EMode
// Removed character literals.
//=============================================================================
#include "ptnentry.h"
#include "unicode/unicode.h"
#include "unicode/coll.h"
#include "unicode/normlzr.h"
// static member initialization
const int32_t PatternEntry::RESET = -2;
const int32_t PatternEntry::UNSET = -1;
// ===== privates =====
PatternEntry::PatternEntry()
: strength(PatternEntry::UNSET)
{
}
PatternEntry::PatternEntry(const PatternEntry& other)
: strength(other.strength), chars(other.chars), extension(other.extension)
{
}
PatternEntry::PatternEntry(int32_t newStrength,
const UnicodeString& newChars,
const UnicodeString& newExtensions,
Normalizer::EMode decompMode)
: strength(newStrength), extension(newExtensions)
{
// Normalize the characters in the new entry. Find occurances of all
// decomposed characters and normalize them. By "normalize",
// we mean that all precomposed Unicode characters must be converted into
// a base character and one or more combining characters (such as accents).
// When there are multiple combining characters attached to a base character,
// the combining characters must be in their canonical order
//
UErrorCode status = U_ZERO_ERROR;
Normalizer::normalize(newChars, decompMode, 0, chars, status);
if (U_FAILURE(status)) {
chars = newChars;
}
}
PatternEntry::~PatternEntry() {
}
const PatternEntry&
PatternEntry::operator=(const PatternEntry& other)
{
if (this != &other) {
strength = other.strength;
chars = other.chars;
extension = other.extension;
}
return *this;
}
/**
* Gets the current extension, quoted
* This is useful when constructing a user-readable string representing
* a pattern.
*/
void PatternEntry::appendQuotedExtension(UnicodeString& toAddTo) const {
appendQuoted(extension,toAddTo);
}
/**
* Gets the current chars, quoted
* This is useful when constructing a user-readable string representing
* a pattern.
*/
void PatternEntry::appendQuotedChars(UnicodeString& toAddTo) const {
appendQuoted(chars,toAddTo);
}
bool_t PatternEntry::equals(const PatternEntry& other) const {
bool_t result = ((strength == other.strength) &&
(chars == other.chars) &&
(extension == other.extension));
return result;
}
/**
* For debugging.
*/
UnicodeString&
PatternEntry::toString(UnicodeString& result) const
{
addToBuffer(result, TRUE, FALSE, NULL);
return result;
}
int32_t
PatternEntry::getStrength() const
{
return strength;
}
const UnicodeString&
PatternEntry::getExtension(UnicodeString& ext) const
{
ext = extension;
return ext;
}
const UnicodeString&
PatternEntry::getChars(UnicodeString& result) const
{
result = chars;
return result;
}
/*
Add the entry in textual form into the toAddTo buffer.
*/
void PatternEntry::addToBuffer(UnicodeString& toAddTo,
bool_t showExtension,
bool_t showWhiteSpace,
const PatternEntry* lastEntry) const
{
if (showWhiteSpace && toAddTo.length() > 0)
// Adds new line before each primary strength entry.
if (strength == Collator::PRIMARY || lastEntry != NULL)
toAddTo += 0x000A/*'\n'*/;
else
toAddTo += 0x0020/*' '*/;
if (lastEntry != NULL) {
toAddTo += 0x0026/*'&'*/;
if (showWhiteSpace)
toAddTo += 0x0020/*' '*/;
lastEntry->appendQuotedChars(toAddTo);
appendQuotedExtension(toAddTo);
if (showWhiteSpace)
toAddTo += 0x0020/*' '*/;
}
// Check the strength for the correct symbol to append
switch (strength) {
case Collator::IDENTICAL: toAddTo += 0x003D/*'='*/; break;
case Collator::TERTIARY: toAddTo += 0x002C/*','*/; break;
case Collator::SECONDARY: toAddTo += 0x003B/*';'*/; break;
case Collator::PRIMARY: toAddTo += 0x003C/*'<'*/; break;
case PatternEntry::RESET: toAddTo += 0x0026/*'&'*/; break;
case PatternEntry::UNSET: toAddTo += 0x003F/*'?'*/; break;
}
if (showWhiteSpace)
toAddTo += 0x0020/*' '*/;
appendQuoted(chars,toAddTo);
// If there's an expending char and needs to be shown,
// append that after the entry
if (showExtension && extension.length() != 0) {
toAddTo += 0x002F/*'/'*/;
appendQuoted(extension,toAddTo);
}
}
// Append a string to a pattern buffer, adding quotes if necessary
void PatternEntry::appendQuoted(const UnicodeString& chars, UnicodeString& toAddTo) {
bool_t inQuote = FALSE;
UChar ch = chars[T_INT32(0)];
if (Unicode::isSpaceChar(ch)) {
inQuote = TRUE;
toAddTo += 0x0027/*'\''*/;
} else if (isSpecialChar(ch)) {
inQuote = TRUE;
toAddTo += 0x0027/*'\''*/;
} else {
switch (ch) {
case 0x0010: case 0x000C/*'\f'*/:
case 0x000D/*'\r'*/: case 0x0009/*'\t'*/:
case 0x000A/*'\n'*/: case 0x0040/*'@'*/:
inQuote = TRUE;
toAddTo += 0x0027/*'\''*/;
break;
case 0x0027/*'\''*/:
inQuote = TRUE;
toAddTo += 0x0027/*'\''*/;
break;
default:
if (inQuote) {
inQuote = FALSE; toAddTo += 0x0027/*'\''*/;
}
break;
}
}
toAddTo += chars;
if (inQuote)
toAddTo += 0x0027/*'\''*/;
}
PatternEntry::Parser::Parser(const UnicodeString &pattern,
Normalizer::EMode decompMode)
: pattern(pattern), index(0),
fDecompMode(decompMode), newChars(), newExtensions()
{
}
PatternEntry::Parser::Parser(const Parser &that)
: pattern(that.pattern), index(that.index), fDecompMode(that.fDecompMode),
newChars(that.newChars), newExtensions(that.newExtensions)
{
}
PatternEntry::Parser::~Parser()
{
}
PatternEntry::Parser &PatternEntry::Parser::operator=(const Parser &that)
{
if (this != &that)
{
this->pattern = that.pattern;
this->index = that.index;
this->fDecompMode = that.fDecompMode;
this->newChars = that.newChars;
this->newExtensions = that.newExtensions;
}
return *this;
}
PatternEntry *PatternEntry::Parser::next(UErrorCode &status)
{
int32_t newStrength = PatternEntry::UNSET;
bool_t inChars = TRUE;
bool_t inQuote = FALSE;
newChars.remove();
newExtensions.remove();
while (index < pattern.length())
{
UChar ch = pattern[index];
if (inQuote)
{
if (ch == 0x0027/*'\''*/)
{
inQuote = FALSE;
}
else
{
if ((newChars.length() == 0) || inChars)
{
newChars += ch;
}
else
{
newExtensions += ch;
}
}
}
else
{
// Sets the strength for this entry
switch (ch)
{
case 0x003D/*'='*/ :
if (newStrength != PatternEntry::UNSET)
{
goto EndOfLoop;
}
newStrength = Collator::IDENTICAL;
break;
case 0x002C/*','*/:
if (newStrength != PatternEntry::UNSET)
{
goto EndOfLoop;
}
newStrength = Collator::TERTIARY;
break;
case 0x003B/*';'*/:
if (newStrength != PatternEntry::UNSET)
{
goto EndOfLoop;
}
newStrength = Collator::SECONDARY;
break;
case 0x003C/*'<'*/:
if (newStrength != PatternEntry::UNSET)
{
goto EndOfLoop;
}
newStrength = Collator::PRIMARY;
break;
case 0x0026/*'&'*/:
if (newStrength != PatternEntry::UNSET)
{
goto EndOfLoop;
}
newStrength = PatternEntry::RESET;
break;
// Ignore the white spaces
case 0x0009/*'\t'*/:
case 0x000C/*'\f'*/:
case 0x000D/*'\r'*/:
case 0x000A/*'\n'*/:
case 0x0020/*' '*/:
break; // skip whitespace TODO use Unicode
case 0x002F/*'/'*/:
// This entry has an extension.
inChars = FALSE;
break;
case 0x0027/*'\''*/:
inQuote = TRUE;
ch = pattern[++index];
if (newChars.length() == 0)
{
newChars += ch;
}
else if (inChars)
{
newChars += ch;
}
else
{
newExtensions += ch;
}
break;
default:
if (newStrength == PatternEntry::UNSET)
{
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
if (isSpecialChar(ch) && (inQuote == FALSE))
{
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
if (inChars)
{
newChars += ch;
}
else
{
newExtensions += ch;
}
break;
}
}
if (newChars.isBogus() || newExtensions.isBogus())
{
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
index += 1;
}
EndOfLoop:
if (newStrength == PatternEntry::UNSET)
{
return NULL;
}
if (newChars.length() == 0)
{
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
return new PatternEntry(newStrength, newChars, newExtensions, fDecompMode);
}
// Check if the character is a special character. A special character
// would be meaningful in the rule only if quoted, otherwise it's used
// as a denotation for strength or merging symbols.
bool_t PatternEntry::isSpecialChar(UChar ch)
{
return (((ch <= 0x002F) && (ch >= 0x0020)) ||
((ch <= 0x003F) && (ch >= 0x003A)) ||
((ch <= 0x0060) && (ch >= 0x005B)) ||
((ch <= 0x007E) && (ch >= 0x007B)));
}