| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-1999, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * File tblcoll.cpp |
| * |
| * Created by: Helena Shih |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 2/5/97 aliu Added streamIn and streamOut methods. Added |
| * constructor which reads RuleBasedCollator object from |
| * a binary file. Added writeToFile method which streams |
| * RuleBasedCollator out to a binary file. The streamIn |
| * and streamOut methods use istream and ostream objects |
| * in binary mode. |
| * 2/11/97 aliu Moved declarations out of for loop initializer. |
| * Added Mac compatibility #ifdef for ios::nocreate. |
| * 2/12/97 aliu Modified to use TableCollationData sub-object to |
| * hold invariant data. |
| * 2/13/97 aliu Moved several methods into this class from Collation. |
| * Added a private RuleBasedCollator(Locale&) constructor, |
| * to be used by Collator::getInstance(). General |
| * clean up. Made use of UErrorCode variables consistent. |
| * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy |
| * constructor and getDynamicClassID. |
| * 3/5/97 aliu Changed compaction cycle to improve performance. We |
| * use the maximum allowable value which is kBlockCount. |
| * Modified getRules() to load rules dynamically. Changed |
| * constructFromFile() call to accomodate this (added |
| * parameter to specify whether binary loading is to |
| * take place). |
| * 05/06/97 helena Added memory allocation error check. |
| * 6/20/97 helena Java class name change. |
| * 6/23/97 helena Adding comments to make code more readable. |
| * 09/03/97 helena Added createCollationKeyValues(). |
| * 06/26/98 erm Changes for CollationKeys using byte arrays. |
| * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java |
| * 04/23/99 stephen Removed EDecompositionMode, merged with |
| * Normalizer::EMode |
| * 06/14/99 stephen Removed kResourceBundleSuffix |
| * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx |
| * files are no longer used. |
| * 11/02/99 helena Collator performance enhancements. Special case |
| * for NO_OP situations. |
| * 11/17/99 srl More performance enhancements. Inlined some internal functions. |
| * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator |
| * to implementation file. |
| ******************************************************************************* |
| */ |
| |
| #include "ucmp32.h" |
| #include "tcoldata.h" |
| |
| #include "unicode/tblcoll.h" |
| |
| #include "unicode/coleitr.h" |
| #include "unicode/locid.h" |
| #include "unicode/unicode.h" |
| #include "tables.h" |
| #include "unicode/normlzr.h" |
| #include "mergecol.h" |
| #include "unicode/resbund.h" |
| #include "filestrm.h" |
| #include "umemstrm.h" |
| |
| #ifdef _DEBUG |
| #include "unistrm.h" |
| #endif |
| |
| #include "compitr.h" |
| |
| #include <string.h> |
| |
| #include "unicode/ustring.h" |
| |
| #include "cmemory.h" |
| |
| |
| class RuleBasedCollatorStreamer |
| { |
| public: |
| static void streamIn(RuleBasedCollator* collator, FileStream* is); |
| static void streamOut(const RuleBasedCollator* collator, FileStream* os); |
| static void streamIn(RuleBasedCollator* collator, UMemoryStream* is, UErrorCode& status); |
| static void streamOut(const RuleBasedCollator* collator, UMemoryStream* os); |
| }; |
| |
| //=========================================================================================== |
| // The following diagram shows the data structure of the RuleBasedCollator object. |
| // Suppose we have the rule, where 'o-umlaut' is the unicode char 0x00F6. |
| // "a, A < b, B < c, C, ch, cH, Ch, CH < d, D ... < o, O; 'o-umlaut'/E, 'O-umlaut'/E ...". |
| // What the rule says is, sorts 'ch'ligatures and 'c' only with tertiary difference and |
| // sorts 'o-umlaut' as if it's always expanded with 'e'. |
| // |
| // mapping table contracting list expanding list |
| // (contains all unicode char |
| // entries) ___ _____________ _________________________ |
| // ________ |==>|_*_|-->|'c' |v('c') | |==>|v('o')|v('umlaut')|v('e')| |
| // |_\u0001_|--> v('\u0001') | |_:_| |-------------| | |-------------------------| |
| // |_\u0002_|--> v('\u0002') | |_:_| |'ch' |v('ch')| | | : | |
| // |____:___| | |_:_| |-------------| | |-------------------------| |
| // |____:___| | |'cH' |v('cH')| | | : | |
| // |__'a'___|--> v('a') | |-------------| | |-------------------------| |
| // |__'b'___|--> v('b') | |'Ch' |v('Ch')| | | : | |
| // |____:___| | |-------------| | |-------------------------| |
| // |____:___| | |'CH' |v('CH')| | | : | |
| // |___'c'__|------------------- ------------- | |-------------------------| |
| // |____:___| | | : | |
| // |o-umlaut|------------------------------------------------ |_________________________| |
| // |____:___| |
| // |
| // |
| // Noted by Helena Shih on 6/23/97 with pending design changes (slimming collation). |
| //============================================================================================ |
| |
| const int32_t RuleBasedCollator::CHARINDEX = 0x70000000; // need look up in .commit() |
| const int32_t RuleBasedCollator::EXPANDCHARINDEX = 0x7E000000; // Expand index follows |
| const int32_t RuleBasedCollator::CONTRACTCHARINDEX = 0x7F000000; // contract indexes follows |
| const int32_t RuleBasedCollator::UNMAPPED = 0xFFFFFFFF; // unmapped character values |
| const int32_t RuleBasedCollator::PRIMARYORDERINCREMENT = 0x00010000; // primary strength increment |
| const int32_t RuleBasedCollator::SECONDARYORDERINCREMENT = 0x00000100; // secondary strength increment |
| const int32_t RuleBasedCollator::TERTIARYORDERINCREMENT = 0x00000001; // tertiary strength increment |
| const int32_t RuleBasedCollator::MAXIGNORABLE = 0x00010000; // maximum ignorable char order value |
| const int32_t RuleBasedCollator::PRIMARYORDERMASK = 0xffff0000; // mask off anything but primary order |
| const int32_t RuleBasedCollator::SECONDARYORDERMASK = 0x0000ff00; // mask off anything but secondary order |
| const int32_t RuleBasedCollator::TERTIARYORDERMASK = 0x000000ff; // mask off anything but tertiary order |
| const int32_t RuleBasedCollator::SECONDARYRESETMASK = 0x0000ffff; // mask off secondary and tertiary order |
| const int32_t RuleBasedCollator::IGNORABLEMASK = 0x0000ffff; // mask off ignorable char order |
| const int32_t RuleBasedCollator::PRIMARYDIFFERENCEONLY = 0xffff0000; // use only the primary difference |
| const int32_t RuleBasedCollator::SECONDARYDIFFERENCEONLY = 0xffffff00; // use only the primary and secondary difference |
| const int32_t RuleBasedCollator::PRIMARYORDERSHIFT = 16; // primary order shift |
| const int32_t RuleBasedCollator::SECONDARYORDERSHIFT = 8; // secondary order shift |
| const int32_t RuleBasedCollator::SORTKEYOFFSET = 1; // minimum sort key offset |
| const int32_t RuleBasedCollator::CONTRACTCHAROVERFLOW = 0x7FFFFFFF; // Indicates the char is a contract char |
| |
| const int16_t RuleBasedCollator::FILEID = 0x5443; // unique file id for parity check |
| const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension |
| char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id |
| |
| //////////////////////////////////////////////////////////////////////// |
| // NormalizerIterator |
| // |
| // This class is essentially a duplicate of CollationElementIterator, |
| // stripped down for speed. It is declared here so we can incorporate |
| // internal classes as subobjects, as well as just to hide it from the |
| // public interface. |
| //////////////////////////////////////////////////////////////////////// |
| |
| /* Internal class for quick iteration over the text. |
| 100% pure inline code |
| */ |
| class NormalizerIterator { |
| public: |
| Normalizer *cursor; |
| VectorOfInt *bufferAlias; |
| VectorOfInt *reorderBuffer; |
| VectorOfInt ownBuffer; |
| UChar* text; |
| int32_t expIndex; |
| int32_t textLen; |
| UTextOffset currentOffset; |
| |
| NormalizerIterator(void); |
| NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode); |
| ~NormalizerIterator(void); |
| void setText(const UChar* source, int32_t length, UErrorCode& status); |
| void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status); |
| |
| UChar current(void) const; |
| UChar next(void); |
| void reset(void); |
| }; |
| |
| inline |
| NormalizerIterator::NormalizerIterator() : |
| cursor(0), |
| bufferAlias(0), |
| reorderBuffer(0), |
| ownBuffer(2), |
| text(0), |
| expIndex(0), |
| textLen(0), |
| currentOffset(0) |
| { |
| } |
| |
| inline |
| NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) : |
| cursor(0), |
| bufferAlias(0), |
| reorderBuffer(0), |
| ownBuffer(2), |
| text(0), |
| expIndex(0), |
| textLen(0), |
| currentOffset(0) |
| { |
| if (mode == Normalizer::NO_OP) { |
| text = (UChar*)source; |
| textLen = length; |
| currentOffset = 0; |
| } else { |
| cursor = new Normalizer(source, length, mode); |
| |
| } |
| } |
| |
| inline |
| NormalizerIterator::~NormalizerIterator() |
| { |
| if (cursor != 0) { |
| delete cursor; |
| cursor = 0; |
| } |
| if (reorderBuffer != 0) { |
| delete reorderBuffer; |
| } |
| } |
| |
| inline |
| void |
| NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status) |
| { |
| if (cursor == 0) { |
| text = (UChar*)source; |
| textLen = length; |
| currentOffset = 0; |
| |
| } else { |
| text = 0; |
| cursor->setText(source, length, status); |
| } |
| bufferAlias = 0; |
| currentOffset = 0; |
| } |
| |
| /* You can only set mode after the comparision of two strings is completed. |
| Setting the mode in the middle of a comparison is not allowed. |
| */ |
| inline |
| void |
| |
| NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status) |
| { |
| if(mode != Normalizer::NO_OP) |
| { |
| /* DO have a mode - will need a normalizer object */ |
| if(cursor != NULL) |
| { |
| /* Just modify the existing cursor */ |
| cursor->setMode(mode); |
| cursor->setText(source, length, status); |
| } |
| else |
| { |
| cursor = new Normalizer(source, length, mode); |
| } |
| |
| /* RESET the old data */ |
| text = 0; |
| textLen = 0; |
| } |
| else |
| { |
| /* NO_OP mode.. */ |
| if(cursor != NULL) |
| { /* get rid of the old cursor */ |
| delete cursor; |
| cursor = 0; |
| } |
| |
| text = (UChar*)source; |
| textLen = length; |
| } |
| currentOffset = 0; /* always */ |
| |
| bufferAlias = 0; |
| } |
| |
| inline |
| UChar |
| NormalizerIterator::current(void) const |
| { |
| if (text != 0) { |
| if(currentOffset >= textLen) |
| { |
| return Normalizer::DONE; |
| } |
| else |
| { |
| return text[currentOffset]; |
| } |
| } |
| |
| return (UChar)cursor->current(); |
| } |
| |
| |
| inline |
| UChar |
| NormalizerIterator::next(void) |
| { |
| if (text != 0) { |
| return (UChar)((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE); |
| } |
| return (UChar)cursor->next(); |
| } |
| |
| inline |
| void |
| NormalizerIterator::reset(void) |
| { |
| currentOffset = 0; |
| if(cursor) |
| { |
| cursor->reset(); |
| } |
| } |
| |
| //================ Some inline definitions of implementation functions........ ======== |
| /** |
| * A clone of CollationElementIterator::makeReorderedBuffer, trimmed down |
| * to only handle forward. |
| */ |
| inline VectorOfInt* |
| RuleBasedCollator::makeReorderedBuffer(NormalizerIterator* cursor, |
| UChar colFirst, |
| int32_t lastValue, |
| VectorOfInt* lastExpansion) const { |
| VectorOfInt* result; |
| |
| int32_t firstValue = ucmp32_get(data->mapping, colFirst); |
| if (firstValue >= CONTRACTCHARINDEX) { |
| UErrorCode status = U_ZERO_ERROR; |
| firstValue = nextContractChar(cursor, colFirst, status); |
| } |
| |
| VectorOfInt* firstExpansion = NULL; |
| if (firstValue >= EXPANDCHARINDEX) { |
| firstExpansion = getExpandValueList(firstValue); |
| } |
| |
| if (firstExpansion == NULL && lastExpansion == NULL) { |
| cursor->ownBuffer.at(0) = firstValue; |
| cursor->ownBuffer.at(1) = lastValue; |
| result = &cursor->ownBuffer; |
| } |
| else { |
| int32_t firstLength = firstExpansion==NULL? 1 : firstExpansion->size(); |
| int32_t lastLength = lastExpansion==NULL? 1 : lastExpansion->size(); |
| if (cursor->reorderBuffer == NULL) { |
| cursor->reorderBuffer = new VectorOfInt(firstLength+lastLength); |
| } |
| // reorderdBuffer gets reused for the life of this object. |
| // Since its internal buffer only grows, there is a danger |
| // that it will get really, really big, and never shrink. If |
| // this is actually happening, insert code here to check for |
| // the condition. Something along the lines of: |
| //! else if (reorderBuffer->size() >= 256 && |
| //! (firstLength+lastLength) < 16) { |
| //! delete reorderBuffer; |
| //! reorderBuffer = new VectorOfInt(firstLength+lastLength); |
| //! } |
| // The specific numeric values need to be determined |
| // empirically. [aliu] |
| result = cursor->reorderBuffer; |
| |
| if (firstExpansion == NULL) { |
| result->atPut(0, firstValue); |
| } |
| else { |
| // System.arraycopy(firstExpansion, 0, result, 0, firstLength); |
| *result = *firstExpansion; |
| } |
| |
| if (lastExpansion == NULL) { |
| result->atPut(firstLength, lastValue); |
| } |
| else { |
| // System.arraycopy(lastExpansion, 0, result, firstLength, lastLength); |
| for (int32_t i=0; i<lastLength; ++i) { |
| result->atPut(firstLength + i, lastExpansion->at(i)); |
| } |
| } |
| result->setSize(firstLength+lastLength); |
| } |
| |
| return result; |
| } |
| |
| |
| |
| |
| |
| inline int32_t |
| RuleBasedCollator::strengthOrder(int32_t value) const |
| { |
| if (getStrength() == PRIMARY) |
| { |
| return (value & PRIMARYDIFFERENCEONLY); |
| } else if (getStrength() == SECONDARY) |
| { |
| return (value & SECONDARYDIFFERENCEONLY); |
| } |
| return value; |
| } |
| |
| |
| inline int32_t |
| RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor, |
| UErrorCode status) const |
| { |
| if (U_FAILURE(status)) |
| { |
| return CollationElementIterator::NULLORDER; |
| } |
| |
| if (cursor->bufferAlias != NULL) |
| { |
| // bufferAlias needs a bit of an explanation. |
| // When we hit an expanding character in the text, we call the order's |
| // getExpandValues method to retrieve an array of the orderings for all |
| // of the characters in the expansion (see the end of this method). |
| // The first ordering is returned, and an alias to the orderings array |
| // is saved so that the remaining orderings can be returned on subsequent |
| // calls to next. So, if the expanding buffer is not exhausted, |
| // all we have to do here is return the next ordering in the buffer. |
| if (cursor->expIndex < cursor->bufferAlias->size()) |
| { |
| //_L((stderr, "next from [%08X] from bufferAlias\n", this)); |
| return strengthOrder(cursor->bufferAlias->at(cursor->expIndex++)); |
| } |
| else |
| { |
| cursor->bufferAlias = NULL; |
| } |
| } |
| |
| UChar ch = cursor->current(); |
| cursor->next(); |
| |
| //_L((stderr, "Next from [%08X] = [%04X], [%c]\n", cursor, (int)ch & 0xFFFF, (char)(ch & 0xFF))); |
| |
| if (ch == Normalizer::DONE) { |
| return CollationElementIterator::NULLORDER; |
| } |
| // Ask the collator for this character's ordering. |
| int32_t value = ucmp32_get(data->mapping, ch); |
| |
| if (value == UNMAPPED) |
| { |
| // Returned an "unmapped" flag and save the character so it can be |
| // returned next time this method is called. |
| if (ch == 0x0000) return ch; // \u0000 is not valid in C++'s UnicodeString |
| cursor->ownBuffer.at(0) = CollationElementIterator::UNMAPPEDCHARVALUE; |
| cursor->ownBuffer.at(1) = ch << 16; |
| cursor->bufferAlias = &cursor->ownBuffer; |
| |
| } else { |
| |
| if (value >= CONTRACTCHARINDEX) |
| { |
| value = nextContractChar(cursor, ch, status); |
| } |
| |
| if (value >= EXPANDCHARINDEX) { |
| cursor->bufferAlias = getExpandValueList(value); |
| } |
| |
| if (CollationElementIterator::isThaiPreVowel(ch)) { |
| UChar consonant = cursor->current(); |
| if (CollationElementIterator::isThaiBaseConsonant(consonant)) { |
| cursor->next(); |
| cursor->bufferAlias = makeReorderedBuffer(cursor, consonant, value, |
| cursor->bufferAlias); |
| } |
| } |
| } |
| |
| if (cursor->bufferAlias != NULL) { |
| cursor->expIndex = 1; |
| value = cursor->bufferAlias->at(0); |
| } |
| |
| return strengthOrder(value); |
| } |
| |
| // ==================== End inlines ============================================ |
| |
| |
| //=============================================================================== |
| |
| RuleBasedCollator::RuleBasedCollator() |
| : Collator(), |
| isOverIgnore(FALSE), |
| mPattern(0), |
| // sourceCursor(0), |
| //targetCursor(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(0) |
| { |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) |
| : Collator(that), |
| isOverIgnore(that.isOverIgnore), |
| mPattern(0), |
| // sourceCursor(0), |
| //targetCursor(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(that.data) // Alias the data pointer |
| { |
| } |
| |
| UBool |
| RuleBasedCollator::operator==(const Collator& that) const |
| { |
| if (this == &that) |
| { |
| return TRUE; |
| } |
| |
| if (this->getDynamicClassID() != that.getDynamicClassID()) |
| { |
| return FALSE; // not the same class |
| } |
| |
| if (!Collator::operator==(that)) |
| { |
| return FALSE; |
| } |
| |
| RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; |
| |
| if (isOverIgnore != thatAlias.isOverIgnore) |
| { |
| return FALSE; |
| } |
| |
| if (data != thatAlias.data) |
| { |
| return FALSE; |
| } |
| |
| return TRUE; |
| } |
| |
| RuleBasedCollator& |
| RuleBasedCollator::operator=(const RuleBasedCollator& that) |
| { |
| if (this != &that) |
| { |
| Collator::operator=(that); |
| isOverIgnore = that.isOverIgnore; |
| |
| if (dataIsOwned) |
| { |
| delete data; |
| } |
| |
| data = 0; |
| delete mPattern; |
| mPattern = 0; |
| dataIsOwned = FALSE; |
| data = that.data; |
| } |
| |
| return *this; |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| UErrorCode& status) |
| : Collator(), |
| isOverIgnore(FALSE), |
| mPattern(0), |
| // sourceCursor(0), |
| /// targetCursor(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(0) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| constructFromRules(rules, status); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| ECollationStrength collationStrength, |
| UErrorCode& status) |
| : Collator(collationStrength, Normalizer::NO_OP), |
| isOverIgnore(FALSE), |
| mPattern(0), |
| // sourceCursor(0), |
| // targetCursor(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(0) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| constructFromRules(rules, status); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| Normalizer::EMode decompositionMode, |
| UErrorCode& status) |
| : Collator(TERTIARY, decompositionMode), |
| isOverIgnore(FALSE), |
| mPattern(0), |
| // sourceCursor(0), |
| // targetCursor(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(0) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| constructFromRules(rules, status); |
| } |
| |
| RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, |
| ECollationStrength collationStrength, |
| Normalizer::EMode decompositionMode, |
| UErrorCode& status) |
| : Collator(collationStrength, decompositionMode), |
| isOverIgnore(FALSE), |
| mPattern(0), |
| // sourceCursor(0), |
| //targetCursor(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(0) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| constructFromRules(rules, status); |
| } |
| |
| void RuleBasedCollator::constructFromRules(const UnicodeString& rules, |
| UErrorCode& status) |
| { |
| // Construct this collator's ruleset from its string representation |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| if (rules.isBogus()) |
| { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| if (dataIsOwned) |
| { |
| delete data; |
| data = 0; |
| } |
| |
| isOverIgnore = FALSE; |
| setStrength(Collator::TERTIARY); |
| |
| data = new TableCollationData; |
| if (data->isBogus()) |
| { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| delete data; |
| data = 0; |
| return; |
| } |
| |
| // We constructed the data using the build method, so we own it. |
| dataIsOwned = TRUE; |
| |
| // Now that we've got all the buffers allocated, do the actual work |
| mPattern = 0; |
| build(rules, status); |
| } |
| |
| void |
| RuleBasedCollator::constructFromFile(const char* fileName, |
| UErrorCode& status) |
| { |
| // This method tries to read in a flattened RuleBasedCollator that |
| // has been previously streamed out using the streamOut() method. |
| // The 'fileName' parameter should contain a full pathname valid on |
| // the local environment. |
| |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| if (dataIsOwned) |
| { |
| delete data; |
| data = 0; |
| } |
| |
| mPattern = 0; |
| isOverIgnore = FALSE; |
| setStrength(Collator::TERTIARY); // This is the default strength |
| |
| FileStream* ifs = T_FileStream_open(fileName, "rb"); |
| if (ifs == 0) { |
| status = U_FILE_ACCESS_ERROR; |
| return; |
| } |
| |
| // The streamIn function does the actual work here... |
| RuleBasedCollatorStreamer::streamIn(this, ifs); |
| |
| if (!T_FileStream_error(ifs)) |
| { |
| status = U_ZERO_ERROR; |
| } |
| else if (data && data->isBogus()) |
| { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| delete data; |
| data = 0; |
| } |
| else |
| { |
| status = U_MISSING_RESOURCE_ERROR; |
| delete data; |
| data = 0; |
| } |
| |
| #ifdef COLLDEBUG |
| fprintf(stderr, "binary read %s size %d, %s\n", fileName, T_FileStream_size(ifs), u_errorName(status)); |
| #endif |
| |
| // We constructed the data when streaming it in, so we own it |
| dataIsOwned = TRUE; |
| |
| T_FileStream_close(ifs); |
| } |
| |
| const char * |
| RuleBasedCollator::constructFromBundle(const Locale & name, |
| UErrorCode& status) |
| { |
| // This method tries to locate binary collation data which has been |
| // previously streamed to a binary object "%%Collation" in a |
| // resource bundle. If the data is found, it is cached. |
| // cache is checked before actually streaming in data |
| // resource bundle fallback mechanism is used. |
| |
| if (U_FAILURE(status)) |
| { |
| return 0; |
| } |
| |
| if (dataIsOwned) |
| { |
| delete data; |
| data = 0; |
| } |
| const char* realName = 0; |
| |
| mPattern = 0; |
| isOverIgnore = FALSE; |
| setStrength(Collator::TERTIARY); // This is the default strength |
| |
| ResourceBundle rb((char *)0, name, status); |
| if(U_SUCCESS(status)) { |
| ResourceBundle binary = rb.get("%%Collation", status); //This is the bundle that actually contains the collation data |
| realName = binary.getName(); |
| if(U_SUCCESS(status)) { |
| UErrorCode intStatus = U_ZERO_ERROR; |
| constructFromCache(realName, intStatus); // check whether we already have this data in cache |
| if(U_SUCCESS(intStatus)) { |
| return realName; |
| } |
| int32_t inDataLen = 0; |
| const uint8_t *inData = binary.getBinary(inDataLen, status); //This got us the real binary data |
| |
| UMemoryStream *ifs = uprv_mstrm_openBuffer(inData, inDataLen); |
| |
| if (ifs == 0) { |
| status = U_FILE_ACCESS_ERROR; |
| return 0; |
| } |
| |
| // The streamIn function does the actual work here... |
| RuleBasedCollatorStreamer::streamIn(this, ifs, status); |
| |
| if (!uprv_mstrm_error(ifs)) { |
| } |
| else if (data && data->isBogus()) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| delete data; |
| data = 0; |
| } else { |
| status = U_MISSING_RESOURCE_ERROR; |
| delete data; |
| data = 0; |
| } |
| |
| // We constructed the data when streaming it in, so we own it |
| dataIsOwned = TRUE; |
| |
| uprv_mstrm_close(ifs); |
| addToCache(realName); // add the newly constructed data to cache |
| return realName; |
| } else { |
| status = U_MISSING_RESOURCE_ERROR; |
| return 0; |
| } |
| } else { |
| return 0; |
| } |
| } |
| |
| RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale, |
| UErrorCode& status) |
| : Collator(), |
| isOverIgnore(FALSE), |
| // sourceCursor(0), |
| //targetCursor(0), |
| mPattern(0), |
| cursor1(0), |
| cursor2(0), |
| dataIsOwned(FALSE), |
| data(0) |
| { |
| |
| |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| // Try to load, in order: |
| // 1. The desired locale's collation. |
| // 2. A fallback of the desired locale. |
| // 3. The default locale's collation. |
| // 4. A fallback of the default locale. |
| // 5. The default collation rules, which contains en_US collation rules. |
| |
| // To reiterate, we try: |
| // Specific: |
| // language+country+variant |
| // language+country |
| // language |
| // Default: |
| // language+country+variant |
| // language+country |
| // language |
| // Root: (aka DEFAULTRULES) |
| // steps 1-5 are handled by resource bundle fallback mechanism. |
| // however, in a very unprobable situation that no resource bundle |
| // data exists, step 5 is repeated with hardcoded default rules. |
| |
| const char *locName = constructFromBundle(desiredLocale, status); /*!*/ |
| |
| if (U_SUCCESS(status)) { |
| data->desiredLocale = desiredLocale; |
| data->realLocaleName = locName; |
| if(status != U_USING_DEFAULT_ERROR) { |
| setDecomposition(Normalizer::NO_OP); |
| } |
| } else { |
| UErrorCode intStatus = U_ZERO_ERROR; |
| constructFromCache(ResourceBundle::kDefaultFilename, intStatus); |
| if(U_FAILURE(intStatus)) { |
| intStatus = U_ZERO_ERROR; |
| constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus); |
| if (intStatus == U_ZERO_ERROR) { |
| status = U_USING_DEFAULT_ERROR; |
| } else { |
| status = intStatus; // bubble back |
| } |
| |
| if (status == U_MEMORY_ALLOCATION_ERROR) { |
| return; |
| } |
| } |
| data->realLocaleName = ResourceBundle::kDefaultFilename; |
| setDecomposition(Normalizer::NO_OP); |
| addToCache(ResourceBundle::kDefaultFilename); |
| } |
| return; |
| } |
| |
| void |
| RuleBasedCollator::constructFromFile( const Locale& locale, |
| const UnicodeString& localeFileName, |
| UBool tryBinaryFile, |
| UErrorCode& status) |
| { |
| // constructFromFile creates a collation object by reading from a |
| // file. It does not employ the usual FILE search mechanism with |
| // locales, default locales, and base locales. Instead, it tries to |
| // look only in files with the given localFileName. It does, |
| // however, employ the LOCALE search mechanism. |
| |
| // This method maintains the binary collation files. If a collation |
| // is not present in binary form, but is present in text form (in a |
| // resource bundle file), it will be loaded in text form, and then |
| // written to disk. |
| |
| // If tryBinaryFile is true, then try to load from the binary file first. |
| |
| if(U_FAILURE(status)) { |
| return; |
| } |
| |
| if(dataIsOwned) { |
| delete data; |
| data = 0; |
| } |
| |
| if(tryBinaryFile) { |
| char *binaryFilePath = createPathName(UnicodeString(u_getDataDirectory(),""), |
| localeFileName, |
| UnicodeString(kFilenameSuffix,"")); |
| |
| // Try to load up the collation from a binary file first |
| constructFromFile(binaryFilePath, status); |
| #ifdef COLLDEBUG |
| cerr << localeFileName << kFilenameSuffix << " binary load " << u_errorName(status) << endl; |
| #endif |
| if(U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) { |
| delete [] binaryFilePath; |
| return; |
| } |
| if(status == U_FILE_ACCESS_ERROR) { |
| status = U_ZERO_ERROR; |
| } |
| delete [] binaryFilePath; |
| } |
| |
| // Now try to load it up from a resource bundle text source file |
| UnicodeString dataDir = UnicodeString(u_getDataDirectory(),""); |
| |
| char *ch; |
| ch = new char[localeFileName.size() + 1]; |
| ch[localeFileName.extract(0, 0x7fffffff, ch, "")] = 0; |
| ResourceBundle bundle(dataDir, ch, status); |
| |
| delete [] ch; |
| |
| // if there is no resource bundle file for the give locale, break out |
| if(U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| #ifdef COLLDEBUG |
| cerr << localeFileName << " ascii load " << u_errorName(status) << endl; |
| #endif |
| |
| // check and see if this resource bundle contains collation data |
| |
| UnicodeString colString; |
| UErrorCode intStatus = U_ZERO_ERROR; |
| |
| ResourceBundle colElems = bundle.get("CollationElements", intStatus); |
| if (U_FAILURE(intStatus)) |
| { |
| status = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| colString = colElems.getStringEx("Sequence", intStatus); |
| |
| if(U_FAILURE(intStatus)) { |
| status = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| |
| if(colString.isBogus()) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| // Having loaded the collation from the resource bundle text file, |
| // now retrieve the CollationElements tagged data, merged with the |
| // default rules. If that fails, use the default rules alone. |
| |
| colString.insert(0, DEFAULTRULES); |
| if(colString.isBogus()) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| constructFromRules(colString, intStatus); |
| if(intStatus == U_MEMORY_ALLOCATION_ERROR) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| if(intStatus != U_ZERO_ERROR) { |
| status = U_USING_DEFAULT_ERROR; |
| |
| // predefined tables should contain correct grammar |
| intStatus = U_ZERO_ERROR; |
| constructFromRules(DEFAULTRULES, intStatus); |
| if(intStatus != U_ZERO_ERROR) { |
| status = intStatus; |
| } |
| } |
| |
| #ifdef COLLDEBUG |
| cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << " - try= " << (tryBinaryFile?"true":"false") << endl; |
| #endif |
| |
| } |
| |
| RuleBasedCollator::~RuleBasedCollator() |
| { |
| if (dataIsOwned) |
| { |
| delete data; |
| } |
| |
| data = 0; |
| |
| // delete sourceCursor; |
| // sourceCursor = 0; |
| |
| // delete targetCursor; |
| // targetCursor = 0; |
| |
| if (cursor1 != NULL) { |
| delete cursor1; |
| cursor1 = 0; |
| } |
| if (cursor2 != NULL) { |
| delete cursor2; |
| cursor2 = 0; |
| } |
| |
| delete mPattern; |
| mPattern = 0; |
| } |
| |
| Collator* |
| RuleBasedCollator::clone() const |
| { |
| return new RuleBasedCollator(*this); |
| } |
| |
| // Create a CollationElementIterator object that will iterator over the elements |
| // in a string, using the collation rules defined in this RuleBasedCollator |
| CollationElementIterator* |
| RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| CollationElementIterator *newCursor = 0; |
| |
| newCursor = new CollationElementIterator(source, this, status); |
| if (U_FAILURE(status)) |
| { |
| return NULL; |
| } |
| |
| return newCursor; |
| } |
| |
| // Create a CollationElementIterator object that will iterator over the elements |
| // in a string, using the collation rules defined in this RuleBasedCollator |
| CollationElementIterator* |
| RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| CollationElementIterator *newCursor = 0; |
| |
| newCursor = new CollationElementIterator(source, this, status); |
| if (U_FAILURE(status)) |
| { |
| return NULL; |
| } |
| |
| return newCursor; |
| } |
| |
| // Return a string representation of this collator's rules. |
| // The string can later be passed to the constructor that takes a |
| // UnicodeString argument, which will construct a collator that's |
| // functionally identical to this one. |
| // You can also allow users to edit the string in order to change |
| // the collation data, or you can print it out for inspection, or whatever. |
| |
| const UnicodeString& |
| RuleBasedCollator::getRules() const |
| { |
| if (mPattern != 0) |
| { |
| MergeCollation*& nonConstMPattern = *(MergeCollation**)&mPattern; |
| mPattern->emitPattern(data->ruleTable); |
| data->isRuleTableLoaded = TRUE; |
| delete nonConstMPattern; |
| nonConstMPattern = 0; |
| } |
| else if (!data->isRuleTableLoaded) |
| { |
| // At this point the caller wants the rules, but the rule table data |
| // is not loaded. Furthermore, there is no mPattern object to load |
| // the rules from. Therefore, we fetch the rules off the disk. |
| // Notice that we pass in a tryBinaryFile value of FALSE, since |
| // by design the binary file has NO rules in it! |
| //UErrorCode status = U_ZERO_ERROR; |
| //RuleBasedCollator temp(data->realLocaleName, status); |
| RuleBasedCollator temp; |
| UErrorCode status = U_ZERO_ERROR; |
| temp.constructFromFile(data->desiredLocale, data->realLocaleName, FALSE, status); |
| |
| // We must check that mPattern is nonzero here, or we run the risk |
| // of an infinite loop. |
| if (U_SUCCESS(status) && temp.mPattern != 0) |
| { |
| data->ruleTable = temp.getRules(); |
| data->isRuleTableLoaded = TRUE; |
| #ifdef _DEBUG |
| // // the following is useful for specific debugging purposes |
| // UnicodeString name; |
| // cerr << "Table collation rules loaded dynamically for " |
| // << data->desiredLocale.getName(name) |
| // << " at " |
| // << data->realLocaleName |
| // << ", " << dec << data->ruleTable.size() << " characters" |
| // << endl; |
| #endif |
| } |
| else |
| { |
| #ifdef _DEBUG |
| // UnicodeString name; |
| // cerr << "Unable to load table collation rules dynamically for " |
| // << data->desiredLocale.getName(name) |
| // << " at " |
| // << data->realLocaleName |
| // << endl; |
| // cerr << "Status " << u_errorName(status) << ", mPattern " << temp.mPattern << endl; |
| #endif |
| /* SRL have to add this because we now have the situation where |
| DEFAULT is loaded from a binary file w/ no rules. */ |
| UErrorCode intStatus = U_ZERO_ERROR; |
| temp.constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus); |
| |
| if(U_SUCCESS(intStatus) && (temp.mPattern != 0)) |
| { |
| data->ruleTable = temp.getRules(); |
| data->isRuleTableLoaded = TRUE; |
| } |
| } |
| } |
| |
| return data->ruleTable; |
| } |
| |
| |
| Collator::EComparisonResult |
| RuleBasedCollator::compare( const UnicodeString& source, |
| const UnicodeString& target, |
| int32_t length) const |
| { |
| UnicodeString source_togo; |
| UnicodeString target_togo; |
| UTextOffset begin=0; |
| |
| source.extract(begin, uprv_min(length,source.length()), source_togo); |
| target.extract(begin, uprv_min(length,target.length()), target_togo); |
| return (RuleBasedCollator::compare(source_togo, target_togo)); |
| } |
| |
| Collator::EComparisonResult |
| RuleBasedCollator::compare(const UChar* source, |
| int32_t sourceLength, |
| const UChar* target, |
| int32_t targetLength) const |
| { |
| // check if source and target are valid strings |
| if (((source == 0) && (target == 0)) || |
| ((sourceLength == 0) && (targetLength == 0))) |
| { |
| return Collator::EQUAL; |
| } |
| |
| Collator::EComparisonResult result = Collator::EQUAL; |
| UErrorCode status = U_ZERO_ERROR; |
| |
| if (cursor1 == NULL) |
| { |
| ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLength, getDecomposition()); |
| } |
| else |
| { |
| cursor1->setModeAndText(getDecomposition(), source, sourceLength, status); |
| } |
| |
| if ( /*cursor1->cursor == NULL ||*/ U_FAILURE(status)) |
| { |
| return Collator::EQUAL; |
| } |
| |
| if (cursor2 == NULL) |
| { |
| ((RuleBasedCollator *)this)->cursor2 = new NormalizerIterator(target, targetLength, getDecomposition()); |
| } |
| else |
| { |
| cursor2->setModeAndText(getDecomposition(), target, targetLength, status); |
| } |
| |
| if (/*cursor2 == NULL ||*/ U_FAILURE(status)) |
| { |
| return Collator::EQUAL; |
| } |
| |
| int32_t sOrder, tOrder; |
| // int32_t sOrder = CollationElementIterator::NULLORDER, tOrder = CollationElementIterator::NULLORDER; |
| UBool gets = TRUE, gett = TRUE; |
| UBool initialCheckSecTer = getStrength() >= Collator::SECONDARY; |
| UBool checkSecTer = initialCheckSecTer; |
| UBool checkTertiary = getStrength() >= Collator::TERTIARY; |
| UBool isFrenchSec = data->isFrenchSec; |
| uint32_t pSOrder, pTOrder; |
| |
| for(;;) |
| { |
| // Get the next collation element in each of the strings, unless |
| // we've been requested to skip it. |
| if (gets) |
| { |
| sOrder = getStrengthOrder((NormalizerIterator*)cursor1, status); |
| |
| if (U_FAILURE(status)) |
| { |
| return Collator::EQUAL; |
| } |
| } |
| |
| gets = TRUE; |
| |
| if (gett) |
| { |
| tOrder = getStrengthOrder((NormalizerIterator*)cursor2, status); |
| |
| if (U_FAILURE(status)) |
| { |
| return Collator::EQUAL; |
| } |
| } |
| |
| gett = TRUE; |
| |
| // If we've hit the end of one of the strings, jump out of the loop |
| if ((sOrder == CollationElementIterator::NULLORDER)|| |
| (tOrder == CollationElementIterator::NULLORDER)) |
| { |
| break; |
| } |
| |
| // If there's no difference at this position, we can skip to the |
| // next one. |
| pSOrder = CollationElementIterator::primaryOrder(sOrder); |
| pTOrder = CollationElementIterator::primaryOrder(tOrder); |
| if (sOrder == tOrder) |
| { |
| if (isFrenchSec && pSOrder != 0) |
| { |
| if (!checkSecTer) |
| { |
| // in french, a secondary difference more to the right is stronger, |
| // so accents have to be checked with each base element |
| checkSecTer = initialCheckSecTer; |
| |
| // but tertiary differences are less important than the first |
| // secondary difference, so checking tertiary remains disabled |
| checkTertiary = FALSE; |
| } |
| } |
| |
| continue; |
| } |
| |
| // Compare primary differences first. |
| if (pSOrder != pTOrder) |
| { |
| if (sOrder == 0) |
| { |
| // The entire source element is ignorable. |
| // Skip to the next source element, but don't fetch another target element. |
| gett = FALSE; |
| continue; |
| } |
| |
| if (tOrder == 0) |
| { |
| gets = FALSE; |
| continue; |
| } |
| |
| // The source and target elements aren't ignorable, but it's still possible |
| // for the primary component of one of the elements to be ignorable.... |
| if (pSOrder == 0) // primary order in source is ignorable |
| { |
| // The source's primary is ignorable, but the target's isn't. We treat ignorables |
| // as a secondary difference, so remember that we found one. |
| if (checkSecTer) |
| { |
| result = Collator::GREATER; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| |
| // Skip to the next source element, but don't fetch another target element. |
| gett = FALSE; |
| } |
| else if (pTOrder == 0) |
| { |
| // record differences - see the comment above. |
| if (checkSecTer) |
| { |
| result = Collator::LESS; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| |
| // Skip to the next target element, but don't fetch another source element. |
| gets = FALSE; |
| } |
| else |
| { |
| // Neither of the orders is ignorable, and we already know that the primary |
| // orders are different because of the (pSOrder != pTOrder) test above. |
| // Record the difference and stop the comparison. |
| if (pSOrder < pTOrder) |
| { |
| return Collator::LESS; // (strength is PRIMARY) |
| } |
| |
| return Collator::GREATER; // (strength is PRIMARY) |
| } |
| } |
| else |
| { // else of if ( pSOrder != pTOrder ) |
| // primary order is the same, but complete order is different. So there |
| // are no base elements at this point, only ignorables (Since the strings are |
| // normalized) |
| |
| if (checkSecTer) |
| { |
| // a secondary or tertiary difference may still matter |
| uint32_t secSOrder = CollationElementIterator::secondaryOrder(sOrder); |
| uint32_t secTOrder = CollationElementIterator::secondaryOrder(tOrder); |
| |
| if (secSOrder != secTOrder) |
| { |
| // there is a secondary difference |
| result = (secSOrder < secTOrder) ? Collator::LESS : Collator::GREATER; |
| // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| // (even in french, only the first secondary difference within |
| // a base character matters) |
| } |
| else |
| { |
| if (checkTertiary) |
| { |
| // a tertiary difference may still matter |
| uint32_t terSOrder = CollationElementIterator::tertiaryOrder(sOrder); |
| uint32_t terTOrder = CollationElementIterator::tertiaryOrder(tOrder); |
| |
| if (terSOrder != terTOrder) |
| { |
| // there is a tertiary difference |
| result = (terSOrder < terTOrder) ? Collator::LESS : Collator::GREATER; |
| // (strength is TERTIARY) |
| checkTertiary = FALSE; |
| } |
| } |
| } |
| } // if (checkSecTer) |
| |
| } // if ( pSOrder != pTOrder ) |
| } // while() |
| |
| if (sOrder != CollationElementIterator::NULLORDER) |
| { |
| // (tOrder must be CollationElementIterator::NULLORDER, |
| // since this point is only reached when sOrder or tOrder is NULLORDER.) |
| // The source string has more elements, but the target string hasn't. |
| do |
| { |
| if (CollationElementIterator::primaryOrder(sOrder) != 0) |
| { |
| // We found an additional non-ignorable base character in the source string. |
| // This is a primary difference, so the source is greater |
| return Collator::GREATER; // (strength is PRIMARY) |
| } |
| |
| if (CollationElementIterator::secondaryOrder(sOrder) != 0) |
| { |
| // Additional secondary elements mean the source string is greater |
| if (checkSecTer) |
| { |
| result = Collator::GREATER; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| } |
| } |
| while ((sOrder = getStrengthOrder(cursor1, status)) != CollationElementIterator::NULLORDER); |
| } |
| else if (tOrder != CollationElementIterator::NULLORDER) |
| { |
| // The target string has more elements, but the source string hasn't. |
| do |
| { |
| if (CollationElementIterator::primaryOrder(tOrder) != 0) |
| { |
| // We found an additional non-ignorable base character in the target string. |
| // This is a primary difference, so the source is less |
| return Collator::LESS; // (strength is PRIMARY) |
| } |
| |
| if (CollationElementIterator::secondaryOrder(tOrder) != 0) |
| { |
| // Additional secondary elements in the target mean the source string is less |
| if (checkSecTer) |
| { |
| result = Collator::LESS; // (strength is SECONDARY) |
| checkSecTer = FALSE; |
| } |
| } |
| } |
| while ((tOrder = getStrengthOrder(cursor2, status)) != CollationElementIterator::NULLORDER); |
| } |
| |
| |
| // For IDENTICAL comparisons, we use a bitwise character comparison |
| // as a tiebreaker if all else is equal |
| // NOTE: The java code compares result with 0, and |
| // puts the result of the string comparison directly into result |
| if (result == Collator::EQUAL && getStrength() == IDENTICAL) |
| { |
| #if 0 |
| // ******** for the UChar normalization interface. |
| // It doesn't work much faster, and the code was broken |
| // so it's commented out. --srl |
| // UChar sourceDecomp[1024], targetDecomp[1024]; |
| // int32_t sourceDecompLength = 1024; |
| // int32_t targetDecompLength = 1024; |
| |
| // int8_t comparison; |
| // Normalizer::EMode decompMode = getDecomposition(); |
| |
| // if (decompMode != Normalizer::NO_OP) |
| // { |
| // Normalizer::normalize(source, sourceLength, decompMode, |
| // 0, sourceDecomp, sourceDecompLength, status); |
| |
| // Normalizer::normalize(target, targetLength, decompMode, |
| // 0, targetDecomp, targetDecompLength, status); |
| |
| // comparison = u_strcmp(sourceDecomp,targetDecomp); |
| // } |
| // else |
| // { |
| // comparison = u_strcmp(source, target); /* ! */ |
| // } |
| |
| #else |
| |
| UnicodeString sourceDecomp, targetDecomp; |
| |
| int8_t comparison; |
| |
| Normalizer::normalize(source, getDecomposition(), |
| 0, sourceDecomp, status); |
| |
| Normalizer::normalize(target, getDecomposition(), |
| 0, targetDecomp, status); |
| |
| comparison = sourceDecomp.compare(targetDecomp); |
| #endif |
| |
| if (comparison < 0) |
| { |
| result = Collator::LESS; |
| } |
| else if (comparison == 0) |
| { |
| result = Collator::EQUAL; |
| } |
| else |
| { |
| result = Collator::GREATER; |
| } |
| } |
| |
| return result; |
| } |
| |
| |
| int32_t |
| RuleBasedCollator::nextContractChar(NormalizerIterator *cursor, |
| UChar ch, |
| UErrorCode& status) const |
| { |
| // First get the ordering of this single character |
| VectorOfPToContractElement *list = getContractValues(ch); |
| EntryPair *pair = (EntryPair *)list->at(0); |
| int32_t order = pair->value; |
| |
| // Now iterate through the chars following it and |
| // look for the longest match |
| ((UnicodeString&)key).remove(); |
| ((UnicodeString&)key) += ch; |
| |
| while ((ch = cursor->current()) != Normalizer::DONE) |
| { |
| ((UnicodeString&)key) += ch; |
| |
| int32_t n = getEntry(list, key, TRUE); |
| |
| if (n == UNMAPPED) |
| { |
| break; |
| } |
| cursor->next(); |
| |
| pair = (EntryPair *)list->at(n); |
| order = pair->value; |
| } |
| |
| return order; |
| } |
| |
| // Compare two strings using this collator |
| Collator::EComparisonResult |
| RuleBasedCollator::compare(const UnicodeString& source, |
| const UnicodeString& target) const |
| { |
| return compare(source.getUChars(), source.length(), target.getUChars(), target.length()); |
| } |
| |
| // Retrieve a collation key for the specified string |
| // The key can be compared with other collation keys using a bitwise comparison |
| // (e.g. memcmp) to find the ordering of their respective source strings. |
| // This is handy when doing a sort, where each sort key must be compared |
| // many times. |
| // |
| // The basic algorithm here is to find all of the collation elements for each |
| // character in the source string, convert them to an ASCII representation, |
| // and put them into the collation key. But it's trickier than that. |
| // Each collation element in a string has three components: primary ('A' vs 'B'), |
| // secondary ('u' vs 'ü'), and tertiary ('A' vs 'a'), and a primary difference |
| // at the end of a string takes precedence over a secondary or tertiary |
| // difference earlier in the string. |
| // |
| // To account for this, we put all of the primary orders at the beginning of the |
| // string, followed by the secondary and tertiary orders. Each set of orders is |
| // terminated by nulls so that a key for a string which is a initial substring of |
| // another key will compare less without any special case. |
| // |
| // Here's a hypothetical example, with the collation element represented as |
| // a three-digit number, one digit for primary, one for secondary, etc. |
| // |
| // String: A a B É |
| // Collation Elements: 101 100 201 511 |
| // Collation Key: 1125<null>0001<null>1011<null> |
| // |
| // To make things even trickier, secondary differences (accent marks) are compared |
| // starting at the *end* of the string in languages with French secondary ordering. |
| // But when comparing the accent marks on a single base character, they are compared |
| // from the beginning. To handle this, we reverse all of the accents that belong |
| // to each base character, then we reverse the entire string of secondary orderings |
| // at the end. |
| // |
| CollationKey& |
| RuleBasedCollator::getCollationKey( const UnicodeString& source, |
| CollationKey& sortkey, |
| UErrorCode& status) const |
| { |
| return RuleBasedCollator::getCollationKey(source.getUChars(), source.length(), sortkey, status); |
| } |
| |
| CollationKey& |
| RuleBasedCollator::getCollationKey( const UChar* source, |
| int32_t sourceLen, |
| CollationKey& sortkey, |
| UErrorCode& status) const |
| { |
| if (U_FAILURE(status)) |
| { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return sortkey.setToBogus(); |
| } |
| |
| if ((!source) || (sourceLen == 0)) |
| { |
| return sortkey.reset(); |
| } |
| |
| if (cursor1 == NULL) |
| { |
| ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLen, getDecomposition()); |
| } |
| else |
| { |
| cursor1->setModeAndText(getDecomposition(), source,sourceLen, status); |
| } |
| |
| if (U_FAILURE(status)) |
| { |
| return sortkey.setToBogus(); |
| } |
| |
| UBool compareSec = (getStrength() >= Collator::SECONDARY); |
| UBool compareTer = (getStrength() >= Collator::TERTIARY); |
| UBool compareIdent = (getStrength() == Collator::IDENTICAL); |
| int32_t order = 0; |
| int32_t totalPrimary = 0; |
| int32_t totalSec = 0; |
| int32_t totalTer = 0; |
| int32_t totalIdent = 0; |
| UnicodeString decomp; |
| |
| // iterate over the source, counting primary, secondary, and tertiary entries |
| while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != |
| CollationElementIterator::NULLORDER) |
| { |
| int32_t secOrder = CollationElementIterator::secondaryOrder(order); |
| int32_t terOrder = CollationElementIterator::tertiaryOrder(order); |
| |
| if (U_FAILURE(status)) |
| { |
| return sortkey.setToBogus(); |
| } |
| |
| if (! CollationElementIterator::isIgnorable(order)) |
| { |
| totalPrimary += 1; |
| |
| if (compareSec) |
| { |
| totalSec += 1; |
| } |
| |
| if (compareTer) |
| { |
| totalTer += 1; |
| } |
| } |
| else |
| { |
| if (compareSec && secOrder != 0) |
| { |
| totalSec += 1; |
| } |
| |
| if (compareTer && terOrder != 0) |
| { |
| totalTer += 1; |
| } |
| } |
| } |
| |
| // count the null bytes after the entires |
| totalPrimary += 1; |
| |
| if (compareSec) |
| { |
| totalSec += 1; |
| } |
| |
| if (compareTer) |
| { |
| totalTer += 1; |
| } |
| |
| if (compareIdent) |
| { |
| Normalizer::normalize(source, getDecomposition(), // SRL: ?? |
| 0, decomp, status); |
| |
| if (U_SUCCESS(status)) |
| { |
| totalIdent = decomp.length() + 1; |
| } |
| } |
| |
| // Compute total number of bytes to hold the entries |
| // and make sure the key can hold them |
| uint32_t size = 2 * (totalPrimary + totalSec + totalTer + totalIdent); |
| |
| sortkey.ensureCapacity(size); |
| |
| if (sortkey.isBogus()) |
| { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return sortkey; |
| } |
| |
| int32_t primaryCursor = 0; |
| int32_t secCursor = 2 * totalPrimary; |
| int32_t secBase = secCursor; |
| int32_t preSecIgnore = secBase; |
| int32_t terCursor = secCursor + (2 * totalSec); |
| int32_t identCursor = terCursor + (2 * totalTer); |
| |
| // reset source to the beginning |
| cursor1->reset(); |
| |
| // now iterate over the source computing the actual entries |
| while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != CollationElementIterator::NULLORDER) |
| { |
| if (U_FAILURE(status)) |
| { |
| return sortkey.reset(); |
| } |
| |
| int32_t primaryOrder = CollationElementIterator::primaryOrder(order); |
| int32_t secOrder = CollationElementIterator::secondaryOrder(order); |
| int32_t terOrder = CollationElementIterator::tertiaryOrder(order); |
| |
| if (! CollationElementIterator::isIgnorable(order)) |
| { |
| primaryCursor = sortkey.storeBytes(primaryCursor, primaryOrder + SORTKEYOFFSET); |
| |
| if (compareSec) |
| { |
| if (data->isFrenchSec && (preSecIgnore < secCursor)) |
| { |
| sortkey.reverseBytes(preSecIgnore, secCursor); |
| } |
| |
| secCursor = sortkey.storeBytes(secCursor, secOrder + SORTKEYOFFSET); |
| |
| preSecIgnore = secCursor; |
| } |
| |
| if (compareTer) |
| { |
| terCursor = sortkey.storeBytes(terCursor, terOrder + SORTKEYOFFSET); |
| } |
| } |
| else |
| { |
| if (compareSec && secOrder != 0) |
| { |
| secCursor = sortkey.storeBytes(secCursor, secOrder + data->maxSecOrder + SORTKEYOFFSET); |
| } |
| |
| if (compareTer && terOrder != 0) |
| { |
| terCursor = sortkey.storeBytes(terCursor, terOrder + data->maxTerOrder + SORTKEYOFFSET); |
| } |
| } |
| } |
| |
| // append 0 at the end of each portion. |
| sortkey.storeBytes(primaryCursor, 0); |
| |
| if (compareSec) |
| { |
| if (data->isFrenchSec) |
| { |
| if (preSecIgnore < secCursor) |
| { |
| sortkey.reverseBytes(preSecIgnore, secCursor); |
| } |
| |
| sortkey.reverseBytes(secBase, secCursor); |
| } |
| |
| sortkey.storeBytes(secCursor, 0); |
| } |
| |
| if (compareTer) |
| { |
| sortkey.storeBytes(terCursor, 0); |
| } |
| |
| if (compareIdent) |
| { |
| sortkey.storeUnicodeString(identCursor, decomp); |
| } |
| |
| // Debugging - print out the sortkey [--srl] |
| // { |
| // const uint8_t *bytes; |
| // int32_t xcount; |
| // bytes = sortkey.getByteArray(xcount); |
| // // fprintf(stderr, "\n\n- [%02X] [%02X]\n\n", (int)(bytes[0]&0xFF), (int)(bytes[1]&0xFF) ); |
| // } |
| |
| return sortkey; |
| } |
| |
| |
| // Build this collator's rule tables based on a string representation of the rules |
| // See the big diagram at the top of this file for an overview of how the tables |
| // are organized. |
| void |
| RuleBasedCollator::build(const UnicodeString& pattern, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| // This array maps Unicode characters to their collation ordering |
| data->mapping = ucmp32_open(UNMAPPED); |
| |
| if (data->mapping->fBogus) |
| { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| int32_t i = 0; |
| UnicodeString lastGroupChars; |
| UnicodeString expChars; |
| UnicodeString groupChars; |
| |
| if (pattern.length() == 0) |
| { |
| status = U_INVALID_FORMAT_ERROR; |
| return; |
| } |
| |
| // Build the merged collation entries |
| // Since rules can be specified in any order in the string |
| // (e.g. "c , C < d , D < e , E .... C < CH") |
| // this splits all of the rules in the string out into separate |
| // objects and then sorts them. In the above example, it merges the |
| // "C < CH" rule in just before the "C < D" rule. |
| |
| mPattern = new MergeCollation(pattern, getDecomposition(), status); |
| if (U_FAILURE(status)) |
| { |
| ucmp32_close(data->mapping); |
| data->mapping = 0; |
| delete mPattern; |
| mPattern = 0; |
| return; |
| } |
| |
| int32_t order = 0; |
| |
| // Walk through each entry |
| for (i = 0; i < mPattern->getCount(); ++i) |
| { |
| const PatternEntry* entry = mPattern->getItemAt(i); |
| groupChars.remove(); |
| expChars.remove(); |
| |
| // if entry is valid |
| if (entry != NULL) |
| { |
| entry->getChars(groupChars); |
| |
| // check if french secondary needs to be turned on |
| if ((groupChars.length() > 1) && |
| (groupChars[groupChars.length()-1] == 0x0040)) |
| { |
| data->isFrenchSec = TRUE; |
| groupChars.remove(groupChars.length()-1); |
| } |
| |
| order = increment((Collator::ECollationStrength)entry->getStrength(), order); |
| |
| if (entry->getExtension(expChars).length() != 0) |
| { |
| // encountered an expanding character, where one character on input |
| // expands to several sort elements (e.g. 'ö' --> 'o' 'e') |
| addExpandOrder(groupChars, expChars, order, status); |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| } |
| else if (groupChars.length() > 1) |
| { |
| // encountered a contracting character, where several characters on input |
| // contract into one sort order. For example, "ch" is treated as a single |
| // character in traditional Spanish sorting. |
| addContractOrder(groupChars, order, status); |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| } |
| else |
| { |
| // Nothing out of the ordinary -- one character maps to one sort order |
| addOrder(groupChars[0], order, status); |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| } |
| } |
| } |
| |
| // add expanding entries for pre-composed characters |
| addComposedChars(); |
| |
| // Fill in all the expanding chars values |
| commit(); |
| |
| // Compact the data mapping table |
| ucmp32_compact(data->mapping, 1); |
| } |
| |
| /** |
| * Add expanding entries for pre-composed unicode characters so that this |
| * collator can be used reasonably well with decomposition turned off. |
| */ |
| void RuleBasedCollator::addComposedChars() |
| { |
| UnicodeString buf; |
| UErrorCode status = U_ZERO_ERROR; |
| |
| // Iterate through all of the pre-composed characters in Unicode |
| ComposedCharIter iter; |
| UnicodeString decomp; |
| |
| while (iter.hasNext()) |
| { |
| UChar c = iter.next(); |
| |
| if (getCharOrder(c) == UNMAPPED) |
| { |
| // |
| // We don't already have an ordering for this pre-composed character. |
| // |
| // First, see if the decomposed string is already in our |
| // tables as a single contracting-string ordering. |
| // If so, just map the precomposed character to that order. |
| // |
| // TODO: What we should really be doing here is trying to find the |
| // longest initial substring of the decomposition that is present |
| // in the tables as a contracting character sequence, and find its |
| // ordering. Then do this recursively with the remaining chars |
| // so that we build a list of orderings, and add that list to |
| // the expansion table. |
| // That would be more correct but also significantly slower, so |
| // I'm not totally sure it's worth doing. |
| // |
| iter.getDecomposition(decomp); |
| int contractOrder = getContractOrder(decomp); |
| |
| if (contractOrder != UNMAPPED) |
| { |
| addOrder(c, contractOrder, status); |
| } |
| else |
| { |
| // |
| // We don't have a contracting ordering for the entire string |
| // that results from the decomposition, but if we have orders |
| // for each individual character, we can add an expanding |
| // table entry for the pre-composed character |
| // |
| UBool allThere = TRUE; |
| int32_t i; |
| |
| for (i = 0; i < decomp.length(); i += 1) |
| { |
| if (getCharOrder(decomp[i]) == UNMAPPED) |
| { |
| allThere = FALSE; |
| break; |
| } |
| } |
| |
| if (allThere) |
| { |
| buf.remove(); |
| buf += c; |
| addExpandOrder(buf, decomp, UNMAPPED, status); |
| } |
| } |
| } |
| } |
| } |
| |
| // When the expanding character tables are built by addExpandOrder, |
| // it doesn't know what the final ordering of each character |
| // in the expansion will be. Instead, it just puts the raw character |
| // code into the table, adding CHARINDEX as a flag. Now that we've |
| // finished building the mapping table, we can go back and look up |
| // that character to see what its real collation order is and |
| // stick that into the expansion table. That lets us avoid doing |
| // a two-stage lookup later. |
| |
| void |
| RuleBasedCollator::commit() |
| { |
| // if there are any expanding characters |
| if (data->expandTable != NULL) |
| { |
| int32_t i; |
| for (i = 0; i < data->expandTable->size(); i += 1) |
| { |
| VectorOfInt* valueList = data->expandTable->at(i); |
| int32_t j; |
| for (j = 0; j < valueList->size(); j++) |
| { |
| // found a expanding character |
| // the expanding char value is not filled in yet |
| if ((valueList->at(j) < EXPANDCHARINDEX) && |
| (valueList->at(j) > CHARINDEX)) |
| { |
| // Get the real values for the non-filled entry |
| UChar ch = (UChar)(valueList->at(j) - CHARINDEX); |
| int32_t realValue = ucmp32_get(data->mapping, ch); |
| |
| if (realValue == UNMAPPED) |
| { |
| // The real value is still unmapped, maybe it'signorable |
| valueList->atPut(j, IGNORABLEMASK & ch); |
| } |
| // fill in the value |
| else |
| { |
| valueList->atPut(j, realValue); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Increment of the last order based on the comparison level. |
| */ |
| int32_t |
| RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t lastValue) |
| { |
| switch(aStrength) |
| { |
| case Collator::PRIMARY: |
| // increment priamry order and mask off secondary and tertiary difference |
| lastValue += PRIMARYORDERINCREMENT; |
| lastValue &= PRIMARYORDERMASK; |
| isOverIgnore = TRUE; |
| break; |
| |
| case Collator::SECONDARY: |
| // increment secondary order and mask off tertiary difference |
| lastValue += SECONDARYORDERINCREMENT; |
| lastValue &= SECONDARYDIFFERENCEONLY; |
| |
| // record max # of ignorable chars with secondary difference |
| if (isOverIgnore == FALSE) |
| { |
| data->maxSecOrder += 1; |
| } |
| break; |
| |
| case Collator::TERTIARY: |
| // increment tertiary order |
| lastValue += TERTIARYORDERINCREMENT; |
| |
| // record max # of ignorable chars with tertiary difference |
| if (isOverIgnore == FALSE) |
| { |
| data->maxTerOrder += 1; |
| } |
| break; |
| |
| // case IDENTICAL? |
| } |
| |
| return lastValue; |
| } |
| |
| // Adds a character and its designated order into the collation table. |
| // This is the simple case, with no expansion or contraction |
| void |
| RuleBasedCollator::addOrder(UChar ch, |
| int32_t anOrder, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| // try to find the order of the char in the mapping table |
| int32_t order = ucmp32_get(data->mapping, ch); |
| |
| if (order >= CONTRACTCHARINDEX) |
| { |
| // There's already an entry for this character that points to a contracting |
| // character table. Instead of adding the character directly to the mapping |
| // table, we must add it to the contract table instead. |
| key.remove(); |
| key += ch; |
| if (key.isBogus()) |
| { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| addContractOrder(key, anOrder, status); |
| } |
| else |
| { |
| // add the entry to the mapping table, the same later entry replaces the previous one |
| ucmp32_set(data->mapping, ch, anOrder); |
| } |
| } |
| |
| // Add an expanding-character entry to the table. |
| void |
| RuleBasedCollator::addExpandOrder( const UnicodeString& contractChars, |
| const UnicodeString& expandChars, |
| int32_t anOrder, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| // Create an expansion table entry |
| int32_t tableIndex = addExpansion(anOrder, expandChars); |
| |
| // And add its index into the main mapping table |
| if (contractChars.length() > 1) |
| { |
| addContractOrder(contractChars, tableIndex, status); |
| } |
| else |
| { |
| addOrder(contractChars[0], tableIndex, status); |
| } |
| } |
| |
| int32_t RuleBasedCollator::addExpansion(int32_t anOrder, const UnicodeString &expandChars) |
| { |
| if (data->expandTable == NULL) |
| { |
| data->expandTable = new VectorOfPToExpandTable(); |
| |
| if (data->expandTable == NULL) |
| { |
| return 0; |
| } |
| } |
| |
| // If anOrder is valid, we want to add it at the beginning of the list |
| int32_t offset = (anOrder == UNMAPPED) ? 0 : 1; |
| |
| VectorOfInt *valueList = new VectorOfInt(expandChars.length() + offset); |
| |
| if (offset == 1) |
| { |
| valueList->atPut(0, anOrder); |
| } |
| |
| int32_t i; |
| for (i = 0; i < expandChars.length(); i += 1) |
| { |
| UChar ch = expandChars[i]; |
| int32_t mapValue = getCharOrder(ch); |
| |
| if (mapValue != UNMAPPED) |
| { |
| valueList->atPut(i + offset, mapValue); |
| } |
| else |
| { |
| // can't find it in the table, will be filled in by commit(). |
| valueList->atPut(i + offset, CHARINDEX + (int32_t)ch); |
| } |
| } |
| |
| // Add the expanding char list into the expansion table. |
| int32_t tableIndex = EXPANDCHARINDEX + data->expandTable->size(); |
| data->expandTable->atPut(data->expandTable->size(), valueList); |
| |
| return tableIndex; |
| } |
| |
| // Add a string of characters that contracts into a single ordering. |
| void |
| RuleBasedCollator::addContractOrder(const UnicodeString& groupChars, |
| int32_t anOrder, |
| UBool fwd, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| { |
| return; |
| } |
| |
| if (data->contractTable == NULL) |
| { |
| data->contractTable = new VectorOfPToContractTable(); |
| if (data->contractTable->isBogus()) |
| { |
| delete data->contractTable; |
| data->contractTable = NULL; |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| } |
| |
| // See if the initial character of the string already has a contract table. |
| // e.g. for "ch", look for 'c'. |
| int32_t entry = ucmp32_get(data->mapping, groupChars[0]); |
| VectorOfPToContractElement *entryTable = getContractValues(entry - CONTRACTCHARINDEX); |
| |
| if (entryTable == NULL) |
| { |
| // We need to create a new table of contract entries for this base char |
| int32_t tableIndex = CONTRACTCHARINDEX + data->contractTable->size(); |
| EntryPair *pair = NULL; |
| UnicodeString substring; |
| |
| entryTable = new VectorOfPToContractElement(); |
| if (entryTable->isBogus()) |
| { |
| delete entryTable; |
| delete data->contractTable; |
| data->contractTable = NULL; |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| data->contractTable->atPut(data->contractTable->size(), entryTable); |
| if (data->contractTable->isBogus()) |
| { |
| delete entryTable; |
| delete data->contractTable; |
| data->contractTable = NULL; |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| |
| // Add the initial character's current ordering first. then |
| // update its mapping to point to this contract table |
| groupChars.extract(0, 1, substring); |
| if (substring.isBogus()) |
| { |
| delete entryTable; |
| delete data->contractTable; |
| data->contractTable = NULL; |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| pair = new EntryPair(substring, entry); |
| |
| entryTable->atPut(0, pair); |
| if (entryTable->isBogus()) |
| { |
| delete entryTable; |
| delete data->contractTable; |
| data->contractTable = NULL; |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| ucmp32_set(data->mapping, groupChars[0], tableIndex); |
| } |
| |
| // Now add (or replace) this string in the table |
| int32_t index = getEntry(entryTable, groupChars, fwd); |
| |
| if (index != UNMAPPED) |
| { |
| EntryPair *pair = (EntryPair *) entryTable->at(index); |
| pair->value = anOrder; |
| } |
| else |
| { |
| EntryPair *pair = new EntryPair(groupChars, anOrder, fwd); |
| |
| entryTable->atPut(entryTable->size(), pair); |
| } |
| |
| // If this was a forward mapping for a contracting string, also add a |
| // reverse mapping for it, so that CollationElementIterator::previous |
| // can work right |
| if (fwd) |
| { |
| UnicodeString reverse(groupChars); |
| |
| if (reverse.isBogus()) |
| { |
| delete entryTable; |
| delete data->contractTable; |
| data->contractTable = NULL; |
| status = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| |
| addContractOrder(reverse.reverse(), anOrder, FALSE, status); |
| } |
| } |
| |
| /** |
| * If the given string has been specified as a contracting string |
| * in this collation table, return its ordering. |
| * Otherwise return UNMAPPED. |
| */ |
| int32_t RuleBasedCollator::getContractOrder(const UnicodeString &groupChars) const |
| { |
| int32_t result = UNMAPPED; |
| |
| if (data->contractTable != NULL) |
| { |
| VectorOfPToContractElement *entryTable = getContractValues(groupChars[0]); |
| |
| if (entryTable != NULL) |
| { |
| int32_t index = getEntry(entryTable, groupChars, TRUE); |
| |
| if (index != UNMAPPED) |
| { |
| EntryPair *pair = entryTable->at(index); |
| |
| result = pair->value; |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| int32_t RuleBasedCollator::getCharOrder(UChar ch) const |
| { |
| int32_t order = ucmp32_get(data->mapping, ch); |
| |
| if (order >= CONTRACTCHARINDEX) |
| { |
| VectorOfPToContractElement *groupList = getContractValues(order - CONTRACTCHARINDEX); |
| EntryPair *pair = groupList->at(0); |
| |
| order = pair->value; |
| } |
| |
| return order; |
| } |
| |
| // Create a hash code for this collation. Just hash the main rule table -- |
| // that should be good enough for almost any use. |
| int32_t |
| RuleBasedCollator::hashCode() const |
| { |
| int32_t value = 0; |
| int32_t c; |
| int32_t count = getRules().length(); |
| UTextOffset pos = count - 1; |
| |
| if (count > 64) |
| { |
| count = 64; // only hash upto limit |
| } |
| |
| int16_t i = 0; |
| |
| while (i < count) |
| { |
| c = data->ruleTable[pos]; |
| value = ((value << (c & 0x0f)) ^ (c << 8)) + (c ^ value); |
| i += 1; |
| pos -= 1; |
| } |
| |
| if (value == 0) |
| { |
| value = 1; |
| } |
| |
| return value; |
| } |
| |
| // find the contracting char entry in the list |
| int32_t |
| RuleBasedCollator::getEntry(VectorOfPToContractElement* list, |
| const UnicodeString& name, |
| UBool fwd) |
| { |
| int32_t i; |
| |
| if (list != NULL) |
| { |
| for (i = 0; i < list->size(); i += 1) |
| { |
| EntryPair *pair = list->at(i); |
| |
| if ((pair != NULL) && (pair->fwd == fwd) && (pair->entryName == name)) |
| { |
| return i; |
| } |
| } |
| } |
| |
| return RuleBasedCollator::UNMAPPED; |
| } |
| |
| // look for the contracting list entry with the beginning char |
| VectorOfPToContractElement* |
| RuleBasedCollator::getContractValues(UChar ch) const |
| { |
| int32_t index = ucmp32_get(data->mapping, ch); |
| return getContractValues(index - CONTRACTCHARINDEX); |
| } |
| |
| // look for the contracting list entry with the index |
| VectorOfPToContractElement* |
| RuleBasedCollator::getContractValues(int32_t index) const |
| { |
| if (data->contractTable != NULL) |
| { |
| if (index >= 0) |
| { |
| return data->contractTable->at(index); |
| } |
| } |
| return NULL; |
| } |
| |
| /** |
| * Return the maximum length of any expansion sequences that end |
| * with the specified comparison order. |
| * |
| * @param order a collation order returned by previous or next. |
| * @return the maximum length of any expansion seuences ending |
| * with the specified order. |
| * |
| * @see CollationElementIterator#getMaxExpansion |
| */ |
| int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const |
| { |
| int32_t result = 1; |
| |
| if (data->expandTable != NULL) |
| { |
| // Right now this does a linear search through the entire |
| // expandsion table. If a collator had a large number of expansions, |
| // this could cause a performance problem, but in practice that |
| // rarely happens |
| int32_t i; |
| for (i = 0; i < data->expandTable->size(); i += 1) |
| { |
| VectorOfInt *valueList = data->expandTable->at(i); |
| int32_t length = valueList->size(); |
| |
| if (length > result && valueList->at(length-1) == order) |
| { |
| result = length; |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Get the entry of hash table of the expanding string in the collation |
| * table. |
| * @param offset the index of the expanding string value list |
| */ |
| VectorOfInt *RuleBasedCollator::getExpandValueList(int32_t order) const |
| { |
| return data->expandTable->at(order - EXPANDCHARINDEX); |
| } |
| |
| |
| |
| void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, UMemoryStream* is, UErrorCode& status) |
| { |
| if (!uprv_mstrm_error(is) && U_SUCCESS(status)) { |
| // Check that this is the correct file type |
| int16_t id; |
| |
| uprv_mstrm_read(is, &id, sizeof(id)); |
| if (id != collator->FILEID) |
| { |
| // This isn't the right type of file. Mark the ios |
| // as failing and return. |
| uprv_mstrm_setError(is); // force the stream to set its error flag |
| return; |
| } |
| |
| // Stream in large objects |
| char isNull; |
| |
| uprv_mstrm_read(is, &isNull, sizeof(isNull)); |
| if (isNull) |
| { |
| delete collator->data; |
| collator->data = NULL; |
| status = U_MISSING_RESOURCE_ERROR; |
| } |
| else |
| { |
| if (collator->data == NULL) |
| { |
| collator->data = new TableCollationData; |
| } |
| |
| collator->data->streamIn(is, status); |
| if (collator->data->isBogus()) { |
| uprv_mstrm_setError(is); // force the stream to set its error flag |
| status = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| } |
| |
| // Verify that the end marker is present |
| uprv_mstrm_read(is, &id, sizeof(id)); |
| if (id != collator->FILEID) |
| { |
| // This isn't the right type of file. Mark the ios |
| // as failing and return. |
| uprv_mstrm_setError(is); // force the stream to set its error flag |
| status = U_MISSING_RESOURCE_ERROR; |
| return; |
| } |
| |
| // Reset other data members |
| collator->isOverIgnore = FALSE; |
| collator->lastChar = 0; |
| delete collator->mPattern; |
| collator->mPattern = 0; |
| collator->key.remove(); |
| collator->dataIsOwned = TRUE; |
| } |
| } |
| |
| void RuleBasedCollatorStreamer::streamOut(const RuleBasedCollator* collator, UMemoryStream* os) |
| { |
| if (!uprv_mstrm_error(os)) |
| { |
| // We use a 16-bit ID code to identify this file. |
| int16_t id = collator->FILEID; |
| uprv_mstrm_write(os, (uint8_t *)&id, sizeof(id)); |
| |
| // Stream out the data |
| char isNull; |
| isNull = (collator->data == 0); |
| uprv_mstrm_write(os, (uint8_t*)&isNull, sizeof(isNull)); |
| |
| if (!isNull) |
| { |
| collator->data->streamOut(os); |
| } |
| |
| // Write out the ID to indicate the end |
| uprv_mstrm_write(os, (uint8_t *)&id, sizeof(id)); |
| } |
| } |
| |
| void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, FileStream* is) |
| { |
| if (!T_FileStream_error(is)) |
| { |
| // Check that this is the correct file type |
| int16_t id; |
| |
| T_FileStream_read(is, &id, sizeof(id)); |
| if (id != collator->FILEID) |
| { |
| // This isn't the right type of file. Mark the ios |
| // as failing and return. |
| T_FileStream_setError(is); // force the stream to set its error flag |
| return; |
| } |
| |
| // Stream in large objects |
| char isNull; |
| |
| T_FileStream_read(is, &isNull, sizeof(isNull)); |
| if (isNull) |
| { |
| delete collator->data; |
| collator->data = NULL; |
| } |
| else |
| { |
| if (collator->data == NULL) |
| { |
| collator->data = new TableCollationData; |
| } |
| |
| collator->data->streamIn(is); |
| if (collator->data->isBogus()) { |
| T_FileStream_setError(is); // force the stream to set its error flag |
| return; |
| } |
| } |
| |
| // Verify that the end marker is present |
| T_FileStream_read(is, &id, sizeof(id)); |
| if (id != collator->FILEID) |
| { |
| // This isn't the right type of file. Mark the ios |
| // as failing and return. |
| T_FileStream_setError(is); // force the stream to set its error flag |
| return; |
| } |
| |
| // Reset other data members |
| collator->isOverIgnore = FALSE; |
| collator->lastChar = 0; |
| delete collator->mPattern; |
| collator->mPattern = 0; |
| collator->key.remove(); |
| collator->dataIsOwned = TRUE; |
| } |
| } |
| |
| void RuleBasedCollatorStreamer::streamOut(const RuleBasedCollator* collator, FileStream* os) |
| { |
| if (!T_FileStream_error(os)) |
| { |
| // We use a 16-bit ID code to identify this file. |
| int16_t id = collator->FILEID; |
| T_FileStream_write(os, &id, sizeof(id)); |
| |
| // Stream out the data |
| char isNull; |
| isNull = (collator->data == 0); |
| T_FileStream_write(os, &isNull, sizeof(isNull)); |
| |
| if (!isNull) |
| { |
| collator->data->streamOut(os); |
| } |
| |
| // Write out the ID to indicate the end |
| T_FileStream_write(os, &id, sizeof(id)); |
| } |
| } |
| |
| UBool RuleBasedCollator::writeToFile(const char* fileName) const |
| { |
| FileStream* ofs = T_FileStream_open(fileName, "wb"); |
| if (ofs != 0) |
| { |
| RuleBasedCollatorStreamer::streamOut(this, ofs); |
| } |
| |
| #ifdef COLLDEBUG |
| fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs), |
| (!T_FileStream_error(ofs) ? ", OK" : ", FAIL")); |
| #endif |
| |
| UBool err = T_FileStream_error(ofs) == 0; |
| |
| T_FileStream_close(ofs); |
| return err; |
| } |
| /* |
| UBool RuleBasedCollator::prepareForBundle() const |
| { |
| UMemoryStream* ofs = uprv_mstrm_openNew(0); |
| if (ofs != 0) |
| { |
| RuleBasedCollatorStreamer::streamOut(this, ofs); |
| } |
| |
| #ifdef COLLDEBUG |
| fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs), |
| (!T_FileStream_error(ofs) ? ", OK" : ", FAIL")); |
| #endif |
| |
| UBool err = uprv_mstrm_error(ofs) == 0; |
| |
| uprv_mstrm_close(ofs); |
| |
| return err; |
| } |
| */ |
| |
| void RuleBasedCollator::addToCache(const UnicodeString& key) |
| { |
| // This method doesn't add the RuleBasedCollator itself to the cache. Instead, |
| // it adds the given RuleBasedCollator's data object to the TableCollationData |
| // cache, and marks it as non-owned in the given RuleBasedCollator object. |
| TableCollationData::addToCache(key, data); |
| dataIsOwned = FALSE; |
| } |
| |
| void |
| RuleBasedCollator::constructFromCache(const UnicodeString& key, |
| UErrorCode& status) |
| { |
| // Attempt to construct this RuleBasedCollator object from cached TableCollationData. |
| // If no such data is in the cache, return false. |
| if (U_FAILURE(status)) return; |
| if (dataIsOwned) |
| { |
| delete data; |
| data = NULL; |
| } |
| |
| isOverIgnore = FALSE; |
| lastChar = 0; |
| mPattern = 0; |
| setStrength(Collator::TERTIARY); |
| |
| dataIsOwned = FALSE; |
| data = TableCollationData::findInCache(key); |
| if (data == NULL) |
| { |
| status = U_MISSING_RESOURCE_ERROR; |
| } |
| } |
| |
| char* |
| RuleBasedCollator::createPathName( const UnicodeString& prefix, |
| const UnicodeString& name, |
| const UnicodeString& suffix) |
| { |
| // Concatenate three elements to form a file name, and return it. |
| |
| UnicodeString workingName(prefix); |
| int32_t size; |
| char* returnVal; |
| |
| workingName += name; |
| workingName += suffix; |
| |
| size = workingName.length(); |
| returnVal = new char[size + 1]; |
| workingName.extract(0, size, returnVal, ""); |
| returnVal[size] = 0; |
| |
| return returnVal; |
| } |
| |
| void |
| RuleBasedCollator::chopLocale(UnicodeString& localeName) |
| { |
| // chopLocale removes the final element from a locale string. |
| // For instance, "de_CH" becomes "de", and "de" becomes "". |
| // "" remains "". |
| |
| int32_t size = localeName.length(); |
| int32_t i; |
| |
| for (i = size - 1; i > 0; i--) |
| { |
| if (localeName[i] == 0x005F) |
| { |
| break; |
| } |
| } |
| |
| if (i < 0) |
| { |
| i = 0; |
| } |
| |
| localeName.remove(i, size - i); |
| } |
| |
| |
| uint8_t * |
| RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &status) |
| { |
| UMemoryStream *memdata = 0; |
| uint8_t *data = 0; |
| |
| if(U_FAILURE(status)) { |
| return NULL; |
| } |
| |
| memdata = uprv_mstrm_openNew(0); |
| |
| if (memdata != 0) { |
| RuleBasedCollatorStreamer::streamOut(this, memdata); |
| } |
| |
| UBool err = uprv_mstrm_error(memdata) == 0; |
| |
| |
| data = (uint8_t *)uprv_malloc(memdata->fPos); |
| if(data == 0) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| uprv_mstrm_close(memdata); |
| length = 0; |
| return 0; |
| } else { |
| uprv_memcpy(data, memdata->fStart, memdata->fPos); |
| length = memdata->fPos; |
| uprv_mstrm_close(memdata); |
| return data; |
| } |
| } |
| |
| void RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value, UErrorCode &status) { |
| status = U_UNSUPPORTED_ERROR; |
| } |
| |
| UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &status) { |
| status = U_UNSUPPORTED_ERROR; |
| return UCOL_ATTR_DEFAULT; |
| } |
| |
| Collator* RuleBasedCollator::safeClone(void) { |
| return 0; |
| } |
| |
| |
| Collator::EComparisonResult RuleBasedCollator::compare(ForwardCharacterIterator &source, |
| ForwardCharacterIterator &target) { |
| return EQUAL; |
| } |
| |
| int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, |
| uint8_t *result, |
| int32_t resultLength) const { |
| return 0; |
| } |
| |
| int32_t RuleBasedCollator::getSortKey(const UChar *source, |
| int32_t sourceLength, |
| uint8_t *result, |
| int32_t resultLength) const { |
| return 0; |
| } |
| |
| //eof |