| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-1999, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| |
| /* |
| * File coleitr.cpp |
| * |
| * |
| * |
| * Created by: Helena Shih |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * |
| * 6/23/97 helena Adding comments to make code more readable. |
| * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java |
| * 12/10/99 aliu Ported Thai collation support from Java. |
| * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) |
| * 02/19/01 swquek Removed CollationElementsIterator() since it is |
| * private constructor and no calls are made to it |
| */ |
| |
| // #include "unicode/sortkey.h" |
| #include "unicode/coleitr.h" |
| #include "ucolimp.h" |
| #include "cmemory.h" |
| |
| // #include "unicode/chariter.h" |
| // #include "tables.h" |
| // #include "unicode/normlzr.h" |
| // #include "unicode/unicode.h" |
| // #include "tcoldata.h" |
| // #include "ucmp32.h" |
| |
| /* Constants --------------------------------------------------------------- */ |
| |
| /* synwee : public can't remove */ |
| int32_t const CollationElementIterator::NULLORDER = 0xffffffff; |
| // int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000; |
| int32_t const CollationElementIterator::NO_MORE_CES = 0x00010101; |
| |
| /* CollationElementIterator public constructor/destructor ------------------ */ |
| |
| CollationElementIterator::CollationElementIterator( |
| const CollationElementIterator& other) |
| : isDataOwned_(TRUE) |
| { |
| *this = other; |
| } |
| |
| CollationElementIterator::~CollationElementIterator() |
| { |
| if (isDataOwned_) |
| ucol_closeElements(m_data_); |
| } |
| |
| /* CollationElementIterator public methods --------------------------------- */ |
| |
| UTextOffset CollationElementIterator::getOffset() const |
| { |
| return ucol_getOffset(m_data_); |
| } |
| |
| /** |
| * Get the ordering priority of the next character in the string. |
| * @return the next character's ordering. Returns NULLORDER if an error has |
| * occured otherwise if the end of string has been reached, |
| * NO_MORE_CES is returned. |
| */ |
| int32_t CollationElementIterator::next(UErrorCode& status) |
| { |
| /* |
| if (text == NULL || U_FAILURE(status)) |
| return NULLORDER; |
| |
| // Update the decomposition mode if necessary. |
| text->setMode(orderAlias->getDecomposition()); |
| |
| if (bufferAlias != NULL) |
| { |
| // bufferAlias needs a bit of an explanation. |
| // When we hit an expanding character in the text, we call the order's |
| // getExpandValues method to retrieve an array of the orderings for all of |
| // the characters in the expansion (see the end of this method). |
| // The first ordering is returned, and an alias to the orderings array is |
| // saved so that the remaining orderings can be returned on subsequent calls |
| // to next. So, if the expanding buffer is not exhausted, all we have to do |
| // here is return the next ordering in the buffer. |
| if (expIndex < bufferAlias->size()) |
| return strengthOrder(bufferAlias->at(expIndex++)); |
| else |
| bufferAlias = NULL; |
| } |
| |
| // Gets the next character from the string using decomposition iterator. |
| UChar32 ch = text->current(); |
| text->next(); |
| |
| if (U_FAILURE(status)) |
| return NULLORDER; |
| |
| |
| if (ch == Normalizer::DONE) |
| return NULLORDER; |
| |
| // Ask the collator for this character's ordering. |
| // Used to be RuleBasedCollator.getUnicodeOrder(). |
| // It can't be inlined in tblcoll.h file unfortunately. |
| |
| int32_t value = ucmp32_get(orderAlias->data->mapping, ch); |
| |
| if (value == RuleBasedCollator::UNMAPPED) |
| { |
| // Returned an "unmapped" flag and save the character so it can be |
| // returned next time this method is called. |
| if (ch == 0x0000) |
| return ch; |
| // \u0000 is not valid in C++'s UnicodeString |
| ownBuffer->at(0) = UNMAPPEDCHARVALUE; |
| ownBuffer->at(1) = ch << 16; |
| bufferAlias = ownBuffer; |
| } |
| else |
| { |
| if (value >= RuleBasedCollator::CONTRACTCHARINDEX) |
| value = nextContractChar(ch, status); |
| if (value >= RuleBasedCollator::EXPANDCHARINDEX) |
| bufferAlias = orderAlias->getExpandValueList(value); |
| |
| if (isThaiPreVowel(ch)) |
| { |
| UChar32 consonant = text->current(); |
| text->next(); |
| if (isThaiBaseConsonant(consonant)) |
| bufferAlias = makeReorderedBuffer((UChar)consonant, value, bufferAlias, |
| TRUE, status); |
| else |
| text->previous(); |
| } |
| } |
| |
| if (bufferAlias != NULL) |
| { |
| expIndex = 1; |
| value = bufferAlias->at(0); |
| } |
| |
| return strengthOrder(value); |
| */ |
| return ucol_next(m_data_, &status); |
| } |
| |
| UBool CollationElementIterator::operator!=( |
| const CollationElementIterator& other) const |
| { |
| return !(*this == other); |
| } |
| |
| UBool CollationElementIterator::operator==( |
| const CollationElementIterator& that) const |
| { |
| if (this == &that) |
| return TRUE; |
| |
| /* |
| if (*text != *(that.text)) |
| return FALSE; |
| |
| if (((bufferAlias == NULL) != (that.bufferAlias == NULL)) || |
| (bufferAlias != NULL && *bufferAlias != *(that.bufferAlias))) |
| return FALSE; |
| |
| if (expIndex != that.expIndex) |
| return FALSE; |
| |
| if (orderAlias != that.orderAlias) |
| return FALSE; |
| |
| return TRUE; |
| */ |
| |
| return m_data_ == that.m_data_; |
| } |
| |
| /** |
| * Get the ordering priority of the previous collation element in the string. |
| * @param status the error code status. |
| * @return the previous element's ordering. Returns NULLORDER if an error has |
| * occured otherwise if the start of string has been reached, |
| * NO_MORE_CES is returned. |
| */ |
| int32_t CollationElementIterator::previous(UErrorCode& status) |
| { |
| /* |
| if (text == NULL || U_FAILURE(status)) |
| return NULLORDER; |
| |
| text->setMode(orderAlias->getDecomposition()); |
| |
| if (bufferAlias != NULL) |
| { |
| if (expIndex > 0) |
| return strengthOrder(bufferAlias->at(--expIndex)); |
| |
| bufferAlias = NULL; |
| } |
| |
| UChar32 ch = text->previous(); |
| |
| if (ch == Normalizer::DONE) |
| return NULLORDER; |
| |
| // Used to be RuleBasedCollator.getUnicodeOrder(). It can't be inlined in |
| // tblcoll.h file unfortunately. |
| |
| int32_t value = ucmp32_get(orderAlias->data->mapping, ch); |
| |
| if (value == RuleBasedCollator::UNMAPPED) |
| { |
| if (ch == 0x0000) |
| return ch; |
| |
| ownBuffer->at(0) = UNMAPPEDCHARVALUE; |
| ownBuffer->at(1) = ch << 16; |
| bufferAlias = ownBuffer; |
| } |
| else |
| { |
| if (value >= RuleBasedCollator::CONTRACTCHARINDEX) |
| value = prevContractChar(ch, status); |
| |
| if (value >= RuleBasedCollator::EXPANDCHARINDEX) |
| bufferAlias = orderAlias->getExpandValueList(value); |
| |
| if (isThaiBaseConsonant(ch)) |
| { |
| UChar32 vowel = text->previous(); |
| if (isThaiPreVowel(vowel)) |
| bufferAlias = makeReorderedBuffer((UChar)vowel, value, bufferAlias, |
| FALSE, status); |
| else |
| text->next(); |
| } |
| } |
| |
| if (bufferAlias != NULL) |
| { |
| expIndex = bufferAlias->size()-1; |
| value = bufferAlias->at(expIndex); |
| } |
| |
| return strengthOrder(value); |
| */ |
| return ucol_previous(m_data_, &status); |
| } |
| |
| /** |
| * Resets the cursor to the beginning of the string. |
| */ |
| void CollationElementIterator::reset() |
| { |
| /* |
| if (text != NULL) |
| { |
| text->reset(); |
| text->setMode(orderAlias->getDecomposition()); |
| } |
| |
| bufferAlias = NULL; |
| expIndex = 0; |
| */ |
| ucol_reset(m_data_); |
| } |
| |
| void CollationElementIterator::setOffset(UTextOffset newOffset, |
| UErrorCode& status) |
| { |
| /* |
| if (U_FAILURE(status)) |
| return; |
| |
| if (text != NULL) |
| text->setIndex(newOffset); |
| |
| bufferAlias = NULL; |
| */ |
| ucol_setOffset(m_data_, newOffset, &status); |
| } |
| |
| /** |
| * Sets the source to the new source string. |
| */ |
| void CollationElementIterator::setText(const UnicodeString& source, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| return; |
| /* |
| bufferAlias = 0; |
| |
| if (text == NULL) |
| text = new Normalizer(source, orderAlias->getDecomposition()); |
| else |
| { |
| text->setText(source, status); |
| text->setMode(orderAlias->getDecomposition()); |
| } |
| */ |
| int32_t length = source.length(); |
| UChar *string = new UChar[length]; |
| source.extract(0, length, string); |
| |
| m_data_->length_ = length; |
| |
| if (m_data_->iteratordata_.isWritable && |
| m_data_->iteratordata_.string != NULL) |
| uprv_free(m_data_->iteratordata_.string); |
| init_collIterate(string, length, &m_data_->iteratordata_, TRUE); |
| } |
| |
| // Sets the source to the new character iterator. |
| void CollationElementIterator::setText(CharacterIterator& source, |
| UErrorCode& status) |
| { |
| if (U_FAILURE(status)) |
| return; |
| |
| /* |
| bufferAlias = 0; |
| |
| if (text == NULL) |
| text = new Normalizer(source, orderAlias->getDecomposition()); |
| else |
| { |
| text->setMode(orderAlias->getDecomposition()); |
| text->setText(source, status); |
| } |
| */ |
| int32_t length = source.getLength(); |
| UChar *buffer = new UChar[length]; |
| /* |
| Using this constructor will prevent buffer from being removed when |
| string gets removed |
| */ |
| UnicodeString string; |
| source.getText(string); |
| string.extract(0, length, buffer); |
| m_data_->length_ = length; |
| |
| if (m_data_->iteratordata_.isWritable && |
| m_data_->iteratordata_.string != NULL) |
| uprv_free(m_data_->iteratordata_.string); |
| init_collIterate(buffer, length, &m_data_->iteratordata_, TRUE); |
| } |
| |
| int32_t CollationElementIterator::strengthOrder(int32_t order) const |
| { |
| UCollationStrength s = ucol_getStrength(m_data_->collator_); |
| // Mask off the unwanted differences. |
| if (s == UCOL_PRIMARY) |
| order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY; |
| else |
| if (s == UCOL_SECONDARY) |
| order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY; |
| |
| return order; |
| } |
| |
| /* CollationElementIterator private constructors/destructors --------------- */ |
| |
| /* |
| This private method will never be called, but it makes the linker happy |
| CollationElementIterator::CollationElementIterator() : m_data_(0) |
| { |
| } |
| */ |
| |
| CollationElementIterator::CollationElementIterator( |
| const RuleBasedCollator* order) |
| : isDataOwned_(TRUE) |
| { |
| UErrorCode status = U_ZERO_ERROR; |
| m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status); |
| } |
| |
| /** |
| * This is the "real" constructor for this class; it constructs an iterator |
| * over the source text using the specified collator |
| */ |
| CollationElementIterator::CollationElementIterator( |
| const UnicodeString& sourceText, |
| const RuleBasedCollator* order, |
| UErrorCode& status) |
| : isDataOwned_(TRUE) |
| { |
| if (U_FAILURE(status)) |
| return; |
| |
| /* |
| if ( sourceText.length() != 0 ) |
| { |
| // A CollationElementIterator is really a two-layered beast. |
| // Internally it uses a Normalizer to munge the source text into a form |
| // where all "composed" Unicode characters (such as ΓΌ) are split into a |
| // normal character and a combining accent character. |
| // Afterward, CollationElementIterator does its own processing to handle |
| // expanding and contracting collation sequences, ignorables, and so on. |
| |
| Normalizer::EMode decomp = (order->getStrength() == Collator::IDENTICAL) |
| ? Normalizer::NO_OP : order->getDecomposition(); |
| |
| text = new Normalizer(sourceText, decomp); |
| if (text == NULL) |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| */ |
| int32_t length = sourceText.length(); |
| UChar *string = new UChar[length]; |
| /* |
| Using this constructor will prevent buffer from being removed when |
| string gets removed |
| */ |
| sourceText.extract(0, length, string); |
| |
| m_data_ = ucol_openElements(order->ucollator, string, length, &status); |
| m_data_->iteratordata_.isWritable = TRUE; |
| } |
| |
| /** |
| * This is the "real" constructor for this class; it constructs an iterator over |
| * the source text using the specified collator |
| */ |
| CollationElementIterator::CollationElementIterator( |
| const CharacterIterator& sourceText, |
| const RuleBasedCollator* order, |
| UErrorCode& status) |
| : isDataOwned_(TRUE) |
| { |
| if (U_FAILURE(status)) |
| return; |
| |
| // **** should I just drop this test? **** |
| /* |
| if ( sourceText.endIndex() != 0 ) |
| { |
| // A CollationElementIterator is really a two-layered beast. |
| // Internally it uses a Normalizer to munge the source text into a form |
| // where all "composed" Unicode characters (such as ΓΌ) are split into a |
| // normal character and a combining accent character. |
| // Afterward, CollationElementIterator does its own processing to handle |
| // expanding and contracting collation sequences, ignorables, and so on. |
| |
| Normalizer::EMode decomp = order->getStrength() == Collator::IDENTICAL |
| ? Normalizer::NO_OP : order->getDecomposition(); |
| |
| text = new Normalizer(sourceText, decomp); |
| if (text == NULL) |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| */ |
| int32_t length = sourceText.getLength(); |
| UChar *buffer = new UChar[length]; |
| /* |
| Using this constructor will prevent buffer from being removed when |
| string gets removed |
| */ |
| UnicodeString string(buffer, length, length); |
| ((CharacterIterator &)sourceText).getText(string); |
| string.extract(0, length, buffer); |
| |
| m_data_ = ucol_openElements(order->ucollator, buffer, length, &status); |
| m_data_->iteratordata_.isWritable = TRUE; |
| } |
| |
| /* CollationElementIterator private methods -------------------------------- */ |
| |
| const CollationElementIterator& CollationElementIterator::operator=( |
| const CollationElementIterator& other) |
| { |
| if (this != &other) |
| { |
| /* |
| expIndex = other.expIndex; |
| delete text; |
| text = (Normalizer*)other.text->clone(); |
| |
| if (other.bufferAlias == other.ownBuffer) |
| { |
| *ownBuffer = *other.ownBuffer; |
| bufferAlias = ownBuffer; |
| } |
| else |
| if (other.bufferAlias != NULL && other.bufferAlias == other.reorderBuffer) |
| { |
| if (reorderBuffer == NULL) |
| reorderBuffer = new VectorOfInt(*other.reorderBuffer); |
| else |
| *reorderBuffer = *other.reorderBuffer; |
| |
| bufferAlias = reorderBuffer; |
| } |
| else |
| bufferAlias = other.bufferAlias; |
| |
| orderAlias = other.orderAlias; |
| */ |
| this->m_data_ = other.m_data_; |
| this->isDataOwned_ = FALSE; |
| } |
| |
| return *this; |
| } |
| |
| /** |
| * Get the ordering priority of the next contracting character in the |
| * string. |
| * @param ch the starting character of a contracting character token |
| * @return the next contracting character's ordering. Returns NULLORDER |
| * if the end of string is reached. |
| */ |
| /* |
| synwee : removed |
| int32_t |
| CollationElementIterator::nextContractChar(UChar32 ch, |
| UErrorCode& status) |
| { |
| // First get the ordering of this single character |
| VectorOfPToContractElement *list = orderAlias->getContractValues((UChar)ch); |
| EntryPair *pair = (EntryPair *)list->at(0); |
| int32_t order = pair->value; |
| |
| // Now iterate through the chars following it and |
| // look for the longest match |
| key.remove(); |
| key += ch; |
| |
| while ((ch = text->current()) != Normalizer::DONE) |
| { |
| if (U_FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| key += ch; |
| |
| int32_t n = RuleBasedCollator::getEntry(list, key, TRUE); |
| |
| if (n == RuleBasedCollator::UNMAPPED) |
| { |
| break; |
| } |
| text->next(); |
| |
| pair = (EntryPair *)list->at(n); |
| order = pair->value; |
| } |
| |
| return order; |
| } |
| */ |
| |
| /** |
| * Get the ordering priority of the previous contracting character in the |
| * string. |
| * @param ch the starting character of a contracting character token |
| * @return the next contracting character's ordering. Returns NULLORDER |
| * if the end of string is reached. |
| */ |
| /* synwee : removed |
| int32_t CollationElementIterator::prevContractChar(UChar32 ch, |
| UErrorCode &status) |
| { |
| // First get the ordering of this single character |
| VectorOfPToContractElement *list = orderAlias->getContractValues((UChar)ch); |
| EntryPair *pair = (EntryPair *)list->at(0); |
| int32_t order = pair->value; |
| |
| // Now iterate through the chars following it and |
| // look for the longest match |
| key.remove(); |
| key += ch; |
| |
| while ((ch = text->previous()) != Normalizer::DONE) |
| { |
| key += ch; |
| |
| int32_t n = RuleBasedCollator::getEntry(list, key, FALSE); |
| |
| if (n == RuleBasedCollator::UNMAPPED) |
| { |
| ch = text->next(); |
| |
| if (U_FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| break; |
| } |
| |
| pair = (EntryPair *)list->at(n); |
| order = pair->value; |
| } |
| |
| return order; |
| } |
| */ |
| |
| /** |
| * This method produces a buffer which contains the collation |
| * elements for the two characters, with colFirst's values preceding |
| * another character's. Presumably, the other character precedes colFirst |
| * in logical |
| * order (otherwise you wouldn't need this method would you?). |
| * The assumption is that the other char's value(s) have already been |
| * computed. If this char has a single element it is passed to this |
| * method as lastValue, and lastExpasion is null. If it has an |
| * expasion it is passed in lastExpansion, and colLastValue is ignored. |
| * This method may return the ownBuffer array as its value so ownBuffer |
| * had better not be in use anywhere else. |
| */ |
| /* |
| VectorOfInt* CollationElementIterator::makeReorderedBuffer(UChar colFirst, |
| int32_t lastValue, |
| VectorOfInt* lastExpansion, |
| UBool forward, |
| UErrorCode& status) { |
| |
| VectorOfInt* result; |
| |
| int32_t firstValue = ucmp32_get(orderAlias->data->mapping, colFirst); |
| if (firstValue >= RuleBasedCollator::CONTRACTCHARINDEX) { |
| firstValue = forward ? nextContractChar(colFirst, status) |
| : prevContractChar(colFirst, status); |
| } |
| |
| VectorOfInt* firstExpansion = NULL; |
| if (firstValue >= RuleBasedCollator::EXPANDCHARINDEX) { |
| firstExpansion = orderAlias->getExpandValueList(firstValue); |
| } |
| |
| if (!forward) { |
| int32_t temp1 = firstValue; |
| firstValue = lastValue; |
| lastValue = temp1; |
| VectorOfInt* temp2 = firstExpansion; |
| firstExpansion = lastExpansion; |
| lastExpansion = temp2; |
| } |
| |
| if (firstExpansion == NULL && lastExpansion == NULL) { |
| ownBuffer->at(0) = firstValue; |
| ownBuffer->at(1) = lastValue; |
| result = ownBuffer; |
| } |
| else { |
| int32_t firstLength = firstExpansion==NULL? 1 : firstExpansion->size(); |
| int32_t lastLength = lastExpansion==NULL? 1 : lastExpansion->size(); |
| if (reorderBuffer == NULL) { |
| reorderBuffer = new VectorOfInt(firstLength+lastLength); |
| } |
| // reorderdBuffer gets reused for the life of this object. |
| // Since its internal buffer only grows, there is a danger |
| // that it will get really, really big, and never shrink. If |
| // this is actually happening, insert code here to check for |
| // the condition. Something along the lines of: |
| //! else if (reorderBuffer->size() >= 256 && |
| //! (firstLength+lastLength) < 16) { |
| //! delete reorderBuffer; |
| //! reorderBuffer = new VectorOfInt(firstLength+lastLength); |
| //! } |
| // The specific numeric values need to be determined |
| // empirically. [aliu] |
| result = reorderBuffer; |
| |
| if (firstExpansion == NULL) { |
| result->atPut(0, firstValue); |
| } |
| else { |
| // System.arraycopy(firstExpansion, 0, result, 0, firstLength); |
| *result = *firstExpansion; |
| } |
| |
| if (lastExpansion == NULL) { |
| result->atPut(firstLength, lastValue); |
| } |
| else { |
| // System.arraycopy(lastExpansion, 0, result, firstLength, lastLength); |
| for (int32_t i=0; i<lastLength; ++i) { |
| result->atPut(firstLength + i, lastExpansion->at(i)); |
| } |
| } |
| result->setSize(firstLength+lastLength); |
| } |
| |
| return result; |
| } |
| */ |