| /* |
| ***************************************************************************************** |
| * * |
| * COPYRIGHT: * |
| * (C) Copyright Taligent, Inc., 1996 * |
| * (C) Copyright International Business Machines Corporation, 1996-1998 * |
| * Licensed Material - Program-Property of IBM - All Rights Reserved. * |
| * US Government Users Restricted Rights - Use, duplication, or disclosure * |
| * restricted by GSA ADP Schedule Contract with IBM Corp. * |
| * * |
| ***************************************************************************************** |
| */ |
| //============================================================================= |
| // |
| // File coleitr.cpp |
| // |
| // |
| // |
| // Created by: Helena Shih |
| // |
| // Modification History: |
| // |
| // Date Name Description |
| // |
| // 6/23/97 helena Adding comments to make code more readable. |
| // 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java |
| //============================================================================= |
| |
| #include "sortkey.h" |
| #include "coleitr.h" |
| |
| #include "chariter.h" |
| #include "tables.h" |
| #include "normlzr.h" |
| #include "unicode.h" |
| |
| int32_t const CollationElementIterator::NULLORDER = 0xffffffff; |
| int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000; |
| |
| |
| // This private method will never be called, but it makes the linker happy |
| |
| CollationElementIterator::CollationElementIterator() |
| : expIndex(0), |
| text(0), |
| swapOrder(0), |
| bufferAlias(0), |
| orderAlias(0) |
| { |
| } |
| |
| // This private method will never be called, but it makes the linker happy |
| |
| CollationElementIterator::CollationElementIterator(const RuleBasedCollator* order) |
| : expIndex(0), |
| text(0), |
| bufferAlias(0), |
| swapOrder(0), |
| orderAlias(order) |
| { |
| } |
| |
| // This is the "real" constructor for this class; it constructs an iterator |
| // over the source text using the specified collator |
| CollationElementIterator::CollationElementIterator( const UnicodeString& sourceText, |
| const RuleBasedCollator* order, |
| UErrorCode& status) |
| : expIndex(0), |
| swapOrder(0), |
| text(NULL), |
| bufferAlias(NULL), |
| orderAlias(order) |
| { |
| if (FAILURE(status)) { |
| return; |
| } |
| |
| if ( sourceText.size() != 0 ) { |
| // |
| // A CollationElementIterator is really a two-layered beast. |
| // Internally it uses a Normalizer to munge the source text |
| // into a form where all "composed" Unicode characters (such as ü) are |
| // split into a normal character and a combining accent character. |
| // Afterward, CollationElementIterator does its own processing to handle |
| // expanding and contracting collation sequences, ignorables, and so on. |
| // |
| Normalizer::EMode decomp = (order->getStrength() == Collator::IDENTICAL) |
| ? Normalizer::NO_OP |
| : order->getDecomposition(); |
| |
| text = new Normalizer(sourceText, decomp); |
| if (text == NULL) { |
| status = MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| } |
| |
| |
| // This is the "real" constructor for this class; it constructs an iterator |
| // over the source text using the specified collator |
| CollationElementIterator::CollationElementIterator( const CharacterIterator& sourceText, |
| const RuleBasedCollator* order, |
| UErrorCode& status) |
| : expIndex(0), |
| swapOrder(0), |
| text(NULL), |
| bufferAlias(NULL), |
| orderAlias(order) |
| { |
| if (FAILURE(status)) { |
| return; |
| } |
| |
| // **** should I just drop this test? **** |
| if ( sourceText.endIndex() != 0 ) |
| { |
| // |
| // A CollationElementIterator is really a two-layered beast. |
| // Internally it uses a Normalizer to munge the source text |
| // into a form where all "composed" Unicode characters (such as ü) are |
| // split into a normal character and a combining accent character. |
| // Afterward, CollationElementIterator does its own processing to handle |
| // expanding and contracting collation sequences, ignorables, and so on. |
| // |
| Normalizer::EMode decomp = order->getStrength() == Collator::IDENTICAL |
| ? Normalizer::NO_OP |
| : order->getDecomposition(); |
| |
| text = new Normalizer(sourceText, decomp); |
| if (text == NULL) { |
| status = MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| } |
| |
| CollationElementIterator::CollationElementIterator(const CollationElementIterator& other) |
| : expIndex(other.expIndex), text(0), swapOrder(other.swapOrder) |
| { |
| text = (Normalizer*) other.text->clone(); |
| bufferAlias = other.bufferAlias; |
| orderAlias = other.orderAlias; |
| } |
| |
| const CollationElementIterator& |
| CollationElementIterator::operator=(const CollationElementIterator& other) |
| { |
| if (this != &other) |
| { |
| expIndex = other.expIndex; |
| swapOrder = other.swapOrder; |
| |
| delete text; |
| text = (Normalizer*)other.text->clone(); |
| |
| bufferAlias = other.bufferAlias; |
| orderAlias = other.orderAlias; |
| } |
| |
| return *this; |
| } |
| |
| CollationElementIterator::~CollationElementIterator() |
| { |
| delete text; |
| text = NULL; |
| bufferAlias = NULL; |
| orderAlias = NULL; |
| } |
| |
| bool_t |
| CollationElementIterator::operator==(const CollationElementIterator& that) const |
| { |
| if (this == &that) |
| { |
| return TRUE; |
| } |
| |
| if (*text != *(that.text)) |
| { |
| return FALSE; |
| } |
| |
| if (swapOrder != that.swapOrder) |
| { |
| return FALSE; |
| } |
| |
| if (*bufferAlias != *(that.bufferAlias)) |
| { |
| return FALSE; |
| } |
| |
| if (expIndex != that.expIndex) |
| { |
| return FALSE; |
| } |
| |
| if (orderAlias != that.orderAlias) |
| { |
| return FALSE; |
| } |
| |
| return TRUE; |
| } |
| |
| bool_t |
| CollationElementIterator::operator!=(const CollationElementIterator& other) const |
| { |
| return !(*this == other); |
| } |
| |
| /** |
| * Resets the cursor to the beginning of the string. |
| */ |
| void |
| CollationElementIterator::reset() |
| { |
| if (text != NULL) |
| { |
| text->reset(); |
| text->setMode(orderAlias->getDecomposition()); |
| } |
| |
| bufferAlias = NULL; |
| expIndex = 0; |
| swapOrder = 0; |
| } |
| |
| // Sets the source to the new source string. |
| void |
| CollationElementIterator::setText(const UnicodeString& source, |
| UErrorCode& status) |
| { |
| if (FAILURE(status)) |
| { |
| return; |
| } |
| |
| bufferAlias = 0; |
| swapOrder = 0; |
| expIndex = 0; |
| |
| if (text == NULL) |
| { |
| text = new Normalizer(source, orderAlias->getDecomposition()); |
| } |
| else |
| { |
| text->setText(source, status); |
| text->setMode(orderAlias->getDecomposition()); |
| } |
| } |
| |
| // Sets the source to the new character iterator. |
| void |
| CollationElementIterator::setText(CharacterIterator& source, |
| UErrorCode& status) |
| { |
| if (FAILURE(status)) { |
| return; |
| } |
| |
| bufferAlias = 0; |
| swapOrder = 0; |
| expIndex = 0; |
| |
| if (text == NULL) { |
| text = new Normalizer(source, orderAlias->getDecomposition()); |
| } |
| else |
| { |
| text->setMode(orderAlias->getDecomposition()); |
| text->setText(source, status); |
| } |
| } |
| |
| /** |
| * Get the ordering priority of the next character in the string. |
| * @return the next character's ordering. Returns NULLORDER if |
| * the end of string is reached. |
| */ |
| int32_t |
| CollationElementIterator::next(UErrorCode& status) |
| { |
| if (text == NULL || FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| // Update the decomposition mode if necessary. |
| text->setMode(orderAlias->getDecomposition()); |
| |
| if (bufferAlias != NULL) |
| { |
| // bufferAlias needs a bit of an explanation. |
| // When we hit an expanding character in the text, we call the order's |
| // getExpandValues method to retrieve an array of the orderings for all |
| // of the characters in the expansion (see the end of this method). |
| // The first ordering is returned, and an alias to the orderings array |
| // is saved so that the remaining orderings can be returned on subsequent |
| // calls to next. So, if the expanding buffer is not exhausted, |
| // all we have to do here is return the next ordering in the buffer. |
| if (expIndex < bufferAlias->size()) |
| { |
| return strengthOrder(bufferAlias->at(expIndex++)); |
| } |
| else |
| { |
| bufferAlias = NULL; |
| expIndex = 0; |
| } |
| } |
| else if (swapOrder != 0) |
| { |
| // If we find a character with no order, we return the marking |
| // flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character |
| // itself shifted left 16 bits as orders. At this point, the |
| // UNMAPPEDCHARVALUE flag has already been returned by the code |
| // below, so just return the shifted character here. |
| int32_t order = swapOrder << 16; |
| |
| swapOrder = 0; |
| |
| return order; |
| } |
| |
| // Gets the next character from the string using decomposition iterator. |
| UChar ch = text->current(); |
| text->next(); |
| |
| if (FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| if (ch == Normalizer::DONE) |
| { |
| return NULLORDER; |
| } |
| |
| // Ask the collator for this character's ordering. |
| int32_t value = orderAlias->getUnicodeOrder(ch); |
| |
| if (value == RuleBasedCollator::UNMAPPED) |
| { |
| // Returned an "unmapped" flag and save the character so it can be |
| // returned next time this method is called. |
| if (ch == 0x0000) return ch; |
| swapOrder = ch; // \u0000 is not valid in C++'s UnicodeString |
| return UNMAPPEDCHARVALUE; |
| } |
| |
| if (value >= RuleBasedCollator::CONTRACTCHARINDEX) |
| { |
| value = nextContractChar(ch, status); |
| } |
| |
| if (value >= RuleBasedCollator::EXPANDCHARINDEX) |
| { |
| bufferAlias = orderAlias->getExpandValueList(value); |
| expIndex = 0; |
| value = bufferAlias->at(expIndex++); |
| } |
| |
| return strengthOrder(value); |
| } |
| |
| /** |
| * Get the ordering priority of the previous collation element in the string. |
| * @param status the error code status. |
| * @return the previous element's ordering. Returns NULLORDER if |
| * the beginning of string is reached. |
| */ |
| int32_t |
| CollationElementIterator::previous(UErrorCode& status) |
| { |
| if (text == NULL || FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| text->setMode(orderAlias->getDecomposition()); |
| |
| if (bufferAlias != NULL) |
| { |
| if (expIndex > 0) |
| { |
| return strengthOrder(bufferAlias->at(--expIndex)); |
| } |
| |
| bufferAlias = NULL; |
| expIndex = 0; |
| } |
| else if (swapOrder != 0) |
| { |
| int32_t order = swapOrder << 16; |
| |
| swapOrder = 0; |
| return order; |
| } |
| |
| UChar ch = text->previous(); |
| |
| if (ch == Normalizer::DONE) |
| { |
| return NULLORDER; |
| } |
| |
| int32_t value = orderAlias->getUnicodeOrder(ch); |
| |
| if (value == RuleBasedCollator::UNMAPPED) |
| { |
| if (ch == 0x0000) return ch; |
| swapOrder = UNMAPPEDCHARVALUE; |
| return ch; |
| } |
| |
| if (value >= RuleBasedCollator::CONTRACTCHARINDEX) |
| { |
| value = prevContractChar(ch, status); |
| } |
| |
| if (value >= RuleBasedCollator::EXPANDCHARINDEX) |
| { |
| bufferAlias = orderAlias->getExpandValueList(value); |
| expIndex = bufferAlias->size(); |
| value = bufferAlias->at(--expIndex); |
| } |
| |
| return strengthOrder(value); |
| } |
| |
| int32_t |
| CollationElementIterator::strengthOrder(int32_t order) const |
| { |
| Collator::ECollationStrength s = orderAlias->getStrength(); |
| // Mask off the unwanted differences. |
| if (s == Collator::PRIMARY) |
| { |
| order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY; |
| } else if (s == Collator::SECONDARY) |
| { |
| order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY; |
| } |
| return order; |
| } |
| |
| UTextOffset |
| CollationElementIterator::getOffset() const |
| { |
| // Since the DecompositionIterator is doing the work of iterating through |
| // the text string, we can just ask it what its offset is. |
| return (text != NULL) ? text->getIndex() : 0; |
| } |
| |
| void |
| CollationElementIterator::setOffset(UTextOffset newOffset, |
| UErrorCode& status) |
| { |
| if (FAILURE(status)) |
| { |
| return; |
| } |
| |
| if (text != NULL) |
| { |
| text->setIndex(newOffset); |
| } |
| |
| bufferAlias = NULL; |
| expIndex = 0; |
| swapOrder = 0; |
| } |
| |
| //============================================================ |
| // privates |
| //============================================================ |
| |
| |
| /** |
| * Get the ordering priority of the next contracting character in the |
| * string. |
| * @param ch the starting character of a contracting character token |
| * @return the next contracting character's ordering. Returns NULLORDER |
| * if the end of string is reached. |
| */ |
| int32_t |
| CollationElementIterator::nextContractChar(UChar ch, |
| UErrorCode& status) |
| { |
| // First get the ordering of this single character |
| VectorOfPToContractElement *list = orderAlias->getContractValues(ch); |
| EntryPair *pair = (EntryPair *)list->at(0); |
| int32_t order = pair->value; |
| |
| // Now iterate through the chars following it and |
| // look for the longest match |
| key.remove(); |
| key += ch; |
| |
| while ((ch = text->current()) != Normalizer::DONE) |
| { |
| if (FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| key += ch; |
| |
| int32_t n = RuleBasedCollator::getEntry(list, key, TRUE); |
| |
| if (n == RuleBasedCollator::UNMAPPED) |
| { |
| break; |
| } |
| text->next(); |
| |
| pair = (EntryPair *)list->at(n); |
| order = pair->value; |
| } |
| |
| return order; |
| } |
| |
| /** |
| * Get the ordering priority of the previous contracting character in the |
| * string. |
| * @param ch the starting character of a contracting character token |
| * @return the next contracting character's ordering. Returns NULLORDER |
| * if the end of string is reached. |
| */ |
| int32_t CollationElementIterator::prevContractChar(UChar ch, |
| UErrorCode &status) |
| { |
| // First get the ordering of this single character |
| VectorOfPToContractElement *list = orderAlias->getContractValues(ch); |
| EntryPair *pair = (EntryPair *)list->at(0); |
| int32_t order = pair->value; |
| |
| // Now iterate through the chars following it and |
| // look for the longest match |
| key.remove(); |
| key += ch; |
| |
| while ((ch = text->previous()) != Normalizer::DONE) |
| { |
| key += ch; |
| |
| int32_t n = RuleBasedCollator::getEntry(list, key, FALSE); |
| |
| if (n == RuleBasedCollator::UNMAPPED) |
| { |
| ch = text->next(); |
| |
| if (FAILURE(status)) |
| { |
| return NULLORDER; |
| } |
| |
| break; |
| } |
| |
| pair = (EntryPair *)list->at(n); |
| order = pair->value; |
| } |
| |
| return order; |
| } |