| /* |
| ****************************************************************************** |
| * Copyright (C) 1996-2002, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ****************************************************************************** |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/TrieIterator.java,v $ |
| * $Date: 2002/03/13 05:44:14 $ |
| * $Revision: 1.5 $ |
| * |
| ****************************************************************************** |
| */ |
| |
| package com.ibm.icu.impl; |
| |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.util.RangeValueIterator; |
| import com.ibm.icu.util.RangeValueIterator.*; |
| |
| /** |
| * <p>Class enabling iteration of the values in a Trie.</p> |
| * <p>Result of each iteration contains the interval of codepoints that have |
| * the same value type and the value type itself.</p> |
| * <p>The comparison of each codepoint value is done via extract(), which the |
| * default implementation is to return the value as it is.</p> |
| * <p>Method extract() can be overwritten to perform manipulations on |
| * codepoint values in order to perform specialized comparison.</p> |
| * <p>TrieIterator is designed to be a generic iterator for the CharTrie |
| * and the IntTrie, hence to accommodate both types of data, the return |
| * result will be in terms of int (32 bit) values.</p> |
| * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p> |
| * @author synwee |
| * @see com.ibm.icu.util.Trie |
| * @see com.ibm.icu.text.UCharacterTypeIterator |
| * @since release 2.1, Jan 17 2002 |
| */ |
| public class TrieIterator implements RangeValueIterator |
| |
| { |
| // public constructor --------------------------------------------- |
| |
| /** |
| * TrieEnumeration constructor |
| * @param trie to be used |
| * @exception IllegalArgumentException throw when argument is null. |
| * @draft 2.1 |
| */ |
| protected TrieIterator(Trie trie) |
| { |
| if (trie == null) { |
| throw new IllegalArgumentException( |
| "Argument trie cannot be null"); |
| } |
| m_trie_ = trie; |
| // synwee: check that extract belongs to the child class |
| m_initialValue_ = extract(m_trie_.getInitialValue()); |
| reset(); |
| } |
| |
| // public methods ------------------------------------------------- |
| |
| /** |
| * <p>Returns true if we are not at the end of the iteration, false |
| * otherwise.</p> |
| * <p>The next set of codepoints with the same value type will be |
| * calculated during this call and returned in the arguement element.</p> |
| * @param element return result |
| * @return true if we are not at the end of the iteration, false otherwise. |
| * @exception NoSuchElementException - if no more elements exist. |
| * @see #getStart() |
| * @see #getLimit() |
| * @see #getValue() |
| * @draft 2.1 |
| */ |
| public final boolean next(Element element) |
| { |
| if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { |
| return false; |
| } |
| if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && |
| calculateNextBMPElement(element)) { |
| return true; |
| } |
| calculateNextSupplementaryElement(element); |
| return true; |
| } |
| |
| /** |
| * Resets the iterator to the beginning of the iteration |
| * @draft 2.1 |
| */ |
| public final void reset() |
| { |
| m_currentCodepoint_ = 0; |
| m_nextCodepoint_ = 0; |
| m_nextIndex_ = 0; |
| m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; |
| if (m_nextBlock_ == 0) { |
| m_nextValue_ = m_initialValue_; |
| } |
| else { |
| m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); |
| } |
| m_nextBlockIndex_ = 0; |
| m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; |
| } |
| |
| // protected methods ---------------------------------------------- |
| |
| /** |
| * Called by next() to extracts a 32 bit value from a trie value |
| * used for comparison. |
| * This method is to be overwritten if special manipulation is to be done |
| * to retrieve a relevant comparison. |
| * The default function is to return the value as it is. |
| * @param value a value from the trie |
| * @return extracted value |
| * @draft 2.1 |
| */ |
| protected int extract(int value) |
| { |
| return value; |
| } |
| |
| // private methods ------------------------------------------------ |
| |
| /** |
| * Set the result values |
| * @param element return result object |
| * @param start codepoint of range |
| * @param limit (end + 1) codepoint of range |
| * @param value common value of range |
| */ |
| private final void setResult(Element element, int start, int limit, |
| int value) |
| { |
| element.start = start; |
| element.limit = limit; |
| element.value = value; |
| } |
| |
| /** |
| * Finding the next element. |
| * This method is called just before returning the result of |
| * next(). |
| * We always store the next element before it is requested. |
| * In the case that we have to continue calculations into the |
| * supplementary planes, a false will be returned. |
| * @param element return result object |
| * @return true if the next range is found, false if we have to proceed to |
| * the supplementary range. |
| */ |
| private final boolean calculateNextBMPElement(Element element) |
| { |
| int currentBlock = m_nextBlock_; |
| int currentValue = m_nextValue_; |
| m_currentCodepoint_ = m_nextCodepoint_; |
| m_nextCodepoint_ ++; |
| m_nextBlockIndex_ ++; |
| if (!checkBlockDetail(currentValue)) { |
| setResult(element, m_currentCodepoint_, m_nextCodepoint_, |
| currentValue); |
| return true; |
| } |
| // synwee check that next block index == 0 here |
| // enumerate BMP - the main loop enumerates data blocks |
| while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { |
| m_nextIndex_ ++; |
| // because of the way the character is split to form the index |
| // the lead surrogate and trail surrogate can not be in the |
| // mid of a block |
| if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { |
| // skip lead surrogate code units, |
| // go to lead surrogate codepoints |
| m_nextIndex_ = BMP_INDEX_LENGTH_; |
| } |
| else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { |
| // go back to regular BMP code points |
| m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; |
| } |
| |
| m_nextBlockIndex_ = 0; |
| if (!checkBlock(currentBlock, currentValue)) { |
| setResult(element, m_currentCodepoint_, m_nextCodepoint_, |
| currentValue); |
| return true; |
| } |
| } |
| m_nextCodepoint_ --; // step one back since this value has not been |
| m_nextBlockIndex_ --; // retrieved yet. |
| return false; |
| } |
| |
| /** |
| * Finds the next supplementary element. |
| * For each entry in the trie, the value to be delivered is passed through |
| * extract(). |
| * We always store the next element before it is requested. |
| * Called after calculateNextBMP() completes its round of BMP characters. |
| * There is a slight difference in the usage of m_currentCodepoint_ |
| * here as compared to calculateNextBMP(). Though both represents the |
| * lower bound of the next element, in calculateNextBMP() it gets set |
| * at the start of any loop, where-else, in calculateNextSupplementary() |
| * since m_currentCodepoint_ already contains the lower bound of the |
| * next element (passed down from calculateNextBMP()), we keep it till |
| * the end before resetting it to the new value. |
| * Note, if there are no more iterations, it will never get to here. |
| * Blocked out by next(). |
| * @param element return result object |
| * @draft 2.1 |
| */ |
| private final void calculateNextSupplementaryElement(Element element) |
| { |
| int currentValue = m_nextValue_; |
| int currentBlock = m_nextBlock_; |
| m_nextCodepoint_ ++; |
| m_nextBlockIndex_ ++; |
| |
| if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { |
| setResult(element, m_currentCodepoint_, m_nextCodepoint_, |
| currentValue); |
| m_currentCodepoint_ = m_nextCodepoint_; |
| return; |
| } |
| // we have cleared one block |
| m_nextIndex_ ++; |
| m_nextTrailIndexOffset_ ++; |
| if (!checkTrailBlock(currentBlock, currentValue)) { |
| setResult(element, m_currentCodepoint_, m_nextCodepoint_, |
| currentValue); |
| m_currentCodepoint_ = m_nextCodepoint_; |
| return; |
| } |
| |
| int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); |
| // enumerate supplementary code points |
| while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { |
| // lead surrogate access |
| int leadBlock = |
| m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << |
| Trie.INDEX_STAGE_2_SHIFT_; |
| if (leadBlock == m_trie_.m_dataOffset_) { |
| // no entries for a whole block of lead surrogates |
| nextLead += DATA_BLOCK_LENGTH_; |
| // number of total affected supplementary codepoints in one |
| // block |
| m_nextCodepoint_ += DATA_BLOCK_SUPPLEMENTARY_LENGTH_; |
| continue; |
| } |
| if (m_trie_.m_dataManipulate_ == null) { |
| throw new NullPointerException( |
| "The field DataManipulate in this Trie is null"); |
| } |
| // enumerate trail surrogates for this lead surrogate |
| m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( |
| m_trie_.getValue(leadBlock + |
| (nextLead & Trie.INDEX_STAGE_3_MASK_))); |
| if (m_nextIndex_ <= 0) { |
| // no data for this lead surrogate |
| if (currentValue != m_initialValue_) { |
| m_nextValue_ = m_initialValue_; |
| m_nextBlock_ = 0; |
| m_nextBlockIndex_ = 0; |
| setResult(element, m_currentCodepoint_, m_nextCodepoint_, |
| currentValue); |
| m_currentCodepoint_ = m_nextCodepoint_; |
| return; |
| } |
| m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; |
| } else { |
| m_nextTrailIndexOffset_ = 0; |
| if (!checkTrailBlock(currentBlock, currentValue)) { |
| setResult(element, m_currentCodepoint_, m_nextCodepoint_, |
| currentValue); |
| m_currentCodepoint_ = m_nextCodepoint_; |
| return; |
| } |
| } |
| nextLead ++; |
| } |
| |
| // deliver last range |
| setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, |
| currentValue); |
| } |
| |
| /** |
| * Internal block value calculations |
| * Performs calculations on a data block to find codepoints in m_nextBlock_ |
| * after the index m_nextBlockIndex_ that has the same value. |
| * Note m_*_ variables at this point is the next codepoint whose value |
| * has not been calculated. |
| * But when returned with false, it will be the last codepoint whose |
| * value has been calculated. |
| * @param currentValue the value which other codepoints are tested against |
| * @return true if the whole block has the same value as currentValue or if |
| * the whole block has been calculated, false otherwise. |
| */ |
| private final boolean checkBlockDetail(int currentValue) |
| { |
| while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { |
| m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + |
| m_nextBlockIndex_)); |
| if (m_nextValue_ != currentValue) { |
| return false; |
| } |
| ++ m_nextBlockIndex_; |
| ++ m_nextCodepoint_; |
| } |
| return true; |
| } |
| |
| /** |
| * Internal block value calculations |
| * Performs calculations on a data block to find codepoints in m_nextBlock_ |
| * that has the same value. |
| * Will call checkBlockDetail() if highlevel check fails. |
| * Note m_*_ variables at this point is the next codepoint whose value |
| * has not been calculated. |
| * @param currentBlock the initial block containing all currentValue |
| * @param currentValue the value which other codepoints are tested against |
| * @return true if the whole block has the same value as currentValue or if |
| * the whole block has been calculated, false otherwise. |
| */ |
| private final boolean checkBlock(int currentBlock, int currentValue) |
| { |
| m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << |
| Trie.INDEX_STAGE_2_SHIFT_; |
| if (m_nextBlock_ == currentBlock && |
| (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { |
| // the block is the same as the previous one, filled with |
| // currentValue |
| m_nextCodepoint_ += DATA_BLOCK_LENGTH_; |
| } |
| else if (m_nextBlock_ == 0) { |
| // this is the all-initial-value block |
| if (currentValue != m_initialValue_) { |
| m_nextValue_ = m_initialValue_; |
| m_nextBlockIndex_ = 0; |
| return false; |
| } |
| m_nextCodepoint_ += DATA_BLOCK_LENGTH_; |
| } |
| else { |
| if (!checkBlockDetail(currentValue)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Internal block value calculations |
| * Performs calculations on multiple data blocks for a set of trail |
| * surrogates to find codepoints in m_nextBlock_ that has the same value. |
| * Will call checkBlock() for internal block checks. |
| * Note m_*_ variables at this point is the next codepoint whose value |
| * has not been calculated. |
| * @param currentBlock the initial block containing all currentValue |
| * @param currentValue the value which other codepoints are tested against |
| * @return true if the whole block has the same value as currentValue or if |
| * the whole block has been calculated, false otherwise. |
| */ |
| private final boolean checkTrailBlock(int currentBlock, |
| int currentValue) |
| { |
| // enumerate code points for this lead surrogate |
| while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) |
| { |
| // if we ever reach here, we are at the start of a new block |
| m_nextBlockIndex_ = 0; |
| // copy of most of the body of the BMP loop |
| if (!checkBlock(currentBlock, currentValue)) { |
| return false; |
| } |
| m_nextTrailIndexOffset_ ++; |
| m_nextIndex_ ++; |
| } |
| return true; |
| } |
| |
| /** |
| * Checks if we are beginning at the start of a initial block. |
| * If we are then the rest of the codepoints in this initial block |
| * has the same values. |
| * We increment m_nextCodepoint_ and relevant data members if so. |
| * This is used only in for the supplementary codepoints because |
| * the offset to the trail indexes could be 0. |
| * @return true if we are at the start of a initial block. |
| */ |
| private final boolean checkNullNextTrailIndex() |
| { |
| if (m_nextIndex_ <= 0) { |
| m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; |
| int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); |
| int leadBlock = |
| m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << |
| Trie.INDEX_STAGE_2_SHIFT_; |
| if (m_trie_.m_dataManipulate_ == null) { |
| throw new NullPointerException( |
| "The field DataManipulate in this Trie is null"); |
| } |
| m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( |
| m_trie_.getValue(leadBlock + |
| (nextLead & Trie.INDEX_STAGE_3_MASK_))); |
| m_nextIndex_ --; |
| m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; |
| return true; |
| } |
| return false; |
| } |
| |
| // private data members -------------------------------------------- |
| |
| /** |
| * Size of the stage 1 BMP indexes |
| */ |
| private static final int BMP_INDEX_LENGTH_ = |
| 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_; |
| /** |
| * Lead surrogate minimum value |
| */ |
| private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800; |
| /** |
| * Trail surrogate minimum value |
| */ |
| private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00; |
| /** |
| * Trail surrogate maximum value |
| */ |
| private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF; |
| /** |
| * Number of trail surrogate |
| */ |
| private static final int TRAIL_SURROGATE_COUNT_ = 0x400; |
| /** |
| * Number of stage 1 indexes for supplementary calculations that maps to |
| * each lead surrogate character. |
| * See second pass into getRawOffset for the trail surrogate character. |
| * 10 for significant number of bits for trail surrogates, 5 for what we |
| * discard during shifting. |
| */ |
| private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = |
| 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_); |
| /** |
| * Number of data values in a stage 2 (data array) block. |
| */ |
| private static final int DATA_BLOCK_LENGTH_ = |
| 1 << Trie.INDEX_STAGE_1_SHIFT_; |
| /** |
| * Number of codepoints in a stage 2 block |
| */ |
| private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ = |
| DATA_BLOCK_LENGTH_ << 10; |
| /** |
| * Trie instance |
| */ |
| private Trie m_trie_; |
| /** |
| * Initial value for trie values |
| */ |
| private int m_initialValue_; |
| /** |
| * Next element results and data. |
| */ |
| private int m_currentCodepoint_; |
| private int m_nextCodepoint_; |
| private int m_nextValue_; |
| private int m_nextIndex_; |
| private int m_nextBlock_; |
| private int m_nextBlockIndex_; |
| private int m_nextTrailIndexOffset_; |
| /** |
| * This is the return result element |
| */ |
| private int m_start_; |
| private int m_limit_; |
| private int m_value_; |
| } |