blob: 450e3e5566df03535a8ab518624bc01f18187ffc [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 1996-2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*
*******************************************************************************
*/
package com.ibm.icu.text;
/***
* import java.text.StringCharacterIterator;
* import java.text.CharacterIterator;
*/
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.StringUCharacterIterator;
import com.ibm.icu.impl.CharacterIteratorWrapper;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.lang.UCharacter;
import java.text.CharacterIterator;
import java.util.MissingResourceException;
/**
* <p><code>CollationElementIterator</code> is an iterator created by
* a RuleBasedCollator to walk through a string. The return result of
* each iteration is a 32-bit collation element that defines the
* ordering priority of the next character or sequence of characters
* in the source string.</p>
*
* <p>For illustration, consider the following in Spanish:
* <blockquote>
* <pre>
* "ca" -> the first collation element is collation_element('c') and second
* collation element is collation_element('a').
*
* Since "ch" in Spanish sorts as one entity, the below example returns one
* collation element for the two characters 'c' and 'h'
*
* "cha" -> the first collation element is collation_element('ch') and second
* collation element is collation_element('a').
* </pre>
* </blockquote>
* And in German,
* <blockquote>
* <pre>
* Since the character '&#230;' is a composed character of 'a' and 'e', the
* iterator returns two collation elements for the single character '&#230;'
*
* "&#230;b" -> the first collation element is collation_element('a'), the
* second collation element is collation_element('e'), and the
* third collation element is collation_element('b').
* </pre>
* </blockquote>
* </p>
*
* <p>For collation ordering comparison, the collation element results
* can not be compared simply by using basic arithmetric operators,
* e.g. &lt;, == or &gt;, further processing has to be done. Details
* can be found in the ICU
* <a href="http://icu.sourceforge.net/userguide/Collate_ServiceArchitecture.html">
* user guide</a>. An example of using the CollationElementIterator
* for collation ordering comparison is the class
* <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
*
* <p>To construct a CollationElementIterator object, users
* call the method getCollationElementIterator() on a
* RuleBasedCollator that defines the desired sorting order.</p>
*
* <p> Example:
* <blockquote>
* <pre>
* String testString = "This is a test";
* RuleBasedCollator rbc = new RuleBasedCollator("&amp;a&lt;b");
* CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
* int primaryOrder = iterator.IGNORABLE;
* while (primaryOrder != iterator.NULLORDER) {
* int order = iterator.next();
* if (order != iterator.IGNORABLE &&
* order != iterator.NULLORDER) {
* // order is valid, not ignorable and we have not passed the end
* // of the iteration, we do something
* primaryOrder = CollationElementIterator.primaryOrder(order);
* System.out.println("Next primary order 0x" +
* Integer.toHexString(primaryOrder));
* }
* }
* </pre>
* </blockquote>
* </p>
* <p>
* This class is not subclassable
* </p>
* @see Collator
* @see RuleBasedCollator
* @see StringSearch
* @author Syn Wee Quek
* @stable ICU 2.8
*/
public final class CollationElementIterator
{
// public data members --------------------------------------------------
/**
* <p>This constant is returned by the iterator in the methods
* next() and previous() when the end or the beginning of the
* source string has been reached, and there are no more valid
* collation elements to return.</p>
*
* <p>See class documentation for an example of use.</p>
* @stable ICU 2.8
* @see #next
* @see #previous */
public final static int NULLORDER = 0xffffffff;
/**
* <p>This constant is returned by the iterator in the methods
* next() and previous() when a collation element result is to be
* ignored.</p>
*
* <p>See class documentation for an example of use.</p>
* @stable ICU 2.8
* @see #next
* @see #previous */
public static final int IGNORABLE = 0;
// public methods -------------------------------------------------------
// public getters -------------------------------------------------------
/**
* <p>Returns the character offset in the source string
* corresponding to the next collation element. I.e., getOffset()
* returns the position in the source string corresponding to the
* collation element that will be returned by the next call to
* next(). This value could be any of:
* <ul>
* <li> The index of the <b>first</b> character corresponding to
* the next collation element. (This means that if
* <code>setOffset(offset)</code> sets the index in the middle of
* a contraction, <code>getOffset()</code> returns the index of
* the first character in the contraction, which may not be equal
* to the original offset that was set. Hence calling getOffset()
* immediately after setOffset(offset) does not guarantee that the
* original offset set will be returned.)
* <li> If normalization is on, the index of the <b>immediate</b>
* subsequent character, or composite character with the first
* character, having a combining class of 0.
* <li> The length of the source string, if iteration has reached
* the end.
*</ul>
* </p>
* @return The character offset in the source string corresponding to the
* collation element that will be returned by the next call to
* next().
* @stable ICU 2.8
*/
public int getOffset()
{
if (m_bufferOffset_ != -1) {
if (m_isForwards_) {
return m_FCDLimit_;
}
return m_FCDStart_;
}
return m_source_.getIndex();
}
/**
* <p> Returns the maximum length of any expansion sequence that ends with
* the specified collation element. If there is no expansion with this
* collation element as the last element, returns 1.
* </p>
* @param ce a collation element returned by previous() or next().
* @return the maximum length of any expansion sequence ending
* with the specified collation element.
* @stable ICU 2.8
*/
public int getMaxExpansion(int ce)
{
int start = 0;
int limit = m_collator_.m_expansionEndCE_.length;
long unsignedce = ce & 0xFFFFFFFFl;
while (start < limit - 1) {
int mid = start + ((limit - start) >> 1);
long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
if (unsignedce <= midce) {
limit = mid;
}
else {
start = mid;
}
}
int result = 1;
if (m_collator_.m_expansionEndCE_[start] == ce) {
result = m_collator_.m_expansionEndCEMaxSize_[start];
}
else if (limit < m_collator_.m_expansionEndCE_.length &&
m_collator_.m_expansionEndCE_[limit] == ce) {
result = m_collator_.m_expansionEndCEMaxSize_[limit];
}
else if ((ce & 0xFFFF) == 0x00C0) {
result = 2;
}
return result;
}
// public other methods -------------------------------------------------
/**
* <p> Resets the cursor to the beginning of the string. The next
* call to next() or previous() will return the first and last
* collation element in the string, respectively.</p>
*
* <p>If the RuleBasedCollator used by this iterator has had its
* attributes changed, calling reset() will reinitialize the
* iterator to use the new attributes.</p>
*
* @stable ICU 2.8
*/
public void reset()
{
m_source_.setToStart();
updateInternalState();
}
/**
* <p>Get the next collation element in the source string.</p>
*
* <p>This iterator iterates over a sequence of collation elements
* that were built from the string. Because there isn't
* necessarily a one-to-one mapping from characters to collation
* elements, this doesn't mean the same thing as "return the
* collation element [or ordering priority] of the next character
* in the string".</p>
*
* <p>This function returns the collation element that the
* iterator is currently pointing to, and then updates the
* internal pointer to point to the next element. Previous()
* updates the pointer first, and then returns the element. This
* means that when you change direction while iterating (i.e.,
* call next() and then call previous(), or call previous() and
* then call next()), you'll get back the same element twice.</p>
*
* @return the next collation element or NULLORDER if the end of the
* iteration has been reached.
* @stable ICU 2.8
*/
public int next()
{
m_isForwards_ = true;
if (m_CEBufferSize_ > 0) {
if (m_CEBufferOffset_ < m_CEBufferSize_) {
// if there are expansions left in the buffer, we return it
return m_CEBuffer_[m_CEBufferOffset_ ++];
}
m_CEBufferSize_ = 0;
m_CEBufferOffset_ = 0;
}
int ch_int = nextChar();
if (ch_int == UCharacterIterator.DONE) {
return NULLORDER;
}
char ch = (char)ch_int;
if (m_collator_.m_isHiragana4_) {
m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309e)
&& !(ch > 0x3094 && ch < 0x309d);
}
int result = NULLORDER;
if (ch <= 0xFF) {
// For latin-1 characters we never need to fall back to the UCA
// table because all of the UCA data is replicated in the
// latinOneMapping array
result = m_collator_.m_trie_.getLatin1LinearValue(ch);
if (RuleBasedCollator.isSpecial(result)) {
result = nextSpecial(m_collator_, result, ch);
}
}
else {
result = m_collator_.m_trie_.getLeadValue(ch);
//System.out.println(Integer.toHexString(result));
if (RuleBasedCollator.isSpecial(result)) {
// surrogate leads are handled as special ces
result = nextSpecial(m_collator_, result, ch);
}
if (result == CE_NOT_FOUND_ && RuleBasedCollator.UCA_ != null) {
// couldn't find a good CE in the tailoring
// if we got here, the codepoint MUST be over 0xFF - so we look
// directly in the UCA
result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
if (RuleBasedCollator.isSpecial(result)) {
// UCA also gives us a special CE
result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
}
}
}
if(result == CE_NOT_FOUND_) {
// maybe there is no UCA, unlikely in Java, but ported for consistency
result = nextImplicit(ch);
}
return result;
}
/**
* <p>Get the previous collation element in the source string.</p>
*
* <p>This iterator iterates over a sequence of collation elements
* that were built from the string. Because there isn't
* necessarily a one-to-one mapping from characters to collation
* elements, this doesn't mean the same thing as "return the
* collation element [or ordering priority] of the previous
* character in the string".</p>
*
* <p>This function updates the iterator's internal pointer to
* point to the collation element preceding the one it's currently
* pointing to and then returns that element, while next() returns
* the current element and then updates the pointer. This means
* that when you change direction while iterating (i.e., call
* next() and then call previous(), or call previous() and then
* call next()), you'll get back the same element twice.</p>
*
* @return the previous collation element, or NULLORDER when the start of
* the iteration has been reached.
* @stable ICU 2.8
*/
public int previous()
{
if (m_source_.getIndex() <= 0 && m_isForwards_) {
// if iterator is new or reset, we can immediate perform backwards
// iteration even when the offset is not right.
m_source_.setToLimit();
updateInternalState();
}
m_isForwards_ = false;
int result = NULLORDER;
if (m_CEBufferSize_ > 0) {
if (m_CEBufferOffset_ > 0) {
return m_CEBuffer_[-- m_CEBufferOffset_];
}
m_CEBufferSize_ = 0;
m_CEBufferOffset_ = 0;
}
int ch_int = previousChar();
if (ch_int == UCharacterIterator.DONE) {
return NULLORDER;
}
char ch = (char)ch_int;
if (m_collator_.m_isHiragana4_) {
m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
}
if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
}
else {
if (ch <= 0xFF) {
result = m_collator_.m_trie_.getLatin1LinearValue(ch);
}
else {
result = m_collator_.m_trie_.getLeadValue(ch);
}
if (RuleBasedCollator.isSpecial(result)) {
result = previousSpecial(m_collator_, result, ch);
}
if (result == CE_NOT_FOUND_) {
if (!isBackwardsStart()
&& m_collator_.isContractionEnd(ch)) {
result = CE_CONTRACTION_;
}
else {
if(RuleBasedCollator.UCA_ != null) {
result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
}
}
if (RuleBasedCollator.isSpecial(result)) {
if(RuleBasedCollator.UCA_ != null) {
result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
}
}
}
}
if(result == CE_NOT_FOUND_) {
result = previousImplicit(ch);
}
return result;
}
/**
* Return the primary order of the specified collation element,
* i.e. the first 16 bits. This value is unsigned.
* @param ce the collation element
* @return the element's 16 bits primary order.
* @stable ICU 2.8
*/
public final static int primaryOrder(int ce)
{
return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
>>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
}
/**
* Return the secondary order of the specified collation element,
* i.e. the 16th to 23th bits, inclusive. This value is unsigned.
* @param ce the collation element
* @return the element's 8 bits secondary order
* @stable ICU 2.8
*/
public final static int secondaryOrder(int ce)
{
return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
>> RuleBasedCollator.CE_SECONDARY_SHIFT_;
}
/**
* Return the tertiary order of the specified collation element, i.e. the last
* 8 bits. This value is unsigned.
* @param ce the collation element
* @return the element's 8 bits tertiary order
* @stable ICU 2.8
*/
public final static int tertiaryOrder(int ce)
{
return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
}
/**
* <p> Sets the iterator to point to the collation element
* corresponding to the character at the specified offset. The
* value returned by the next call to next() will be the collation
* element corresponding to the characters at offset.</p>
*
* <p>If offset is in the middle of a contracting character
* sequence, the iterator is adjusted to the start of the
* contracting sequence. This means that getOffset() is not
* guaranteed to return the same value set by this method.</p>
*
* <p>If the decomposition mode is on, and offset is in the middle
* of a decomposible range of source text, the iterator may not
* return a correct result for the next forwards or backwards
* iteration. The user must ensure that the offset is not in the
* middle of a decomposible range.</p>
*
* @param offset the character offset into the original source string to
* set. Note that this is not an offset into the corresponding
* sequence of collation elements.
* @stable ICU 2.8
*/
public void setOffset(int offset)
{
m_source_.setIndex(offset);
int ch_int = m_source_.current();
char ch = (char)ch_int;
if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
// if it is unsafe we need to check if it is part of a contraction
// or a surrogate character
if (UTF16.isTrailSurrogate(ch)) {
// if it is a surrogate pair we move up one character
char prevch = (char)m_source_.previous();
if (!UTF16.isLeadSurrogate(prevch)) {
m_source_.setIndex(offset); // go back to the same index
}
}
else {
// could be part of a contraction
// backup to a safe point and iterate till we pass offset
while (m_source_.getIndex() > 0) {
if (!m_collator_.isUnsafe(ch)) {
break;
}
ch = (char)m_source_.previous();
}
updateInternalState();
int prevoffset = 0;
while (m_source_.getIndex() <= offset) {
prevoffset = m_source_.getIndex();
next();
}
m_source_.setIndex(prevoffset);
}
}
updateInternalState();
// direction code to prevent next and previous from returning a
// character if we are already at the ends
offset = m_source_.getIndex();
if (offset == 0/* m_source_.getBeginIndex() */) {
// preventing previous() from returning characters from the end of
// the string again if we are at the beginning
m_isForwards_ = false;
}
else if (offset == m_source_.getLength()) {
// preventing next() from returning characters from the start of
// the string again if we are at the end
m_isForwards_ = true;
}
}
/**
* <p>Set a new source string for iteration, and reset the offset
* to the beginning of the text.</p>
*
* @param source the new source string for iteration.
* @stable ICU 2.8
*/
public void setText(String source)
{
m_srcUtilIter_.setText(source);
m_source_ = m_srcUtilIter_;
updateInternalState();
}
/**
* <p>Set a new source string iterator for iteration, and reset the
* offset to the beginning of the text.
* </p>
* <p>The source iterator's integrity will be preserved since a new copy
* will be created for use.</p>
* @param source the new source string iterator for iteration.
* @stable ICU 2.8
*/
public void setText(UCharacterIterator source)
{
m_srcUtilIter_.setText(source.getText());
m_source_ = m_srcUtilIter_;
updateInternalState();
}
/**
* <p>Set a new source string iterator for iteration, and reset the
* offset to the beginning of the text.
* </p>
* @param source the new source string iterator for iteration.
* @stable ICU 2.8
*/
public void setText(CharacterIterator source)
{
m_source_ = new CharacterIteratorWrapper(source);
m_source_.setToStart();
updateInternalState();
}
// public miscellaneous methods -----------------------------------------
/**
* Tests that argument object is equals to this CollationElementIterator.
* Iterators are equal if the objects uses the same RuleBasedCollator,
* the same source text and have the same current position in iteration.
* @param that object to test if it is equals to this
* CollationElementIterator
* @stable ICU 2.8
*/
public boolean equals(Object that)
{
if (that == this) {
return true;
}
if (that instanceof CollationElementIterator) {
CollationElementIterator thatceiter
= (CollationElementIterator)that;
if (!m_collator_.equals(thatceiter.m_collator_)) {
return false;
}
// checks the text
return m_source_.getIndex() == thatceiter.m_source_.getIndex()
&& m_source_.getText().equals(
thatceiter.m_source_.getText());
}
return false;
}
// package private constructors ------------------------------------------
/**
* <p>CollationElementIterator constructor. This takes a source
* string and a RuleBasedCollator. The iterator will walk through
* the source string based on the rules defined by the
* collator. If the source string is empty, NULLORDER will be
* returned on the first call to next().</p>
*
* @param source the source string.
* @param collator the RuleBasedCollator
* @stable ICU 2.8
*/
CollationElementIterator(String source, RuleBasedCollator collator)
{
m_srcUtilIter_ = new StringUCharacterIterator(source);
m_utilStringBuffer_ = new StringBuffer();
m_source_ = m_srcUtilIter_;
m_collator_ = collator;
m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
m_buffer_ = new StringBuffer();
m_utilSpecialBackUp_ = new Backup();
updateInternalState();
}
/**
* <p>CollationElementIterator constructor. This takes a source
* character iterator and a RuleBasedCollator. The iterator will
* walk through the source string based on the rules defined by
* the collator. If the source string is empty, NULLORDER will be
* returned on the first call to next().</p>
*
* @param source the source string iterator.
* @param collator the RuleBasedCollator
* @stable ICU 2.8
*/
CollationElementIterator(CharacterIterator source,
RuleBasedCollator collator)
{
m_srcUtilIter_ = new StringUCharacterIterator();
m_utilStringBuffer_ = new StringBuffer();
m_source_ = new CharacterIteratorWrapper(source);
m_collator_ = collator;
m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
m_buffer_ = new StringBuffer();
m_utilSpecialBackUp_ = new Backup();
updateInternalState();
}
/**
* <p>CollationElementIterator constructor. This takes a source
* character iterator and a RuleBasedCollator. The iterator will
* walk through the source string based on the rules defined by
* the collator. If the source string is empty, NULLORDER will be
* returned on the first call to next().</p>
*
* @param source the source string iterator.
* @param collator the RuleBasedCollator
* @stable ICU 2.8
*/
CollationElementIterator(UCharacterIterator source,
RuleBasedCollator collator)
{
m_srcUtilIter_ = new StringUCharacterIterator();
m_utilStringBuffer_ = new StringBuffer();
m_srcUtilIter_.setText(source.getText());
m_source_ = m_srcUtilIter_;
m_collator_ = collator;
m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
m_buffer_ = new StringBuffer();
m_utilSpecialBackUp_ = new Backup();
updateInternalState();
}
// package private data members -----------------------------------------
/**
* true if current codepoint was Hiragana
*/
boolean m_isCodePointHiragana_;
/**
* Position in the original string that starts with a non-FCD sequence
*/
int m_FCDStart_;
/**
* This is the CE from CEs buffer that should be returned.
* Initial value is 0.
* Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
* backwards will end with m_CEBufferOffset_ == 0.
* The next/previous after we reach the end/beginning of the m_CEBuffer_
* will cause this value to be reset to 0.
*/
int m_CEBufferOffset_;
/**
* This is the position to which we have stored processed CEs.
* Initial value is 0.
* The next/previous after we reach the end/beginning of the m_CEBuffer_
* will cause this value to be reset to 0.
*/
int m_CEBufferSize_;
static final int CE_NOT_FOUND_ = 0xF0000000;
static final int CE_EXPANSION_TAG_ = 1;
static final int CE_CONTRACTION_TAG_ = 2;
/**
* Collate Digits As Numbers (CODAN) implementation
*/
static final int CE_DIGIT_TAG_ = 13;
// package private methods ----------------------------------------------
/**
* Sets the collator used.
* Internal use, all data members will be reset to the default values
* @param collator to set
*/
void setCollator(RuleBasedCollator collator)
{
m_collator_ = collator;
updateInternalState();
}
/**
* <p>Sets the iterator to point to the collation element corresponding to
* the specified character (the parameter is a CHARACTER offset in the
* original string, not an offset into its corresponding sequence of
* collation elements). The value returned by the next call to next()
* will be the collation element corresponding to the specified position
* in the text. Unlike the public method setOffset(int), this method does
* not try to readjust the offset to the start of a contracting sequence.
* getOffset() is guaranteed to return the same value as was passed to a
* preceding call to setOffset().</p>
* @param offset new character offset into the original text to set.
*/
void setExactOffset(int offset)
{
m_source_.setIndex(offset);
updateInternalState();
}
/**
* Checks if iterator is in the buffer zone
* @return true if iterator is in buffer zone, false otherwise
*/
boolean isInBuffer()
{
return m_bufferOffset_ > 0;
}
/**
* <p>Sets the iterator to point to the collation element corresponding to
* the specified character (the parameter is a CHARACTER offset in the
* original string, not an offset into its corresponding sequence of
* collation elements). The value returned by the next call to next()
* will be the collation element corresponding to the specified position
* in the text. Unlike the public method setOffset(int), this method does
* not try to readjust the offset to the start of a contracting sequence.
* getOffset() is guaranteed to return the same value as was passed to a
* preceding call to setOffset().</p>
* </p>
* @param source the new source string iterator for iteration.
* @param offset to the source
*/
void setText(UCharacterIterator source, int offset)
{
m_srcUtilIter_.setText(source.getText());
m_source_ = m_srcUtilIter_;
m_source_.setIndex(offset);
updateInternalState();
}
// private inner class --------------------------------------------------
/**
* Backup data class
*/
private static final class Backup
{
// protected data members -------------------------------------------
/**
* Backup non FCD sequence limit
*/
protected int m_FCDLimit_;
/**
* Backup non FCD sequence start
*/
protected int m_FCDStart_;
/**
* Backup if previous Codepoint is Hiragana quatenary
*/
protected boolean m_isCodePointHiragana_;
/**
* Backup buffer position
*/
protected int m_bufferOffset_;
/**
* Backup source iterator offset
*/
protected int m_offset_;
/**
* Backup buffer contents
*/
protected StringBuffer m_buffer_;
// protected constructor --------------------------------------------
/**
* Empty constructor
*/
protected Backup()
{
m_buffer_ = new StringBuffer();
}
}
// end inner class ------------------------------------------------------
/**
* Direction of travel
*/
private boolean m_isForwards_;
/**
* Source string iterator
*/
private UCharacterIterator m_source_;
/**
* This is position to the m_buffer_, -1 if iterator is not in m_buffer_
*/
private int m_bufferOffset_;
/**
* Buffer for temporary storage of normalized characters, discontiguous
* characters and Thai characters
*/
private StringBuffer m_buffer_;
/**
* Position in the original string to continue forward FCD check from.
*/
private int m_FCDLimit_;
/**
* The collator this iterator is based on
*/
private RuleBasedCollator m_collator_;
/**
* true if Hiragana quatenary is on
*/
private boolean m_isHiragana4_;
/**
* CE buffer
*/
private int m_CEBuffer_[];
/**
* In reality we should not have to deal with expansion sequences longer
* then 16. However this value can be change if a bigger buffer is needed.
* Note, if the size is change to too small a number, BIG trouble.
* Reasonable small value is around 10, if there's no Arabic or other
* funky collations that have long expansion sequence. This is the longest
* expansion sequence this can handle without bombing out.
*/
private static final int CE_BUFFER_INIT_SIZE_ = 512;
/**
* Backup storage for special processing inner cases
*/
private Backup m_utilSpecialBackUp_;
/**
* Backup storage in special processing entry state
*/
private Backup m_utilSpecialEntryBackUp_;
/**
* Backup storage in special processing discontiguous state
*/
private Backup m_utilSpecialDiscontiguousBackUp_;
/**
* Utility
*/
private StringUCharacterIterator m_srcUtilIter_;
private StringBuffer m_utilStringBuffer_;
private StringBuffer m_utilSkippedBuffer_;
private CollationElementIterator m_utilColEIter_;
/**
* One character before the first non-zero combining class character
*/
private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
/**
* One character before the first character with leading non-zero combining
* class
*/
private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
/**
* Mask for the last byte
*/
private static final int LAST_BYTE_MASK_ = 0xFF;
/**
* Shift value for the second last byte
*/
private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
// special ce values and tags -------------------------------------------
private static final int CE_EXPANSION_ = 0xF1000000;
private static final int CE_CONTRACTION_ = 0xF2000000;
/**
* Indicates the last ce has been consumed. Compare with NULLORDER.
* NULLORDER is returned if error occurs.
*/
private static final int CE_NO_MORE_CES_ = 0x00010101;
private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
private static final int CE_NOT_FOUND_TAG_ = 0;
/**
* Charset processing, not yet implemented
*/
private static final int CE_CHARSET_TAG_ = 4;
/**
* AC00-D7AF
*/
private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
/**
* D800-DBFF
*/
private static final int CE_LEAD_SURROGATE_TAG_ = 7;
/**
* DC00-DFFF
*/
private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
/**
* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
*/
private static final int CE_CJK_IMPLICIT_TAG_ = 9;
private static final int CE_IMPLICIT_TAG_ = 10;
static final int CE_SPEC_PROC_TAG_ = 11;
/**
* This is a 3 byte primary with starting secondaries and tertiaries.
* It fits in a single 32 bit CE and is used instead of expansion to save
* space without affecting the performance (hopefully).
*/
private static final int CE_LONG_PRIMARY_TAG_ = 12;
private static final int CE_CE_TAGS_COUNT = 14;
private static final int CE_BYTE_COMMON_ = 0x05;
// end special ce values and tags ---------------------------------------
private static final int HANGUL_SBASE_ = 0xAC00;
private static final int HANGUL_LBASE_ = 0x1100;
private static final int HANGUL_VBASE_ = 0x1161;
private static final int HANGUL_TBASE_ = 0x11A7;
private static final int HANGUL_VCOUNT_ = 21;
private static final int HANGUL_TCOUNT_ = 28;
// CJK stuff ------------------------------------------------------------
private static final int CJK_BASE_ = 0x4E00;
private static final int CJK_LIMIT_ = 0x9FFF+1;
private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
private static final int CJK_A_BASE_ = 0x3400;
private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
private static final int CJK_B_BASE_ = 0x20000;
private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
private static final int NON_CJK_OFFSET_ = 0x110000;
private static final boolean DEBUG = ICUDebug.enabled("collator");
// private methods ------------------------------------------------------
/**
* Reset the iterator internally
*/
private void updateInternalState()
{
m_isCodePointHiragana_ = false;
m_buffer_.setLength(0);
m_bufferOffset_ = -1;
m_CEBufferOffset_ = 0;
m_CEBufferSize_ = 0;
m_FCDLimit_ = -1;
m_FCDStart_ = m_source_.getLength();
m_isHiragana4_ = m_collator_.m_isHiragana4_;
m_isForwards_ = true;
}
/**
* Backup the current internal state
* @param backup object to store the data
*/
private void backupInternalState(Backup backup)
{
backup.m_offset_ = m_source_.getIndex();
backup.m_FCDLimit_ = m_FCDLimit_;
backup.m_FCDStart_ = m_FCDStart_;
backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
backup.m_bufferOffset_ = m_bufferOffset_;
backup.m_buffer_.setLength(0);
if (m_bufferOffset_ >= 0) {
// jdk 1.3.1 does not have append(StringBuffer) yet
if(ICUDebug.isJDK14OrHigher){
backup.m_buffer_.append(m_buffer_);
}else{
backup.m_buffer_.append(m_buffer_.toString());
}
}
}
/**
* Update the iterator internally with backed-up state
* @param backup object that stored the data
*/
private void updateInternalState(Backup backup)
{
m_source_.setIndex(backup.m_offset_);
m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
m_bufferOffset_ = backup.m_bufferOffset_;
m_FCDLimit_ = backup.m_FCDLimit_;
m_FCDStart_ = backup.m_FCDStart_;
m_buffer_.setLength(0);
if (m_bufferOffset_ >= 0) {
// jdk 1.3.1 does not have append(StringBuffer) yet
m_buffer_.append(backup.m_buffer_.toString());
}
}
/**
* A fast combining class retrieval system.
* @param ch UTF16 character
* @return combining class of ch
*/
private int getCombiningClass(int ch)
{
if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
m_collator_.isUnsafe((char)ch) || ch > 0xFFFF) {
return NormalizerImpl.getCombiningClass(ch);
}
return 0;
}
/**
* <p>Incremental normalization, this is an essential optimization.
* Assuming FCD checks has been done, normalize the non-FCD characters into
* the buffer.
* Source offsets points to the current processing character.
* </p>
*/
private void normalize()
{
int size = m_FCDLimit_ - m_FCDStart_;
m_buffer_.setLength(0);
m_source_.setIndex(m_FCDStart_);
for (int i = 0; i < size; i ++) {
m_buffer_.append((char)m_source_.next());
}
String decomp = Normalizer.decompose(m_buffer_.toString(), false);
m_buffer_.setLength(0);
m_buffer_.append(decomp);
m_bufferOffset_ = 0;
}
/**
* <p>Incremental FCD check and normalization. Gets the next base character
* position and determines if the in-between characters needs normalization.
* </p>
* <p>When entering, the state is known to be this:
* <ul>
* <li>We are working on source string, not the buffer.
* <li>The leading combining class from the current character is 0 or the
* trailing combining class of the previous char was zero.
* </ul>
* Incoming source offsets points to the current processing character.
* Return source offsets points to the current processing character.
* </p>
* @param ch current character
* @param offset current character offset
* @return true if FCDCheck passes, false otherwise
*/
private boolean FCDCheck(char ch, int offset)
{
boolean result = true;
// Get the trailing combining class of the current character.
// If it's zero, we are OK.
m_FCDStart_ = offset;
m_source_.setIndex(offset);
// trie access
char fcd = NormalizerImpl.getFCD16(ch);
if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
m_source_.next();
ch = (char)m_source_.current();
// UCharacterIterator.DONE has 0 fcd
if (UTF16.isTrailSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
} else {
fcd = 0;
}
}
int prevTrailCC = fcd & LAST_BYTE_MASK_;
if (prevTrailCC != 0) {
// The current char has a non-zero trailing CC. Scan forward until
// we find a char with a leading cc of zero.
while (true) {
m_source_.next();
int ch_int = m_source_.current();
if (ch_int == UCharacterIterator.DONE) {
break;
}
ch = (char)ch_int;
// trie access
fcd = NormalizerImpl.getFCD16(ch);
if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
m_source_.next();
ch = (char)m_source_.current();
if (UTF16.isTrailSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
} else {
fcd = 0;
}
}
int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
if (leadCC == 0) {
// this is a base character, we stop the FCD checks
break;
}
if (leadCC < prevTrailCC) {
result = false;
}
prevTrailCC = fcd & LAST_BYTE_MASK_;
}
}
m_FCDLimit_ = m_source_.getIndex();
m_source_.setIndex(m_FCDStart_);
m_source_.next();
return result;
}
/**
* <p>Method tries to fetch the next character that is in fcd form.</p>
* <p>Normalization is done if required.</p>
* <p>Offsets are returned at the next character.</p>
* @return next fcd character
*/
private int nextChar()
{
int result;
// loop handles the next character whether it is in the buffer or not.
if (m_bufferOffset_ < 0) {
// we're working on the source and not normalizing. fast path.
// note Thai pre-vowel reordering uses buffer too
result = m_source_.current();
}
else {
// we are in the buffer, buffer offset will never be 0 here
if (m_bufferOffset_ >= m_buffer_.length()) {
// Null marked end of buffer, revert to the source string and
// loop back to top to try again to get a character.
m_source_.setIndex(m_FCDLimit_);
m_bufferOffset_ = -1;
m_buffer_.setLength(0);
return nextChar();
}
return m_buffer_.charAt(m_bufferOffset_ ++);
}
int startoffset = m_source_.getIndex();
if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
// Fast fcd safe path. trail combining class == 0.
|| m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
|| m_bufferOffset_ >= 0 || m_FCDLimit_ > startoffset) {
// skip the fcd checks
m_source_.next();
return result;
}
if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
// We need to peek at the next character in order to tell if we are
// FCD
m_source_.next();
int next = m_source_.current();
if (next == UCharacterIterator.DONE
|| next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
return result; // end of source string and if next character
// starts with a base character is always fcd.
}
}
// Need a more complete FCD check and possible normalization.
if (!FCDCheck((char)result, startoffset)) {
normalize();
result = m_buffer_.charAt(0);
m_bufferOffset_ = 1;
}
return result;
}
/**
* <p>Incremental normalization, this is an essential optimization.
* Assuming FCD checks has been done, normalize the non-FCD characters into
* the buffer.
* Source offsets points to the current processing character.</p>
*/
private void normalizeBackwards()
{
normalize();
m_bufferOffset_ = m_buffer_.length();
}
/**
* <p>Incremental backwards FCD check and normalization. Gets the previous
* base character position and determines if the in-between characters
* needs normalization.
* </p>
* <p>When entering, the state is known to be this:
* <ul>
* <li>We are working on source string, not the buffer.
* <li>The trailing combining class from the current character is 0 or the
* leading combining class of the next char was zero.
* </ul>
* Input source offsets points to the previous character.
* Return source offsets points to the current processing character.
* </p>
* @param ch current character
* @param offset current character offset
* @return true if FCDCheck passes, false otherwise
*/
private boolean FCDCheckBackwards(char ch, int offset)
{
boolean result = true;
char fcd = 0;
m_FCDLimit_ = offset + 1;
m_source_.setIndex(offset);
if (!UTF16.isSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16(ch);
}
else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) {
// note trail surrogate characters gets 0 fcd
char trailch = ch;
ch = (char)m_source_.previous();
if (UTF16.isLeadSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16(ch);
if (fcd != 0) {
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
trailch);
}
}
else {
fcd = 0; // unpaired surrogate
}
}
int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
// The current char has a non-zero leading combining class.
// Scan backward until we find a char with a trailing cc of zero.
while (leadCC != 0) {
offset = m_source_.getIndex();
if (offset == 0) {
break;
}
ch = (char)m_source_.previous();
if (!UTF16.isSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16(ch);
}
else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) {
char trail = ch;
ch = (char)m_source_.previous();
if (UTF16.isLeadSurrogate(ch)) {
fcd = NormalizerImpl.getFCD16(ch);
}
if (fcd != 0) {
fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, trail);
}
}
else {
fcd = 0; // unpaired surrogate
}
int prevTrailCC = fcd & LAST_BYTE_MASK_;
if (leadCC < prevTrailCC) {
result = false;
}
leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
}
// storing character with 0 lead fcd or the 1st accent with a base
// character before it
if (fcd == 0) {
m_FCDStart_ = offset;
}
else {
m_FCDStart_ = m_source_.getIndex();
}
m_source_.setIndex(m_FCDLimit_);
return result;
}
/**
* <p>Method tries to fetch the previous character that is in fcd form.</p>
* <p>Normalization is done if required.</p>
* <p>Offsets are returned at the current character.</p>
* @return previous fcd character
*/
private int previousChar()
{
if (m_bufferOffset_ >= 0) {
m_bufferOffset_ --;
if (m_bufferOffset_ >= 0) {
return m_buffer_.charAt(m_bufferOffset_);
}
else {
// At the start of buffer, route back to string.
m_buffer_.setLength(0);
if (m_FCDStart_ == 0) {
m_FCDStart_ = -1;
m_source_.setIndex(0);
return UCharacterIterator.DONE;
}
else {
m_FCDLimit_ = m_FCDStart_;
m_source_.setIndex(m_FCDStart_);
return previousChar();
}
}
}
int result = m_source_.previous();
int startoffset = m_source_.getIndex();
if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
|| m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
|| m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
return result;
}
int ch = m_source_.previous();
if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
// if previous character is FCD
m_source_.next();
return result;
}
// Need a more complete FCD check and possible normalization.
if (!FCDCheckBackwards((char)result, startoffset)) {
normalizeBackwards();
m_bufferOffset_ --;
result = m_buffer_.charAt(m_bufferOffset_);
}
else {
// fcd checks alway reset m_source_ to the limit of the FCD
m_source_.setIndex(startoffset);
}
return result;
}
/**
* Determines if it is at the start of source iteration
* @return true if iterator at the start, false otherwise
*/
private final boolean isBackwardsStart()
{
return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
|| (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
}
/**
* Checks if iterator is at the end of its source string.
* @return true if it is at the end, false otherwise
*/
private final boolean isEnd()
{
if (m_bufferOffset_ >= 0) {
if (m_bufferOffset_ != m_buffer_.length()) {
return false;
}
else {
// at end of buffer. check if fcd is at the end
return m_FCDLimit_ == m_source_.getLength();
}
}
return m_source_.getLength() == m_source_.getIndex();
}
/**
* <p>Special CE management for surrogates</p>
* <p>Lead surrogate is encountered. CE to be retrieved by using the
* following code unit. If next character is a trail surrogate, both
* characters will be combined to retrieve the CE, otherwise completely
* ignorable (UCA specification) is returned.</p>
* @param collator collator to use
* @param ce current CE
* @param trail character
* @return next CE for the surrogate characters
*/
private final int nextSurrogate(RuleBasedCollator collator, int ce,
char trail)
{
if (!UTF16.isTrailSurrogate(trail)) {
updateInternalState(m_utilSpecialBackUp_);
return IGNORABLE;
}
// TODO: CE contain the data from the previous CE + the mask.
// It should at least be unmasked
int result = collator.m_trie_.getTrailValue(ce, trail);
if (result == CE_NOT_FOUND_) {
updateInternalState(m_utilSpecialBackUp_);
}
return result;
}
/**
* Gets the CE expansion offset
* @param collator current collator
* @param ce ce to test
* @return expansion offset
*/
private int getExpansionOffset(RuleBasedCollator collator, int ce)
{
return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
}
/**
* Gets the contraction ce offset
* @param collator current collator
* @param ce current ce
* @return contraction offset
*/
private int getContractionOffset(RuleBasedCollator collator, int ce)
{
return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
}
/**
* Checks if CE is a special tag CE
* @param ce to check
* @return true if CE is a special tag CE, false otherwise
*/
private boolean isSpecialPrefixTag(int ce)
{
return RuleBasedCollator.isSpecial(ce) &&
RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
}
/**
* <p>Special processing getting a CE that is preceded by a certain
* prefix.</p>
* <p>Used for optimizing Japanese length and iteration marks. When a
* special processing tag is encountered, iterate backwards to see if
* there's a match.</p>
* <p>Contraction tables are used, prefix data is stored backwards in the
* table.</p>
* @param collator collator to use
* @param ce current ce
* @param entrybackup entry backup iterator status
* @return next collation element
*/
private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
Backup entrybackup)
{
backupInternalState(m_utilSpecialBackUp_);
updateInternalState(entrybackup);
previousChar();
// We want to look at the character where we entered
while (true) {
// This loop will run once per source string character, for as
// long as we are matching a potential contraction sequence
// First we position ourselves at the begining of contraction
// sequence
int entryoffset = getContractionOffset(collator, ce);
int offset = entryoffset;
if (isBackwardsStart()) {
ce = collator.m_contractionCE_[offset];
break;
}
char previous = (char)previousChar();
while (previous > collator.m_contractionIndex_[offset]) {
// contraction characters are ordered, skip smaller characters
offset ++;
}
if (previous == collator.m_contractionIndex_[offset]) {
// Found the source string char in the table.
// Pick up the corresponding CE from the table.
ce = collator.m_contractionCE_[offset];
}
else {
// Source string char was not in the table, prefix not found
ce = collator.m_contractionCE_[entryoffset];
}
if (!isSpecialPrefixTag(ce)) {
// The source string char was in the contraction table, and
// the corresponding CE is not a prefix CE. We found the
// prefix, break out of loop, this CE will end up being
// returned. This is the normal way out of prefix handling
// when the source actually contained the prefix.
break;
}
}
if (ce != CE_NOT_FOUND_) {
// we found something and we can merilly continue
updateInternalState(m_utilSpecialBackUp_);
}
else { // prefix search was a failure, we have to backup all the way to
// the start
updateInternalState(entrybackup);
}
return ce;
}
/**
* Checks if the ce is a contraction tag
* @param ce ce to check
* @return true if ce is a contraction tag, false otherwise
*/
private boolean isContractionTag(int ce)
{
return RuleBasedCollator.isSpecial(ce) &&
RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
}
/**
* Method to copy skipped characters into the buffer and sets the fcd
* position. To ensure that the skipped characters are considered later,
* we need to place it in the appropriate position in the buffer and
* reassign the source index. simple case if index reside in string,
* simply copy to buffer and fcdposition = pos, pos = start of buffer.
* if pos in normalization buffer, we'll insert the copy infront of pos
* and point pos to the start of the buffer. why am i doing these copies?
* well, so that the whole chunk of codes in the getNextCE,
* ucol_prv_getSpecialCE does not require any changes, which will be
* really painful.
* @param skipped character buffer
*/
private void setDiscontiguous(StringBuffer skipped)
{
if (m_bufferOffset_ >= 0) {
m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
}
else {
m_FCDLimit_ = m_source_.getIndex();
m_buffer_.setLength(0);
m_buffer_.append(skipped.toString());
}
m_bufferOffset_ = 0;
}
/**
* Returns the current character for forward iteration
* @return current character
*/
private int currentChar()
{
if (m_bufferOffset_ < 0) {
m_source_.previous();
return m_source_.next();
}
// m_bufferOffset_ is never 0 in normal circumstances except after a
// discontiguous contraction since it is always returned and moved
// by 1 when we do nextChar()
return m_buffer_.charAt(m_bufferOffset_ - 1);
}
/**
* Method to get the discontiguous collation element within the source.
* Note this function will set the position to the appropriate places.
* Passed in character offset points to the second combining character
* after the start character.
* @param collator current collator used
* @param entryoffset index to the start character in the contraction table
* @return discontiguous collation element offset
*/
private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
{
int offset = entryoffset;
boolean multicontraction = false;
// since it will be stuffed into this iterator and ran over again
if (m_utilSkippedBuffer_ == null) {
m_utilSkippedBuffer_ = new StringBuffer();
}
else {
m_utilSkippedBuffer_.setLength(0);
}
char ch = (char)currentChar();
m_utilSkippedBuffer_.append((char)currentChar());
// accent after the first character
if (m_utilSpecialDiscontiguousBackUp_ == null) {
m_utilSpecialDiscontiguousBackUp_ = new Backup();
}
backupInternalState(m_utilSpecialDiscontiguousBackUp_);
char nextch = ch;
while (true) {
ch = nextch;
int ch_int = nextChar();
nextch = (char)ch_int;
if (ch_int == UCharacterIterator.DONE
|| getCombiningClass(nextch) == 0) {
// if there are no more accents to move around
// we don't have to shift previousChar, since we are resetting
// the offset later
if (multicontraction) {
if (ch_int != UCharacterIterator.DONE) {
previousChar(); // backtrack
}
setDiscontiguous(m_utilSkippedBuffer_);
return collator.m_contractionCE_[offset];
}
break;
}
offset ++; // skip the combining class offset
while (nextch > collator.m_contractionIndex_[offset]) {
offset ++;
}
int ce = CE_NOT_FOUND_;
if (nextch != collator.m_contractionIndex_[offset]
|| getCombiningClass(nextch) == getCombiningClass(ch)) {
// unmatched or blocked character
m_utilSkippedBuffer_.append(nextch);
continue;
}
else {
ce = collator.m_contractionCE_[offset];
}
if (ce == CE_NOT_FOUND_) {
break;
}
else if (isContractionTag(ce)) {
// this is a multi-contraction
offset = getContractionOffset(collator, ce);
if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
multicontraction = true;
backupInternalState(m_utilSpecialDiscontiguousBackUp_);
}
}
else {
setDiscontiguous(m_utilSkippedBuffer_);
return ce;
}
}
updateInternalState(m_utilSpecialDiscontiguousBackUp_);
// backup is one forward of the base character, we need to move back
// one more
previousChar();
return collator.m_contractionCE_[entryoffset];
}
/**
* Gets the next contraction ce
* @param collator collator to use
* @param ce current ce
* @param entrybackup entry backup iterator status
* @return ce of the next contraction
*/
private int nextContraction(RuleBasedCollator collator, int ce)
{
backupInternalState(m_utilSpecialBackUp_);
int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
while (true) {
int entryoffset = getContractionOffset(collator, ce);
int offset = entryoffset;
if (isEnd()) {
ce = collator.m_contractionCE_[offset];
if (ce == CE_NOT_FOUND_) {
// back up the source over all the chars we scanned going
// into this contraction.
ce = entryce;
updateInternalState(m_utilSpecialBackUp_);
}
break;
}
// get the discontiguos maximum combining class
int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
// checks if all characters have the same combining class
byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
char ch = (char)nextChar();
offset ++;
while (ch > collator.m_contractionIndex_[offset]) {
// contraction characters are ordered, skip all smaller
offset ++;
}
if (ch == collator.m_contractionIndex_[offset]) {
// Found the source string char in the contraction table.
// Pick up the corresponding CE from the table.
ce = collator.m_contractionCE_[offset];
}
else {
// Source string char was not in contraction table.
// Unless it is a discontiguous contraction, we are done
int miss = ch;
if(UTF16.isLeadSurrogate(ch)) { // in order to do the proper detection, we
// need to see if we're dealing with a supplementary
miss = UCharacterProperty.getRawSupplementary(ch, (char) nextChar());
}
int sCC;
if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
|| sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
isEnd()) {
// Contraction can not be discontiguous, back up by one
previousChar();
if(miss > 0xFFFF) {
previousChar();
}
ce = collator.m_contractionCE_[entryoffset];
}
else {
// Contraction is possibly discontiguous.
// find the next character if ch is not a base character
int ch_int = nextChar();
if (ch_int != UCharacterIterator.DONE) {
previousChar();
}
char nextch = (char)ch_int;
if (getCombiningClass(nextch) == 0) {
previousChar();
if(miss > 0xFFFF) {
previousChar();
}
// base character not part of discontiguous contraction
ce = collator.m_contractionCE_[entryoffset];
}
else {
ce = nextDiscontiguous(collator, entryoffset);
}
}
}
if (ce == CE_NOT_FOUND_) {
// source did not match the contraction, revert back original
updateInternalState(m_utilSpecialBackUp_);
ce = entryce;
break;
}
// source was a contraction
if (!isContractionTag(ce)) {
break;
}
// ccontinue looping to check for the remaining contraction.
if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
// there are further contractions to be performed, so we store
// the so-far completed ce, so that if we fail in the next
// round we just return this one.
entryce = collator.m_contractionCE_[entryoffset];
backupInternalState(m_utilSpecialBackUp_);
if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
m_utilSpecialBackUp_.m_bufferOffset_ --;
}
else {
m_utilSpecialBackUp_.m_offset_ --;
}
}
}
return ce;
}
/**
* Gets the next ce for long primaries, stuffs the rest of the collation
* elements into the ce buffer
* @param ce current ce
* @return next ce
*/
private int nextLongPrimary(int ce)
{
m_CEBuffer_[1] = ((ce & 0xFF) << 24)
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
m_CEBufferOffset_ = 1;
m_CEBufferSize_ = 2;
m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
CE_BYTE_COMMON_;
return m_CEBuffer_[0];
}
/**
* Gets the number of expansion
* @param ce current ce
* @return number of expansion
*/
private int getExpansionCount(int ce)
{
return ce & 0xF;
}
/**
* Gets the next expansion ce and stuffs the rest of the collation elements
* into the ce buffer
* @param collator current collator
* @param ce current ce
* @return next expansion ce
*/
private int nextExpansion(RuleBasedCollator collator, int ce)
{
// NOTE: we can encounter both continuations and expansions in an
// expansion!
// I have to decide where continuations are going to be dealt with
int offset = getExpansionOffset(collator, ce);
m_CEBufferSize_ = getExpansionCount(ce);
m_CEBufferOffset_ = 1;
m_CEBuffer_[0] = collator.m_expansion_[offset];
if (m_CEBufferSize_ != 0) {
// if there are less than 16 elements in expansion
for (int i = 1; i < m_CEBufferSize_; i ++) {
m_CEBuffer_[i] = collator.m_expansion_[offset + i];
}
}
else {
// ce are terminated
m_CEBufferSize_ = 1;
while (collator.m_expansion_[offset] != 0) {
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_expansion_[++ offset];
}
}
// in case of one element expansion, we
// want to immediately return CEpos
if (m_CEBufferSize_ == 1) {
m_CEBufferSize_ = 0;
m_CEBufferOffset_ = 0;
}
return m_CEBuffer_[0];
}
/**
* Gets the next digit ce
* @param collator current collator
* @param ce current collation element
* @param cp current codepoint
* @return next digit ce
*/
private int nextDigit(RuleBasedCollator collator, int ce, int cp)
{
// We do a check to see if we want to collate digits as numbers;
// if so we generate a custom collation key. Otherwise we pull out
// the value stored in the expansion table.
if (m_collator_.m_isNumericCollation_){
int collateVal = 0;
int trailingZeroIndex = 0;
boolean nonZeroValReached = false;
// I just need a temporary place to store my generated CEs.
// icu4c uses a unsigned byte array, i'll use a stringbuffer here
// to avoid dealing with the sign problems and array allocation
// clear and set initial string buffer length
m_utilStringBuffer_.setLength(3);
// We parse the source string until we hit a char that's NOT a
// digit.
// Use this u_charDigitValue. This might be slow because we have
// to handle surrogates...
int digVal = UCharacter.digit(cp);
// if we have arrived here, we have already processed possible
// supplementaries that trigered the digit tag -
// all supplementaries are marked in the UCA.
// We pad a zero in front of the first element anyways.
// This takes care of the (probably) most common case where
// people are sorting things followed by a single digit
int digIndx = 1;
for (;;) {
// Make sure we have enough space.
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
<< 1);
}
// Skipping over leading zeroes.
if (digVal != 0 || nonZeroValReached) {
if (digVal != 0 && !nonZeroValReached) {
nonZeroValReached = true;
}
// We parse the digit string into base 100 numbers
// (this fits into a byte).
// We only add to the buffer in twos, thus if we are
// parsing an odd character, that serves as the
// 'tens' digit while the if we are parsing an even
// one, that is the 'ones' digit. We dumped the
// parsed base 100 value (collateVal) into a buffer.
// We multiply each collateVal by 2 (to give us room)
// and add 5 (to avoid overlapping magic CE byte
// values). The last byte we subtract 1 to ensure it is
// less than all the other bytes.
if (digIndx % 2 == 1) {
collateVal += digVal;
// This removes trailing zeroes.
if (collateVal == 0 && trailingZeroIndex == 0) {
trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
}
else if (trailingZeroIndex != 0) {
trailingZeroIndex = 0;
}
m_utilStringBuffer_.setCharAt(
((digIndx - 1) >>> 1) + 2,
(char)((collateVal << 1) + 6));
collateVal = 0;
}
else {
// We drop the collation value into the buffer so if
// we need to do a "front patch" we don't have to
// check to see if we're hitting the last element.
collateVal = digVal * 10;
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
(char)((collateVal << 1) + 6));
}
digIndx ++;
}
// Get next character.
if (!isEnd()){
backupInternalState(m_utilSpecialBackUp_);
int char32 = nextChar();
char ch = (char)char32;
if (UTF16.isLeadSurrogate(ch)){
if (!isEnd()) {
char trail = (char)nextChar();
if (UTF16.isTrailSurrogate(trail)) {
char32 = UCharacterProperty.getRawSupplementary(
ch, trail);
}
else {
goBackOne();
}
}
}
digVal = UCharacter.digit(char32);
if (digVal == -1) {
// Resetting position to point to the next unprocessed
// char. We overshot it when doing our test/set for
// numbers.
updateInternalState(m_utilSpecialBackUp_);
break;
}
}
else {
break;
}
}
if (nonZeroValReached == false){
digIndx = 2;
m_utilStringBuffer_.setCharAt(2, (char)6);
}
int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
: (digIndx >>> 1) + 2;
if (digIndx % 2 != 0){
// We missed a value. Since digIndx isn't even, stuck too many
// values into the buffer (this is what we get for padding the
// first byte with a zero). "Front-patch" now by pushing all
// nybbles forward.
// Doing it this way ensures that at least 50% of the time
// (statistically speaking) we'll only be doing a single pass
// and optimizes for strings with single digits. I'm just
// assuming that's the more common case.
for (int i = 2; i < endIndex; i ++){
m_utilStringBuffer_.setCharAt(i,
(char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
% 10) * 10)
+ (((m_utilStringBuffer_.charAt(i + 1) - 6)
>>> 1) / 10) << 1) + 6));
}
-- digIndx;
}
// Subtract one off of the last byte.
m_utilStringBuffer_.setCharAt(endIndex - 1,
(char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
// We want to skip over the first two slots in the buffer.
// The first slot is reserved for the header byte CODAN_PLACEHOLDER.
// The second slot is for the sign/exponent byte:
// 0x80 + (decimalPos/2) & 7f.
m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
m_utilStringBuffer_.setCharAt(1,
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
// Now transfer the collation key to our collIterate struct.
// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
ce = (((m_utilStringBuffer_.charAt(0) << 8)
// Primary weight
| m_utilStringBuffer_.charAt(1))
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
// Secondary weight
| (RuleBasedCollator.BYTE_COMMON_
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
| RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
int i = 2; // Reset the index into the buffer.
m_CEBuffer_[0] = ce;
m_CEBufferSize_ = 1;
m_CEBufferOffset_ = 1;
while (i < endIndex)
{
int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
if (i < endIndex) {
primWeight |= m_utilStringBuffer_.charAt(i ++);
}
m_CEBuffer_[m_CEBufferSize_ ++]
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
}
return ce;
}
// no numeric mode, we'll just switch to whatever we stashed and
// continue
// find the offset to expansion table
return collator.m_expansion_[getExpansionOffset(collator, ce)];
}
/**
* Gets the next implicit ce for codepoints
* @param codepoint current codepoint
* @return implicit ce
*/
private int nextImplicit(int codepoint)
{
if (!UCharacter.isLegal(codepoint)) {
// synwee to check with vladimir on the range of isNonChar()
// illegal code value, use completely ignoreable!
return IGNORABLE;
}
int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
| 0x00000505;
m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
m_CEBufferOffset_ = 1;
m_CEBufferSize_ = 2;
return m_CEBuffer_[0];
}
/**
* Returns the next ce associated with the following surrogate characters
* @param ch current character
* @return ce
*/
private int nextSurrogate(char ch)
{
int ch_int = nextChar();
char nextch = (char)ch_int;
if (ch_int != CharacterIterator.DONE &&
UTF16.isTrailSurrogate(nextch)) {
int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
return nextImplicit(codepoint);
}
if (nextch != CharacterIterator.DONE) {
previousChar(); // reverts back to the original position
}
return IGNORABLE; // completely ignorable
}
/**
* Returns the next ce for a hangul character, this is an implicit
* calculation
* @param collator current collator
* @param ch current character
* @return hangul ce
*/
private int nextHangul(RuleBasedCollator collator, char ch)
{
char L = (char)(ch - HANGUL_SBASE_);
// divide into pieces
// do it in this order since some compilers can do % and / in one
// operation
char T = (char)(L % HANGUL_TCOUNT_);
L /= HANGUL_TCOUNT_;
char V = (char)(L % HANGUL_VCOUNT_);
L /= HANGUL_VCOUNT_;
// offset them
L += HANGUL_LBASE_;
V += HANGUL_VBASE_;
T += HANGUL_TBASE_;
// return the first CE, but first put the rest into the expansion
// buffer
m_CEBufferSize_ = 0;
if (!collator.m_isJamoSpecial_) { // FAST PATH
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_trie_.getLeadValue(L);
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_trie_.getLeadValue(V);
if (T != HANGUL_TBASE_) {
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_trie_.getLeadValue(T);
}
m_CEBufferOffset_ = 1;
return m_CEBuffer_[0];
}
else {
// Jamo is Special
// Since Hanguls pass the FCD check, it is guaranteed that we
// won't be in the normalization buffer if something like this
// happens
// Move Jamos into normalization buffer
m_buffer_.append((char)L);
m_buffer_.append((char)V);
if (T != HANGUL_TBASE_) {
m_buffer_.append((char)T);
}
m_FCDLimit_ = m_source_.getIndex();
m_FCDStart_ = m_FCDLimit_ - 1;
// Indicate where to continue in main input string after
// exhausting the buffer
return IGNORABLE;
}
}
/**
* <p>Special CE management. Expansions, contractions etc...</p>
* @param collator can be plain UCA
* @param ce current ce
* @param ch current character
* @return next special ce
*/
private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
{
int codepoint = ch;
Backup entrybackup = m_utilSpecialEntryBackUp_;
// this is to handle recursive looping
if (entrybackup != null) {
m_utilSpecialEntryBackUp_ = null;
}
else {
entrybackup = new Backup();
}
backupInternalState(entrybackup);
try { // forces it to assign m_utilSpecialEntryBackup_
while (true) {
// This loop will repeat only in the case of contractions,
// surrogate
switch(RuleBasedCollator.getTag(ce)) {
case CE_NOT_FOUND_TAG_:
// impossible case for icu4j
return ce;
case RuleBasedCollator.CE_SURROGATE_TAG_:
if (isEnd()) {
return IGNORABLE;
}
backupInternalState(m_utilSpecialBackUp_);
char trail = (char)nextChar();
ce = nextSurrogate(collator, ce, trail);
// calculate the supplementary code point value,
// if surrogate was not tailored we go one more round
codepoint =
UCharacterProperty.getRawSupplementary(ch, trail);
break;
case CE_SPEC_PROC_TAG_:
ce = nextSpecialPrefix(collator, ce, entrybackup);
break;
case CE_CONTRACTION_TAG_:
ce = nextContraction(collator, ce);
break;
case CE_LONG_PRIMARY_TAG_:
return nextLongPrimary(ce);
case CE_EXPANSION_TAG_:
return nextExpansion(collator, ce);
case CE_DIGIT_TAG_:
ce = nextDigit(collator, ce, codepoint);
break;
// various implicits optimization
case CE_CJK_IMPLICIT_TAG_:
// 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
return nextImplicit(codepoint);
case CE_IMPLICIT_TAG_: // everything that is not defined
return nextImplicit(codepoint);
case CE_TRAIL_SURROGATE_TAG_:
return IGNORABLE; // DC00-DFFF broken surrogate
case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
return nextSurrogate(ch);
case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
return nextHangul(collator, ch);
case CE_CHARSET_TAG_:
// not yet implemented probably after 1.8
return CE_NOT_FOUND_;
default:
ce = IGNORABLE;
// synwee todo, throw exception or something here.
}
if (!RuleBasedCollator.isSpecial(ce)) {
break;
}
}
}
finally {
m_utilSpecialEntryBackUp_ = entrybackup;
}
return ce;
}
/**
* Special processing is getting a CE that is preceded by a certain prefix.
* Currently this is only needed for optimizing Japanese length and
* iteration marks. When we encouter a special processing tag, we go
* backwards and try to see if we have a match. Contraction tables are used
* - so the whole process is not unlike contraction. prefix data is stored
* backwards in the table.
* @param collator current collator
* @param ce current ce
* @return previous ce
*/
private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
{
backupInternalState(m_utilSpecialBackUp_);
while (true) {
// position ourselves at the begining of contraction sequence
int offset = getContractionOffset(collator, ce);
int entryoffset = offset;
if (isBackwardsStart()) {
ce = collator.m_contractionCE_[offset];
break;
}
char prevch = (char)previousChar();
while (prevch > collator.m_contractionIndex_[offset]) {
// since contraction codepoints are ordered, we skip all that
// are smaller
offset ++;
}
if (prevch == collator.m_contractionIndex_[offset]) {
ce = collator.m_contractionCE_[offset];
}
else {
// if there is a completely ignorable code point in the middle
// of a prefix, we need to act as if it's not there assumption:
// 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
// zero)
// lone surrogates cannot be set to zero as it would break
// other processing
int isZeroCE = collator.m_trie_.getLeadValue(prevch);
// it's easy for BMP code points
if (isZeroCE == 0) {
continue;
}
else if (UTF16.isTrailSurrogate(prevch)
|| UTF16.isLeadSurrogate(prevch)) {
// for supplementary code points, we have to check the next one
// situations where we are going to ignore
// 1. beginning of the string: schar is a lone surrogate
// 2. schar is a lone surrogate
// 3. schar is a trail surrogate in a valid surrogate
// sequence that is explicitly set to zero.
if (!isBackwardsStart()) {
char lead = (char)previousChar();
if (UTF16.isLeadSurrogate(lead)) {
isZeroCE = collator.m_trie_.getLeadValue(lead);
if (RuleBasedCollator.getTag(isZeroCE)
== RuleBasedCollator.CE_SURROGATE_TAG_) {
int finalCE = collator.m_trie_.getTrailValue(
isZeroCE,
prevch);
if (finalCE == 0) {
// this is a real, assigned completely
// ignorable code point
continue;
}
}
}
else {
nextChar(); // revert to original offset
// lone surrogate, completely ignorable
continue;
}
nextChar(); // revert to original offset
}
else {
// lone surrogate at the beggining, completely ignorable
continue;
}
}
// char was not in the table. prefix not found
ce = collator.m_contractionCE_[entryoffset];
}
if (!isSpecialPrefixTag(ce)) {
// char was in the contraction table, and the corresponding ce
// is not a prefix ce. We found the prefix, break out of loop,
// this ce will end up being returned.
break;
}
}
updateInternalState(m_utilSpecialBackUp_);
return ce;
}
/**
* Retrieves the previous contraction ce. To ensure that the backwards and
* forwards iteration matches, we take the current region of most possible
* match and pass it through the forward iteration. This will ensure that
* the obstinate problem of overlapping contractions will not occur.
* @param collator current collator
* @param ce current ce
* @param ch current character
* @return previous contraction ce
*/
private int previousContraction(RuleBasedCollator collator, int ce, char ch)
{
m_utilStringBuffer_.setLength(0);
// since we might encounter normalized characters (from the thai
// processing) we can't use peekCharacter() here.
char prevch = (char)previousChar();
boolean atStart = false;
// TODO: address the comment above - maybe now we *can* use peekCharacter
//while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
while (collator.isUnsafe(ch)) {
m_utilStringBuffer_.insert(0, ch);
ch = prevch;
if (isBackwardsStart()) {
atStart = true;
break;
}
prevch = (char)previousChar();
}
if (!atStart) {
// undo the previousChar() if we didn't reach the beginning
nextChar();
}
// adds the initial base character to the string
m_utilStringBuffer_.insert(0, ch);
// a new collation element iterator is used to simply things, since
// using the current collation element iterator will mean that the
// forward and backwards iteration will share and change the same
// buffers. it is going to be painful.
int originaldecomp = collator.getDecomposition();
// for faster access, since string would have been normalized above
collator.setDecomposition(Collator.NO_DECOMPOSITION);
if (m_utilColEIter_ == null) {
m_utilColEIter_ = new CollationElementIterator(
m_utilStringBuffer_.toString(),
collator);
}
else {
m_utilColEIter_.m_collator_ = collator;
m_utilColEIter_.setText(m_utilStringBuffer_.toString());
}
ce = m_utilColEIter_.next();
m_CEBufferSize_ = 0;
while (ce != NULLORDER) {
if (m_CEBufferSize_ == m_CEBuffer_.length) {
try {
// increasing cebuffer size
int tempbuffer[] = new int[m_CEBuffer_.length + 50];
System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
m_CEBuffer_.length);
m_CEBuffer_ = tempbuffer;
}
catch( MissingResourceException e)
{
throw e;
}
catch (Exception e) {
if(DEBUG){
e.printStackTrace();
}
return NULLORDER;
}
}
m_CEBuffer_[m_CEBufferSize_ ++] = ce;
ce = m_utilColEIter_.next();
}
collator.setDecomposition(originaldecomp);
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
/**
* Returns the previous long primary ces
* @param ce long primary ce
* @return previous long primary ces
*/
private int previousLongPrimary(int ce)
{
m_CEBufferSize_ = 0;
m_CEBuffer_[m_CEBufferSize_ ++] =
((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
/**
* Returns the previous expansion ces
* @param collator current collator
* @param ce current ce
* @return previous expansion ce
*/
private int previousExpansion(RuleBasedCollator collator, int ce)
{
// find the offset to expansion table
int offset = getExpansionOffset(collator, ce);
m_CEBufferSize_ = getExpansionCount(ce);
if (m_CEBufferSize_ != 0) {
// less than 16 elements in expansion
for (int i = 0; i < m_CEBufferSize_; i ++) {
m_CEBuffer_[i] = collator.m_expansion_[offset + i];
}
}
else {
// null terminated ces
while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
m_CEBuffer_[m_CEBufferSize_] =
collator.m_expansion_[offset + m_CEBufferSize_];
m_CEBufferSize_ ++;
}
}
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
/**
* Getting the digit collation elements
* @param collator
* @param ce current collation element
* @param ch current code point
* @return digit collation element
*/
private int previousDigit(RuleBasedCollator collator, int ce, char ch)
{
// We do a check to see if we want to collate digits as numbers; if so we generate
// a custom collation key. Otherwise we pull out the value stored in the expansion table.
if (m_collator_.m_isNumericCollation_){
int leadingZeroIndex = 0;
int collateVal = 0;
boolean nonZeroValReached = false;
// clear and set initial string buffer length
m_utilStringBuffer_.setLength(3);
// We parse the source string until we hit a char that's NOT a digit
// Use this u_charDigitValue. This might be slow because we have to
// handle surrogates...
int char32 = ch;
if (UTF16.isTrailSurrogate(ch)) {
if (!isBackwardsStart()){
char lead = (char)previousChar();
if (UTF16.isLeadSurrogate(lead)) {
char32 = UCharacterProperty.getRawSupplementary(lead,
ch);
}
else {
goForwardOne();
}
}
}
int digVal = UCharacter.digit(char32);
int digIndx = 0;
for (;;) {
// Make sure we have enough space.
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
<< 1);
}
// Skipping over "trailing" zeroes but we still add to digIndx.
if (digVal != 0 || nonZeroValReached) {
if (digVal != 0 && !nonZeroValReached) {
nonZeroValReached = true;
}
// We parse the digit string into base 100 numbers (this
// fits into a byte).
// We only add to the buffer in twos, thus if we are
// parsing an odd character, that serves as the 'tens'
// digit while the if we are parsing an even one, that is
// the 'ones' digit. We dumped the parsed base 100 value
// (collateVal) into a buffer. We multiply each collateVal
// by 2 (to give us room) and add 5 (to avoid overlapping
// magic CE byte values). The last byte we subtract 1 to
// ensure it is less than all the other bytes.
// Since we're doing in this reverse we want to put the
// first digit encountered into the ones place and the
// second digit encountered into the tens place.
if (digIndx % 2 == 1){
collateVal += digVal * 10;
// This removes leading zeroes.
if (collateVal == 0 && leadingZeroIndex == 0) {
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
}
else if (leadingZeroIndex != 0) {
leadingZeroIndex = 0;
}
m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
(char)((collateVal << 1) + 6));
collateVal = 0;
}
else {
collateVal = digVal;
}
}
digIndx ++;
if (!isBackwardsStart()){
backupInternalState(m_utilSpecialBackUp_);
char32 = previousChar();
ch = (char)ch;
if (UTF16.isTrailSurrogate(ch)){
if (!isBackwardsStart()) {
char lead = (char)previousChar();
if (UTF16.isLeadSurrogate(lead)) {
char32
= UCharacterProperty.getRawSupplementary(
lead, ch);
}
else {
updateInternalState(m_utilSpecialBackUp_);
}
}
}
digVal = UCharacter.digit(char32);
if (digVal == -1) {
updateInternalState(m_utilSpecialBackUp_);
break;
}
}
else {
break;
}
}
if (nonZeroValReached == false) {
digIndx = 2;
m_utilStringBuffer_.setCharAt(2, (char)6);
}
if (digIndx % 2 != 0) {
if (collateVal == 0 && leadingZeroIndex == 0) {
// This removes the leading 0 in a odd number sequence of
// numbers e.g. avery001
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
}
else {
// this is not a leading 0, we add it in
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
(char)((collateVal << 1) + 6));
digIndx ++;
}
}
int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
: ((digIndx >>> 1) + 2) ;
digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
// Subtract one off of the last byte.
// Really the first byte here, but it's reversed...
m_utilStringBuffer_.setCharAt(2,
(char)(m_utilStringBuffer_.charAt(2) - 1));
// We want to skip over the first two slots in the buffer.
// The first slot is reserved for the header byte CODAN_PLACEHOLDER.
// The second slot is for the sign/exponent byte:
// 0x80 + (decimalPos/2) & 7f.
m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
m_utilStringBuffer_.setCharAt(1,
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
// Now transfer the collation key to our collIterate struct.
// The total size for our collation key is endIndx bumped up to the
// next largest even value divided by two.
m_CEBufferSize_ = 0;
m_CEBuffer_[m_CEBufferSize_ ++]
= (((m_utilStringBuffer_.charAt(0) << 8)
// Primary weight
| m_utilStringBuffer_.charAt(1))
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
// Secondary weight
| (RuleBasedCollator.BYTE_COMMON_
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
// Tertiary weight.
| RuleBasedCollator.BYTE_COMMON_;
int i = endIndex - 1; // Reset the index into the buffer.
while (i >= 2) {
int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
if (i >= 2) {
primWeight |= m_utilStringBuffer_.charAt(i --);
}
m_CEBuffer_[m_CEBufferSize_ ++]
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
}
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
else {
return collator.m_expansion_[getExpansionOffset(collator, ce)];
}
}
/**
* Returns previous hangul ces
* @param collator current collator
* @param ch current character
* @return previous hangul ce
*/
private int previousHangul(RuleBasedCollator collator, char ch)
{
char L = (char)(ch - HANGUL_SBASE_);
// we do it in this order since some compilers can do % and / in one
// operation
char T = (char)(L % HANGUL_TCOUNT_);
L /= HANGUL_TCOUNT_;
char V = (char)(L % HANGUL_VCOUNT_);
L /= HANGUL_VCOUNT_;
// offset them
L += HANGUL_LBASE_;
V += HANGUL_VBASE_;
T += HANGUL_TBASE_;
m_CEBufferSize_ = 0;
if (!collator.m_isJamoSpecial_) {
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_trie_.getLeadValue(L);
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_trie_.getLeadValue(V);
if (T != HANGUL_TBASE_) {
m_CEBuffer_[m_CEBufferSize_ ++] =
collator.m_trie_.getLeadValue(T);
}
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
else {
// Since Hanguls pass the FCD check, it is guaranteed that we won't
// be in the normalization buffer if something like this happens
// Move Jamos into normalization buffer
m_buffer_.append(L);
m_buffer_.append(V);
if (T != HANGUL_TBASE_) {
m_buffer_.append(T);
}
m_FCDStart_ = m_source_.getIndex();
m_FCDLimit_ = m_FCDStart_ + 1;
return IGNORABLE;
}
}
/**
* Gets implicit codepoint ces
* @param codepoint current codepoint
* @return implicit codepoint ces
*/
private int previousImplicit(int codepoint)
{
if (!UCharacter.isLegal(codepoint)) {
return IGNORABLE; // illegal code value, completely ignoreable!
}
int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
m_CEBufferSize_ = 2;
m_CEBufferOffset_ = 1;
m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
| 0x00000505;
m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
return m_CEBuffer_[1];
}
/**
* Gets the previous surrogate ce
* @param ch current character
* @return previous surrogate ce
*/
private int previousSurrogate(char ch)
{
if (isBackwardsStart()) {
// we are at the start of the string, wrong place to be at
return IGNORABLE;
}
char prevch = (char)previousChar();
// Handles Han and Supplementary characters here.
if (UTF16.isLeadSurrogate(prevch)) {
return previousImplicit(
UCharacterProperty.getRawSupplementary(prevch, ch));
}
if (prevch != CharacterIterator.DONE) {
nextChar();
}
return IGNORABLE; // completely ignorable
}
/**
* <p>Special CE management. Expansions, contractions etc...</p>
* @param collator can be plain UCA
* @param ce current ce
* @param ch current character
* @return previous special ce
*/
private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
{
while(true) {
// the only ces that loops are thai, special prefix and
// contractions
switch (RuleBasedCollator.getTag(ce)) {
case CE_NOT_FOUND_TAG_: // this tag always returns
return ce;
case RuleBasedCollator.CE_SURROGATE_TAG_:
// essentialy a disengaged lead surrogate. a broken
// sequence was encountered and this is an error
return IGNORABLE;
case CE_SPEC_PROC_TAG_:
ce = previousSpecialPrefix(collator, ce);
break;
case CE_CONTRACTION_TAG_:
// may loop for first character e.g. "0x0f71" for english
if (isBackwardsStart()) {
// start of string or this is not the end of any contraction
ce = collator.m_contractionCE_[
getContractionOffset(collator, ce)];
break;
}
return previousContraction(collator, ce, ch); // else
case CE_LONG_PRIMARY_TAG_:
return previousLongPrimary(ce);
case CE_EXPANSION_TAG_: // always returns
return previousExpansion(collator, ce);
case CE_DIGIT_TAG_:
ce = previousDigit(collator, ce, ch);
break;
case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
return previousHangul(collator, ch);
case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
return IGNORABLE; // broken surrogate sequence
case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
return previousSurrogate(ch);
case CE_CJK_IMPLICIT_TAG_:
// 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
return previousImplicit(ch);
case CE_IMPLICIT_TAG_: // everything that is not defined
// UCA is filled with these. Tailorings are NOT_FOUND
return previousImplicit(ch);
case CE_CHARSET_TAG_: // this tag always returns
return CE_NOT_FOUND_;
default: // this tag always returns
ce = IGNORABLE;
}
if (!RuleBasedCollator.isSpecial(ce)) {
break;
}
}
return ce;
}
/**
* GET IMPLICIT PRIMARY WEIGHTS
* @param cp codepoint
* @param value is left justified primary key
*/
// private static final int getImplicitPrimary(int cp)
// {
// cp = swapCJK(cp);
//
// //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
// // we now have a range of numbers from 0 to 21FFFF.
// // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// // we must leave a gap of 01 between all values of the last byte, so
// // the last byte has 126 values (3 byte case)
// // we shift so that HAN all has the same first primary, for
// // compression.
// // for the 4 byte case, we make the gap as large as we can fit.
// // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
// // Four byte forms (most supplementaries) are EF xx xx xx (with a gap
// // of LAST2_MULTIPLIER == 14)
//
// int last0 = cp - RuleBasedCollator.IMPLICIT_4BYTE_BOUNDARY_;
// if (last0 < 0) {
// int last1 = cp / RuleBasedCollator.LAST_COUNT_;
// last0 = cp % RuleBasedCollator.LAST_COUNT_;
//
// int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
// last1 %= RuleBasedCollator.OTHER_COUNT_;
// return RuleBasedCollator.IMPLICIT_BASE_3BYTE_ + (last2 << 24)
// + (last1 << 16)
// + ((last0 * RuleBasedCollator.LAST_MULTIPLIER_) << 8);
// }
// else {
// int last1 = last0 / RuleBasedCollator.LAST_COUNT2_;
// last0 %= RuleBasedCollator.LAST_COUNT2_;
//
// int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
// last1 %= RuleBasedCollator.OTHER_COUNT_;
//
// int last3 = last2 / RuleBasedCollator.OTHER_COUNT_;
// last2 %= RuleBasedCollator.OTHER_COUNT_;
// return RuleBasedCollator.IMPLICIT_BASE_4BYTE_ + (last3 << 24)
// + (last2 << 16) + (last1 << 8)
// + (last0 * RuleBasedCollator.LAST2_MULTIPLIER_);
// }
// }
// /**
// * Swapping CJK characters for implicit ces
// * @param cp codepoint CJK
// * @return swapped result
// */
// private static final int swapCJK(int cp)
// {
// if (cp >= CJK_BASE_) {
// if (cp < CJK_LIMIT_) {
// return cp - CJK_BASE_;
// }
// if (cp < CJK_COMPAT_USED_BASE_) {
// return cp + NON_CJK_OFFSET_;
// }
// if (cp < CJK_COMPAT_USED_LIMIT_) {
// return cp - CJK_COMPAT_USED_BASE_ + (CJK_LIMIT_ - CJK_BASE_);
// }
// if (cp < CJK_B_BASE_) {
// return cp + NON_CJK_OFFSET_;
// }
// if (cp < CJK_B_LIMIT_) {
// return cp; // non-BMP-CJK
// }
// return cp + NON_CJK_OFFSET_; // non-CJK
// }
// if (cp < CJK_A_BASE_) {
// return cp + NON_CJK_OFFSET_;
// }
// if (cp < CJK_A_LIMIT_) {
// return cp - CJK_A_BASE_ + (CJK_LIMIT_ - CJK_BASE_)
// + (CJK_COMPAT_USED_LIMIT_ - CJK_COMPAT_USED_BASE_);
// }
// return cp + NON_CJK_OFFSET_; // non-CJK
// }
/**
* Gets a character from the source string at a given offset.
* Handles both normal and iterative cases.
* No error checking and does not access the normalization buffer
* - caller beware!
* @param offset offset from current position which character is to be
* retrieved
* @return character at current position + offset
*/
private char peekCharacter(int offset)
{
if (offset != 0) {
int currentoffset = m_source_.getIndex();
m_source_.setIndex(currentoffset + offset);
char result = (char)m_source_.current();
m_source_.setIndex(currentoffset);
return result;
}
else {
return (char)m_source_.current();
}
}
/**
* Moves back 1 position in the source string. This is slightly less
* complicated than previousChar in that it doesn't normalize while
* moving back. Boundary checks are not performed.
* This method is to be used with caution, with the assumption that
* moving back one position will not exceed the source limits.
* Use only with nextChar() and never call this API twice in a row without
* nextChar() in the middle.
*/
private void goBackOne()
{
if (m_bufferOffset_ >= 0) {
m_bufferOffset_ --;
}
else {
m_source_.setIndex(m_source_.getIndex() - 1);
}
}
/**
* Moves forward 1 position in the source string. This is slightly less
* complicated than nextChar in that it doesn't normalize while
* moving back. Boundary checks are not performed.
* This method is to be used with caution, with the assumption that
* moving back one position will not exceed the source limits.
* Use only with previousChar() and never call this API twice in a row
* without previousChar() in the middle.
*/
private void goForwardOne()
{
if (m_bufferOffset_ < 0) {
// we're working on the source and not normalizing. fast path.
// note Thai pre-vowel reordering uses buffer too
m_source_.setIndex(m_source_.getIndex() + 1);
}
else {
// we are in the buffer, buffer offset will never be 0 here
m_bufferOffset_ ++;
}
}
}