| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2011, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.text; |
| |
| import java.io.DataInputStream; |
| import java.io.IOException; |
| import java.nio.ByteBuffer; |
| import java.text.CharacterIterator; |
| import java.text.ParseException; |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.MissingResourceException; |
| import java.util.Set; |
| import java.util.concurrent.locks.Lock; |
| import java.util.concurrent.locks.ReentrantLock; |
| |
| import com.ibm.icu.impl.BOCU; |
| import com.ibm.icu.impl.ICUDebug; |
| import com.ibm.icu.impl.ICUResourceBundle; |
| import com.ibm.icu.impl.ImplicitCEGenerator; |
| import com.ibm.icu.impl.IntTrie; |
| import com.ibm.icu.impl.StringUCharacterIterator; |
| import com.ibm.icu.impl.Trie; |
| import com.ibm.icu.impl.TrieIterator; |
| import com.ibm.icu.impl.Utility; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.util.RangeValueIterator; |
| import com.ibm.icu.util.ULocale; |
| import com.ibm.icu.util.UResourceBundle; |
| import com.ibm.icu.util.VersionInfo; |
| |
| /** |
| * <p> |
| * RuleBasedCollator is a concrete subclass of Collator. It allows customization of the Collator via user-specified rule |
| * sets. RuleBasedCollator is designed to be fully compliant to the <a |
| * href="http://www.unicode.org/unicode/reports/tr10/">Unicode Collation Algorithm (UCA)</a> and conforms to ISO 14651. |
| * </p> |
| * |
| * <p> |
| * Users are strongly encouraged to read <a href="http://www.icu-project.org/userguide/Collate_Intro.html"> the users |
| * guide</a> for more information about the collation service before using this class. |
| * </p> |
| * |
| * <p> |
| * Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class |
| * Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the |
| * argument locale. If a customized collation ordering ar attributes is required, use the RuleBasedCollator(String) |
| * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on UCA, while |
| * re-adjusting the attributes and orders of the characters in the specified rule accordingly. |
| * </p> |
| * |
| * <p> |
| * RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale |
| * is not available, the orders eventually falls back to the <a href="http://www.unicode.org/unicode/reports/tr10/">UCA |
| * collation order </a>. |
| * </p> |
| * |
| * <p> |
| * For information about the collation rule syntax and details about customization, please refer to the <a |
| * href="http://www.icu-project.org/userguide/Collate_Customization.html"> Collation customization</a> section of the |
| * user's guide. |
| * </p> |
| * |
| * <p> |
| * <strong>Note</strong> that there are some differences between the Collation rule syntax used in Java and ICU4J: |
| * |
| * <ul> |
| * <li>According to the JDK documentation: <i> |
| * <p> |
| * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule is in force when a Thai vowel of the range |
| * \U0E40-\U0E44 precedes a Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the range |
| * \U0EC0-\U0EC4 precedes a Lao consonant of the range \U0E81-\U0EAE then the vowel is placed after the |
| * consonant for collation purposes. |
| * </p> |
| * <p> |
| * If a rule is without the modifier '!', the Thai/Lao vowel-consonant swapping is not turned on. |
| * </p> |
| * </i> |
| * <p> |
| * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao vowel-consonant swapping, since the UCA clearly |
| * states that it has to be supported to ensure a correct sorting order. If a '!' is encountered, it is ignored. |
| * </p> |
| * <li>As mentioned in the documentation of the base class Collator, compatibility decomposition mode is not supported. |
| * </ul> |
| * <p> |
| * <strong>Examples</strong> |
| * </p> |
| * <p> |
| * Creating Customized RuleBasedCollators: <blockquote> |
| * |
| * <pre> |
| * String simple = "& a < b < c < d"; |
| * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple); |
| * |
| * String norwegian = "& a , A < b , B < c , C < d , D < e , E " |
| * + "< f , F < g , G < h , H < i , I < j , " |
| * + "J < k , K < l , L < m , M < n , N < " |
| * + "o , O < p , P < q , Q < r , R < s , S < " |
| * + "t , T < u , U < v , V < w , W < x , X " |
| * + "< y , Y < z , Z < \u00E5 = a\u030A " |
| * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 " |
| * + ", \u00C6 < \u00F8 , \u00D8"; |
| * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian); |
| * </pre> |
| * |
| * </blockquote> |
| * |
| * Concatenating rules to combine <code>Collator</code>s: <blockquote> |
| * |
| * <pre> |
| * // Create an en_US Collator object |
| * RuleBasedCollator en_USCollator = (RuleBasedCollator) |
| * Collator.getInstance(new Locale("en", "US", "")); |
| * // Create a da_DK Collator object |
| * RuleBasedCollator da_DKCollator = (RuleBasedCollator) |
| * Collator.getInstance(new Locale("da", "DK", "")); |
| * // Combine the two |
| * // First, get the collation rules from en_USCollator |
| * String en_USRules = en_USCollator.getRules(); |
| * // Second, get the collation rules from da_DKCollator |
| * String da_DKRules = da_DKCollator.getRules(); |
| * RuleBasedCollator newCollator = |
| * new RuleBasedCollator(en_USRules + da_DKRules); |
| * // newCollator has the combined rules |
| * </pre> |
| * |
| * </blockquote> |
| * |
| * Making changes to an existing RuleBasedCollator to create a new <code>Collator</code> object, by appending changes to |
| * the existing rule: <blockquote> |
| * |
| * <pre> |
| * // Create a new Collator object with additional rules |
| * String addRules = "& C < ch, cH, Ch, CH"; |
| * RuleBasedCollator myCollator = |
| * new RuleBasedCollator(en_USCollator.getRules() + addRules); |
| * // myCollator contains the new rules |
| * </pre> |
| * |
| * </blockquote> |
| * |
| * How to change the order of non-spacing accents: <blockquote> |
| * |
| * <pre> |
| * // old rule with main accents |
| * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 " |
| * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 " |
| * + "; \u0306 ; \u0307 ; \u0309 ; \u030A " |
| * + "; \u030B ; \u030C ; \u030D ; \u030E " |
| * + "; \u030F ; \u0310 ; \u0311 ; \u0312 " |
| * + "< a , A ; ae, AE ; \u00e6 , \u00c6 " |
| * + "< b , B < c, C < e, E & C < d , D"; |
| * // change the order of accent characters |
| * String addOn = "& \u0300 ; \u0308 ; \u0302"; |
| * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn); |
| * </pre> |
| * |
| * </blockquote> |
| * |
| * Putting in a new primary ordering before the default setting, e.g. sort English characters before or after Japanese |
| * characters in the Japanese <code>Collator</code>: <blockquote> |
| * |
| * <pre> |
| * // get en_US Collator rules |
| * RuleBasedCollator en_USCollator |
| * = (RuleBasedCollator)Collator.getInstance(Locale.US); |
| * // add a few Japanese characters to sort before English characters |
| * // suppose the last character before the first base letter 'a' in |
| * // the English collation rule is \u2212 |
| * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, " |
| * + "\u3044"; |
| * RuleBasedCollator myJapaneseCollator |
| * = new RuleBasedCollator(en_USCollator.getRules() + jaString); |
| * </pre> |
| * |
| * </blockquote> |
| * </p> |
| * <p> |
| * This class is not subclassable |
| * </p> |
| * |
| * @author Syn Wee Quek |
| * @stable ICU 2.8 |
| */ |
| public final class RuleBasedCollator extends Collator { |
| // public constructors --------------------------------------------------- |
| |
| /** |
| * <p> |
| * Constructor that takes the argument rules for customization. The collator will be based on UCA, with the |
| * attributes and re-ordering of the characters specified in the argument rules. |
| * </p> |
| * <p> |
| * See the user guide's section on <a href="http://www.icu-project.org/userguide/Collate_Customization.html"> |
| * Collation Customization</a> for details on the rule syntax. |
| * </p> |
| * |
| * @param rules |
| * the collation rules to build the collation table from. |
| * @exception ParseException |
| * and IOException thrown. ParseException thrown when argument rules have an invalid syntax. |
| * IOException thrown when an error occured while reading internal data. |
| * @stable ICU 2.8 |
| */ |
| public RuleBasedCollator(String rules) throws Exception { |
| checkUCA(); |
| if (rules == null) { |
| throw new IllegalArgumentException("Collation rules can not be null"); |
| } |
| init(rules); |
| } |
| |
| // public methods -------------------------------------------------------- |
| |
| /** |
| * Clones the RuleBasedCollator |
| * |
| * @return a new instance of this RuleBasedCollator object |
| * @stable ICU 2.8 |
| */ |
| public Object clone() throws CloneNotSupportedException { |
| return clone(isFrozen()); |
| } |
| |
| /** |
| * Clones the RuleBasedCollator |
| * |
| * @param frozen should the clone be frozen or not |
| * @return a new instance of this RuleBasedCollator object |
| */ |
| private Object clone(boolean frozen) throws CloneNotSupportedException { |
| //TODO: once buffer and threading issue is resolved have frozen clone just return itself |
| RuleBasedCollator result = (RuleBasedCollator) super.clone(); |
| if (latinOneCEs_ != null) { |
| result.m_reallocLatinOneCEs_ = true; |
| result.m_ContInfo_ = new ContractionInfo(); |
| } |
| |
| // since all collation data in the RuleBasedCollator do not change |
| // we can safely assign the result.fields to this collator |
| // except in cases where we can't |
| result.collationBuffer = null; |
| result.frozenLock = frozen ? new ReentrantLock() : null; |
| return result; |
| } |
| |
| /** |
| * Return a CollationElementIterator for the given String. |
| * |
| * @see CollationElementIterator |
| * @stable ICU 2.8 |
| */ |
| public CollationElementIterator getCollationElementIterator(String source) { |
| return new CollationElementIterator(source, this); |
| } |
| |
| /** |
| * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be |
| * preserved since a new copy will be created for use. |
| * |
| * @see CollationElementIterator |
| * @stable ICU 2.8 |
| */ |
| public CollationElementIterator getCollationElementIterator(CharacterIterator source) { |
| CharacterIterator newsource = (CharacterIterator) source.clone(); |
| return new CollationElementIterator(newsource, this); |
| } |
| |
| /** |
| * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be |
| * preserved since a new copy will be created for use. |
| * |
| * @see CollationElementIterator |
| * @stable ICU 2.8 |
| */ |
| public CollationElementIterator getCollationElementIterator(UCharacterIterator source) { |
| return new CollationElementIterator(source, this); |
| } |
| |
| // Freezable interface implementation ------------------------------------------------- |
| |
| /** |
| * Determines whether the object has been frozen or not. |
| * @draft ICU 4.8 |
| */ |
| public boolean isFrozen() { |
| return frozenLock != null; |
| } |
| |
| /** |
| * Freezes the collator. |
| * @return the collator itself. |
| * @draft ICU 4.8 |
| */ |
| public Collator freeze() { |
| if (!isFrozen()) { |
| frozenLock = new ReentrantLock(); |
| } |
| return this; |
| } |
| |
| /** |
| * Provides for the clone operation. Any clone is initially unfrozen. |
| * @draft ICU 4.8 |
| */ |
| public RuleBasedCollator cloneAsThawed() { |
| RuleBasedCollator clone = null; |
| try { |
| clone = (RuleBasedCollator) clone(false); |
| } catch (CloneNotSupportedException e) { |
| // Clone is implemented |
| } |
| return clone; |
| } |
| |
| // public setters -------------------------------------------------------- |
| |
| /** |
| * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator |
| * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a |
| * correct JIS collation order, distinguishing between Katakana and Hiragana characters. |
| * |
| * @param flag |
| * true if Hiragana Quaternary mode is to be on, false otherwise |
| * @see #setHiraganaQuaternaryDefault |
| * @see #isHiraganaQuaternary |
| * @stable ICU 2.8 |
| */ |
| public void setHiraganaQuaternary(boolean flag) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_isHiragana4_ = flag; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See |
| * setHiraganaQuaternary(boolean) for more details. |
| * |
| * @see #setHiraganaQuaternary(boolean) |
| * @see #isHiraganaQuaternary |
| * @stable ICU 2.8 |
| */ |
| public void setHiraganaQuaternaryDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_isHiragana4_ = m_defaultIsHiragana4_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The |
| * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case |
| * characters first. |
| * |
| * @param upperfirst |
| * true to sort uppercase characters before lowercase characters, false to sort lowercase characters |
| * before uppercase characters |
| * @see #isLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setLowerCaseFirst |
| * @see #setCaseFirstDefault |
| * @stable ICU 2.8 |
| */ |
| public void setUpperCaseFirst(boolean upperfirst) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (upperfirst) { |
| if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) { |
| latinOneRegenTable_ = true; |
| } |
| m_caseFirst_ = AttributeValue.UPPER_FIRST_; |
| } else { |
| if (m_caseFirst_ != AttributeValue.OFF_) { |
| latinOneRegenTable_ = true; |
| } |
| m_caseFirst_ = AttributeValue.OFF_; |
| } |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The |
| * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper |
| * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences. |
| * |
| * @param lowerfirst |
| * true for sorting lower cased characters before upper cased characters, false to ignore case |
| * preferences. |
| * @see #isLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setUpperCaseFirst |
| * @see #setCaseFirstDefault |
| * @stable ICU 2.8 |
| */ |
| public void setLowerCaseFirst(boolean lowerfirst) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (lowerfirst) { |
| if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) { |
| latinOneRegenTable_ = true; |
| } |
| m_caseFirst_ = AttributeValue.LOWER_FIRST_; |
| } else { |
| if (m_caseFirst_ != AttributeValue.OFF_) { |
| latinOneRegenTable_ = true; |
| } |
| m_caseFirst_ = AttributeValue.OFF_; |
| } |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See |
| * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details. |
| * |
| * @see #isLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setLowerCaseFirst(boolean) |
| * @see #setUpperCaseFirst(boolean) |
| * @stable ICU 2.8 |
| */ |
| public final void setCaseFirstDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (m_caseFirst_ != m_defaultCaseFirst_) { |
| latinOneRegenTable_ = true; |
| } |
| m_caseFirst_ = m_defaultCaseFirst_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See |
| * setAlternateHandling(boolean) for more details. |
| * |
| * @see #setAlternateHandlingShifted(boolean) |
| * @see #isAlternateHandlingShifted() |
| * @stable ICU 2.8 |
| */ |
| public void setAlternateHandlingDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See |
| * setCaseLevel(boolean) for more details. |
| * |
| * @see #setCaseLevel(boolean) |
| * @see #isCaseLevel |
| * @stable ICU 2.8 |
| */ |
| public void setCaseLevelDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_isCaseLevel_ = m_defaultIsCaseLevel_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See |
| * setDecomposition(int) for more details. |
| * |
| * @see #getDecomposition |
| * @see #setDecomposition(int) |
| * @stable ICU 2.8 |
| */ |
| public void setDecompositionDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| setDecomposition(m_defaultDecomposition_); |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See |
| * setFrenchCollation(boolean) for more details. |
| * |
| * @see #isFrenchCollation |
| * @see #setFrenchCollation(boolean) |
| * @stable ICU 2.8 |
| */ |
| public void setFrenchCollationDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) { |
| latinOneRegenTable_ = true; |
| } |
| m_isFrenchCollation_ = m_defaultIsFrenchCollation_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See |
| * setStrength(int) for more details. |
| * |
| * @see #setStrength(int) |
| * @see #getStrength |
| * @stable ICU 2.8 |
| */ |
| public void setStrengthDefault() { |
| setStrength(m_defaultStrength_); |
| updateInternalState(); |
| } |
| |
| /** |
| * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator |
| * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER |
| * '2' |
| * |
| * @see #getNumericCollation |
| * @see #setNumericCollation |
| * @stable ICU 2.8 |
| */ |
| public void setNumericCollationDefault() { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| setNumericCollation(m_defaultIsNumericCollation_); |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false, |
| * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted |
| * backwards. See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html"> |
| * French collation</a> for more information. |
| * |
| * @param flag |
| * true to set the French collation on, false to set it off |
| * @stable ICU 2.8 |
| * @see #isFrenchCollation |
| * @see #setFrenchCollationDefault |
| */ |
| public void setFrenchCollation(boolean flag) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (m_isFrenchCollation_ != flag) { |
| latinOneRegenTable_ = true; |
| } |
| m_isFrenchCollation_ = flag; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition |
| * on <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting"> Alternate Weighting</a>. This |
| * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false, |
| * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all |
| * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour |
| * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the |
| * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order. |
| * |
| * @param shifted |
| * true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour. |
| * @see #isAlternateHandlingShifted |
| * @see #setAlternateHandlingDefault |
| * @stable ICU 2.8 |
| */ |
| public void setAlternateHandlingShifted(boolean shifted) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_isAlternateHandlingShifted_ = shifted; |
| updateInternalState(); |
| } |
| |
| /** |
| * <p> |
| * When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known |
| * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level |
| * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value |
| * is false, which means the case level is not generated. The contents of the case level are affected by the case |
| * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable |
| * case level. |
| * </p> |
| * <p> |
| * See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html"> case |
| * level</a> for more information. |
| * </p> |
| * |
| * @param flag |
| * true if case level sorting is required, false otherwise |
| * @stable ICU 2.8 |
| * @see #setCaseLevelDefault |
| * @see #isCaseLevel |
| */ |
| public void setCaseLevel(boolean flag) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_isCaseLevel_ = flag; |
| updateInternalState(); |
| } |
| |
| /** |
| * <p> |
| * Sets this Collator's strength property. The strength property determines the minimum level of difference |
| * considered significant during comparison. |
| * </p> |
| * <p> |
| * See the Collator class description for an example of use. |
| * </p> |
| * |
| * @param newStrength |
| * the new strength value. |
| * @see #getStrength |
| * @see #setStrengthDefault |
| * @see #PRIMARY |
| * @see #SECONDARY |
| * @see #TERTIARY |
| * @see #QUATERNARY |
| * @see #IDENTICAL |
| * @exception IllegalArgumentException |
| * If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. |
| * @stable ICU 2.8 |
| */ |
| public void setStrength(int newStrength) { |
| super.setStrength(newStrength); |
| updateInternalState(); |
| } |
| |
| /** |
| * <p> |
| * Variable top is a two byte primary value which causes all the codepoints with primary values that are less or |
| * equal than the variable top to be shifted when alternate handling is set to SHIFTED. |
| * </p> |
| * <p> |
| * Sets the variable top to a collation element value of a string supplied. |
| * </p> |
| * |
| * @param varTop |
| * one or more (if contraction) characters to which the variable top should be set |
| * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined. |
| * @exception IllegalArgumentException |
| * is thrown if varTop argument is not a valid variable top element. A variable top element is |
| * invalid when |
| * <ul> |
| * <li>it is a contraction that does not exist in the Collation order |
| * <li>when the PRIMARY strength collation element for the variable top has more than two bytes |
| * <li>when the varTop argument is null or zero in length. |
| * </ul> |
| * @see #getVariableTop |
| * @see RuleBasedCollator#setAlternateHandlingShifted |
| * @stable ICU 2.6 |
| */ |
| public int setVariableTop(String varTop) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (varTop == null || varTop.length() == 0) { |
| throw new IllegalArgumentException("Variable top argument string can not be null or zero in length."); |
| } |
| |
| CollationBuffer buffer = null; |
| try { |
| buffer = getCollationBuffer(); |
| return setVariableTop(varTop, buffer); |
| } finally { |
| releaseCollationBuffer(buffer); |
| } |
| |
| } |
| |
| private int setVariableTop(String varTop, CollationBuffer buffer) { |
| buffer.m_srcUtilColEIter_.setText(varTop); |
| int ce = buffer.m_srcUtilColEIter_.next(); |
| |
| // here we check if we have consumed all characters |
| // you can put in either one character or a contraction |
| // you shouldn't put more... |
| if (buffer.m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) { |
| throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist " |
| + "in the Collation order"); |
| } |
| |
| int nextCE = buffer.m_srcUtilColEIter_.next(); |
| |
| if ((nextCE != CollationElementIterator.NULLORDER) |
| && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) { |
| throw new IllegalArgumentException("Variable top argument string can only have a single collation " |
| + "element that has less than or equal to two PRIMARY strength " + "bytes"); |
| } |
| |
| m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16; |
| |
| return ce & CE_PRIMARY_MASK_; |
| } |
| |
| /** |
| * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16 |
| * bits are ignored. |
| * |
| * @param varTop |
| * Collation element value, as returned by setVariableTop or getVariableTop |
| * @see #getVariableTop |
| * @see #setVariableTop(String) |
| * @stable ICU 2.6 |
| */ |
| public void setVariableTop(int varTop) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16; |
| } |
| |
| /** |
| * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings |
| * of digits. This is a way to get '100' to sort AFTER '2' |
| * |
| * @param flag |
| * true to turn numeric collation on and false to turn it off |
| * @see #getNumericCollation |
| * @see #setNumericCollationDefault |
| * @stable ICU 2.8 |
| */ |
| public void setNumericCollation(boolean flag) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| // sort substrings of digits as numbers |
| m_isNumericCollation_ = flag; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the reordering codes for this collator. |
| * Collation reordering allows scripts and some other defined blocks of characters |
| * to be moved relative to each other as a block. This reordering is done on top of |
| * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed |
| * at the start and/or the end of the collation order. |
| * <p>By default, reordering codes specified for the start of the order are placed in the |
| * order given after a group of “special” non-script blocks. These special groups of characters |
| * are space, punctuation, symbol, currency, and digit. These special groups are represented with |
| * {@link Collator.ReorderCodes}. Script groups can be intermingled with |
| * these special non-script blocks if those special blocks are explicitly specified in the reordering. |
| * <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS} stands for any script that is not explicitly |
| * mentioned in the list of reordering codes given. Anything that is after {@link Collator.ReorderCodes#OTHERS OTHERS} |
| * will go at the very end of the reordering in the order given. |
| * <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT} will reset the reordering for this collator |
| * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that |
| * was specified when this collator was created from resource data or from rules. The |
| * {@link Collator.ReorderCodes#DEFAULT DEFAULT} code <b>must</b> be the sole code supplied when it used. If not |
| * that will result in an {@link IllegalArgumentException} being thrown. |
| * <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE} will remove any reordering for this collator. |
| * The result of setting no reordering will be to have the DUCET/CLDR reordering used. The |
| * {@link Collator.ReorderCodes#NONE NONE} code <b>must</b> be the sole code supplied when it used. |
| * @param order the reordering codes to apply to this collator; if this is null or an empty array |
| * then this clears any existing reordering |
| * @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts) |
| * @see #getReorderCodes |
| * @see #getEquivalentReorderCodes |
| * @draft ICU 4.8 |
| */ |
| public void setReorderCodes(int... order) { |
| if (isFrozen()) { |
| throw new UnsupportedOperationException("Attempt to modify frozen object"); |
| } |
| |
| if (order != null && order.length > 0) { |
| m_reorderCodes_ = order.clone(); |
| } else { |
| m_reorderCodes_ = null; |
| } |
| buildPermutationTable(); |
| } |
| |
| // public getters -------------------------------------------------------- |
| |
| /** |
| * Gets the collation rules for this RuleBasedCollator. Equivalent to String getRules(RuleOption.FULL_RULES). |
| * |
| * @return returns the collation rules |
| * @see #getRules(boolean) |
| * @stable ICU 2.8 |
| */ |
| public String getRules() { |
| return m_rules_; |
| } |
| |
| /** |
| * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the |
| * tailoring. |
| * |
| * @param fullrules |
| * true if the rules that defines the full set of collation order is required, otherwise false for |
| * returning only the tailored rules |
| * @return the current rules that defines this Collator. |
| * @see #getRules() |
| * @stable ICU 2.6 |
| */ |
| public String getRules(boolean fullrules) { |
| if (!fullrules) { |
| return m_rules_; |
| } |
| // take the UCA rules and append real rules at the end |
| return UCA_.m_rules_.concat(m_rules_); |
| } |
| |
| /** |
| * Get an UnicodeSet that contains all the characters and sequences tailored in this collator. |
| * |
| * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently |
| * than in the UCA. |
| * @stable ICU 2.4 |
| */ |
| public UnicodeSet getTailoredSet() { |
| try { |
| CollationRuleParser src = new CollationRuleParser(getRules()); |
| return src.getTailoredSet(); |
| } catch (Exception e) { |
| throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!"); |
| } |
| } |
| |
| private class contContext { |
| RuleBasedCollator coll; |
| UnicodeSet contractions; |
| UnicodeSet expansions; |
| UnicodeSet removedContractions; |
| boolean addPrefixes; |
| |
| contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions, |
| UnicodeSet removedContractions, boolean addPrefixes) { |
| this.coll = coll; |
| this.contractions = contractions; |
| this.expansions = expansions; |
| this.removedContractions = removedContractions; |
| this.addPrefixes = addPrefixes; |
| } |
| } |
| |
| private void addSpecial(contContext c, StringBuilder buffer, int CE) { |
| StringBuilder b = new StringBuilder(); |
| int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_; |
| int newCE = c.coll.m_contractionCE_[offset]; |
| // we might have a contraction that ends from previous level |
| if (newCE != CollationElementIterator.CE_NOT_FOUND_) { |
| if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE) |
| && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { |
| addSpecial(c, buffer, newCE); |
| } |
| if (buffer.length() > 1) { |
| if (c.contractions != null) { |
| c.contractions.add(buffer.toString()); |
| } |
| if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { |
| c.expansions.add(buffer.toString()); |
| } |
| } |
| } |
| |
| offset++; |
| // check whether we're doing contraction or prefix |
| if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) { |
| while (c.coll.m_contractionIndex_[offset] != 0xFFFF) { |
| b.delete(0, b.length()); |
| b.append(buffer); |
| newCE = c.coll.m_contractionCE_[offset]; |
| b.insert(0, c.coll.m_contractionIndex_[offset]); |
| if (isSpecial(newCE) |
| && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { |
| addSpecial(c, b, newCE); |
| } else { |
| if (c.contractions != null) { |
| c.contractions.add(b.toString()); |
| } |
| if (c.expansions != null && isSpecial(newCE) |
| && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { |
| c.expansions.add(b.toString()); |
| } |
| } |
| offset++; |
| } |
| } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) { |
| while (c.coll.m_contractionIndex_[offset] != 0xFFFF) { |
| b.delete(0, b.length()); |
| b.append(buffer); |
| newCE = c.coll.m_contractionCE_[offset]; |
| b.append(c.coll.m_contractionIndex_[offset]); |
| if (isSpecial(newCE) |
| && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) { |
| addSpecial(c, b, newCE); |
| } else { |
| if (c.contractions != null) { |
| c.contractions.add(b.toString()); |
| } |
| if (c.expansions != null && isSpecial(newCE) |
| && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) { |
| c.expansions.add(b.toString()); |
| } |
| } |
| offset++; |
| } |
| } |
| } |
| |
| private void processSpecials(contContext c) { |
| int internalBufferSize = 512; |
| TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_); |
| RangeValueIterator.Element element = new RangeValueIterator.Element(); |
| while (trieiterator.next(element)) { |
| int start = element.start; |
| int limit = element.limit; |
| int CE = element.value; |
| StringBuilder contraction = new StringBuilder(internalBufferSize); |
| |
| if (isSpecial(CE)) { |
| if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) { |
| while (start < limit) { |
| // if there are suppressed contractions, we don't |
| // want to add them. |
| if (c.removedContractions != null && c.removedContractions.contains(start)) { |
| start++; |
| continue; |
| } |
| // we start our contraction from middle, since we don't know if it |
| // will grow toward right or left |
| contraction.append((char) start); |
| addSpecial(c, contraction, CE); |
| start++; |
| } |
| } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { |
| while (start < limit) { |
| c.expansions.add(start++); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Gets unicode sets containing contractions and/or expansions of a collator |
| * |
| * @param contractions |
| * if not null, set to contain contractions |
| * @param expansions |
| * if not null, set to contain expansions |
| * @param addPrefixes |
| * add the prefix contextual elements to contractions |
| * @throws Exception |
| * Throws an exception if any errors occurs. |
| * @stable ICU 3.4 |
| */ |
| public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes) |
| throws Exception { |
| if (contractions != null) { |
| contractions.clear(); |
| } |
| if (expansions != null) { |
| expansions.clear(); |
| } |
| String rules = getRules(); |
| try { |
| CollationRuleParser src = new CollationRuleParser(rules); |
| contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_, |
| addPrefixes); |
| |
| // Add the UCA contractions |
| processSpecials(c); |
| // This is collator specific. Add contractions from a collator |
| c.coll = this; |
| c.removedContractions = null; |
| processSpecials(c); |
| } catch (Exception e) { |
| throw e; |
| } |
| } |
| |
| /** |
| * <p> |
| * Get a Collation key for the argument String source from this RuleBasedCollator. |
| * </p> |
| * <p> |
| * General recommendation: <br> |
| * If comparison are to be done to the same String multiple times, it would be more efficient to generate |
| * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each |
| * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better |
| * performance. |
| * </p> |
| * <p> |
| * See the class documentation for an explanation about CollationKeys. |
| * </p> |
| * |
| * @param source |
| * the text String to be transformed into a collation key. |
| * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source |
| * String is null, a null CollationKey is returned. |
| * @see CollationKey |
| * @see #compare(String, String) |
| * @see #getRawCollationKey |
| * @stable ICU 2.8 |
| */ |
| public CollationKey getCollationKey(String source) { |
| if (source == null) { |
| return null; |
| } |
| CollationBuffer buffer = null; |
| try { |
| buffer = getCollationBuffer(); |
| return getCollationKey(source, buffer); |
| } finally { |
| releaseCollationBuffer(buffer); |
| } |
| } |
| |
| private CollationKey getCollationKey(String source, CollationBuffer buffer) { |
| buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_, buffer); |
| return new CollationKey(source, buffer.m_utilRawCollationKey_); |
| } |
| |
| /** |
| * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the |
| * result into the user provided argument key. If key has a internal byte array of length that's too small for the |
| * result, the internal byte array will be grown to the exact required size. |
| * |
| * @param source the text String to be transformed into a RawCollationKey |
| * @param key output RawCollationKey to store results |
| * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user |
| * provided key will be returned. |
| * @see #getCollationKey |
| * @see #compare(String, String) |
| * @see RawCollationKey |
| * @stable ICU 2.8 |
| */ |
| public RawCollationKey getRawCollationKey(String source, RawCollationKey key) { |
| if (source == null) { |
| return null; |
| } |
| CollationBuffer buffer = null; |
| try { |
| buffer = getCollationBuffer(); |
| return getRawCollationKey(source, key, buffer); |
| } finally { |
| releaseCollationBuffer(buffer); |
| } |
| } |
| |
| private RawCollationKey getRawCollationKey(String source, RawCollationKey key, CollationBuffer buffer) { |
| int strength = getStrength(); |
| buffer.m_utilCompare0_ = m_isCaseLevel_; |
| // m_utilCompare1_ = true; |
| buffer.m_utilCompare2_ = strength >= SECONDARY; |
| buffer.m_utilCompare3_ = strength >= TERTIARY; |
| buffer.m_utilCompare4_ = strength >= QUATERNARY; |
| buffer.m_utilCompare5_ = strength == IDENTICAL; |
| |
| boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_; |
| // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. |
| // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so |
| // high. |
| int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_; |
| byte hiragana4 = 0; |
| if (m_isHiragana4_ && buffer.m_utilCompare4_) { |
| // allocate one more space for hiragana, value for hiragana |
| hiragana4 = (byte) commonBottom4; |
| commonBottom4++; |
| } |
| |
| int bottomCount4 = 0xFF - commonBottom4; |
| // If we need to normalize, we'll do it all at once at the beginning! |
| if (buffer.m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) { |
| // if it is identical strength, we have to normalize the string to |
| // NFD so that it will be appended correctly to the end of the sort |
| // key |
| source = Normalizer.decompose(source, false); |
| } else if (getDecomposition() != NO_DECOMPOSITION |
| && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) { |
| // for the rest of the strength, if decomposition is on, FCD is |
| // enough for us to work on. |
| source = Normalizer.normalize(source, Normalizer.FCD); |
| } |
| getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4, buffer); |
| if (key == null) { |
| key = new RawCollationKey(); |
| } |
| getSortKey(source, doFrench, commonBottom4, bottomCount4, key, buffer); |
| return key; |
| } |
| |
| /** |
| * Return true if an uppercase character is sorted before the corresponding lowercase character. See |
| * setCaseFirst(boolean) for details. |
| * |
| * @see #setUpperCaseFirst |
| * @see #setLowerCaseFirst |
| * @see #isLowerCaseFirst |
| * @see #setCaseFirstDefault |
| * @return true if upper cased characters are sorted before lower cased characters, false otherwise |
| * @stable ICU 2.8 |
| */ |
| public boolean isUpperCaseFirst() { |
| return (m_caseFirst_ == AttributeValue.UPPER_FIRST_); |
| } |
| |
| /** |
| * Return true if a lowercase character is sorted before the corresponding uppercase character. See |
| * setCaseFirst(boolean) for details. |
| * |
| * @see #setUpperCaseFirst |
| * @see #setLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setCaseFirstDefault |
| * @return true lower cased characters are sorted before upper cased characters, false otherwise |
| * @stable ICU 2.8 |
| */ |
| public boolean isLowerCaseFirst() { |
| return (m_caseFirst_ == AttributeValue.LOWER_FIRST_); |
| } |
| |
| /** |
| * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true, |
| * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the |
| * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more |
| * details. |
| * |
| * @return true or false |
| * @see #setAlternateHandlingShifted(boolean) |
| * @see #setAlternateHandlingDefault |
| * @stable ICU 2.8 |
| */ |
| public boolean isAlternateHandlingShifted() { |
| return m_isAlternateHandlingShifted_; |
| } |
| |
| /** |
| * Checks if case level is set to true. See setCaseLevel(boolean) for details. |
| * |
| * @return the case level mode |
| * @see #setCaseLevelDefault |
| * @see #isCaseLevel |
| * @see #setCaseLevel(boolean) |
| * @stable ICU 2.8 |
| */ |
| public boolean isCaseLevel() { |
| return m_isCaseLevel_; |
| } |
| |
| /** |
| * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details. |
| * |
| * @return true if French Collation is set to true, false otherwise |
| * @see #setFrenchCollation(boolean) |
| * @see #setFrenchCollationDefault |
| * @stable ICU 2.8 |
| */ |
| public boolean isFrenchCollation() { |
| return m_isFrenchCollation_; |
| } |
| |
| /** |
| * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details. |
| * |
| * @return flag true if Hiragana Quaternary mode is on, false otherwise |
| * @see #setHiraganaQuaternaryDefault |
| * @see #setHiraganaQuaternary(boolean) |
| * @stable ICU 2.8 |
| */ |
| public boolean isHiraganaQuaternary() { |
| return m_isHiragana4_; |
| } |
| |
| /** |
| * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored. |
| * |
| * @return the variable top value of a Collator. |
| * @see #setVariableTop |
| * @stable ICU 2.6 |
| */ |
| public int getVariableTop() { |
| return m_variableTopValue_ << 16; |
| } |
| |
| /** |
| * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a |
| * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2' |
| * |
| * @see #setNumericCollation |
| * @see #setNumericCollationDefault |
| * @return true if numeric collation is turned on, false otherwise |
| * @stable ICU 2.8 |
| */ |
| public boolean getNumericCollation() { |
| return m_isNumericCollation_; |
| } |
| |
| /** |
| * Retrieves the reordering codes for this collator. |
| * These reordering codes are a combination of UScript codes and ReorderCodes. |
| * @return a copy of the reordering codes for this collator; |
| * if none are set then returns an empty array |
| * @see #setReorderCodes |
| * @see #getEquivalentReorderCodes |
| * @draft ICU 4.8 |
| */ |
| public int[] getReorderCodes() { |
| if (m_reorderCodes_ != null) { |
| return m_reorderCodes_.clone(); |
| } else { |
| return LeadByteConstants.EMPTY_INT_ARRAY; |
| } |
| } |
| |
| /** |
| * Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder |
| * codes are grouped and must reorder together. |
| * |
| * @param reorderCode code for which equivalents to be retrieved |
| * @return the set of all reorder codes in the same group as the given reorder code. |
| * @see #setReorderCodes |
| * @see #getReorderCodes |
| * @draft ICU 4.8 |
| */ |
| public static int[] getEquivalentReorderCodes(int reorderCode) { |
| Set<Integer> equivalentCodesSet = new HashSet<Integer>(); |
| int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode); |
| for (int leadByte : leadBytes) { |
| int[] codes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte); |
| for (int code : codes) { |
| equivalentCodesSet.add(code); |
| } |
| } |
| int[] equivalentCodes = new int[equivalentCodesSet.size()]; |
| int i = 0; |
| for (int code : equivalentCodesSet) { |
| equivalentCodes[i++] = code; |
| } |
| return equivalentCodes; |
| } |
| |
| // public other methods ------------------------------------------------- |
| |
| /** |
| * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same |
| * collation rules and the same attributes. |
| * |
| * @param obj |
| * the RuleBasedCollator to be compared to. |
| * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise. |
| * @stable ICU 2.8 |
| */ |
| public boolean equals(Object obj) { |
| if (obj == null) { |
| return false; // super does class check |
| } |
| if (this == obj) { |
| return true; |
| } |
| if (getClass() != obj.getClass()) { |
| return false; |
| } |
| RuleBasedCollator other = (RuleBasedCollator) obj; |
| // all other non-transient information is also contained in rules. |
| if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition() |
| || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_ |
| || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_ |
| || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_ |
| || other.m_isHiragana4_ != m_isHiragana4_) { |
| return false; |
| } |
| if (m_reorderCodes_ != null ^ other.m_reorderCodes_ != null) { |
| return false; |
| } |
| if (m_reorderCodes_ != null) { |
| if (m_reorderCodes_.length != other.m_reorderCodes_.length) { |
| return false; |
| } |
| for (int i = 0; i < m_reorderCodes_.length; i++) { |
| if (m_reorderCodes_[i] != other.m_reorderCodes_[i]) { |
| return false; |
| } |
| } |
| } |
| boolean rules = m_rules_ == other.m_rules_; |
| if (!rules && (m_rules_ != null && other.m_rules_ != null)) { |
| rules = m_rules_.equals(other.m_rules_); |
| } |
| if (!rules || !ICUDebug.enabled("collation")) { |
| return rules; |
| } |
| if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_ |
| || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_ |
| || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_ |
| || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_ |
| || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_ |
| || !Arrays.equals(m_unsafe_, other.m_unsafe_)) { |
| return false; |
| } |
| if (!m_trie_.equals(other.m_trie_)) { |
| // we should use the trie iterator here, but then this part is |
| // only used in the test. |
| for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) { |
| int v = m_trie_.getCodePointValue(i); |
| int otherv = other.m_trie_.getCodePointValue(i); |
| if (v != otherv) { |
| int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_); |
| if (mask == (otherv & 0xff000000)) { |
| v &= 0xffffff; |
| otherv &= 0xffffff; |
| if (mask == 0xf1000000) { |
| v -= (m_expansionOffset_ << 4); |
| otherv -= (other.m_expansionOffset_ << 4); |
| } else if (mask == 0xf2000000) { |
| v -= m_contractionOffset_; |
| otherv -= other.m_contractionOffset_; |
| } |
| if (v == otherv) { |
| continue; |
| } |
| } |
| return false; |
| } |
| } |
| } |
| if (!Arrays.equals(m_contractionCE_, other.m_contractionCE_) |
| || !Arrays.equals(m_contractionEnd_, other.m_contractionEnd_) |
| || !Arrays.equals(m_contractionIndex_, other.m_contractionIndex_) |
| || !Arrays.equals(m_expansion_, other.m_expansion_) |
| || !Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) { |
| return false; |
| } |
| // not comparing paddings |
| for (int i = 0; i < m_expansionEndCE_.length; i++) { |
| if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Generates a unique hash code for this RuleBasedCollator. |
| * |
| * @return the unique hash code for this Collator |
| * @stable ICU 2.8 |
| */ |
| public int hashCode() { |
| String rules = getRules(); |
| if (rules == null) { |
| rules = ""; |
| } |
| return rules.hashCode(); |
| } |
| |
| /** |
| * Compares the source text String to the target text String according to the collation rules, strength and |
| * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero |
| * depending on whether the source String is less than, equal to or greater than the target String. See the Collator |
| * class description for an example of use. </p> |
| * <p> |
| * General recommendation: <br> |
| * If comparison are to be done to the same String multiple times, it would be more efficient to generate |
| * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed |
| * performance is critical and object instantiation is to be reduced, further optimization may be achieved by |
| * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method |
| * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey |
| * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key |
| * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String, |
| * String) will have a better performance. |
| * </p> |
| * |
| * @param source |
| * the source text String. |
| * @param target |
| * the target text String. |
| * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source |
| * and target are equal, value is greater than zero if source is greater than target. |
| * @see CollationKey |
| * @see #getCollationKey |
| * @stable ICU 2.8 |
| */ |
| public int compare(String source, String target) { |
| if (source == target) { |
| return 0; |
| } |
| CollationBuffer buffer = null; |
| try { |
| buffer = getCollationBuffer(); |
| return compare(source, target, buffer); |
| } finally { |
| releaseCollationBuffer(buffer); |
| } |
| } |
| |
| private int compare(String source, String target, CollationBuffer buffer) { |
| // Find the length of any leading portion that is equal |
| int offset = getFirstUnmatchedOffset(source, target); |
| // return compareRegular(source, target, offset); |
| if (latinOneUse_) { |
| if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_) |
| || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) { |
| // source or target start with non-latin-1 |
| return compareRegular(source, target, offset, buffer); |
| } else { |
| return compareUseLatin1(source, target, offset, buffer); |
| } |
| } else { |
| return compareRegular(source, target, offset, buffer); |
| } |
| } |
| |
| // package private inner interfaces -------------------------------------- |
| |
| /** |
| * Attribute values to be used when setting the Collator options |
| */ |
| static interface AttributeValue { |
| /** |
| * Indicates that the default attribute value will be used. See individual attribute for details on its default |
| * value. |
| */ |
| static final int DEFAULT_ = -1; |
| /** |
| * Primary collation strength |
| */ |
| static final int PRIMARY_ = Collator.PRIMARY; |
| /** |
| * Secondary collation strength |
| */ |
| static final int SECONDARY_ = Collator.SECONDARY; |
| /** |
| * Tertiary collation strength |
| */ |
| static final int TERTIARY_ = Collator.TERTIARY; |
| /** |
| * Default collation strength |
| */ |
| static final int DEFAULT_STRENGTH_ = Collator.TERTIARY; |
| /** |
| * Internal use for strength checks in Collation elements |
| */ |
| static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1; |
| /** |
| * Quaternary collation strength |
| */ |
| static final int QUATERNARY_ = 3; |
| /** |
| * Identical collation strength |
| */ |
| static final int IDENTICAL_ = Collator.IDENTICAL; |
| /** |
| * Internal use for strength checks |
| */ |
| static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1; |
| /** |
| * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and |
| * DECOMPOSITION_MODE |
| */ |
| static final int OFF_ = 16; |
| /** |
| * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE |
| */ |
| static final int ON_ = 17; |
| /** |
| * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted |
| */ |
| static final int SHIFTED_ = 20; |
| /** |
| * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable |
| */ |
| static final int NON_IGNORABLE_ = 21; |
| /** |
| * Valid for CASE_FIRST - lower case sorts before upper case |
| */ |
| static final int LOWER_FIRST_ = 24; |
| /** |
| * Upper case sorts before lower case |
| */ |
| static final int UPPER_FIRST_ = 25; |
| /** |
| * Number of attribute values |
| */ |
| static final int LIMIT_ = 29; |
| } |
| |
| /** |
| * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values |
| * specific to each one. |
| */ |
| static interface Attribute { |
| /** |
| * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in |
| * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear. |
| */ |
| static final int FRENCH_COLLATION_ = 0; |
| /** |
| * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the |
| * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with |
| * primary weights that are equal or below the variable top value to be ignored on primary level and moved to |
| * the quaternary level. |
| */ |
| static final int ALTERNATE_HANDLING_ = 1; |
| /** |
| * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders |
| * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case |
| * letters to sort before lower case letters, and LOWER_FIRST which does the opposite. |
| */ |
| static final int CASE_FIRST_ = 2; |
| /** |
| * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable |
| * values are OFF (default), when case level is not generated, and ON which causes the case level to be |
| * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to |
| * ignore accent differences in a string is to set the strength to PRIMARY and enable case level. |
| */ |
| static final int CASE_LEVEL_ = 3; |
| /** |
| * Controls whether the normalization check and necessary normalizations are performed. When set to OFF |
| * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input |
| * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is |
| * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental |
| * NFD normalization is performed. |
| */ |
| static final int NORMALIZATION_MODE_ = 4; |
| /** |
| * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual |
| * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with |
| * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish |
| * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise, |
| * quaternary level is affected only by the number of non ignorable code points in the string. Identical |
| * strength is rarely useful, as it amounts to codepoints of the NFD form of the string. |
| */ |
| static final int STRENGTH_ = 5; |
| /** |
| * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a |
| * sneaky way to produce JIS sort order. |
| */ |
| static final int HIRAGANA_QUATERNARY_MODE_ = 6; |
| /** |
| * Attribute count |
| */ |
| static final int LIMIT_ = 7; |
| } |
| |
| /** |
| * DataManipulate singleton |
| */ |
| static class DataManipulate implements Trie.DataManipulate { |
| // public methods ---------------------------------------------------- |
| |
| /** |
| * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data. |
| * |
| * @param ce |
| * collation element of the lead surrogate |
| * @return data offset or 0 for the next trail surrogate |
| * @stable ICU 2.8 |
| */ |
| public final int getFoldingOffset(int ce) { |
| if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { |
| return (ce & 0xFFFFFF); |
| } |
| return 0; |
| } |
| |
| /** |
| * Get singleton object |
| */ |
| public static final DataManipulate getInstance() { |
| if (m_instance_ == null) { |
| m_instance_ = new DataManipulate(); |
| } |
| return m_instance_; |
| } |
| |
| // private data member ---------------------------------------------- |
| |
| /** |
| * Singleton instance |
| */ |
| private static DataManipulate m_instance_; |
| |
| // private constructor ---------------------------------------------- |
| |
| /** |
| * private to prevent initialization |
| */ |
| private DataManipulate() { |
| } |
| } |
| |
| /** |
| * UCAConstants |
| */ |
| static final class UCAConstants { |
| int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 |
| int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 |
| int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705 |
| int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000 |
| int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500 |
| int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05 |
| int FIRST_VARIABLE_[] = new int[2]; // 0x05070505 |
| int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505 |
| int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505 |
| int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505 |
| int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303 |
| int FIRST_IMPLICIT_[] = new int[2]; |
| int LAST_IMPLICIT_[] = new int[2]; |
| int FIRST_TRAILING_[] = new int[2]; |
| int LAST_TRAILING_[] = new int[2]; |
| int PRIMARY_TOP_MIN_; |
| int PRIMARY_IMPLICIT_MIN_; // 0xE8000000 |
| int PRIMARY_IMPLICIT_MAX_; // 0xF0000000 |
| int PRIMARY_TRAILING_MIN_; // 0xE8000000 |
| int PRIMARY_TRAILING_MAX_; // 0xF0000000 |
| int PRIMARY_SPECIAL_MIN_; // 0xE8000000 |
| int PRIMARY_SPECIAL_MAX_; // 0xF0000000 |
| } |
| |
| /** |
| * Script to Lead Byte and Lead Byte to Script Data |
| * |
| */ |
| static final class LeadByteConstants { |
| private static final int DATA_MASK_FOR_INDEX = 0x8000; |
| private static final int[] EMPTY_INT_ARRAY = new int[0]; |
| |
| private int serializedSize = 0; |
| |
| private Map<Integer, Integer> SCRIPT_TO_LEAD_BYTES_INDEX; |
| private byte[] SCRIPT_TO_LEAD_BYTES_DATA; |
| |
| private int[] LEAD_BYTE_TO_SCRIPTS_INDEX; |
| private byte[] LEAD_BYTE_TO_SCRIPTS_DATA; |
| |
| LeadByteConstants() { |
| } |
| |
| void read(DataInputStream dis) throws IOException { |
| int readcount = 0; |
| int indexCount; |
| int dataSize; |
| |
| // script to lead bytes |
| indexCount = dis.readShort(); |
| readcount += 2; |
| dataSize = dis.readShort(); |
| readcount += 2; |
| this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap<Integer, Integer>(); |
| //System.out.println("Script to Lead Bytes Index - Count = " + indexCount); |
| for (int index = 0; index < indexCount; index++) { |
| int reorderCode = dis.readShort(); // reorder code |
| readcount += 2; |
| int dataOffset = 0xffff & dis.readShort(); // data offset |
| readcount += 2; |
| // System.out.println("\t-------------"); |
| // System.out.println("\toffset = " + Integer.toHexString(readcount - 4)); |
| // System.out.println("\treorderCode = " + Integer.toHexString(reorderCode)); |
| // System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset)); |
| this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset); |
| } |
| |
| this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2]; |
| dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length); |
| readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length; |
| |
| // lead byte to scripts |
| indexCount = dis.readShort(); |
| readcount += 2; |
| dataSize = dis.readShort(); |
| readcount += 2; |
| this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount]; |
| //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount); |
| for (int index = 0; index < indexCount; index++) { |
| this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort(); |
| readcount += 2; |
| // System.out.println("\t-------------"); |
| // System.out.println("\toffset = " + Integer.toHexString(readcount - 2)); |
| // System.out.println("\tindex = " + Integer.toHexString(index)); |
| // System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index])); |
| } |
| |
| this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2]; |
| dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length); |
| readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length; |
| |
| this.serializedSize = readcount; |
| } |
| |
| int getSerializedDataSize() { |
| return this.serializedSize; |
| } |
| |
| int[] getReorderCodesForLeadByte(int leadByte) { |
| if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) { |
| return EMPTY_INT_ARRAY; |
| } |
| int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte]; |
| if (offset == 0) { |
| return EMPTY_INT_ARRAY; |
| } |
| int[] reorderCodes; |
| if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) { |
| reorderCodes = new int[1]; |
| reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX; |
| } else { |
| int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset); |
| offset++; |
| |
| reorderCodes = new int[length]; |
| for (int code = 0; code < length; code++, offset++) { |
| reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset); |
| } |
| } |
| return reorderCodes; |
| } |
| |
| int[] getLeadBytesForReorderCode(int reorderCode) { |
| if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) { |
| return EMPTY_INT_ARRAY; |
| } |
| int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode); |
| |
| if (offset == 0) { |
| return EMPTY_INT_ARRAY; |
| } |
| |
| int[] leadBytes; |
| if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) { |
| leadBytes = new int[1]; |
| leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX; |
| } else { |
| int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset); |
| offset++; |
| |
| leadBytes = new int[length]; |
| for (int leadByte = 0; leadByte < length; leadByte++, offset++) { |
| leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset); |
| } |
| } |
| return leadBytes; |
| } |
| |
| private static int readShort(byte[] data, int offset) { |
| return (0xff & data[offset * 2]) << 8 | (data[offset * 2 + 1] & 0xff); |
| } |
| } |
| |
| // package private data member ------------------------------------------- |
| |
| static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04; |
| static final byte BYTE_COMMON_ = (byte) 0x05; |
| static final int COMMON_TOP_2_ = 0x86; // int for unsigness |
| static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; |
| static final int COMMON_BOTTOM_3 = 0x05; |
| /** |
| * Case strength mask |
| */ |
| static final int CE_CASE_BIT_MASK_ = 0xC0; |
| static final int CE_TAG_SHIFT_ = 24; |
| static final int CE_TAG_MASK_ = 0x0F000000; |
| |
| static final int CE_SPECIAL_FLAG_ = 0xF0000000; |
| /** |
| * Lead surrogate that is tailored and doesn't start a contraction |
| */ |
| static final int CE_SURROGATE_TAG_ = 5; |
| /** |
| * Mask to get the primary strength of the collation element |
| */ |
| static final int CE_PRIMARY_MASK_ = 0xFFFF0000; |
| /** |
| * Mask to get the secondary strength of the collation element |
| */ |
| static final int CE_SECONDARY_MASK_ = 0xFF00; |
| /** |
| * Mask to get the tertiary strength of the collation element |
| */ |
| static final int CE_TERTIARY_MASK_ = 0xFF; |
| /** |
| * Primary strength shift |
| */ |
| static final int CE_PRIMARY_SHIFT_ = 16; |
| /** |
| * Secondary strength shift |
| */ |
| static final int CE_SECONDARY_SHIFT_ = 8; |
| /** |
| * Continuation marker |
| */ |
| static final int CE_CONTINUATION_MARKER_ = 0xC0; |
| |
| /** |
| * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to |
| * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus |
| * that off to get the right expansion ce offset. In number of ints. |
| */ |
| int m_expansionOffset_; |
| /** |
| * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction |
| * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have |
| * to minus that off to get the right contraction ce offset. In number of chars. |
| */ |
| int m_contractionOffset_; |
| /** |
| * Flag indicator if Jamo is special |
| */ |
| boolean m_isJamoSpecial_; |
| |
| // Collator options ------------------------------------------------------ |
| |
| int m_defaultVariableTopValue_; |
| boolean m_defaultIsFrenchCollation_; |
| boolean m_defaultIsAlternateHandlingShifted_; |
| int m_defaultCaseFirst_; |
| boolean m_defaultIsCaseLevel_; |
| int m_defaultDecomposition_; |
| int m_defaultStrength_; |
| boolean m_defaultIsHiragana4_; |
| boolean m_defaultIsNumericCollation_; |
| /** |
| * Default script order - the one created at initial rule parse time |
| */ |
| int[] m_defaultReorderCodes_; |
| |
| /** |
| * Value of the variable top |
| */ |
| int m_variableTopValue_; |
| /** |
| * Attribute for special Hiragana |
| */ |
| boolean m_isHiragana4_; |
| /** |
| * Case sorting customization |
| */ |
| int m_caseFirst_; |
| /** |
| * Numeric collation option |
| */ |
| boolean m_isNumericCollation_; |
| /** |
| * Script order |
| */ |
| int[] m_reorderCodes_; |
| |
| // end Collator options -------------------------------------------------- |
| |
| /** |
| * Expansion table |
| */ |
| int m_expansion_[]; |
| /** |
| * Contraction index table |
| */ |
| char m_contractionIndex_[]; |
| /** |
| * Contraction CE table |
| */ |
| int m_contractionCE_[]; |
| /** |
| * Data trie |
| */ |
| IntTrie m_trie_; |
| /** |
| * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch. |
| */ |
| int m_expansionEndCE_[]; |
| /** |
| * Table to store the maximum size of any expansions that end with the corresponding collation element in |
| * m_expansionEndCE_. For use in StringSearch too |
| */ |
| byte m_expansionEndCEMaxSize_[]; |
| /** |
| * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are |
| * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the |
| * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above, |
| * then 'A', 'B', 'C' are "unsafe" but 'Z' is not. |
| */ |
| byte m_unsafe_[]; |
| /** |
| * Table to store information on whether a codepoint can occur as the last character in a contraction |
| */ |
| byte m_contractionEnd_[]; |
| /** |
| * Original collation rules |
| */ |
| String m_rules_; |
| /** |
| * The smallest "unsafe" codepoint |
| */ |
| char m_minUnsafe_; |
| /** |
| * The smallest codepoint that could be the end of a contraction |
| */ |
| char m_minContractionEnd_; |
| /** |
| * General version of the collator |
| */ |
| VersionInfo m_version_; |
| /** |
| * UCA version |
| */ |
| VersionInfo m_UCA_version_; |
| /** |
| * UCD version |
| */ |
| VersionInfo m_UCD_version_; |
| /** |
| * Lead byte and script data |
| */ |
| int m_leadByteToScripts; |
| int m_scriptToLeadBytes; |
| /** |
| * UnicodeData.txt property object |
| */ |
| static final RuleBasedCollator UCA_; |
| /** |
| * UCA Constants |
| */ |
| static final UCAConstants UCA_CONSTANTS_; |
| /** |
| * Lead Byte Constants |
| */ |
| static LeadByteConstants LEADBYTE_CONSTANTS_; |
| /** |
| * Table for UCA and builder use |
| */ |
| static final char UCA_CONTRACTIONS_[]; |
| |
| private static boolean UCA_INIT_COMPLETE; |
| |
| /** |
| * Implicit generator |
| */ |
| static final ImplicitCEGenerator impCEGen_; |
| |
| static final byte SORT_LEVEL_TERMINATOR_ = 1; |
| |
| // These are values from UCA required for |
| // implicit generation and supressing sort key compression |
| // they should regularly be in the UCA, but if one |
| // is running without UCA, it could be a problem |
| static final int maxRegularPrimary = 0x7A; |
| static final int minImplicitPrimary = 0xE0; |
| static final int maxImplicitPrimary = 0xE4; |
| |
| // block to initialise character property database |
| static { |
| // take pains to let static class init succeed, otherwise the class itself won't exist and |
| // clients will get a NoClassDefFoundException. Instead, make the constructors fail if |
| // we can't load the UCA data. |
| |
| RuleBasedCollator iUCA_ = null; |
| UCAConstants iUCA_CONSTANTS_ = null; |
| LeadByteConstants iLEADBYTE_CONSTANTS = null; |
| char iUCA_CONTRACTIONS_[] = null; |
| ImplicitCEGenerator iimpCEGen_ = null; |
| try { |
| // !!! note what's going on here... |
| // even though the static init of the class is not yet complete, we |
| // instantiate an instance of the class. So we'd better be sure that |
| // instantiation doesn't rely on the static initialization that's |
| // not complete yet! |
| iUCA_ = new RuleBasedCollator(); |
| iUCA_CONSTANTS_ = new UCAConstants(); |
| iLEADBYTE_CONSTANTS = new LeadByteConstants(); |
| iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS); |
| |
| // called before doing canonical closure for the UCA. |
| iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary); |
| // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, |
| // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_); |
| iUCA_.init(); |
| ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( |
| ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH); |
| iUCA_.m_rules_ = (String) rb.getObject("UCARules"); |
| } catch (MissingResourceException ex) { |
| // throw ex; |
| } catch (IOException e) { |
| // e.printStackTrace(); |
| // throw new MissingResourceException(e.getMessage(),"",""); |
| } |
| |
| UCA_ = iUCA_; |
| UCA_CONSTANTS_ = iUCA_CONSTANTS_; |
| LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS; |
| UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_; |
| impCEGen_ = iimpCEGen_; |
| |
| UCA_INIT_COMPLETE = true; |
| } |
| |
| private static void checkUCA() throws MissingResourceException { |
| if (UCA_INIT_COMPLETE && UCA_ == null) { |
| throw new MissingResourceException("Collator UCA data unavailable", "", ""); |
| } |
| } |
| |
| // package private constructors ------------------------------------------ |
| |
| /** |
| * <p> |
| * Private contructor for use by subclasses. Public access to creating Collators is handled by the API |
| * Collator.getInstance() or RuleBasedCollator(String rules). |
| * </p> |
| * <p> |
| * This constructor constructs the UCA collator internally |
| * </p> |
| */ |
| RuleBasedCollator() { |
| checkUCA(); |
| } |
| |
| /** |
| * Constructors a RuleBasedCollator from the argument locale. If no resource bundle is associated with the locale, |
| * UCA is used instead. |
| * |
| * @param locale |
| */ |
| RuleBasedCollator(ULocale locale) { |
| checkUCA(); |
| ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( |
| ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale); |
| if (rb != null) { |
| try { |
| // Use keywords, if supplied for lookup |
| String collkey = locale.getKeywordValue("collation"); |
| if (collkey == null) { |
| collkey = rb.getStringWithFallback("collations/default"); |
| } |
| |
| // collations/default will always give a string back |
| // keyword for the real collation data |
| // if "collations/collkey" will return null if collkey == null |
| ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey); |
| if (elements != null) { |
| // TODO: Determine actual & valid locale correctly |
| ULocale uloc = rb.getULocale(); |
| setLocale(uloc, uloc); |
| |
| m_rules_ = elements.getString("Sequence"); |
| ByteBuffer buf = elements.get("%%CollationBin").getBinary(); |
| // %%CollationBin |
| if (buf != null) { |
| // m_rules_ = (String)rules[1][1]; |
| CollatorReader.initRBC(this, buf); |
| /* |
| * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /* |
| * CollatorReader reader = new CollatorReader(input, false); if (map.length > |
| * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this); |
| * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); } |
| */ |
| // at this point, we have read in the collator |
| // now we need to check whether the binary image has |
| // the right UCA and other versions |
| if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) { |
| init(m_rules_); |
| return; |
| } |
| try { |
| UResourceBundle reorderRes = elements.get("%%ReorderCodes"); |
| if (reorderRes != null) { |
| int[] reorderCodes = reorderRes.getIntVector(); |
| setReorderCodes(reorderCodes); |
| m_defaultReorderCodes_ = reorderCodes.clone(); |
| } |
| } catch (MissingResourceException e) { |
| // ignore |
| } |
| init(); |
| return; |
| } else { |
| init(m_rules_); |
| return; |
| } |
| } |
| } catch (Exception e) { |
| e.printStackTrace(); |
| // if failed use UCA. |
| } |
| } |
| setWithUCAData(); |
| } |
| |
| // package private methods ----------------------------------------------- |
| |
| /** |
| * Sets this collator to use the tables in UCA. Note options not taken care of here. |
| */ |
| final void setWithUCATables() { |
| m_contractionOffset_ = UCA_.m_contractionOffset_; |
| m_expansionOffset_ = UCA_.m_expansionOffset_; |
| m_expansion_ = UCA_.m_expansion_; |
| m_contractionIndex_ = UCA_.m_contractionIndex_; |
| m_contractionCE_ = UCA_.m_contractionCE_; |
| m_trie_ = UCA_.m_trie_; |
| m_expansionEndCE_ = UCA_.m_expansionEndCE_; |
| m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_; |
| m_unsafe_ = UCA_.m_unsafe_; |
| m_contractionEnd_ = UCA_.m_contractionEnd_; |
| m_minUnsafe_ = UCA_.m_minUnsafe_; |
| m_minContractionEnd_ = UCA_.m_minContractionEnd_; |
| } |
| |
| /** |
| * Sets this collator to use the all options and tables in UCA. |
| */ |
| final void setWithUCAData() { |
| latinOneFailed_ = true; |
| |
| m_addition3_ = UCA_.m_addition3_; |
| m_bottom3_ = UCA_.m_bottom3_; |
| m_bottomCount3_ = UCA_.m_bottomCount3_; |
| m_caseFirst_ = UCA_.m_caseFirst_; |
| m_caseSwitch_ = UCA_.m_caseSwitch_; |
| m_common3_ = UCA_.m_common3_; |
| m_contractionOffset_ = UCA_.m_contractionOffset_; |
| setDecomposition(UCA_.getDecomposition()); |
| m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_; |
| m_defaultDecomposition_ = UCA_.m_defaultDecomposition_; |
| m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_; |
| m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_; |
| m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_; |
| m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_; |
| m_defaultStrength_ = UCA_.m_defaultStrength_; |
| m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_; |
| m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_; |
| m_expansionOffset_ = UCA_.m_expansionOffset_; |
| m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_; |
| m_isCaseLevel_ = UCA_.m_isCaseLevel_; |
| m_isFrenchCollation_ = UCA_.m_isFrenchCollation_; |
| m_isHiragana4_ = UCA_.m_isHiragana4_; |
| m_isJamoSpecial_ = UCA_.m_isJamoSpecial_; |
| m_isSimple3_ = UCA_.m_isSimple3_; |
| m_mask3_ = UCA_.m_mask3_; |
| m_minContractionEnd_ = UCA_.m_minContractionEnd_; |
| m_minUnsafe_ = UCA_.m_minUnsafe_; |
| m_rules_ = UCA_.m_rules_; |
| setStrength(UCA_.getStrength()); |
| m_top3_ = UCA_.m_top3_; |
| m_topCount3_ = UCA_.m_topCount3_; |
| m_variableTopValue_ = UCA_.m_variableTopValue_; |
| m_isNumericCollation_ = UCA_.m_isNumericCollation_; |
| setWithUCATables(); |
| latinOneFailed_ = false; |
| } |
| |
| /** |
| * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters |
| * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is |
| * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one |
| * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not. |
| * |
| * @param ch |
| * character to determin |
| * @return true if ch is unsafe, false otherwise |
| */ |
| final boolean isUnsafe(char ch) { |
| if (ch < m_minUnsafe_) { |
| return false; |
| } |
| |
| if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { |
| if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) { |
| // Trail surrogate are always considered unsafe. |
| return true; |
| } |
| ch &= HEURISTIC_OVERFLOW_MASK_; |
| ch += HEURISTIC_OVERFLOW_OFFSET_; |
| } |
| int value = m_unsafe_[ch >> HEURISTIC_SHIFT_]; |
| return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; |
| } |
| |
| /** |
| * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at |
| * the end of a contraction, otherwise it is not deterministic. |
| * |
| * @param ch |
| * character to be determined |
| */ |
| final boolean isContractionEnd(char ch) { |
| if (UTF16.isTrailSurrogate(ch)) { |
| return true; |
| } |
| |
| if (ch < m_minContractionEnd_) { |
| return false; |
| } |
| |
| if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { |
| ch &= HEURISTIC_OVERFLOW_MASK_; |
| ch += HEURISTIC_OVERFLOW_OFFSET_; |
| } |
| int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_]; |
| return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; |
| } |
| |
| /** |
| * Retrieve the tag of a special ce |
| * |
| * @param ce |
| * ce to test |
| * @return tag of ce |
| */ |
| static int getTag(int ce) { |
| return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_; |
| } |
| |
| /** |
| * Checking if ce is special |
| * |
| * @param ce |
| * to check |
| * @return true if ce is special |
| */ |
| static boolean isSpecial(int ce) { |
| return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_; |
| } |
| |
| /** |
| * Checks if the argument ce is a continuation |
| * |
| * @param ce |
| * collation element to test |
| * @return true if ce is a continuation |
| */ |
| static final boolean isContinuation(int ce) { |
| return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; |
| } |
| |
| // private inner classes ------------------------------------------------ |
| |
| // private variables ----------------------------------------------------- |
| |
| /** |
| * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark. |
| */ |
| private static final int DEFAULT_MIN_HEURISTIC_ = 0x300; |
| /** |
| * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the |
| * rest of the chars. Size in bytes. |
| */ |
| private static final char HEURISTIC_SIZE_ = 1056; |
| /** |
| * Mask value down to "some power of two" - 1, number of bits, not num of bytes. |
| */ |
| private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff; |
| /** |
| * Unsafe character shift |
| */ |
| private static final int HEURISTIC_SHIFT_ = 3; |
| /** |
| * Unsafe character addition for character too large, it has to be folded then incremented. |
| */ |
| private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256; |
| /** |
| * Mask value to get offset in heuristic table. |
| */ |
| private static final char HEURISTIC_MASK_ = 7; |
| |
| private int m_caseSwitch_; |
| private int m_common3_; |
| private int m_mask3_; |
| /** |
| * When switching case, we need to add or subtract different values. |
| */ |
| private int m_addition3_; |
| /** |
| * Upper range when compressing |
| */ |
| private int m_top3_; |
| /** |
| * Upper range when compressing |
| */ |
| private int m_bottom3_; |
| private int m_topCount3_; |
| private int m_bottomCount3_; |
| /** |
| * Script reordering table |
| */ |
| private byte[] m_leadBytePermutationTable_; |
| /** |
| * Case first constants |
| */ |
| private static final int CASE_SWITCH_ = 0xC0; |
| private static final int NO_CASE_SWITCH_ = 0; |
| /** |
| * Case level constants |
| */ |
| private static final int CE_REMOVE_CASE_ = 0x3F; |
| private static final int CE_KEEP_CASE_ = 0xFF; |
| /** |
| * Case strength mask |
| */ |
| private static final int CE_CASE_MASK_3_ = 0xFF; |
| /** |
| * Sortkey size factor. Values can be changed. |
| */ |
| private static final double PROPORTION_2_ = 0.5; |
| private static final double PROPORTION_3_ = 0.667; |
| |
| // These values come from the UCA ---------------------------------------- |
| |
| /** |
| * This is an enum that lists magic special byte values from the fractional UCA |
| */ |
| // private static final byte BYTE_ZERO_ = 0x0; |
| // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01; |
| // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; |
| private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03; |
| /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; |
| // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; |
| // TODO: Make the following values dynamic since they change with almost every UCA version. |
| static final byte CODAN_PLACEHOLDER = 0x12; |
| private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B; |
| |
| private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF; |
| private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; |
| private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; |
| private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40; |
| private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85; |
| private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45; |
| private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5; |
| private static final int COMMON_BOTTOM_3_ = 0x05; |
| private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86; |
| private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_; |
| private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_); |
| private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_; |
| private static final int COMMON_2_ = COMMON_BOTTOM_2_; |
| private static final int COMMON_UPPER_FIRST_3_ = 0xC5; |
| private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_; |
| // private static final int COMMON_4_ = (byte)0xFF; |
| |
| /* |
| * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes |
| */ |
| // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2; |
| |
| /** |
| * If this collator is to generate only simple tertiaries for fast path |
| */ |
| private boolean m_isSimple3_; |
| |
| /** |
| * French collation sorting flag |
| */ |
| private boolean m_isFrenchCollation_; |
| /** |
| * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for |
| * alternate handling will be non-ignorable. |
| */ |
| private boolean m_isAlternateHandlingShifted_; |
| /** |
| * Extra case level for sorting |
| */ |
| private boolean m_isCaseLevel_; |
| /** |
| * Frozen state of the collator. |
| */ |
| private Lock frozenLock; |
| |
| |
| private static final int SORT_BUFFER_INIT_SIZE_ = 128; |
| private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3; |
| private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_; |
| private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_; |
| private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2; |
| private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_; |
| |
| private static final int CE_CONTINUATION_TAG_ = 0xC0; |
| private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F; |
| |
| private static final int LAST_BYTE_MASK_ = 0xFF; |
| |
| // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303; |
| // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303; |
| |
| private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80; |
| private static final byte SORT_CASE_SHIFT_START_ = (byte) 7; |
| |
| /** |
| * CE buffer size |
| */ |
| private static final int CE_BUFFER_SIZE_ = 512; |
| |
| // variables for Latin-1 processing |
| boolean latinOneUse_ = false; |
| boolean latinOneRegenTable_ = false; |
| boolean latinOneFailed_ = false; |
| |
| int latinOneTableLen_ = 0; |
| int latinOneCEs_[] = null; |
| |
| private final class CollationBuffer { |
| /** |
| * Bunch of utility iterators |
| */ |
| protected StringUCharacterIterator m_srcUtilIter_; |
| protected CollationElementIterator m_srcUtilColEIter_; |
| protected StringUCharacterIterator m_tgtUtilIter_; |
| protected CollationElementIterator m_tgtUtilColEIter_; |
| |
| /** |
| * Utility comparison flags |
| */ |
| protected boolean m_utilCompare0_; |
| // private boolean m_utilCompare1_; |
| protected boolean m_utilCompare2_; |
| protected boolean m_utilCompare3_; |
| protected boolean m_utilCompare4_; |
| protected boolean m_utilCompare5_; |
| |
| /** |
| * Utility byte buffer |
| */ |
| protected byte m_utilBytes0_[]; |
| protected byte m_utilBytes1_[]; |
| protected byte m_utilBytes2_[]; |
| protected byte m_utilBytes3_[]; |
| protected byte m_utilBytes4_[]; |
| // private byte m_utilBytes5_[]; |
| |
| protected RawCollationKey m_utilRawCollationKey_; |
| |
| protected int m_utilBytesCount0_; |
| protected int m_utilBytesCount1_; |
| protected int m_utilBytesCount2_; |
| protected int m_utilBytesCount3_; |
| protected int m_utilBytesCount4_; |
| // private int m_utilBytesCount5_; |
| |
| // private int m_utilCount0_; |
| // private int m_utilCount1_; |
| protected int m_utilCount2_; |
| protected int m_utilCount3_; |
| protected int m_utilCount4_; |
| // private int m_utilCount5_; |
| |
| protected int m_utilFrenchStart_; |
| protected int m_utilFrenchEnd_; |
| |
| /** |
| * Preparing the CE buffers. will be filled during the primary phase |
| */ |
| protected int m_srcUtilCEBuffer_[]; |
| protected int m_tgtUtilCEBuffer_[]; |
| protected int m_srcUtilCEBufferSize_; |
| protected int m_tgtUtilCEBufferSize_; |
| |
| protected int m_srcUtilContOffset_; |
| protected int m_tgtUtilContOffset_; |
| |
| protected int m_srcUtilOffset_; |
| protected int m_tgtUtilOffset_; |
| |
| private CollationBuffer() { |
| initBuffers(); |
| } |
| |
| /** |
| * Initializes utility iterators and byte buffer used by compare |
| */ |
| protected final void initBuffers() { |
| resetBuffers(); |
| m_srcUtilIter_ = new StringUCharacterIterator(); |
| m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, RuleBasedCollator.this); |
| m_tgtUtilIter_ = new StringUCharacterIterator(); |
| m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, RuleBasedCollator.this); |
| m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case |
| m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary |
| m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary |
| m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary |
| m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary |
| m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_]; |
| m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_]; |
| } |
| |
| protected final void resetBuffers() { |
| m_utilCompare0_ = false; |
| // private boolean m_utilCompare1_; |
| m_utilCompare2_ = false; |
| m_utilCompare3_ = false; |
| m_utilCompare4_ = false; |
| m_utilCompare5_ = false; |
| |
| m_utilBytesCount0_ = 0; |
| m_utilBytesCount1_ = 0; |
| m_utilBytesCount2_ = 0; |
| m_utilBytesCount3_ = 0; |
| m_utilBytesCount4_ = 0; |
| // private int m_utilBytesCount5_; |
| |
| m_utilCount2_ = 0; |
| m_utilCount3_ = 0; |
| m_utilCount4_ = 0; |
| |
| m_utilFrenchStart_ = 0; |
| m_utilFrenchEnd_ = 0; |
| |
| m_srcUtilContOffset_ = 0; |
| m_tgtUtilContOffset_ = 0; |
| |
| m_srcUtilOffset_ = 0; |
| m_tgtUtilOffset_ = 0; |
| } |
| } |
| |
| // private methods ------------------------------------------------------- |
| |
| private void init(String rules) throws Exception { |
| setWithUCAData(); |
| CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules); |
| builder.setRules(this); |
| m_rules_ = rules; |
| init(); |
| buildPermutationTable(); |
| } |
| |
| private final int compareRegular(String source, String target, int offset, CollationBuffer buffer) { |
| buffer.resetBuffers(); |
| |
| int strength = getStrength(); |
| // setting up the collator parameters |
| buffer.m_utilCompare0_ = m_isCaseLevel_; |
| // m_utilCompare1_ = true; |
| buffer.m_utilCompare2_ = strength >= SECONDARY; |
| buffer.m_utilCompare3_ = strength >= TERTIARY; |
| buffer.m_utilCompare4_ = strength >= QUATERNARY; |
| buffer.m_utilCompare5_ = strength == IDENTICAL; |
| boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_; |
| boolean doShift4 = m_isAlternateHandlingShifted_ && buffer.m_utilCompare4_; |
| boolean doHiragana4 = m_isHiragana4_ && buffer.m_utilCompare4_; |
| |
| if (doHiragana4 && doShift4) { |
| String sourcesub = source.substring(offset); |
| String targetsub = target.substring(offset); |
| return compareBySortKeys(sourcesub, targetsub, buffer); |
| } |
| |
| // This is the lowest primary value that will not be ignored if shifted |
| int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0; |
| buffer.m_srcUtilCEBufferSize_ = 0; |
| buffer.m_tgtUtilCEBufferSize_ = 0; |
| int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset, buffer); |
| if (buffer.m_srcUtilCEBufferSize_ == -1 && buffer.m_tgtUtilCEBufferSize_ == -1) { |
| // since the cebuffer is cleared when we have determined that |
| // either source is greater than target or vice versa, the return |
| // result is the comparison result and not the hiragana result |
| return result; |
| } |
| |
| int hiraganaresult = result; |
| |
| if (buffer.m_utilCompare2_) { |
| result = doSecondaryCompare(doFrench, buffer); |
| if (result != 0) { |
| return result; |
| } |
| } |
| // doing the case bit |
| if (buffer.m_utilCompare0_) { |
| result = doCaseCompare(buffer); |
| if (result != 0) { |
| return result; |
| } |
| } |
| // Tertiary level |
| if (buffer.m_utilCompare3_) { |
| result = doTertiaryCompare(buffer); |
| if (result != 0) { |
| return result; |
| } |
| } |
| |
| if (doShift4) { // checkQuad |
| result = doQuaternaryCompare(lowestpvalue, buffer); |
| if (result != 0) { |
| return result; |
| } |
| } else if (doHiragana4 && hiraganaresult != 0) { |
| // If we're fine on quaternaries, we might be different |
| // on Hiragana. This, however, might fail us in shifted. |
| return hiraganaresult; |
| } |
| |
| // For IDENTICAL comparisons, we use a bitwise character comparison |
| // as a tiebreaker if all else is equal. |
| // Getting here should be quite rare - strings are not identical - |
| // that is checked first, but compared == through all other checks. |
| if (buffer.m_utilCompare5_) { |
| return doIdenticalCompare(source, target, offset, true); |
| } |
| return 0; |
| } |
| |
| // Is this primary weight compressible? |
| // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). |
| // TODO: This should use per-lead-byte flags from FractionalUCA.txt. |
| static boolean isCompressible(int primary1) { |
| return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary; |
| } |
| |
| /** |
| * Gets the 2 bytes of primary order and adds it to the primary byte array |
| * |
| * @param ce |
| * current ce |
| * @param notIsContinuation |
| * flag indicating if the current bytes belong to a continuation ce |
| * @param doShift |
| * flag indicating if ce is to be shifted |
| * @param leadPrimary |
| * lead primary used for compression |
| * @param commonBottom4 |
| * common byte value for Quaternary |
| * @param bottomCount4 |
| * smallest byte value for Quaternary |
| * @return the new lead primary for compression |
| */ |
| private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary, |
| int commonBottom4, int bottomCount4, CollationBuffer buffer) { |
| |
| int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned |
| int p1 = ce >>> 8; // comparison |
| int originalP1 = p1; |
| if (notIsContinuation) { |
| if (m_leadBytePermutationTable_ != null) { |
| p1 = 0xff & m_leadBytePermutationTable_[p1]; |
| } |
| } |
| |
| if (doShift) { |
| if (buffer.m_utilCount4_ > 0) { |
| while (buffer.m_utilCount4_ > bottomCount4) { |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4)); |
| buffer.m_utilBytesCount4_++; |
| buffer.m_utilCount4_ -= bottomCount4; |
| } |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1))); |
| buffer.m_utilBytesCount4_++; |
| buffer.m_utilCount4_ = 0; |
| } |
| // dealing with a variable and we're treating them as shifted |
| // This is a shifted ignorable |
| if (p1 != 0) { |
| // we need to check this since we could be in continuation |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p1); |
| buffer.m_utilBytesCount4_++; |
| } |
| if (p2 != 0) { |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p2); |
| buffer.m_utilBytesCount4_++; |
| } |
| } else { |
| // Note: This code assumes that the table is well built |
| // i.e. not having 0 bytes where they are not supposed to be. |
| // Usually, we'll have non-zero primary1 & primary2, except |
| // in cases of LatinOne and friends, when primary2 will be |
| // regular and simple sortkey calc |
| if (p1 != CollationElementIterator.IGNORABLE) { |
| if (notIsContinuation) { |
| if (leadPrimary == p1) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); |
| buffer.m_utilBytesCount1_++; |
| } else { |
| if (leadPrimary != 0) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, |
| ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_)); |
| buffer.m_utilBytesCount1_++; |
| } |
| if (p2 == CollationElementIterator.IGNORABLE) { |
| // one byter, not compressed |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); |
| buffer.m_utilBytesCount1_++; |
| leadPrimary = 0; |
| } else if (isCompressible(originalP1)) { |
| // compress |
| leadPrimary = p1; |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); |
| buffer.m_utilBytesCount1_++; |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); |
| buffer.m_utilBytesCount1_++; |
| } else { |
| leadPrimary = 0; |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); |
| buffer.m_utilBytesCount1_++; |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); |
| buffer.m_utilBytesCount1_++; |
| } |
| } |
| } else { |
| // continuation, add primary to the key, no compression |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1); |
| buffer.m_utilBytesCount1_++; |
| if (p2 != CollationElementIterator.IGNORABLE) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2); |
| // second part |
| buffer.m_utilBytesCount1_++; |
| } |
| } |
| } |
| } |
| return leadPrimary; |
| } |
| |
| /** |
| * Gets the secondary byte and adds it to the secondary byte array |
| * |
| * @param ce current ce |
| * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce |
| * @param doFrench flag indicator if french sort is to be performed |
| * @param buffer collation buffer temporary state |
| */ |
| private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench, CollationBuffer buffer) { |
| int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison |
| if (s != 0) { |
| if (!doFrench) { |
| // This is compression code. |
| if (s == COMMON_2_ && notIsContinuation) { |
| buffer.m_utilCount2_++; |
| } else { |
| if (buffer.m_utilCount2_ > 0) { |
| if (s > COMMON_2_) { // not necessary for 4th level. |
| while (buffer.m_utilCount2_ > TOP_COUNT_2_) { |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, |
| (byte) (COMMON_TOP_2_ - TOP_COUNT_2_)); |
| buffer.m_utilBytesCount2_++; |
| buffer.m_utilCount2_ -= TOP_COUNT_2_; |
| } |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, |
| (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1))); |
| buffer.m_utilBytesCount2_++; |
| } else { |
| while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, |
| (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); |
| buffer.m_utilBytesCount2_++; |
| buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; |
| } |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, |
| (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); |
| buffer.m_utilBytesCount2_++; |
| } |
| buffer.m_utilCount2_ = 0; |
| } |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s); |
| buffer.m_utilBytesCount2_++; |
| } |
| } else { |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s); |
| buffer.m_utilBytesCount2_++; |
| // Do the special handling for French secondaries |
| // We need to get continuation elements and do intermediate |
| // restore |
| // abc1c2c3de with french secondaries need to be edc1c2c3ba |
| // NOT edc3c2c1ba |
| if (notIsContinuation) { |
| if (buffer.m_utilFrenchStart_ != -1) { |
| // reverse secondaries from frenchStartPtr up to |
| // frenchEndPtr |
| reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_); |
| buffer.m_utilFrenchStart_ = -1; |
| } |
| } else { |
| if (buffer.m_utilFrenchStart_ == -1) { |
| buffer.m_utilFrenchStart_ = buffer.m_utilBytesCount2_ - 2; |
| } |
| buffer.m_utilFrenchEnd_ = buffer.m_utilBytesCount2_ - 1; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Reverse the argument buffer |
| * |
| * @param buffer to reverse |
| * @param start index in buffer to start from |
| * @param end index in buffer to end at |
| */ |
| private static void reverseBuffer(byte buffer[], int start, int end) { |
| while (start < end) { |
| byte b = buffer[start]; |
| buffer[start++] = buffer[end]; |
| buffer[end--] = b; |
| } |
| } |
| |
| /** |
| * Insert the case shifting byte if required |
| * |
| * @param caseshift value |
| * @return new caseshift value |
| */ |
| private final int doCaseShift(int caseshift, CollationBuffer buffer) { |
| if (caseshift == 0) { |
| buffer.m_utilBytes0_ = append(buffer.m_utilBytes0_, buffer.m_utilBytesCount0_, SORT_CASE_BYTE_START_); |
| buffer.m_utilBytesCount0_++; |
| caseshift = SORT_CASE_SHIFT_START_; |
| } |
| return caseshift; |
| } |
| |
| /** |
| * Performs the casing sort |
| * |
| * @param tertiary byte in ints for easy comparison |
| * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce |
| * @param caseshift |
| * @param buffer collation buffer temporary state |
| * @return the new value of case shift |
| */ |
| private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift, CollationBuffer buffer) { |
| caseshift = doCaseShift(caseshift, buffer); |
| |
| if (notIsContinuation && tertiary != 0) { |
| byte casebits = (byte) (tertiary & 0xC0); |
| if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { |
| if (casebits == 0) { |
| buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= (1 << (--caseshift)); |
| } else { |
| // second bit |
| caseshift = doCaseShift(caseshift - 1, buffer); |
| buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift); |
| } |
| } else { |
| if (casebits != 0) { |
| buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= 1 << (--caseshift); |
| // second bit |
| caseshift = doCaseShift(caseshift, buffer); |
| buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift); |
| } else { |
| caseshift--; |
| } |
| } |
| } |
| |
| return caseshift; |
| } |
| |
| /** |
| * Gets the tertiary byte and adds it to the tertiary byte array |
| * |
| * @param tertiary byte in int for easy comparison |
| * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce |
| * @param buffer collation buffer temporary state |
| */ |
| private final void doTertiaryBytes(int tertiary, boolean notIsContinuation, CollationBuffer buffer) { |
| if (tertiary != 0) { |
| // This is compression code. |
| // sequence size check is included in the if clause |
| if (tertiary == m_common3_ && notIsContinuation) { |
| buffer.m_utilCount3_++; |
| } else { |
| int common3 = m_common3_ & LAST_BYTE_MASK_; |
| if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) { |
| tertiary += m_addition3_; |
| } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) { |
| tertiary -= m_addition3_; |
| } |
| if (buffer.m_utilCount3_ > 0) { |
| if (tertiary > common3) { |
| while (buffer.m_utilCount3_ > m_topCount3_) { |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_)); |
| buffer.m_utilBytesCount3_++; |
| buffer.m_utilCount3_ -= m_topCount3_; |
| } |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, |
| (byte) (m_top3_ - (buffer.m_utilCount3_ - 1))); |
| buffer.m_utilBytesCount3_++; |
| } else { |
| while (buffer.m_utilCount3_ > m_bottomCount3_) { |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, |
| (byte) (m_bottom3_ + m_bottomCount3_)); |
| buffer.m_utilBytesCount3_++; |
| buffer.m_utilCount3_ -= m_bottomCount3_; |
| } |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, |
| (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1))); |
| buffer.m_utilBytesCount3_++; |
| } |
| buffer.m_utilCount3_ = 0; |
| } |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) tertiary); |
| buffer.m_utilBytesCount3_++; |
| } |
| } |
| } |
| |
| /** |
| * Gets the Quaternary byte and adds it to the Quaternary byte array |
| * |
| * @param isCodePointHiragana flag indicator if the previous codepoint we dealt with was Hiragana |
| * @param commonBottom4 smallest common Quaternary byte |
| * @param bottomCount4 smallest Quaternary byte |
| * @param hiragana4 hiragana Quaternary byte |
| * @param buffer collation buffer temporary state |
| */ |
| private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4, |
| byte hiragana4, CollationBuffer buffer) { |
| if (isCodePointHiragana) { // This was Hiragana, need to note it |
| if (buffer.m_utilCount4_ > 0) { // Close this part |
| while (buffer.m_utilCount4_ > bottomCount4) { |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4)); |
| buffer.m_utilBytesCount4_++; |
| buffer.m_utilCount4_ -= bottomCount4; |
| } |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1))); |
| buffer.m_utilBytesCount4_++; |
| buffer.m_utilCount4_ = 0; |
| } |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, hiragana4); // Add the Hiragana |
| buffer.m_utilBytesCount4_++; |
| } else { // This wasn't Hiragana, so we can continue adding stuff |
| buffer.m_utilCount4_++; |
| } |
| } |
| |
| /** |
| * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc. |
| * |
| * @param source normalized string |
| * @param doFrench flag indicator if special handling of French has to be done |
| * @param hiragana4 offset for Hiragana quaternary |
| * @param commonBottom4 smallest common quaternary byte |
| * @param bottomCount4 smallest quaternary byte |
| * @param buffer collation buffer temporary state |
| */ |
| private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4, |
| int bottomCount4, CollationBuffer buffer) |
| |
| { |
| int backupDecomposition = getDecomposition(); |
| // TODO- hack fix around frozen state - stop self-modification |
| internalSetDecomposition(NO_DECOMPOSITION); // have to revert to backup later |
| buffer.m_srcUtilIter_.setText(source); |
| buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_); |
| buffer.m_utilFrenchStart_ = -1; |
| buffer.m_utilFrenchEnd_ = -1; |
| |
| boolean doShift = false; |
| boolean notIsContinuation = false; |
| |
| int leadPrimary = 0; // int for easier comparison |
| int caseShift = 0; |
| |
| while (true) { |
| int ce = buffer.m_srcUtilColEIter_.next(); |
| if (ce == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| |
| if (ce == CollationElementIterator.IGNORABLE) { |
| continue; |
| } |
| |
| notIsContinuation = !isContinuation(ce); |
| |
| boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0; |
| // actually we can just check that the first byte is 0 |
| // generation stuffs the order left first |
| boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_; |
| doShift = (m_isAlternateHandlingShifted_ |
| && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0 |
| || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable)); |
| if (doShift && isPrimaryByteIgnorable) { |
| // amendment to the UCA says that primary ignorables and other |
| // ignorables should be removed if following a shifted code |
| // point |
| // if we were shifted and we got an ignorable code point |
| // we should just completely ignore it |
| continue; |
| } |
| leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4, buffer); |
| |
| if (doShift) { |
| continue; |
| } |
| if (buffer.m_utilCompare2_) { |
| doSecondaryBytes(ce, notIsContinuation, doFrench, buffer); |
| } |
| |
| int t = ce & LAST_BYTE_MASK_; |
| if (!notIsContinuation) { |
| t = ce & CE_REMOVE_CONTINUATION_MASK_; |
| } |
| |
| if (buffer.m_utilCompare0_ && (!isPrimaryByteIgnorable || buffer.m_utilCompare2_)) { |
| // do the case level if we need to do it. We don't want to calculate |
| // case level for primary ignorables if we have only primary strength and case level |
| // otherwise we would break well formedness of CEs |
| caseShift = doCaseBytes(t, notIsContinuation, caseShift, buffer); |
| } else if (notIsContinuation) { |
| t ^= m_caseSwitch_; |
| } |
| |
| t &= m_mask3_; |
| |
| if (buffer.m_utilCompare3_) { |
| doTertiaryBytes(t, notIsContinuation, buffer); |
| } |
| |
| if (buffer.m_utilCompare4_ && notIsContinuation) { // compare quad |
| doQuaternaryBytes(buffer.m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4, buffer); |
| } |
| } |
| // TODO - hack fix around frozen state - stop self-modification |
| internalSetDecomposition(backupDecomposition); // reverts to original |
| if (buffer.m_utilFrenchStart_ != -1) { |
| // one last round of checks |
| reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_); |
| } |
| } |
| |
| /** |
| * From the individual strength byte results the final compact sortkey will be calculated. |
| * |
| * @param source text string |
| * @param doFrench flag indicating that special handling of French has to be done |
| * @param commonBottom4 smallest common quaternary byte |
| * @param bottomCount4 smallest quaternary byte |
| * @param key output RawCollationKey to store results, key cannot be null |
| * @param buffer collation buffer temporary state |
| */ |
| private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4, |
| RawCollationKey key, CollationBuffer buffer) { |
| // we have done all the CE's, now let's put them together to form |
| // a key |
| if (buffer.m_utilCompare2_) { |
| doSecondary(doFrench, buffer); |
| } |
| // adding case level should be independent of secondary level |
| if (buffer.m_utilCompare0_) { |
| doCase(buffer); |
| } |
| if (buffer.m_utilCompare3_) { |
| doTertiary(buffer); |
| if (buffer.m_utilCompare4_) { |
| doQuaternary(commonBottom4, bottomCount4, buffer); |
| if (buffer.m_utilCompare5_) { |
| doIdentical(source, buffer); |
| } |
| |
| } |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) 0); |
| buffer.m_utilBytesCount1_++; |
| |
| key.set(buffer.m_utilBytes1_, 0, buffer.m_utilBytesCount1_); |
| } |
| |
| /** |
| * Packs the French bytes |
| * @param buffer collation buffer temporary state |
| */ |
| private static final void doFrench(CollationBuffer buffer) { |
| for (int i = 0; i < buffer.m_utilBytesCount2_; i++) { |
| byte s = buffer.m_utilBytes2_[buffer.m_utilBytesCount2_ - i - 1]; |
| // This is compression code. |
| if (s == COMMON_2_) { |
| ++buffer.m_utilCount2_; |
| } else { |
| if (buffer.m_utilCount2_ > 0) { |
| // getting the unsigned value |
| if ((s & LAST_BYTE_MASK_) > COMMON_2_) { |
| // not necessary for 4th level. |
| while (buffer.m_utilCount2_ > TOP_COUNT_2_) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, |
| (byte) (COMMON_TOP_2_ - TOP_COUNT_2_)); |
| buffer.m_utilBytesCount1_++; |
| buffer.m_utilCount2_ -= TOP_COUNT_2_; |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, |
| (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1))); |
| buffer.m_utilBytesCount1_++; |
| } else { |
| while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, |
| (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); |
| buffer.m_utilBytesCount1_++; |
| buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, |
| (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); |
| buffer.m_utilBytesCount1_++; |
| } |
| buffer.m_utilCount2_ = 0; |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, s); |
| buffer.m_utilBytesCount1_++; |
| } |
| } |
| if (buffer.m_utilCount2_ > 0) { |
| while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); |
| buffer.m_utilBytesCount1_++; |
| buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); |
| buffer.m_utilBytesCount1_++; |
| } |
| } |
| |
| /** |
| * Compacts the secondary bytes and stores them into the primary array |
| * |
| * @param doFrench flag indicator that French has to be handled specially |
| * @param buffer collation buffer temporary state |
| */ |
| private static final void doSecondary(boolean doFrench, CollationBuffer buffer) { |
| if (buffer.m_utilCount2_ > 0) { |
| while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) { |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); |
| buffer.m_utilBytesCount2_++; |
| buffer.m_utilCount2_ -= BOTTOM_COUNT_2_; |
| } |
| buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1))); |
| buffer.m_utilBytesCount2_++; |
| } |
| |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); |
| buffer.m_utilBytesCount1_++; |
| |
| if (doFrench) { // do the reverse copy |
| doFrench(buffer); |
| } else { |
| if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount2_) { |
| buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_); |
| } |
| System.arraycopy(buffer.m_utilBytes2_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_); |
| buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount2_; |
| } |
| } |
| |
| /** |
| * Increase buffer size |
| * |
| * @param buffer array of bytes |
| * @param size of the byte array |
| * @param incrementsize size to increase |
| * @return the new buffer |
| */ |
| private static final byte[] increase(byte buffer[], int size, int incrementsize) { |
| byte result[] = new byte[buffer.length + incrementsize]; |
| System.arraycopy(buffer, 0, result, 0, size); |
| return result; |
| } |
| |
| /** |
| * Increase buffer size |
| * |
| * @param buffer array of ints |
| * @param size of the byte array |
| * @param incrementsize size to increase |
| * @return the new buffer |
| */ |
| private static final int[] increase(int buffer[], int size, int incrementsize) { |
| int result[] = new int[buffer.length + incrementsize]; |
| System.arraycopy(buffer, 0, result, 0, size); |
| return result; |
| } |
| |
| /** |
| * Compacts the case bytes and stores them into the primary array |
| * |
| * @param buffer collation buffer temporary state |
| */ |
| private static final void doCase(CollationBuffer buffer) { |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); |
| buffer.m_utilBytesCount1_++; |
| if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount0_) { |
| buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_); |
| } |
| System.arraycopy(buffer.m_utilBytes0_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_); |
| buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount0_; |
| } |
| |
| /** |
| * Compacts the tertiary bytes and stores them into the primary array |
| * |
| * @param buffer collation buffer temporary state |
| */ |
| private final void doTertiary(CollationBuffer buffer) { |
| if (buffer.m_utilCount3_ > 0) { |
| if (m_common3_ != COMMON_BOTTOM_3_) { |
| while (buffer.m_utilCount3_ >= m_topCount3_) { |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_)); |
| buffer.m_utilBytesCount3_++; |
| buffer.m_utilCount3_ -= m_topCount3_; |
| } |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - buffer.m_utilCount3_)); |
| buffer.m_utilBytesCount3_++; |
| } else { |
| while (buffer.m_utilCount3_ > m_bottomCount3_) { |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_)); |
| buffer.m_utilBytesCount3_++; |
| buffer.m_utilCount3_ -= m_bottomCount3_; |
| } |
| buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1))); |
| buffer.m_utilBytesCount3_++; |
| } |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); |
| buffer.m_utilBytesCount1_++; |
| if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount3_) { |
| buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_); |
| } |
| System.arraycopy(buffer.m_utilBytes3_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_); |
| buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount3_; |
| } |
| |
| /** |
| * Compacts the quaternary bytes and stores them into the primary array |
| * |
| * @param buffer collation buffer temporary state |
| */ |
| private final void doQuaternary(int commonbottom4, int bottomcount4, CollationBuffer buffer) { |
| if (buffer.m_utilCount4_ > 0) { |
| while (buffer.m_utilCount4_ > bottomcount4) { |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4)); |
| buffer.m_utilBytesCount4_++; |
| buffer.m_utilCount4_ -= bottomcount4; |
| } |
| buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + (buffer.m_utilCount4_ - 1))); |
| buffer.m_utilBytesCount4_++; |
| } |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); |
| buffer.m_utilBytesCount1_++; |
| if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount4_) { |
| buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_); |
| } |
| System.arraycopy(buffer.m_utilBytes4_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_); |
| buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount4_; |
| } |
| |
| /** |
| * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer. |
| * |
| * @param source text string |
| * @param buffer collation buffer temporary state |
| */ |
| private static final void doIdentical(String source, CollationBuffer buffer) { |
| int isize = BOCU.getCompressionLength(source); |
| buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_); |
| buffer.m_utilBytesCount1_++; |
| if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + isize) { |
| buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, 1 + isize); |
| } |
| buffer.m_utilBytesCount1_ = BOCU.compress(source, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_); |
| } |
| |
| /** |
| * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the |
| * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence. |
| * |
| * @param source |
| * string |
| * @param target |
| * string |
| * @return offset of the first unmatched characters in source and target. |
| */ |
| private final int getFirstUnmatchedOffset(String source, String target) { |
| int result = 0; |
| int slength = source.length(); |
| int tlength = target.length(); |
| int minlength = slength; |
| if (minlength > tlength) { |
| minlength = tlength; |
| } |
| while (result < minlength && source.charAt(result) == target.charAt(result)) { |
| result++; |
| } |
| if (result > 0) { |
| // There is an identical portion at the beginning of the two |
| // strings. If the identical portion ends within a contraction or a |
| // combining character sequence, back up to the start of that |
| // sequence. |
| char schar = 0; |
| char tchar = 0; |
| if (result < minlength) { |
| schar = source.charAt(result); // first differing chars |
| tchar = target.charAt(result); |
| } else { |
| schar = source.charAt(minlength - 1); |
| if (isUnsafe(schar)) { |
| tchar = schar; |
| } else if (slength == tlength) { |
| return result; |
| } else if (slength < tlength) { |
| tchar = target.charAt(result); |
| } else { |
| schar = source.charAt(result); |
| } |
| } |
| if (isUnsafe(schar) || isUnsafe(tchar)) { |
| // We are stopped in the middle of a contraction or combining |
| // sequence. |
| // Look backwards for the part of the string for the start of |
| // the sequence |
| // It doesn't matter which string we scan, since they are the |
| // same in this region. |
| do { |
| result--; |
| } while (result > 0 && isUnsafe(source.charAt(result))); |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Appending an byte to an array of bytes and increases it if we run out of space |
| * |
| * @param array |
| * of byte arrays |
| * @param appendindex |
| * index in the byte array to append |
| * @param value |
| * to append |
| * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned |
| */ |
| private static final byte[] append(byte array[], int appendindex, byte value) { |
| try { |
| array[appendindex] = value; |
| } catch (ArrayIndexOutOfBoundsException e) { |
| array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_); |
| array[appendindex] = value; |
| } |
| return array; |
| } |
| |
| /** |
| * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets |
| * in trouble and needs to bail out. |
| * |
| * @param source text string |
| * @param target text string |
| * @param buffer collation buffer temporary state |
| */ |
| private final int compareBySortKeys(String source, String target, CollationBuffer buffer) |
| { |
| buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_); |
| // this method is very seldom called |
| RawCollationKey targetkey = getRawCollationKey(target, null); |
| return buffer.m_utilRawCollationKey_.compareTo(targetkey); |
| } |
| |
| /** |
| * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between |
| * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the |
| * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens. |
| * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer |
| * will not be cleared and the return result will be the hiragana result. |
| * |
| * @param doHiragana4 flag indicator that Hiragana Quaternary has to be observed |
| * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted |
| * @param source text string |
| * @param target text string |
| * @param textoffset offset in text to start the comparison |
| * @param buffer collation buffer temporary state |
| * @return comparion result if a primary difference is found, otherwise hiragana result |
| */ |
| private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target, |
| int textoffset, CollationBuffer buffer) |
| |
| { |
| // Preparing the context objects for iterating over strings |
| buffer.m_srcUtilIter_.setText(source); |
| buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_, textoffset); |
| buffer.m_tgtUtilIter_.setText(target); |
| buffer.m_tgtUtilColEIter_.setText(buffer.m_tgtUtilIter_, textoffset); |
| |
| // Non shifted primary processing is quite simple |
| if (!m_isAlternateHandlingShifted_) { |
| int hiraganaresult = 0; |
| while (true) { |
| int sorder = 0; |
| // We fetch CEs until we hit a non ignorable primary or end. |
| do { |
| sorder = buffer.m_srcUtilColEIter_.next(); |
| buffer.m_srcUtilCEBuffer_ = append(buffer.m_srcUtilCEBuffer_, buffer.m_srcUtilCEBufferSize_, sorder); |
| buffer.m_srcUtilCEBufferSize_++; |
| sorder &= CE_PRIMARY_MASK_; |
| } while (sorder == CollationElementIterator.IGNORABLE); |
| |
| int torder = 0; |
| do { |
| torder = buffer.m_tgtUtilColEIter_.next(); |
| buffer.m_tgtUtilCEBuffer_ = append(buffer.m_tgtUtilCEBuffer_, buffer.m_tgtUtilCEBufferSize_, torder); |
| buffer.m_tgtUtilCEBufferSize_++; |
| torder &= CE_PRIMARY_MASK_; |
| } while (torder == CollationElementIterator.IGNORABLE); |
| |
| if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) { |
| sorder = (m_leadBytePermutationTable_[((sorder >> 24) + 256) % 256] << 24) | (sorder & 0x00FFFFFF); |
| torder = (m_leadBytePermutationTable_[((torder >> 24) + 256) % 256] << 24) | (torder & 0x00FFFFFF); |
| } |
| |
| // if both primaries are the same |
| if (sorder == torder) { |
| // and there are no more CEs, we advance to the next level |
| // see if we are at the end of either string |
| if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { |
| if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| break; |
| } else if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| if (doHiragana4 && hiraganaresult == 0 |
| && buffer.m_srcUtilColEIter_.m_isCodePointHiragana_ != buffer.m_tgtUtilColEIter_.m_isCodePointHiragana_) { |
| if (buffer.m_srcUtilColEIter_.m_isCodePointHiragana_) { |
| hiraganaresult = -1; |
| } else { |
| hiraganaresult = 1; |
| } |
| } |
| } else { |
| // if two primaries are different, we are done |
| return endPrimaryCompare(sorder, torder, buffer); |
| } |
| } |
| // no primary difference... do the rest from the buffers |
| return hiraganaresult; |
| } else { // shifted - do a slightly more complicated processing :) |
| while (true) { |
| int sorder = getPrimaryShiftedCompareCE(buffer.m_srcUtilColEIter_, lowestpvalue, true, buffer); |
| int torder = getPrimaryShiftedCompareCE(buffer.m_tgtUtilColEIter_, lowestpvalue, false, buffer); |
| if (sorder == torder) { |
| if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) { |
| break; |
| } else { |
| continue; |
| } |
| } else { |
| return endPrimaryCompare(sorder, torder, buffer); |
| } |
| } // no primary difference... do the rest from the buffers |
| } |
| return 0; |
| } |
| |
| /** |
| * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder |
| * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time. |
| * |
| * @param sorder source strength order |
| * @param torder target strength order |
| * @param buffer collation buffer temporary state |
| * @return the comparison result of sorder and torder |
| */ |
| private static final int endPrimaryCompare(int sorder, int torder, CollationBuffer buffer) { |
| // if we reach here, the ce offset accessed is the last ce |
| // appended to the buffer |
| boolean isSourceNullOrder = (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER); |
| boolean isTargetNullOrder = (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER); |
| buffer.m_srcUtilCEBufferSize_ = -1; |
| buffer.m_tgtUtilCEBufferSize_ = -1; |
| if (isSourceNullOrder) { |
| return -1; |
| } |
| if (isTargetNullOrder) { |
| return 1; |
| } |
| // getting rid of the sign |
| sorder >>>= CE_PRIMARY_SHIFT_; |
| torder >>>= CE_PRIMARY_SHIFT_; |
| if (sorder < torder) { |
| return -1; |
| } |
| return 1; |
| } |
| |
| /** |
| * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce. |
| * |
| * @param coleiter collation element iterator |
| * @param doHiragana4 flag indicator if hiragana quaternary is to be handled |
| * @param lowestpvalue lowest primary shifted value that will not be ignored |
| * @param buffer collation buffer temporary state |
| * @return result next modified ce |
| */ |
| private static final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc, CollationBuffer buffer) |
| { |
| boolean shifted = false; |
| int result = CollationElementIterator.IGNORABLE; |
| int cebuffer[] = buffer.m_srcUtilCEBuffer_; |
| int cebuffersize = buffer.m_srcUtilCEBufferSize_; |
| if (!isSrc) { |
| cebuffer = buffer.m_tgtUtilCEBuffer_; |
| cebuffersize = buffer.m_tgtUtilCEBufferSize_; |
| } |
| while (true) { |
| result = coleiter.next(); |
| if (result == CollationElementIterator.NULLORDER) { |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| break; |
| } else if (result == CollationElementIterator.IGNORABLE |
| || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) { |
| // UCA amendment - ignore ignorables that follow shifted code |
| // points |
| continue; |
| } else if (isContinuation(result)) { |
| if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) { |
| // There is primary value |
| if (shifted) { |
| result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_; |
| // preserve interesting continuation |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| continue; |
| } else { |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| break; |
| } |
| } else { // Just lower level values |
| if (!shifted) { |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| } |
| } |
| } else { // regular |
| if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) { |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| break; |
| } else { |
| if ((result & CE_PRIMARY_MASK_) != 0) { |
| shifted = true; |
| result &= CE_PRIMARY_MASK_; |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| continue; |
| } else { |
| cebuffer = append(cebuffer, cebuffersize, result); |
| cebuffersize++; |
| shifted = false; |
| continue; |
| } |
| } |
| } |
| } |
| if (isSrc) { |
| buffer.m_srcUtilCEBuffer_ = cebuffer; |
| buffer.m_srcUtilCEBufferSize_ = cebuffersize; |
| } else { |
| buffer.m_tgtUtilCEBuffer_ = cebuffer; |
| buffer.m_tgtUtilCEBufferSize_ = cebuffersize; |
| } |
| result &= CE_PRIMARY_MASK_; |
| return result; |
| } |
| |
| /** |
| * Appending an int to an array of ints and increases it if we run out of space |
| * |
| * @param array |
| * of int arrays |
| * @param appendindex |
| * index at which value will be appended |
| * @param value |
| * to append |
| * @return array if size is not increased, otherwise a new array will be returned |
| */ |
| private static final int[] append(int array[], int appendindex, int value) { |
| if (appendindex + 1 >= array.length) { |
| array = increase(array, appendindex, CE_BUFFER_SIZE_); |
| } |
| array[appendindex] = value; |
| return array; |
| } |
| |
| /** |
| * Does secondary strength comparison based on the collected ces. |
| * |
| * @param doFrench flag indicates if French ordering is to be done |
| * @param buffer collation buffer temporary state |
| * @return the secondary strength comparison result |
| */ |
| private static final int doSecondaryCompare(boolean doFrench, CollationBuffer buffer) { |
| // now, we're gonna reexamine collected CEs |
| if (!doFrench) { // normal |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| while (sorder == CollationElementIterator.IGNORABLE) { |
| sorder = buffer.m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_; |
| } |
| int torder = CollationElementIterator.IGNORABLE; |
| while (torder == CollationElementIterator.IGNORABLE) { |
| torder = buffer.m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_; |
| } |
| |
| if (sorder == torder) { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| break; |
| } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| } else { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| } else { // do the French |
| buffer.m_srcUtilContOffset_ = 0; |
| buffer.m_tgtUtilContOffset_ = 0; |
| buffer.m_srcUtilOffset_ = buffer.m_srcUtilCEBufferSize_ - 2; |
| buffer.m_tgtUtilOffset_ = buffer.m_tgtUtilCEBufferSize_ - 2; |
| while (true) { |
| int sorder = getSecondaryFrenchCE(true, buffer); |
| int torder = getSecondaryFrenchCE(false, buffer); |
| if (sorder == torder) { |
| if ((buffer.m_srcUtilOffset_ < 0 && buffer.m_tgtUtilOffset_ < 0) |
| || (buffer.m_srcUtilOffset_ >= 0 && buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) { |
| break; |
| } |
| } else { |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Calculates the next secondary french CE. |
| * |
| * @param isSrc flag indicator if we are calculating the src ces |
| * @param buffer collation buffer temporary state |
| * @return result next modified ce |
| */ |
| private static final int getSecondaryFrenchCE(boolean isSrc, CollationBuffer buffer) { |
| int result = CollationElementIterator.IGNORABLE; |
| int offset = buffer.m_srcUtilOffset_; |
| int continuationoffset = buffer.m_srcUtilContOffset_; |
| int cebuffer[] = buffer.m_srcUtilCEBuffer_; |
| if (!isSrc) { |
| offset = buffer.m_tgtUtilOffset_; |
| continuationoffset = buffer.m_tgtUtilContOffset_; |
| cebuffer = buffer.m_tgtUtilCEBuffer_; |
| } |
| |
| while (result == CollationElementIterator.IGNORABLE && offset >= 0) { |
| if (continuationoffset == 0) { |
| result = cebuffer[offset]; |
| while (isContinuation(cebuffer[offset--])) { |
| } |
| // after this, sorder is at the start of continuation, |
| // and offset points before that |
| if (isContinuation(cebuffer[offset + 1])) { |
| // save offset for later |
| continuationoffset = offset; |
| offset += 2; |
| } |
| } else { |
| result = cebuffer[offset++]; |
| if (!isContinuation(result)) { |
| // we have finished with this continuation |
| offset = continuationoffset; |
| // reset the pointer to before continuation |
| continuationoffset = 0; |
| continue; |
| } |
| } |
| result &= CE_SECONDARY_MASK_; // remove continuation bit |
| } |
| if (isSrc) { |
| buffer.m_srcUtilOffset_ = offset; |
| buffer.m_srcUtilContOffset_ = continuationoffset; |
| } else { |
| buffer.m_tgtUtilOffset_ = offset; |
| buffer.m_tgtUtilContOffset_ = continuationoffset; |
| } |
| return result; |
| } |
| |
| /** |
| * Does case strength comparison based on the collected ces. |
| * |
| * @param buffer collation buffer temporary state |
| * @return the case strength comparison result |
| */ |
| private final int doCaseCompare(CollationBuffer buffer) { |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| int torder = CollationElementIterator.IGNORABLE; |
| while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { |
| sorder = buffer.m_srcUtilCEBuffer_[soffset++]; |
| if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) { |
| // primary ignorables should not be considered on the case level when the strength is primary |
| // otherwise, the CEs stop being well-formed |
| sorder &= CE_CASE_MASK_3_; |
| sorder ^= m_caseSwitch_; |
| } else { |
| sorder = CollationElementIterator.IGNORABLE; |
| } |
| } |
| |
| while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { |
| torder = buffer.m_tgtUtilCEBuffer_[toffset++]; |
| if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) { |
| // primary ignorables should not be considered on the case level when the strength is primary |
| // otherwise, the CEs stop being well-formed |
| torder &= CE_CASE_MASK_3_; |
| torder ^= m_caseSwitch_; |
| } else { |
| torder = CollationElementIterator.IGNORABLE; |
| } |
| } |
| |
| sorder &= CE_CASE_BIT_MASK_; |
| torder &= CE_CASE_BIT_MASK_; |
| if (sorder == torder) { |
| // checking end of strings |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| break; |
| } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| } else { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (buffer.m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Does tertiary strength comparison based on the collected ces. |
| * |
| * @param buffer collation buffer temporary state |
| * @return the tertiary strength comparison result |
| */ |
| private final int doTertiaryCompare(CollationBuffer buffer) { |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| int torder = CollationElementIterator.IGNORABLE; |
| while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { |
| sorder = buffer.m_srcUtilCEBuffer_[soffset++] & m_mask3_; |
| if (!isContinuation(sorder)) { |
| sorder ^= m_caseSwitch_; |
| } else { |
| sorder &= CE_REMOVE_CASE_; |
| } |
| } |
| |
| while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) { |
| torder = buffer.m_tgtUtilCEBuffer_[toffset++] & m_mask3_; |
| if (!isContinuation(torder)) { |
| torder ^= m_caseSwitch_; |
| } else { |
| torder &= CE_REMOVE_CASE_; |
| } |
| } |
| |
| if (sorder == torder) { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| break; |
| } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| } else { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Does quaternary strength comparison based on the collected ces. |
| * |
| * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted |
| * @param buffer collation buffer temporary state |
| * @return the quaternary strength comparison result |
| */ |
| private final int doQuaternaryCompare(int lowestpvalue, CollationBuffer buffer) { |
| boolean sShifted = true; |
| boolean tShifted = true; |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| int torder = CollationElementIterator.IGNORABLE; |
| while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) { |
| sorder = buffer.m_srcUtilCEBuffer_[soffset++]; |
| if (isContinuation(sorder)) { |
| if (!sShifted) { |
| continue; |
| } |
| } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0 |
| || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) { |
| // non continuation |
| sorder = CE_PRIMARY_MASK_; |
| sShifted = false; |
| } else { |
| sShifted = true; |
| } |
| } |
| sorder >>>= CE_PRIMARY_SHIFT_; |
| while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) { |
| torder = buffer.m_tgtUtilCEBuffer_[toffset++]; |
| if (isContinuation(torder)) { |
| if (!tShifted) { |
| continue; |
| } |
| } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0 |
| || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) { |
| // non continuation |
| torder = CE_PRIMARY_MASK_; |
| tShifted = false; |
| } else { |
| tShifted = true; |
| } |
| } |
| torder >>>= CE_PRIMARY_SHIFT_; |
| |
| if (sorder == torder) { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| break; |
| } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| } else { |
| if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are |
| * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough. |
| * |
| * @param source |
| * text |
| * @param target |
| * text |
| * @param offset |
| * of the first difference in the text strings |
| * @param normalize |
| * flag indicating if we are to normalize the text before comparison |
| * @return 1 if source is greater than target, -1 less than and 0 if equals |
| */ |
| private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize) |
| |
| { |
| if (normalize) { |
| if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) { |
| source = Normalizer.decompose(source, false); |
| } |
| |
| if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) { |
| target = Normalizer.decompose(target, false); |
| } |
| offset = 0; |
| } |
| |
| return doStringCompare(source, target, offset); |
| } |
| |
| /** |
| * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the |
| * all non surrogate characters. |
| * |
| * @param source |
| * text |
| * @param target |
| * text |
| * @param offset |
| * start offset for comparison |
| * @return 1 if source is greater than target, -1 less than and 0 if equals |
| */ |
| private static final int doStringCompare(String source, String target, int offset) { |
| // compare identical prefixes - they do not need to be fixed up |
| char schar = 0; |
| char tchar = 0; |
| int slength = source.length(); |
| int tlength = target.length(); |
| int minlength = Math.min(slength, tlength); |
| while (offset < minlength) { |
| schar = source.charAt(offset); |
| tchar = target.charAt(offset++); |
| if (schar != tchar) { |
| break; |
| } |
| } |
| |
| if (schar == tchar && offset == minlength) { |
| if (slength > minlength) { |
| return 1; |
| } |
| if (tlength > minlength) { |
| return -1; |
| } |
| return 0; |
| } |
| |
| // if both values are in or above the surrogate range, Fix them up. |
| if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) { |
| schar = fixupUTF16(schar); |
| tchar = fixupUTF16(tchar); |
| } |
| |
| // now c1 and c2 are in UTF-32-compatible order |
| return (schar < tchar) ? -1 : 1; // schar and tchar has to be different |
| } |
| |
| /** |
| * Rotate surrogates to the top to get code point order |
| */ |
| private static final char fixupUTF16(char ch) { |
| if (ch >= 0xe000) { |
| ch -= 0x800; |
| } else { |
| ch += 0x2000; |
| } |
| return ch; |
| } |
| |
| private static final int UCOL_REORDER_CODE_IGNORE = ReorderCodes.LIMIT + 1; |
| /** |
| * Builds the lead byte permuatation table |
| */ |
| private void buildPermutationTable() { |
| if (m_reorderCodes_ == null || m_reorderCodes_.length == 0 || (m_reorderCodes_.length == 1 && m_reorderCodes_[0] == ReorderCodes.NONE)) { |
| m_leadBytePermutationTable_ = null; |
| return; |
| } |
| |
| if (m_reorderCodes_[0] == ReorderCodes.DEFAULT) { |
| if (m_reorderCodes_.length != 1) { |
| throw new IllegalArgumentException("Illegal collation reorder codes - default reorder code must be the only code in the list."); |
| } |
| // swap the reorder codes for those at build of the rules |
| if (m_defaultReorderCodes_ == null || m_defaultReorderCodes_.length == 0) { |
| m_leadBytePermutationTable_ = null; |
| } |
| m_reorderCodes_ = m_defaultReorderCodes_.clone(); |
| } |
| |
| // TODO - these need to be read in from the UCA data file |
| // The lowest byte that hasn't been assigned a mapping |
| int toBottom = 0x03; |
| // The highest byte that hasn't been assigned a mapping |
| int toTop = 0xe4; |
| |
| // filled slots in the output m_scriptOrder_ |
| boolean[] permutationSlotFilled = new boolean[256]; |
| |
| // used lead bytes |
| boolean[] newLeadByteUsed = new boolean[256]; |
| |
| if (m_leadBytePermutationTable_ == null) { |
| m_leadBytePermutationTable_ = new byte[256]; |
| } |
| |
| // prefill the reordering codes with the leading entries |
| int[] internalReorderCodes = new int[m_reorderCodes_.length + (ReorderCodes.LIMIT - ReorderCodes.FIRST)]; |
| for (int codeIndex = 0; codeIndex < ReorderCodes.LIMIT - ReorderCodes.FIRST; codeIndex++) { |
| internalReorderCodes[codeIndex] = ReorderCodes.FIRST + codeIndex; |
| } |
| for (int codeIndex = 0; codeIndex < m_reorderCodes_.length; codeIndex++) { |
| internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_reorderCodes_[codeIndex]; |
| if (m_reorderCodes_[codeIndex] >= ReorderCodes.FIRST && m_reorderCodes_[codeIndex] < ReorderCodes.LIMIT) { |
| internalReorderCodes[m_reorderCodes_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE; |
| } |
| } |
| |
| /* |
| * Start from the front of the list and place each script we encounter at the earliest possible locatation |
| * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script |
| * in the last possible location. At each step, we also need to make sure that any scripts that need to not |
| * be moved are copied to their same location in the final table. |
| */ |
| boolean fromTheBottom = true; |
| int reorderCodesIndex = -1; |
| for (int reorderCodesCount = 0; reorderCodesCount < internalReorderCodes.length; reorderCodesCount++) { |
| reorderCodesIndex += fromTheBottom ? 1 : -1; |
| int next = internalReorderCodes[reorderCodesIndex]; |
| if (next == UCOL_REORDER_CODE_IGNORE) { |
| continue; |
| } |
| if (next == UScript.UNKNOWN) { |
| if (fromTheBottom == false) { |
| // double turnaround |
| m_leadBytePermutationTable_ = null; |
| throw new IllegalArgumentException("Illegal collation reorder codes - two \"from the end\" markers."); |
| } |
| fromTheBottom = false; |
| reorderCodesIndex = internalReorderCodes.length; |
| continue; |
| } |
| |
| int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next); |
| if (fromTheBottom) { |
| for (int leadByte : leadBytes) { |
| // don't place a lead byte twice in the permutation table |
| if (permutationSlotFilled[leadByte]) { |
| // lead byte already used |
| m_leadBytePermutationTable_ = null; |
| throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte."); |
| } |
| m_leadBytePermutationTable_[leadByte] = (byte) toBottom; |
| newLeadByteUsed[toBottom] = true; |
| permutationSlotFilled[leadByte] = true; |
| toBottom++; |
| } |
| } else { |
| for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) { |
| int leadByte = leadBytes[leadByteIndex]; |
| // don't place a lead byte twice in the permutation table |
| if (permutationSlotFilled[leadByte]) { |
| // lead byte already used |
| m_leadBytePermutationTable_ = null; |
| throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte."); |
| } |
| |
| m_leadBytePermutationTable_[leadByte] = (byte) toTop; |
| newLeadByteUsed[toTop] = true; |
| permutationSlotFilled[leadByte] = true; |
| toTop--; |
| } |
| } |
| } |
| |
| /* Copy everything that's left over */ |
| int reorderCode = 0; |
| for (int i = 0; i < 256; i++) { |
| if (!permutationSlotFilled[i]) { |
| while (newLeadByteUsed[reorderCode]) { |
| if (reorderCode > 255) { |
| throw new IllegalArgumentException("Unable to fill collation reordering table slots - no available reordering code."); |
| } |
| reorderCode++; |
| } |
| m_leadBytePermutationTable_[i] = (byte) reorderCode; |
| permutationSlotFilled[i] = true; |
| newLeadByteUsed[reorderCode] = true; |
| } |
| } |
| |
| // for (int i = 0; i < 256; i++){ |
| // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16)); |
| // } |
| latinOneRegenTable_ = true; |
| updateInternalState(); |
| } |
| |
| /** |
| * Resets the internal case data members and compression values. |
| */ |
| private void updateInternalState() { |
| if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { |
| m_caseSwitch_ = CASE_SWITCH_; |
| } else { |
| m_caseSwitch_ = NO_CASE_SWITCH_; |
| } |
| |
| if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) { |
| m_mask3_ = CE_REMOVE_CASE_; |
| m_common3_ = COMMON_NORMAL_3_; |
| m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; |
| m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; |
| m_bottom3_ = COMMON_BOTTOM_3_; |
| } else { |
| m_mask3_ = CE_KEEP_CASE_; |
| m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; |
| if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { |
| m_common3_ = COMMON_UPPER_FIRST_3_; |
| m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_; |
| m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_; |
| } else { |
| m_common3_ = COMMON_NORMAL_3_; |
| m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_; |
| m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_; |
| } |
| } |
| |
| // Set the compression values |
| int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1; |
| // we multilply double with int, but need only int |
| m_topCount3_ = (int) (PROPORTION_3_ * total3); |
| m_bottomCount3_ = total3 - m_topCount3_; |
| |
| if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_ |
| && !m_isAlternateHandlingShifted_) { |
| m_isSimple3_ = true; |
| } else { |
| m_isSimple3_ = false; |
| } |
| if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_ |
| && !m_isAlternateHandlingShifted_ && !latinOneFailed_) { |
| if (latinOneCEs_ == null || latinOneRegenTable_) { |
| if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it |
| latinOneUse_ = true; |
| } else { |
| latinOneUse_ = false; |
| latinOneFailed_ = true; |
| } |
| latinOneRegenTable_ = false; |
| } else { // latin1Table exists and it doesn't need to be regenerated, just use it |
| latinOneUse_ = true; |
| } |
| } else { |
| latinOneUse_ = false; |
| } |
| |
| } |
| |
| /** |
| * Initializes the RuleBasedCollator |
| */ |
| private final void init() { |
| for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) { |
| // Find the smallest unsafe char. |
| if (isUnsafe(m_minUnsafe_)) { |
| break; |
| } |
| } |
| |
| for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) { |
| // Find the smallest contraction-ending char. |
| if (isContractionEnd(m_minContractionEnd_)) { |
| break; |
| } |
| } |
| latinOneFailed_ = true; |
| setStrength(m_defaultStrength_); |
| setDecomposition(m_defaultDecomposition_); |
| m_variableTopValue_ = m_defaultVariableTopValue_; |
| m_isFrenchCollation_ = m_defaultIsFrenchCollation_; |
| m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; |
| m_isCaseLevel_ = m_defaultIsCaseLevel_; |
| m_caseFirst_ = m_defaultCaseFirst_; |
| m_isHiragana4_ = m_defaultIsHiragana4_; |
| m_isNumericCollation_ = m_defaultIsNumericCollation_; |
| latinOneFailed_ = false; |
| if (m_defaultReorderCodes_ != null) { |
| m_reorderCodes_ = m_defaultReorderCodes_.clone(); |
| } else { |
| m_reorderCodes_ = null; |
| } |
| updateInternalState(); |
| } |
| |
| // Consts for Latin-1 special processing |
| private static final int ENDOFLATINONERANGE_ = 0xFF; |
| private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50); |
| private static final int BAIL_OUT_CE_ = 0xFF000000; |
| |
| /** |
| * Generate latin-1 tables |
| */ |
| |
| private class shiftValues { |
| int primShift = 24; |
| int secShift = 24; |
| int terShift = 24; |
| } |
| |
| private final void addLatinOneEntry(char ch, int CE, shiftValues sh) { |
| int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; |
| boolean continuation = isContinuation(CE); |
| boolean reverseSecondary = false; |
| if (!continuation) { |
| tertiary = ((CE & m_mask3_)); |
| tertiary ^= m_caseSwitch_; |
| reverseSecondary = true; |
| } else { |
| tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_)); |
| tertiary &= CE_REMOVE_CASE_; |
| reverseSecondary = false; |
| } |
| |
| secondary = ((CE >>>= 8) & LAST_BYTE_MASK_); |
| primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_); |
| primary1 = (CE >>> 8); |
| |
| if (primary1 != 0) { |
| if (m_leadBytePermutationTable_ != null && !continuation) { |
| primary1 = m_leadBytePermutationTable_[primary1]; |
| } |
| latinOneCEs_[ch] |= (primary1 << sh.primShift); |
| sh.primShift -= 8; |
| } |
| if (primary2 != 0) { |
| if (sh.primShift < 0) { |
| latinOneCEs_[ch] = BAIL_OUT_CE_; |
| latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_; |
| latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_; |
| return; |
| } |
| latinOneCEs_[ch] |= (primary2 << sh.primShift); |
| sh.primShift -= 8; |
| } |
| if (secondary != 0) { |
| if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary |
| latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary |
| latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24); |
| } else { // normal case |
| latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift); |
| } |
| sh.secShift -= 8; |
| } |
| if (tertiary != 0) { |
| latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift); |
| sh.terShift -= 8; |
| } |
| } |
| |
| private final void resizeLatinOneTable(int newSize) { |
| int newTable[] = new int[3 * newSize]; |
| int sizeToCopy = ((newSize < latinOneTableLen_) ? newSize : latinOneTableLen_); |
| // uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared. |
| System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy); |
| System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy); |
| System.arraycopy(latinOneCEs_, 2 * latinOneTableLen_, newTable, 2 * newSize, sizeToCopy); |
| latinOneTableLen_ = newSize; |
| latinOneCEs_ = newTable; |
| } |
| |
| private final boolean setUpLatinOne() { |
| if (latinOneCEs_ == null || m_reallocLatinOneCEs_) { |
| latinOneCEs_ = new int[3 * LATINONETABLELEN_]; |
| latinOneTableLen_ = LATINONETABLELEN_; |
| m_reallocLatinOneCEs_ = false; |
| } else { |
| Arrays.fill(latinOneCEs_, 0); |
| } |
| if (m_ContInfo_ == null) { |
| m_ContInfo_ = new ContractionInfo(); |
| } |
| char ch = 0; |
| // StringBuffer sCh = new StringBuffer(); |
| // CollationElementIterator it = getCollationElementIterator(sCh.toString()); |
| CollationElementIterator it = getCollationElementIterator(""); |
| |
| shiftValues s = new shiftValues(); |
| int CE = 0; |
| char contractionOffset = ENDOFLATINONERANGE_ + 1; |
| |
| for (ch = 0; ch <= ENDOFLATINONERANGE_; ch++) { |
| s.primShift = 24; |
| s.secShift = 24; |
| s.terShift = 24; |
| if (ch < 0x100) { |
| CE = m_trie_.getLatin1LinearValue(ch); |
| } else { |
| CE = m_trie_.getLeadValue(ch); |
| if (CE == CollationElementIterator.CE_NOT_FOUND_) { |
| CE = UCA_.m_trie_.getLeadValue(ch); |
| } |
| } |
| if (!isSpecial(CE)) { |
| addLatinOneEntry(ch, CE, s); |
| } else { |
| switch (RuleBasedCollator.getTag(CE)) { |
| case CollationElementIterator.CE_EXPANSION_TAG_: |
| case CollationElementIterator.CE_DIGIT_TAG_: |
| // sCh.delete(0, sCh.length()); |
| // sCh.append(ch); |
| // it.setText(sCh.toString()); |
| it.setText(UCharacter.toString(ch)); |
| while ((CE = it.next()) != CollationElementIterator.NULLORDER) { |
| if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { |
| latinOneCEs_[ch] = BAIL_OUT_CE_; |
| latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_; |
| latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_; |
| break; |
| } |
| addLatinOneEntry(ch, CE, s); |
| } |
| break; |
| case CollationElementIterator.CE_CONTRACTION_TAG_: |
| // here is the trick |
| // F2 is contraction. We do something very similar to contractions |
| // but have two indices, one in the real contraction table and the |
| // other to where we stuffed things. This hopes that we don't have |
| // many contractions (this should work for latin-1 tables). |
| { |
| if ((CE & 0x00FFF000) != 0) { |
| latinOneFailed_ = true; |
| return false; |
| } |
| |
| int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; // getContractionOffset(CE)] |
| |
| CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table |
| |
| latinOneCEs_[ch] = CE; |
| latinOneCEs_[latinOneTableLen_ + ch] = CE; |
| latinOneCEs_[2 * latinOneTableLen_ + ch] = CE; |
| |
| // We're going to jump into contraction table, pick the elements |
| // and use them |
| do { |
| // CE = *(contractionCEs + (UCharOffset - contractionIndex)); |
| CE = m_contractionCE_[UCharOffset]; |
| if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) { |
| int i; /* general counter */ |
| // uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to |
| // expansion table */ |
| int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; // it.getExpansionOffset(this, |
| // CE); |
| int size = CE & 0xF; // getExpansionCount(CE); |
| // CE = *CEOffset++; |
| if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ |
| for (i = 0; i < size; i++) { |
| if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { |
| latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; |
| latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; |
| latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; |
| break; |
| } |
| addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s); |
| } |
| } else { /* else, we do */ |
| while (m_expansion_[offset] != 0) { |
| if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) { |
| latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; |
| latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; |
| latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; |
| break; |
| } |
| addLatinOneEntry(contractionOffset, m_expansion_[offset++], s); |
| } |
| } |
| contractionOffset++; |
| } else if (!isSpecial(CE)) { |
| addLatinOneEntry(contractionOffset++, CE, s); |
| } else { |
| latinOneCEs_[contractionOffset] = BAIL_OUT_CE_; |
| latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; |
| latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_; |
| contractionOffset++; |
| } |
| UCharOffset++; |
| s.primShift = 24; |
| s.secShift = 24; |
| s.terShift = 24; |
| if (contractionOffset == latinOneTableLen_) { // we need to reallocate |
| resizeLatinOneTable(2 * latinOneTableLen_); |
| } |
| } while (m_contractionIndex_[UCharOffset] != 0xFFFF); |
| } |
| break; |
| case CollationElementIterator.CE_SPEC_PROC_TAG_: { |
| // 0xB7 is a precontext character defined in UCA5.1, a special |
| // handle is implemeted in order to save LatinOne table for |
| // most locales. |
| if (ch == 0xb7) { |
| addLatinOneEntry(ch, CE, s); |
| } else { |
| latinOneFailed_ = true; |
| return false; |
| } |
| } |
| break; |
| default: |
| latinOneFailed_ = true; |
| return false; |
| } |
| } |
| } |
| // compact table |
| if (contractionOffset < latinOneTableLen_) { |
| resizeLatinOneTable(contractionOffset); |
| } |
| return true; |
| } |
| |
| private class ContractionInfo { |
| int index; |
| } |
| |
| ContractionInfo m_ContInfo_; |
| |
| private int getLatinOneContraction(int strength, int CE, String s) { |
| // int strength, int CE, String s, Integer ind) { |
| int len = s.length(); |
| // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); |
| int UCharOffset = (CE & 0xFFF) - m_contractionOffset_; |
| int offset = 1; |
| int latinOneOffset = (CE & 0x00FFF000) >>> 12; |
| char schar = 0, tchar = 0; |
| |
| for (;;) { |
| /* |
| * if(len == -1) { if(s[*index] == 0) { // end of string |
| * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; } |
| * } else { |
| */ |
| if (m_ContInfo_.index == len) { |
| return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]); |
| } else { |
| schar = s.charAt(m_ContInfo_.index); |
| } |
| // } |
| |
| while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */ |
| )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ |
| offset++; |
| } |
| |
| if (schar == tchar) { |
| m_ContInfo_.index++; |
| return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]); |
| } else { |
| if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) { |
| return BAIL_OUT_CE_; |
| } |
| // skip completely ignorables |
| int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar); |
| if (isZeroCE == 0) { // we have to ignore completely ignorables |
| m_ContInfo_.index++; |
| continue; |
| } |
| |
| return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]); |
| } |
| } |
| } |
| |
| /** |
| * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries |
| * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case |
| * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the |
| * regular strcoll. |
| * @param buffer collation buffer temporary state |
| */ |
| private final int compareUseLatin1(String source, String target, int startOffset, CollationBuffer buffer) { |
| int sLen = source.length(); |
| int tLen = target.length(); |
| |
| int strength = getStrength(); |
| |
| int sIndex = startOffset, tIndex = startOffset; |
| char sChar = 0, tChar = 0; |
| int sOrder = 0, tOrder = 0; |
| |
| boolean endOfSource = false; |
| |
| // uint32_t *elements = coll->latinOneCEs; |
| |
| boolean haveContractions = false; // if we have contractions in our string |
| // we cannot do French secondary |
| |
| int offset = latinOneTableLen_; |
| |
| // Do the primary level |
| primLoop: |
| for (;;) { |
| while (sOrder == 0) { // this loop skips primary ignorables |
| // sOrder=getNextlatinOneCE(source); |
| if (sIndex == sLen) { |
| endOfSource = true; |
| break; |
| } |
| sChar = source.charAt(sIndex++); // [sIndex++]; |
| // } |
| if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out |
| // fprintf(stderr, "R"); |
| return compareRegular(source, target, startOffset, buffer); |
| } |
| sOrder = latinOneCEs_[sChar]; |
| if (isSpecial(sOrder)) { // if we got a special |
| // specials can basically be either contractions or bail-out signs. If we get anything |
| // else, we'll bail out anywasy |
| if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { |
| m_ContInfo_.index = sIndex; |
| sOrder = getLatinOneContraction(0, sOrder, source); |
| sIndex = m_ContInfo_.index; |
| haveContractions = true; // if there are contractions, we cannot do French secondary |
| // However, if there are contractions in the table, but we always use just one char, |
| // we might be able to do French. This should be checked out. |
| } |
| if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) { |
| // fprintf(stderr, "S"); |
| return compareRegular(source, target, startOffset, buffer); |
| } |
| } |
| } |
| |
| while (tOrder == 0) { // this loop skips primary ignorables |
| // tOrder=getNextlatinOneCE(target); |
| if (tIndex == tLen) { |
| if (endOfSource) { |
| break primLoop; |
| } else { |
| return 1; |
| } |
| } |
| tChar = target.charAt(tIndex++); // [tIndex++]; |
| if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out |
| // fprintf(stderr, "R"); |
| return compareRegular(source, target, startOffset, buffer); |
| } |
| tOrder = latinOneCEs_[tChar]; |
| if (isSpecial(tOrder)) { |
| // Handling specials, see the comments for source |
| if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) { |
| m_ContInfo_.index = tIndex; |
| tOrder = getLatinOneContraction(0, tOrder, target); |
| tIndex = m_ContInfo_.index; |
| haveContractions = true; |
| } |
| if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) { |
| // fprintf(stderr, "S"); |
| return compareRegular(source, target, startOffset, buffer); |
| } |
| } |
| } |
| if (endOfSource) { // source is finished, but target is not, say the result. |
| return -1; |
| } |
| |
| if (sOrder == tOrder) { // if we have same CEs, we continue the loop |
| sOrder = 0; |
| tOrder = 0; |
| continue; |
| } else { |
| // compare current top bytes |
| if (((sOrder ^ tOrder) & 0xFF000000) != 0) { |
| // top bytes differ, return difference |
| if (sOrder >>> 8 < tOrder >>> 8) { |
| return -1; |
| } else { |
| return 1; |
| } |
| // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); |
| // since we must return enum value |
| } |
| |
| // top bytes match, continue with following bytes |
| sOrder <<= 8; |
| tOrder <<= 8; |
| } |
| } |
| |
| // after primary loop, we definitely know the sizes of strings, |
| // so we set it and use simpler loop for secondaries and tertiaries |
| // sLen = sIndex; tLen = tIndex; |
| if (strength >= SECONDARY) { |
| // adjust the table beggining |
| // latinOneCEs_ += coll->latinOneTableLen; |
| endOfSource = false; |
| |
| if (!m_isFrenchCollation_) { // non French |
| // This loop is a simplified copy of primary loop |
| // at this point we know that whole strings are latin-1, so we don't |
| // check for that. We also know that we only have contractions as |
| // specials. |
| // sIndex = 0; tIndex = 0; |
| sIndex = startOffset; |
| tIndex = startOffset; |
| secLoop: for (;;) { |
| while (sOrder == 0) { |
| if (sIndex == sLen) { |
| endOfSource = true; |
| break; |
| } |
| sChar = source.charAt(sIndex++); // [sIndex++]; |
| sOrder = latinOneCEs_[offset + sChar]; |
| if (isSpecial(sOrder)) { |
| m_ContInfo_.index = sIndex; |
| sOrder = getLatinOneContraction(1, sOrder, source); |
| sIndex = m_ContInfo_.index; |
| } |
| } |
| |
| while (tOrder == 0) { |
| if (tIndex == tLen) { |
| if (endOfSource) { |
| break secLoop; |
| } else { |
| return 1; |
| } |
| } |
| tChar = target.charAt(tIndex++); // [tIndex++]; |
| tOrder = latinOneCEs_[offset + tChar]; |
| if (isSpecial(tOrder)) { |
| m_ContInfo_.index = tIndex; |
| tOrder = getLatinOneContraction(1, tOrder, target); |
| tIndex = m_ContInfo_.index; |
| } |
| } |
| if (endOfSource) { |
| return -1; |
| } |
| |
| if (sOrder == tOrder) { |
| sOrder = 0; |
| tOrder = 0; |
| continue; |
| } else { |
| // see primary loop for comments on this |
| if (((sOrder ^ tOrder) & 0xFF000000) != 0) { |
| if (sOrder >>> 8 < tOrder >>> 8) { |
| return -1; |
| } else { |
| return 1; |
| } |
| } |
| sOrder <<= 8; |
| tOrder <<= 8; |
| } |
| } |
| } else { // French |
| if (haveContractions) { // if we have contractions, we have to bail out |
| // since we don't really know how to handle them here |
| return compareRegular(source, target, startOffset, buffer); |
| } |
| // For French, we go backwards |
| sIndex = sLen; |
| tIndex = tLen; |
| secFLoop: for (;;) { |
| while (sOrder == 0) { |
| if (sIndex == startOffset) { |
| endOfSource = true; |
| break; |
| } |
| sChar = source.charAt(--sIndex); // [--sIndex]; |
| sOrder = latinOneCEs_[offset + sChar]; |
| // don't even look for contractions |
| } |
| |
| while (tOrder == 0) { |
| if (tIndex == startOffset) { |
| if (endOfSource) { |
| break secFLoop; |
| } else { |
| return 1; |
| } |
| } |
| tChar = target.charAt(--tIndex); // [--tIndex]; |
| tOrder = latinOneCEs_[offset + tChar]; |
| // don't even look for contractions |
| } |
| if (endOfSource) { |
| return -1; |
| } |
| |
| if (sOrder == tOrder) { |
| sOrder = 0; |
| tOrder = 0; |
| continue; |
| } else { |
| // see the primary loop for comments |
| if (((sOrder ^ tOrder) & 0xFF000000) != 0) { |
| if (sOrder >>> 8 < tOrder >>> 8) { |
| return -1; |
| } else { |
| return 1; |
| } |
| } |
| sOrder <<= 8; |
| tOrder <<= 8; |
| } |
| } |
| } |
| } |
| |
| if (strength >= TERTIARY) { |
| // tertiary loop is the same as secondary (except no French) |
| offset += latinOneTableLen_; |
| // sIndex = 0; tIndex = 0; |
| sIndex = startOffset; |
| tIndex = startOffset; |
| endOfSource = false; |
| for (;;) { |
| while (sOrder == 0) { |
| if (sIndex == sLen) { |
| endOfSource = true; |
| break; |
| } |
| sChar = source.charAt(sIndex++); // [sIndex++]; |
| sOrder = latinOneCEs_[offset + sChar]; |
| if (isSpecial(sOrder)) { |
| m_ContInfo_.index = sIndex; |
| sOrder = getLatinOneContraction(2, sOrder, source); |
| sIndex = m_ContInfo_.index; |
| } |
| } |
| while (tOrder == 0) { |
| if (tIndex == tLen) { |
| if (endOfSource) { |
| return 0; // if both strings are at the end, they are equal |
| } else { |
| return 1; |
| } |
| } |
| tChar = target.charAt(tIndex++); // [tIndex++]; |
| tOrder = latinOneCEs_[offset + tChar]; |
| if (isSpecial(tOrder)) { |
| m_ContInfo_.index = tIndex; |
| tOrder = getLatinOneContraction(2, tOrder, target); |
| tIndex = m_ContInfo_.index; |
| } |
| } |
| if (endOfSource) { |
| return -1; |
| } |
| if (sOrder == tOrder) { |
| sOrder = 0; |
| tOrder = 0; |
| continue; |
| } else { |
| if (((sOrder ^ tOrder) & 0xff000000) != 0) { |
| if (sOrder >>> 8 < tOrder >>> 8) { |
| return -1; |
| } else { |
| return 1; |
| } |
| } |
| sOrder <<= 8; |
| tOrder <<= 8; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Get the version of this collator object. |
| * |
| * @return the version object associated with this collator |
| * @stable ICU 2.8 |
| */ |
| public VersionInfo getVersion() { |
| /* RunTime version */ |
| int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor(); |
| /* Builder version */ |
| int bdVersion = m_version_.getMajor(); |
| |
| /* |
| * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and |
| * an api has to be provided in ucnv.h to obtain this version |
| */ |
| int csVersion = 0; |
| |
| /* combine the version info */ |
| int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF; |
| |
| /* Tailoring rules */ |
| return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(), |
| UCA_.m_UCA_version_.getMajor()); |
| |
| // versionInfo[0] = (uint8_t)(cmbVersion>>8); |
| // versionInfo[1] = (uint8_t)cmbVersion; |
| // versionInfo[2] = coll->image->version[1]; |
| // versionInfo[3] = coll->UCA->image->UCAVersion[0]; |
| } |
| |
| /** |
| * Get the UCA version of this collator object. |
| * |
| * @return the version object associated with this collator |
| * @stable ICU 2.8 |
| */ |
| public VersionInfo getUCAVersion() { |
| return UCA_.m_UCA_version_; |
| } |
| |
| private transient boolean m_reallocLatinOneCEs_; |
| |
| private CollationBuffer collationBuffer; |
| |
| private final CollationBuffer getCollationBuffer() { |
| if (isFrozen()) { |
| frozenLock.lock(); |
| } |
| if (collationBuffer == null) { |
| collationBuffer = new CollationBuffer(); |
| } else { |
| collationBuffer.resetBuffers(); |
| } |
| return collationBuffer; |
| } |
| |
| private final void releaseCollationBuffer(CollationBuffer buffer) { |
| if (isFrozen()) { |
| frozenLock.unlock(); |
| } |
| } |
| } |