| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2002, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $ |
| * $Date: 2002/07/12 21:59:22 $ |
| * $Revision: 1.10 $ |
| * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.text; |
| |
| import java.io.InputStream; |
| import java.io.BufferedInputStream; |
| import java.io.ByteArrayInputStream; |
| import java.util.Locale; |
| import java.util.ResourceBundle; |
| import java.text.CharacterIterator; |
| import java.text.StringCharacterIterator; |
| import com.ibm.icu.impl.IntTrie; |
| import com.ibm.icu.impl.Trie; |
| import com.ibm.icu.impl.ICULocaleData; |
| import com.ibm.icu.impl.BOCU; |
| |
| /** |
| * <p>RuleBasedCollator is a concrete subclass of Collator. It allows |
| * customization of the Collator via user-specified rule sets. |
| * RuleBasedCollator is designed to be fully compliant to the <a |
| * href="http://www.unicode.org/unicode/reports/tr10/"> Unicode |
| * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p> |
| * |
| * <p>Users are strongly encouraged to read <a |
| * href="http://oss.software.ibm.com/icu/userguide/Collate_Intro.html"> |
| * the users guide</a> for more information about the collation |
| * service before using this class.</p> |
| * |
| * <p>Create a RuleBasedCollator from a locale by calling the |
| * getInstance(Locale) factory method in the base class Collator. |
| * Collator.getInstance(Locale) creates a RuleBasedCollator object |
| * based on the collation rules defined by the argument locale. If a |
| * customized collation ordering ar attributes is required, use the |
| * RuleBasedCollator(String) constructor with the appropriate |
| * rules. The customized RuleBasedCollator will base its ordering on |
| * UCA, while re-adjusting the attributes and orders of the characters |
| * in the specified rule accordingly.</p> |
| * |
| * <p>RuleBasedCollator provides correct collation orders for most |
| * locales supported in ICU. If specific data for a locale is not |
| * available, the orders eventually falls back to the <a |
| * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation |
| * order </a>.</p> |
| * |
| * <p>For information about the collation rule syntax and details |
| * about customization, please refer to the |
| * <a href="http://oss.software.ibm.com/icu/userguide/Collate_Customization.html"> |
| * Collation customization</a> section of the user's guide.</p> |
| * |
| * <p><strong>Note</strong> that there are some differences between |
| * the Collation rule syntax used in Java and ICU4J: |
| * |
| * <ul> |
| * <li>According to the JDK documentation: |
| * <i> |
| * <p> |
| * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule |
| * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a |
| * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the |
| * range \U0EC0-\U0EC4 precedes a Lao consonant of the range |
| * \U0E81-\U0EAE then the |
| * vowel is placed after the consonant for collation purposes. |
| * </p> |
| * <p> |
| * If a rule is without the modifier '!', the Thai/Lao vowel-consonant |
| * swapping is not turned on. |
| * </p> |
| * </i> |
| * <p> |
| * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao |
| * vowel-consonant swapping, since the UCA clearly states that it has to be |
| * supported to ensure a correct sorting order. If a '!' is encountered, it is |
| * ignored. |
| * </p> |
| * <li>According to the JDK documentation: |
| * <i> |
| * <p> |
| * If, however, the first relation is not "<", then all the all |
| * text-arguments up to the first "<" are ignorable. For example, |
| * ", - < a < b" makes "-" an ignorable character, as we saw earlier in |
| * the word "black-birds". |
| * </p> |
| * </i> |
| * <p> |
| * The above allows random characters before the first '<' not in any |
| * specific sequence to be ignorable. ICU4J does not support this feature. |
| * To define ignorable characters in PRIMARY to TERTIARY strength, users can |
| * use the rule "& X < [variable top]" to set the variable top to the |
| * PRIMARY strength of "X". Once alternate handling is set to shifted |
| * (setAlternateHandling(true)), the Collator using strengths PRIMARY, |
| * SECONDARY or TERTIARY will ignore all code points with PRIMARY strengths |
| * less than variable top. |
| * See the user guide's section on |
| * <a href="http://oss.software.ibm.com/icu/userguide/Collate_Customization.html"> |
| * Collation Customization</a> for details.</p> |
| * |
| * <p> |
| * <li>As mentioned in the documentation of the base class Collator, |
| * compatibility decomposition mode is not supported. |
| * </ul> |
| * </p> |
| * <p> |
| * <strong>Examples</strong> |
| * </p> |
| * <p> |
| * Creating Customized RuleBasedCollators: |
| * <blockquote> |
| * <pre> |
| * String simple = "& a < b < c < d"; |
| * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple); |
| * |
| * String norwegian = "& a , A < b , B < c , C < d , D < e , E " |
| * + "< f , F < g , G < h , H < i , I < j , " |
| * + "J < k , K < l , L < m , M < n , N < " |
| * + "o , O < p , P < q , Q < r , R < s , S < " |
| * + "t , T < u , U < v , V < w , W < x , X " |
| * + "< y , Y < z , Z < \u00E5 = a\u030A " |
| * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 " |
| * + ", \u00C6 < \u00F8 , \u00D8"; |
| * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian); |
| * </pre> |
| * </blockquote> |
| * |
| * Concatenating rules to combine <code>Collator</code>s: |
| * <blockquote> |
| * <pre> |
| * // Create an en_US Collator object |
| * RuleBasedCollator en_USCollator = (RuleBasedCollator) |
| * Collator.getInstance(new Locale("en", "US", "")); |
| * // Create a da_DK Collator object |
| * RuleBasedCollator da_DKCollator = (RuleBasedCollator) |
| * Collator.getInstance(new Locale("da", "DK", "")); |
| * // Combine the two |
| * // First, get the collation rules from en_USCollator |
| * String en_USRules = en_USCollator.getRules(); |
| * // Second, get the collation rules from da_DKCollator |
| * String da_DKRules = da_DKCollator.getRules(); |
| * RuleBasedCollator newCollator = |
| * new RuleBasedCollator(en_USRules + da_DKRules); |
| * // newCollator has the combined rules |
| * </pre> |
| * </blockquote> |
| * |
| * Making changes to an existing RuleBasedCollator to create a new |
| * <code>Collator</code> object, by appending changes to the existing rule: |
| * <blockquote> |
| * <pre> |
| * // Create a new Collator object with additional rules |
| * String addRules = "& C < ch, cH, Ch, CH"; |
| * RuleBasedCollator myCollator = |
| * new RuleBasedCollator(en_USCollator + addRules); |
| * // myCollator contains the new rules |
| * </pre> |
| * </blockquote> |
| * |
| * How to change the order of non-spacing accents: |
| * <blockquote> |
| * <pre> |
| * // old rule with main accents |
| * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 " |
| * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 " |
| * + "; \u0306 ; \u0307 ; \u0309 ; \u030A " |
| * + "; \u030B ; \u030C ; \u030D ; \u030E " |
| * + "; \u030F ; \u0310 ; \u0311 ; \u0312 " |
| * + "< a , A ; ae, AE ; \u00e6 , \u00c6 " |
| * + "< b , B < c, C < e, E & C < d , D"; |
| * // change the order of accent characters |
| * String addOn = "& \u0300 ; \u0308 ; \u0302"; |
| * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn); |
| * </pre> |
| * </blockquote> |
| * |
| * Putting in a new primary ordering before the default setting, |
| * e.g. sort English characters before or after Japanese characters in the Japanese |
| * <code>Collator</code>: |
| * <blockquote> |
| * <pre> |
| * // get en_US Collator rules |
| * RuleBasedCollator en_USCollator |
| * = (RuleBasedCollator)Collator.getInstance(Locale.US); |
| * // add a few Japanese characters to sort before English characters |
| * // suppose the last character before the first base letter 'a' in |
| * // the English collation rule is \u2212 |
| * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, " |
| * + "\u3044"; |
| * RuleBasedCollator myJapaneseCollator |
| * = new RuleBasedCollator(en_USCollator.getRules() + jaString); |
| * </pre> |
| * </blockquote> |
| * </p> |
| * @author Syn Wee Quek |
| * @since release 2.2, April 18 2002 |
| * @draft 2.2 |
| */ |
| public final class RuleBasedCollator extends Collator |
| { |
| // public data members --------------------------------------------------- |
| |
| // public constructors --------------------------------------------------- |
| |
| /** |
| * <p> |
| * Constructor that takes the argument rules for |
| * customization. The collator will be based on UCA, |
| * with the attributes and re-ordering of the characters specified in the |
| * argument rules. |
| * </p> |
| * <p>See the user guide's section on |
| * <a href=http://oss.software.ibm.com/icu/userguide/Collate_Customization.html> |
| * Collation Customization</a> for details on the rule syntax. |
| * </p> |
| * @param rules the collation rules to build the collation table from. |
| * @exception ParseException and IOException thrown. ParseException thrown |
| * when argument rules have an invalid syntax. IOException |
| * thrown when an error occured while reading internal data. |
| * @draft 2.2 |
| */ |
| public RuleBasedCollator(String rules) throws Exception |
| { |
| if (rules == null || rules.length() == 0) { |
| throw new IllegalArgumentException( |
| "Collation rules can not be null"); |
| } |
| setWithUCAData(); |
| CollationParsedRuleBuilder builder |
| = new CollationParsedRuleBuilder(rules); |
| |
| builder.setRules(this); |
| m_rules_ = rules; |
| init(); |
| } |
| |
| // public methods -------------------------------------------------------- |
| |
| /** |
| * Return a CollationElementIterator for the given String. |
| * @see CollationElementIterator |
| * @draft 2.2 |
| */ |
| public CollationElementIterator getCollationElementIterator(String source) |
| { |
| return new CollationElementIterator(source, this); |
| } |
| |
| /** |
| * Return a CollationElementIterator for the given CharacterIterator. |
| * The source iterator's integrity will be preserved since a new copy |
| * will be created for use. |
| * @see CollationElementIterator |
| * @draft 2.2 |
| */ |
| public CollationElementIterator getCollationElementIterator( |
| CharacterIterator source) |
| { |
| CharacterIterator newsource = (CharacterIterator)source.clone(); |
| return new CollationElementIterator(newsource, this); |
| } |
| |
| // public setters -------------------------------------------------------- |
| |
| /** |
| * Sets the Hiragana Quaternary mode to be on or off. |
| * When the Hiragana Quaternary mode is turned on, the collator |
| * positions Hiragana characters before all non-ignorable characters in |
| * QUATERNARY strength. This is to produce a correct JIS collation order, |
| * distinguishing between Katakana and Hiragana characters. |
| * @param flag true if Hiragana Quaternary mode is to be on, false |
| * otherwise |
| * @see #setHiraganaQuaternaryDefault |
| * @see #isHiraganaQuaternary |
| * @draft 2.2 |
| */ |
| public void setHiraganaQuaternary(boolean flag) |
| { |
| m_isHiragana4_ = flag; |
| } |
| |
| /** |
| * Sets the Hiragana Quaternary mode to the initial mode set during |
| * construction of the RuleBasedCollator. |
| * See setHiraganaQuaternary(boolean) for more details. |
| * @see #setHiraganaQuaternary(boolean) |
| * @see #isHiraganaQuaternary |
| * @draft 2.2 |
| */ |
| public void setHiraganaQuaternaryDefault() |
| { |
| m_isHiragana4_ = m_defaultIsHiragana4_; |
| } |
| |
| /** |
| * Sets whether uppercase characters sort before lowercase |
| * characters or vice versa, in strength TERTIARY. The default |
| * mode is false, and so lowercase characters sort before uppercase |
| * characters. |
| * If true, sort upper case characters first. |
| * @param upperfirst true to sort uppercase characters before |
| * lowercase characters, false to sort lowercase |
| * characters before uppercase characters |
| * @see #isLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setLowerCaseFirst |
| * @see #setCaseFirstDefault |
| * @draft 2.2 |
| */ |
| public void setUpperCaseFirst(boolean upperfirst) |
| { |
| if (upperfirst) { |
| m_caseFirst_ = AttributeValue.UPPER_FIRST_; |
| } |
| else { |
| m_caseFirst_ = AttributeValue.OFF_; |
| } |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the orders of lower cased characters to sort before upper cased |
| * characters, in strength TERTIARY. The default |
| * mode is false. |
| * If true is set, the RuleBasedCollator will sort lower cased characters |
| * before the upper cased ones. |
| * Otherwise, if false is set, the RuleBasedCollator will ignore case |
| * preferences. |
| * @param lowerfirst true for sorting lower cased characters before |
| * upper cased characters, false to ignore case |
| * preferences. |
| * @see #isLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setUpperCaseFirst |
| * @see #setCaseFirstDefault |
| */ |
| public void setLowerCaseFirst(boolean lowerfirst) |
| { |
| if (lowerfirst) { |
| m_caseFirst_ = AttributeValue.LOWER_FIRST_; |
| } |
| else { |
| m_caseFirst_ = AttributeValue.OFF_; |
| } |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the case first mode to the initial mode set during |
| * construction of the RuleBasedCollator. |
| * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more |
| * details. |
| * @see #isLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setLowerCaseFirst(boolean) |
| * @see #setUpperCaseFirst(boolean) |
| * @draft 2.2 |
| */ |
| public final void setCaseFirstDefault() |
| { |
| m_caseFirst_ = m_defaultCaseFirst_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the alternate handling mode to the initial mode set during |
| * construction of the RuleBasedCollator. |
| * See setAlternateHandling(boolean) for more details. |
| * @see #setAlternateHandlingShifted(boolean) |
| * @see #isAlternateHandlingShifted() |
| * @draft 2.2 |
| */ |
| public void setAlternateHandlingDefault() |
| { |
| m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the case level mode to the initial mode set during |
| * construction of the RuleBasedCollator. |
| * See setCaseLevel(boolean) for more details. |
| * @see #setCaseLevel(boolean) |
| * @see #isCaseLevel |
| * @draft 2.2 |
| */ |
| public void setCaseLevelDefault() |
| { |
| m_isCaseLevel_ = m_defaultIsCaseLevel_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the decomposition mode to the initial mode set during construction |
| * of the RuleBasedCollator. |
| * See setDecomposition(int) for more details. |
| * @see #getDecomposition |
| * @see #setDecomposition(int) |
| * @draft 2.2 |
| */ |
| public void setDecompositionDefault() |
| { |
| setDecomposition(m_defaultDecomposition_); |
| } |
| |
| /** |
| * Sets the French collation mode to the initial mode set during |
| * construction of the RuleBasedCollator. |
| * See setFrenchCollation(boolean) for more details. |
| * @see #isFrenchCollation |
| * @see #setFrenchCollation(boolean) |
| * @draft 2.2 |
| */ |
| public void setFrenchCollationDefault() |
| { |
| m_isFrenchCollation_ = m_defaultIsFrenchCollation_; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the collation strength to the initial mode set during the |
| * construction of the RuleBasedCollator. |
| * See setStrength(int) for more details. |
| * @see #setStrength(int) |
| * @see #getStrength |
| * @draft 2.2 |
| */ |
| public void setStrengthDefault() |
| { |
| setStrength(m_defaultStrength_); |
| } |
| |
| /** |
| * Sets the mode for the direction of SECONDARY weights to be used in |
| * French collation. |
| * The default value is false, which treats SECONDARY weights in the order |
| * they appear. |
| * If set to true, the SECONDARY weights will be sorted backwards. |
| * See the section on |
| * <a href=http://oss.software.ibm.com/icu/userguide/Collate_ServiceArchitecture.html> |
| * French collation</a> for more information. |
| * @param flag true to set the French collation on, false to set it off |
| * @draft 2.2 |
| * @see #isFrenchCollation |
| * @see #setFrenchCollationDefault |
| */ |
| public void setFrenchCollation(boolean flag) |
| { |
| m_isFrenchCollation_ = flag; |
| updateInternalState(); |
| } |
| |
| /** |
| * Sets the alternate handling for QUATERNARY strength to be either |
| * shifted or non-ignorable. |
| * See the UCA definition on |
| * <a href="http://www.unicode.org/unicode/reports/tr10/#§3.2.2 Variable Collation Elements"> |
| * Alternate Weighting</a>. |
| * This attribute will only be effective when QUATERNARY strength is set. |
| * The default value for this mode is false, corresponding to the |
| * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the |
| * RuleBasedCollator will treats all the codepoints with non-ignorable |
| * primary weights in the same way. |
| * If the mode is set to true, the behaviour corresponds to SHIFTED defined |
| * in UCA, this causes codepoints with PRIMARY orders that are equal or |
| * below the variable top value to be ignored in PRIMARY order and |
| * moved to the QUATERNARY order. |
| * @param shifted true if SHIFTED behaviour for alternate handling is |
| * desired, false for the NON_IGNORABLE behaviour. |
| * @see #isAlternateHandlingShifted |
| * @see #setAlternateHandlingDefault |
| * @draft 2.2 |
| */ |
| public void setAlternateHandlingShifted(boolean shifted) |
| { |
| m_isAlternateHandlingShifted_ = shifted; |
| updateInternalState(); |
| } |
| |
| /** |
| * <p> |
| * When case level is set to true, an additional weight is formed |
| * between the SECONDARY and TERTIARY weight, known as the case level. |
| * The case level is used to distinguish large and small Japanese Kana |
| * characters. Case level could also be used in other situations. |
| * For example to distinguish certain Pinyin characters. |
| * The default value is false, which means the case level is not generated. |
| * The contents of the case level are affected by the case first |
| * mode. A simple way to ignore accent differences in a string is to set |
| * the strength to PRIMARY and enable case level. |
| * </p> |
| * <p> |
| * See the section on |
| * <a href=http://oss.software.ibm.com/icu/userguide/Collate_ServiceArchitecture.html> |
| * case level</a> for more information. |
| * </p> |
| * @param flag true if case level sorting is required, false otherwise |
| * @draft 2.2 |
| * @see #setCaseLevelDefault |
| * @see #isCaseLevel |
| */ |
| public void setCaseLevel(boolean flag) |
| { |
| m_isCaseLevel_ = flag; |
| updateInternalState(); |
| } |
| |
| /** |
| * <p> |
| * Sets this Collator's strength property. The strength property |
| * determines the minimum level of difference considered significant |
| * during comparison. |
| * </p> |
| * <p>See the Collator class description for an example of use.</p> |
| * @param the new strength value. |
| * @see #getStrength |
| * @see #setStrengthDefault |
| * @see #PRIMARY |
| * @see #SECONDARY |
| * @see #TERTIARY |
| * @see #QUATERNARY |
| * @see #IDENTICAL |
| * @exception IllegalArgumentException If the new strength value is not one |
| * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. |
| * @draft 2.2 |
| */ |
| public void setStrength(int newStrength) { |
| super.setStrength(newStrength); |
| updateInternalState(); |
| } |
| |
| // public getters -------------------------------------------------------- |
| |
| /** |
| * Gets the collation rules for this RuleBasedCollator. |
| * @return returns the collation rules |
| * @draft 2.2 |
| */ |
| public String getRules() |
| { |
| return m_rules_; |
| } |
| |
| /** |
| * <p> |
| * Get a Collation key for the argument String source from this |
| * RuleBasedCollator. |
| * </p> |
| * <p> |
| * General recommendation: <br> |
| * If comparison are to be done to the same String multiple times, it would |
| * be more efficient to generate CollationKeys for the Strings and use |
| * CollationKey.compareTo(CollationKey) for the comparisons. |
| * If the each Strings are compared to only once, using the method |
| * RuleBasedCollator.compare(String, String) will have a better performance. |
| * </p> |
| * <p> |
| * See the class documentation for an explanation about CollationKeys. |
| * </p> |
| * @param source the text String to be transformed into a collation key. |
| * @return the CollationKey for the given String based on this |
| * RuleBasedCollator's collation rules. If the source String is |
| * null, a null CollationKey is returned. |
| * @see CollationKey |
| * @see #compare(String, String) |
| * @draft 2.2 |
| */ |
| public CollationKey getCollationKey(String source) |
| { |
| if (source == null) { |
| return null; |
| } |
| int strength = getStrength(); |
| boolean compare[] = {m_isCaseLevel_, |
| true, |
| strength >= SECONDARY, |
| strength >= TERTIARY, |
| strength >= QUATERNARY, |
| strength == IDENTICAL |
| }; |
| |
| byte bytes[][] = {new byte[SORT_BUFFER_INIT_SIZE_CASE_], // case |
| new byte[SORT_BUFFER_INIT_SIZE_1_], // primary |
| new byte[SORT_BUFFER_INIT_SIZE_2_], // secondary |
| new byte[SORT_BUFFER_INIT_SIZE_3_], // tertiary |
| new byte[SORT_BUFFER_INIT_SIZE_4_] // Quaternary |
| }; |
| int bytescount[] = {0, 0, 0, 0, 0}; |
| int count[] = {0, 0, 0, 0, 0}; |
| boolean doFrench = m_isFrenchCollation_ && compare[2]; |
| // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. |
| // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so |
| // high. |
| int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_; |
| byte hiragana4 = 0; |
| if (m_isHiragana4_ && compare[4]) { |
| // allocate one more space for hiragana, value for hiragana |
| hiragana4 = (byte)commonBottom4; |
| commonBottom4 ++; |
| } |
| |
| int bottomCount4 = 0xFF - commonBottom4; |
| // If we need to normalize, we'll do it all at once at the beginning! |
| if ((compare[5] || getDecomposition() != NO_DECOMPOSITION) |
| && Normalizer.quickCheck(source, Normalizer.NFD) |
| != Normalizer.YES) { |
| source = Normalizer.decompose(source, false); |
| } |
| getSortKeyBytes(source, compare, bytes, bytescount, count, doFrench, |
| hiragana4, commonBottom4, bottomCount4); |
| byte sortkey[] = getSortKey(source, compare, bytes, bytescount, count, |
| doFrench, commonBottom4, bottomCount4); |
| return new CollationKey(source, sortkey); |
| } |
| |
| /** |
| * Return true if an uppercase character is sorted before the corresponding lowercase character. |
| * See setCaseFirst(boolean) for details. |
| * @see #setUpperCaseFirst |
| * @see #setLowerCaseFirst |
| * @see #isLowerCaseFirst |
| * @see #setCaseFirstDefault |
| * @return true if upper cased characters are sorted before lower cased |
| * characters, false otherwise |
| * @draft 2.2 |
| */ |
| public boolean isUpperCaseFirst() |
| { |
| return (m_caseFirst_ == AttributeValue.UPPER_FIRST_); |
| } |
| |
| /** |
| * Return true if a lowercase character is sorted before the corresponding uppercase character. |
| * See setCaseFirst(boolean) for details. |
| * @see #setUpperCaseFirst |
| * @see #setLowerCaseFirst |
| * @see #isUpperCaseFirst |
| * @see #setCaseFirstDefault |
| * @return true lower cased characters are sorted before upper cased |
| * characters, false otherwise |
| * @draft 2.2 |
| */ |
| public boolean isLowerCaseFirst() |
| { |
| return (m_caseFirst_ == AttributeValue.LOWER_FIRST_); |
| } |
| |
| /** |
| * Checks if the alternate handling behaviour is the UCA defined SHIFTED or |
| * NON_IGNORABLE. |
| * If return value is true, then the alternate handling attribute for the |
| * Collator is SHIFTED. Otherwise if return value is false, then the |
| * alternate handling attribute for the Collator is NON_IGNORABLE |
| * See setAlternateHandlingShifted(boolean) for more details. |
| * @return true or false |
| * @see #setAlternateHandlingShifted(boolean) |
| * @see #setAlternateHandlingDefault |
| * @draft 2.2 |
| */ |
| public boolean isAlternateHandlingShifted() |
| { |
| return m_isAlternateHandlingShifted_; |
| } |
| |
| /** |
| * Checks if case level is set to true. |
| * See setCaseLevel(boolean) for details. |
| * @return the case level mode |
| * @see #setCaseLevelDefault |
| * @see #isCaseLevel |
| * @see #setCaseLevel(boolean) |
| * @draft 2.2 |
| */ |
| public boolean isCaseLevel() |
| { |
| return m_isCaseLevel_; |
| } |
| |
| /** |
| * Checks if French Collation is set to true. |
| * See setFrenchCollation(boolean) for details. |
| * @return true if French Collation is set to true, false otherwise |
| * @see #setFrenchCollation(boolean) |
| * @see #setFrenchCollationDefault |
| * @draft 2.2 |
| */ |
| public boolean isFrenchCollation() |
| { |
| return m_isFrenchCollation_; |
| } |
| |
| /** |
| * Checks if the Hiragana Quaternary mode is set on. |
| * See setHiraganaQuaternary(boolean) for more details. |
| * @return flag true if Hiragana Quaternary mode is on, false otherwise |
| * @see #setHiraganaQuaternaryDefault |
| * @see #setHiraganaQuaternary(boolean) |
| * @draft 2.2 |
| */ |
| public boolean isHiraganaQuaternary() |
| { |
| return m_isHiragana4_; |
| } |
| |
| // public other methods ------------------------------------------------- |
| |
| /** |
| * Compares the equality of two RuleBasedCollator objects. |
| * RuleBasedCollator objects are equal if they have the same collation |
| * rules and the same attributes. |
| * @param obj the RuleBasedCollator to be compared to. |
| * @return true if this RuleBasedCollator has exactly the same |
| * collation behaviour as obj, false otherwise. |
| * @draft 2.2 |
| */ |
| public boolean equals(Object obj) { |
| if (obj == null) { |
| return false; // super does class check |
| } |
| if (this == obj) { |
| return true; |
| } |
| if (getClass() != obj.getClass()) { |
| return false; |
| } |
| RuleBasedCollator other = (RuleBasedCollator)obj; |
| // all other non-transient information is also contained in rules. |
| boolean property = getStrength() == other.getStrength() |
| && getDecomposition() == other.getDecomposition() |
| && other.m_caseFirst_ == m_caseFirst_ |
| && other.m_caseSwitch_ == m_caseSwitch_ |
| && other.m_isAlternateHandlingShifted_ |
| == m_isAlternateHandlingShifted_ |
| && other.m_isCaseLevel_ == m_isCaseLevel_ |
| && other.m_isFrenchCollation_ == m_isFrenchCollation_ |
| && other.m_isHiragana4_ == m_isHiragana4_; |
| if (!property) { |
| return false; |
| } |
| boolean rules = m_rules_ == other.m_rules_; |
| if (!rules && (m_rules_ != null && other.m_rules_ != null)) { |
| rules = m_rules_.equals(other.m_rules_); |
| } |
| return rules; |
| } |
| |
| /** |
| * Generates a unique hash code for this RuleBasedCollator. |
| * @return the unique hash code for this Collator |
| * @draft 2.2 |
| */ |
| public int hashCode() |
| { |
| String rules = getRules(); |
| if (rules == null) { |
| rules = ""; |
| } |
| return rules.hashCode(); |
| } |
| |
| /** |
| * Compares the source text String to the target text String according to |
| * the collation rules, strength and decomposition mode for this |
| * RuleBasedCollator. |
| * Returns an integer less than, |
| * equal to or greater than zero depending on whether the source String is |
| * less than, equal to or greater than the target String. See the Collator |
| * class description for an example of use. |
| * </p> |
| * <p> |
| * General recommendation: <br> |
| * If comparison are to be done to the same String multiple times, it would |
| * be more efficient to generate CollationKeys for the Strings and use |
| * CollationKey.compareTo(CollationKey) for the comparisons. |
| * If the each Strings are compared to only once, using the method |
| * RuleBasedCollator.compare(String, String) will have a better performance. |
| * </p> |
| * @param source the source text String. |
| * @param target the target text String. |
| * @return Returns an integer value. Value is less than zero if source is |
| * less than target, value is zero if source and target are equal, |
| * value is greater than zero if source is greater than target. |
| * @see CollationKey |
| * @see #getCollationKey |
| * @draft 2.2 |
| */ |
| public int compare(String source, String target) |
| { |
| if (source == target) { |
| return 0; |
| } |
| |
| // Find the length of any leading portion that is equal |
| int offset = getFirstUnmatchedOffset(source, target); |
| if (offset == source.length()) { |
| if (offset == target.length() || checkIgnorable(target, offset)) { |
| return 0; |
| } |
| return -1; |
| } |
| else if (target.length() == offset) { |
| if (checkIgnorable(source, offset)) { |
| return 0; |
| } |
| return 1; |
| } |
| |
| int strength = getStrength(); |
| // setting up the collator parameters |
| boolean compare[] = {m_isCaseLevel_, |
| true, |
| strength >= SECONDARY, |
| strength >= TERTIARY, |
| strength >= QUATERNARY, |
| strength == IDENTICAL |
| }; |
| boolean doFrench = m_isFrenchCollation_ && compare[2]; |
| boolean doShift4 = m_isAlternateHandlingShifted_ && compare[4]; |
| boolean doHiragana4 = m_isHiragana4_ && compare[4]; |
| |
| if (doHiragana4 && doShift4) { |
| String sourcesub = source.substring(offset); |
| String targetsub = target.substring(offset); |
| return compareBySortKeys(sourcesub, targetsub); |
| } |
| |
| // Preparing the CE buffers. will be filled during the primary phase |
| int cebuffer[][] = {new int[CE_BUFFER_SIZE_], new int[CE_BUFFER_SIZE_]}; |
| int cebuffersize[] = {0, 0}; |
| // This is the lowest primary value that will not be ignored if shifted |
| int lowestpvalue = m_isAlternateHandlingShifted_ |
| ? m_variableTopValue_ << 16 : 0; |
| int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, |
| target, offset, cebuffer, cebuffersize); |
| if (cebuffer[0] == null && cebuffer[1] == null) { |
| // since the cebuffer is cleared when we have determined that |
| // either source is greater than target or vice versa, the return |
| // result is the comparison result and not the hiragana result |
| return result; |
| } |
| |
| int hiraganaresult = result; |
| |
| if (compare[2]) { |
| result = doSecondaryCompare(cebuffer, cebuffersize, doFrench); |
| if (result != 0) { |
| return result; |
| } |
| } |
| // doing the case bit |
| if (compare[0]) { |
| result = doCaseCompare(cebuffer); |
| if (result != 0) { |
| return result; |
| } |
| } |
| // Tertiary level |
| if (compare[3]) { |
| result = doTertiaryCompare(cebuffer); |
| if (result != 0) { |
| return result; |
| } |
| } |
| |
| if (doShift4) { // checkQuad |
| result = doQuaternaryCompare(cebuffer, lowestpvalue); |
| if (result != 0) { |
| return result; |
| } |
| } |
| else if (doHiragana4 && hiraganaresult != 0) { |
| // If we're fine on quaternaries, we might be different |
| // on Hiragana. This, however, might fail us in shifted. |
| return hiraganaresult; |
| } |
| |
| // For IDENTICAL comparisons, we use a bitwise character comparison |
| // as a tiebreaker if all else is equal. |
| // Getting here should be quite rare - strings are not identical - |
| // that is checked first, but compared == through all other checks. |
| if (compare[5]) { |
| return doIdenticalCompare(source, target, offset, true); |
| } |
| return 0; |
| } |
| |
| // package private inner interfaces -------------------------------------- |
| |
| /** |
| * Attribute values to be used when setting the Collator options |
| */ |
| static interface AttributeValue |
| { |
| /** |
| * Indicates that the default attribute value will be used. |
| * See individual attribute for details on its default value. |
| */ |
| static final int DEFAULT_ = -1; |
| /** |
| * Primary collation strength |
| */ |
| static final int PRIMARY_ = Collator.PRIMARY; |
| /** |
| * Secondary collation strength |
| */ |
| static final int SECONDARY_ = Collator.SECONDARY; |
| /** |
| * Tertiary collation strength |
| */ |
| static final int TERTIARY_ = Collator.TERTIARY; |
| /** |
| * Default collation strength |
| */ |
| static final int DEFAULT_STRENGTH_ = Collator.TERTIARY; |
| /** |
| * Internal use for strength checks in Collation elements |
| */ |
| static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1; |
| /** |
| * Quaternary collation strength |
| */ |
| static final int QUATERNARY_ = 3; |
| /** |
| * Identical collation strength |
| */ |
| static final int IDENTICAL_ = Collator.IDENTICAL; |
| /** |
| * Internal use for strength checks |
| */ |
| static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1; |
| /** |
| * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, |
| * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE |
| */ |
| static final int OFF_ = 16; |
| /** |
| * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, |
| * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE |
| */ |
| static final int ON_ = 17; |
| /** |
| * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted |
| */ |
| static final int SHIFTED_ = 20; |
| /** |
| * Valid for ALTERNATE_HANDLING. Alternate handling will be non |
| * ignorable |
| */ |
| static final int NON_IGNORABLE_ = 21; |
| /** |
| * Valid for CASE_FIRST - lower case sorts before upper case |
| */ |
| static final int LOWER_FIRST_ = 24; |
| /** |
| * Upper case sorts before lower case |
| */ |
| static final int UPPER_FIRST_ = 25; |
| /** |
| * Valid for NORMALIZATION_MODE ON and OFF are also allowed for this |
| * attribute |
| */ |
| static final int ON_WITHOUT_HANGUL_ = 28; |
| /** |
| * Number of attribute values |
| */ |
| static final int LIMIT_ = 29; |
| }; |
| |
| /** |
| * Attributes that collation service understands. All the attributes can |
| * take DEFAULT value, as well as the values specific to each one. |
| */ |
| static interface Attribute |
| { |
| /** |
| * Attribute for direction of secondary weights - used in French. |
| * Acceptable values are ON, which results in secondary weights being |
| * considered backwards and OFF which treats secondary weights in the |
| * order they appear. |
| */ |
| static final int FRENCH_COLLATION_ = 0; |
| /** |
| * Attribute for handling variable elements. Acceptable values are |
| * NON_IGNORABLE (default) which treats all the codepoints with |
| * non-ignorable primary weights in the same way, and SHIFTED which |
| * causes codepoints with primary weights that are equal or below the |
| * variable top value to be ignored on primary level and moved to the |
| * quaternary level. |
| */ |
| static final int ALTERNATE_HANDLING_ = 1; |
| /** |
| * Controls the ordering of upper and lower case letters. Acceptable |
| * values are OFF (default), which orders upper and lower case letters |
| * in accordance to their tertiary weights, UPPER_FIRST which forces |
| * upper case letters to sort before lower case letters, and |
| * LOWER_FIRST which does the opposite. |
| */ |
| static final int CASE_FIRST_ = 2; |
| /** |
| * Controls whether an extra case level (positioned before the third |
| * level) is generated or not. Acceptable values are OFF (default), |
| * when case level is not generated, and ON which causes the case |
| * level to be generated. Contents of the case level are affected by |
| * the value of CASE_FIRST attribute. A simple way to ignore accent |
| * differences in a string is to set the strength to PRIMARY and |
| * enable case level. |
| */ |
| static final int CASE_LEVEL_ = 3; |
| /** |
| * Controls whether the normalization check and necessary |
| * normalizations are performed. When set to OFF (default) no |
| * normalization check is performed. The correctness of the result is |
| * guaranteed only if the input data is in so-called FCD form (see |
| * users manual for more info). When set to ON, an incremental check |
| * is performed to see whether the input data is in the FCD form. If |
| * the data is not in the FCD form, incremental NFD normalization is |
| * performed. |
| */ |
| static final int NORMALIZATION_MODE_ = 4; |
| /** |
| * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, |
| * QUATERNARY or IDENTICAL. The usual strength for most locales |
| * (except Japanese) is tertiary. Quaternary strength is useful when |
| * combined with shifted setting for alternate handling attribute and |
| * for JIS x 4061 collation, when it is used to distinguish between |
| * Katakana and Hiragana (this is achieved by setting the |
| * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is |
| * affected only by the number of non ignorable code points in the |
| * string. Identical strength is rarely useful, as it amounts to |
| * codepoints of the NFD form of the string. |
| */ |
| static final int STRENGTH_ = 5; |
| /** |
| * When turned on, this attribute positions Hiragana before all |
| * non-ignorables on quaternary level. This is a sneaky way to produce |
| * JIS sort order. |
| */ |
| static final int HIRAGANA_QUATERNARY_MODE_ = 6; |
| /** |
| * Attribute count |
| */ |
| static final int LIMIT_ = 7; |
| }; |
| |
| /** |
| * DataManipulate singleton |
| */ |
| static class DataManipulate implements Trie.DataManipulate |
| { |
| // public methods ---------------------------------------------------- |
| |
| /** |
| * Internal method called to parse a lead surrogate's ce for the offset |
| * to the next trail surrogate data. |
| * @param ce collation element of the lead surrogate |
| * @return data offset or 0 for the next trail surrogate |
| * @draft 2.2 |
| */ |
| public final int getFoldingOffset(int ce) |
| { |
| if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { |
| return (ce & 0xFFFFFF); |
| } |
| return 0; |
| } |
| |
| /** |
| * Get singleton object |
| */ |
| public static final DataManipulate getInstance() |
| { |
| if (m_instance_ == null) { |
| m_instance_ = new DataManipulate(); |
| } |
| return m_instance_; |
| } |
| |
| // private data member ---------------------------------------------- |
| |
| /** |
| * Singleton instance |
| */ |
| private static DataManipulate m_instance_; |
| |
| // private constructor ---------------------------------------------- |
| |
| /** |
| * private to prevent initialization |
| */ |
| private DataManipulate() |
| { |
| } |
| }; |
| |
| /** |
| * UCAConstants |
| */ |
| static final class UCAConstants |
| { |
| int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 |
| int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000 |
| int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705 |
| int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000 |
| int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500 |
| int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05 |
| int FIRST_VARIABLE_[] = new int[2]; // 0x05070505 |
| int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505 |
| int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505 |
| int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505 |
| int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303 |
| int FIRST_IMPLICIT_[] = new int[2]; |
| int LAST_IMPLICIT_[] = new int[2]; |
| int FIRST_TRAILING_[] = new int[2]; |
| int LAST_TRAILING_[] = new int[2]; |
| int PRIMARY_TOP_MIN_; |
| int PRIMARY_IMPLICIT_MIN_; // 0xE8000000 |
| int PRIMARY_IMPLICIT_MAX_; // 0xF0000000 |
| int PRIMARY_TRAILING_MIN_; // 0xE8000000 |
| int PRIMARY_TRAILING_MAX_; // 0xF0000000 |
| int PRIMARY_SPECIAL_MIN_; // 0xE8000000 |
| int PRIMARY_SPECIAL_MAX_; // 0xF0000000 |
| } |
| |
| // package private data member ------------------------------------------- |
| |
| static final byte BYTE_FIRST_TAILORED_ = (byte)0x04; |
| static final byte BYTE_COMMON_ = (byte)0x05; |
| static final int COMMON_TOP_2_ = 0x86; // int for unsigness |
| static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; |
| /** |
| * Case strength mask |
| */ |
| static final int CE_CASE_BIT_MASK_ = 0xC0; |
| static final int CE_TAG_SHIFT_ = 24; |
| static final int CE_TAG_MASK_ = 0x0F000000; |
| |
| static final int CE_SPECIAL_FLAG_ = 0xF0000000; |
| /** |
| * Lead surrogate that is tailored and doesn't start a contraction |
| */ |
| static final int CE_SURROGATE_TAG_ = 5; |
| /** |
| * Mask to get the primary strength of the collation element |
| */ |
| static final int CE_PRIMARY_MASK_ = 0xFFFF0000; |
| /** |
| * Mask to get the secondary strength of the collation element |
| */ |
| static final int CE_SECONDARY_MASK_ = 0xFF00; |
| /** |
| * Mask to get the tertiary strength of the collation element |
| */ |
| static final int CE_TERTIARY_MASK_ = 0xFF; |
| /** |
| * Primary strength shift |
| */ |
| static final int CE_PRIMARY_SHIFT_ = 16; |
| /** |
| * Secondary strength shift |
| */ |
| static final int CE_SECONDARY_SHIFT_ = 8; |
| /** |
| * Continuation marker |
| */ |
| static final int CE_CONTINUATION_MARKER_ = 0xC0; |
| |
| /** |
| * Size of collator raw data headers and options before the expansion |
| * data. This is used when expansion ces are to be retrieved. ICU4C uses |
| * the expansion offset starting from UCollator.UColHeader, hence ICU4J |
| * will have to minus that off to get the right expansion ce offset. In |
| * number of ints. |
| */ |
| int m_expansionOffset_; |
| /** |
| * Size of collator raw data headers, options and expansions before |
| * contraction data. This is used when contraction ces are to be retrieved. |
| * ICU4C uses contraction offset starting from UCollator.UColHeader, hence |
| * ICU4J will have to minus that off to get the right contraction ce |
| * offset. In number of chars. |
| */ |
| int m_contractionOffset_; |
| /** |
| * Flag indicator if Jamo is special |
| */ |
| boolean m_isJamoSpecial_; |
| |
| // Collator options ------------------------------------------------------ |
| int m_defaultVariableTopValue_; |
| boolean m_defaultIsFrenchCollation_; |
| boolean m_defaultIsAlternateHandlingShifted_; |
| int m_defaultCaseFirst_; |
| boolean m_defaultIsCaseLevel_; |
| int m_defaultDecomposition_; |
| int m_defaultStrength_; |
| boolean m_defaultIsHiragana4_; |
| /** |
| * Value of the variable top |
| */ |
| int m_variableTopValue_; |
| /** |
| * Attribute for special Hiragana |
| */ |
| boolean m_isHiragana4_; |
| /** |
| * Case sorting customization |
| */ |
| int m_caseFirst_; |
| |
| // end Collator options -------------------------------------------------- |
| |
| /** |
| * Expansion table |
| */ |
| int m_expansion_[]; |
| /** |
| * Contraction index table |
| */ |
| char m_contractionIndex_[]; |
| /** |
| * Contraction CE table |
| */ |
| int m_contractionCE_[]; |
| /** |
| * Data trie |
| */ |
| IntTrie m_trie_; |
| /** |
| * Table to store all collation elements that are the last element of an |
| * expansion. This is for use in StringSearch. |
| */ |
| int m_expansionEndCE_[]; |
| /** |
| * Table to store the maximum size of any expansions that end with the |
| * corresponding collation element in m_expansionEndCE_. For use in |
| * StringSearch too |
| */ |
| byte m_expansionEndCEMaxSize_[]; |
| /** |
| * Heuristic table to store information on whether a char character is |
| * considered "unsafe". "Unsafe" character are combining marks or those |
| * belonging to some contraction sequence from the offset 1 onwards. |
| * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered |
| * unsafe. If we have another contraction "ZA" with the one above, then |
| * 'A', 'B', 'C' are "unsafe" but 'Z' is not. |
| */ |
| byte m_unsafe_[]; |
| /** |
| * Table to store information on whether a codepoint can occur as the last |
| * character in a contraction |
| */ |
| byte m_contractionEnd_[]; |
| /** |
| * Original collation rules |
| */ |
| String m_rules_; |
| /** |
| * The smallest "unsafe" codepoint |
| */ |
| char m_minUnsafe_; |
| /** |
| * The smallest codepoint that could be the end of a contraction |
| */ |
| char m_minContractionEnd_; |
| |
| /** |
| * UnicodeData.txt property object |
| */ |
| static final RuleBasedCollator UCA_; |
| /** |
| * UCA Constants |
| */ |
| static final UCAConstants UCA_CONSTANTS_; |
| /** |
| * Table for UCA and builder use |
| */ |
| static final char UCA_CONTRACTIONS_[]; |
| /** |
| * Implicit constants |
| */ |
| static final int IMPLICIT_BASE_BYTE_; |
| static final int IMPLICIT_LIMIT_BYTE_; |
| static final int IMPLICIT_4BYTE_BOUNDARY_; |
| static final int LAST_MULTIPLIER_; |
| static final int LAST2_MULTIPLIER_; |
| static final int IMPLICIT_BASE_3BYTE_; |
| static final int IMPLICIT_BASE_4BYTE_; |
| static final int BYTES_TO_AVOID_ = 3; |
| static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_; |
| static final int LAST_COUNT_ = OTHER_COUNT_ / 2; |
| /** |
| * Room for intervening, without expanding to 5 bytes |
| */ |
| static final int LAST_COUNT2_ = OTHER_COUNT_ / 21; |
| static final int IMPLICIT_3BYTE_COUNT_ = 1; |
| |
| // block to initialise character property database |
| static |
| { |
| try |
| { |
| UCA_ = new RuleBasedCollator(); |
| UCA_CONSTANTS_ = new UCAConstants(); |
| InputStream i = UCA_.getClass().getResourceAsStream( |
| "/com/ibm/icu/impl/data/ucadata.dat"); |
| |
| BufferedInputStream b = new BufferedInputStream(i, 90000); |
| CollatorReader reader = new CollatorReader(b); |
| UCA_CONTRACTIONS_ = reader.read(UCA_, UCA_CONSTANTS_); |
| b.close(); |
| i.close(); |
| // called before doing canonical closure for the UCA. |
| IMPLICIT_BASE_BYTE_ = UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_; |
| // leave room for 1 3-byte and 2 4-byte forms |
| IMPLICIT_LIMIT_BYTE_ = IMPLICIT_BASE_BYTE_ + 4; |
| IMPLICIT_4BYTE_BOUNDARY_ = IMPLICIT_3BYTE_COUNT_ * OTHER_COUNT_ |
| * LAST_COUNT_; |
| LAST_MULTIPLIER_ = OTHER_COUNT_ / LAST_COUNT_; |
| LAST2_MULTIPLIER_ = OTHER_COUNT_ / LAST_COUNT2_; |
| IMPLICIT_BASE_3BYTE_ = (IMPLICIT_BASE_BYTE_ << 24) + 0x030300; |
| IMPLICIT_BASE_4BYTE_ = ((IMPLICIT_BASE_BYTE_ |
| + IMPLICIT_3BYTE_COUNT_) << 24) + 0x030303; |
| UCA_.m_rules_ = null; |
| UCA_.init(); |
| } |
| catch (Exception e) |
| { |
| e.printStackTrace(); |
| throw new RuntimeException(e.getMessage()); |
| } |
| } |
| |
| // package private constructors ------------------------------------------ |
| |
| /** |
| * <p>Private contructor for use by subclasses. |
| * Public access to creating Collators is handled by the API |
| * Collator.getInstance() or RuleBasedCollator(String rules). |
| * </p> |
| * <p> |
| * This constructor constructs the UCA collator internally |
| * </p> |
| * @draft 2.2 |
| */ |
| RuleBasedCollator() |
| { |
| } |
| |
| // package private methods ----------------------------------------------- |
| |
| /** |
| * Sets this collator to use the tables in UCA. Note options not taken |
| * care of here. |
| */ |
| final void setWithUCATables() |
| { |
| m_contractionOffset_ = UCA_.m_contractionOffset_; |
| m_expansionOffset_ = UCA_.m_expansionOffset_; |
| m_expansion_ = UCA_.m_expansion_; |
| m_contractionIndex_ = UCA_.m_contractionIndex_; |
| m_contractionCE_ = UCA_.m_contractionCE_; |
| m_trie_ = UCA_.m_trie_; |
| m_expansionEndCE_ = UCA_.m_expansionEndCE_; |
| m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_; |
| m_unsafe_ = UCA_.m_unsafe_; |
| m_contractionEnd_ = UCA_.m_contractionEnd_; |
| m_minUnsafe_ = UCA_.m_minUnsafe_; |
| m_minContractionEnd_ = UCA_.m_minContractionEnd_; |
| } |
| |
| /** |
| * Sets this collator to use the all options and tables in UCA. |
| */ |
| final void setWithUCAData() |
| { |
| m_addition3_ = UCA_.m_addition3_; |
| m_bottom3_ = UCA_.m_bottom3_; |
| m_bottomCount3_ = UCA_.m_bottomCount3_; |
| m_caseFirst_ = UCA_.m_caseFirst_; |
| m_caseSwitch_ = UCA_.m_caseSwitch_; |
| m_common3_ = UCA_.m_common3_; |
| m_contractionOffset_ = UCA_.m_contractionOffset_; |
| setDecomposition(UCA_.getDecomposition()); |
| m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_; |
| m_defaultDecomposition_ = UCA_.m_defaultDecomposition_; |
| m_defaultIsAlternateHandlingShifted_ |
| = UCA_.m_defaultIsAlternateHandlingShifted_; |
| m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_; |
| m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_; |
| m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_; |
| m_defaultStrength_ = UCA_.m_defaultStrength_; |
| m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_; |
| m_expansionOffset_ = UCA_.m_expansionOffset_; |
| m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_; |
| m_isCaseLevel_ = UCA_.m_isCaseLevel_; |
| m_isFrenchCollation_ = UCA_.m_isFrenchCollation_; |
| m_isHiragana4_ = UCA_.m_isHiragana4_; |
| m_isJamoSpecial_ = UCA_.m_isJamoSpecial_; |
| m_isSimple3_ = UCA_.m_isSimple3_; |
| m_mask3_ = UCA_.m_mask3_; |
| m_minContractionEnd_ = UCA_.m_minContractionEnd_; |
| m_minUnsafe_ = UCA_.m_minUnsafe_; |
| m_rules_ = UCA_.m_rules_; |
| setStrength(UCA_.getStrength()); |
| m_top3_ = UCA_.m_top3_; |
| m_topCount3_ = UCA_.m_topCount3_; |
| m_variableTopValue_ = UCA_.m_variableTopValue_; |
| setWithUCATables(); |
| } |
| |
| /** |
| * Test whether a char character is potentially "unsafe" for use as a |
| * collation starting point. "Unsafe" characters are combining marks or |
| * those belonging to some contraction sequence from the offset 1 onwards. |
| * E.g. if "ABC" is the only contraction, then 'B' and |
| * 'C' are considered unsafe. If we have another contraction "ZA" with |
| * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not. |
| * @param ch character to determin |
| * @return true if ch is unsafe, false otherwise |
| */ |
| final boolean isUnsafe(char ch) |
| { |
| if (ch < m_minUnsafe_) { |
| return false; |
| } |
| |
| if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { |
| if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) { |
| // Trail surrogate are always considered unsafe. |
| return true; |
| } |
| ch &= HEURISTIC_OVERFLOW_MASK_; |
| ch += HEURISTIC_OVERFLOW_OFFSET_; |
| } |
| int value = m_unsafe_[ch >> HEURISTIC_SHIFT_]; |
| return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; |
| } |
| |
| /** |
| * Approximate determination if a char character is at a contraction end. |
| * Guaranteed to be true if a character is at the end of a contraction, |
| * otherwise it is not deterministic. |
| * @param ch character to be determined |
| */ |
| final boolean isContractionEnd(char ch) |
| { |
| if (UTF16.isTrailSurrogate(ch)) { |
| return true; |
| } |
| |
| if (ch < m_minContractionEnd_) { |
| return false; |
| } |
| |
| if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { |
| ch &= HEURISTIC_OVERFLOW_MASK_; |
| ch += HEURISTIC_OVERFLOW_OFFSET_; |
| } |
| int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_]; |
| return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; |
| } |
| |
| /** |
| * Retrieve the tag of a special ce |
| * @param ce ce to test |
| * @return tag of ce |
| */ |
| static int getTag(int ce) |
| { |
| return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_; |
| } |
| |
| /** |
| * Checking if ce is special |
| * @param ce to check |
| * @return true if ce is special |
| */ |
| static boolean isSpecial(int ce) |
| { |
| return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_; |
| } |
| |
| /** |
| * Checks if the argument ce is a continuation |
| * @param ce collation element to test |
| * @return true if ce is a continuation |
| */ |
| static final boolean isContinuation(int ce) |
| { |
| return ce != CollationElementIterator.NULLORDER |
| && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; |
| } |
| |
| // protected constructor ------------------------------------------------- |
| |
| /** |
| * Constructors a RuleBasedCollator from the argument locale. |
| * If no resource bundle is associated with the locale, UCA is used |
| * instead. |
| * @param locale |
| * @exception Exception thrown when there's an error creating the Collator |
| */ |
| RuleBasedCollator(Locale locale) // throws Exception |
| { |
| ResourceBundle rb = ICULocaleData.getLocaleElements(locale); |
| |
| if (rb != null) { |
| try { |
| Object elements = rb.getObject("CollationElements"); |
| if (elements != null) { |
| Object[][] rules = (Object[][])elements; |
| m_rules_ = (String)rules[1][1]; |
| // %%CollationBin |
| byte map[] = (byte [])rules[0][1]; |
| BufferedInputStream input = |
| new BufferedInputStream( |
| new ByteArrayInputStream(map)); |
| CollatorReader reader = new CollatorReader(input, false); |
| if (map.length > MIN_BINARY_DATA_SIZE_) { |
| reader.read(this, null); |
| } |
| else { |
| reader.readHeader(this); |
| reader.readOptions(this); |
| // duplicating UCA_'s data |
| setWithUCATables(); |
| } |
| init(); |
| return; |
| } |
| } |
| catch (Exception e) { |
| // if failed use UCA. |
| } |
| } |
| setWithUCAData(); |
| } |
| |
| // private inner classes ------------------------------------------------ |
| |
| // private variables ----------------------------------------------------- |
| |
| /** |
| * The smallest natural unsafe or contraction end char character before |
| * tailoring. |
| * This is a combining mark. |
| */ |
| private static final int DEFAULT_MIN_HEURISTIC_ = 0x300; |
| /** |
| * Heuristic table table size. Size is 32 bytes, 1 bit for each |
| * latin 1 char, and some power of two for hashing the rest of the chars. |
| * Size in bytes. |
| */ |
| private static final char HEURISTIC_SIZE_ = 1056; |
| /** |
| * Mask value down to "some power of two" - 1, |
| * number of bits, not num of bytes. |
| */ |
| private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff; |
| /** |
| * Unsafe character shift |
| */ |
| private static final int HEURISTIC_SHIFT_ = 3; |
| /** |
| * Unsafe character addition for character too large, it has to be folded |
| * then incremented. |
| */ |
| private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256; |
| /** |
| * Mask value to get offset in heuristic table. |
| */ |
| private static final char HEURISTIC_MASK_ = 7; |
| |
| private int m_caseSwitch_; |
| private int m_common3_; |
| private int m_mask3_; |
| /** |
| * When switching case, we need to add or subtract different values. |
| */ |
| private int m_addition3_; |
| /** |
| * Upper range when compressing |
| */ |
| private int m_top3_; |
| /** |
| * Upper range when compressing |
| */ |
| private int m_bottom3_; |
| private int m_topCount3_; |
| private int m_bottomCount3_; |
| /** |
| * Case first constants |
| */ |
| private static final int CASE_SWITCH_ = 0xC0; |
| private static final int NO_CASE_SWITCH_ = 0; |
| /** |
| * Case level constants |
| */ |
| private static final int CE_REMOVE_CASE_ = 0x3F; |
| private static final int CE_KEEP_CASE_ = 0xFF; |
| /** |
| * Case strength mask |
| */ |
| private static final int CE_CASE_MASK_3_ = 0xFF; |
| /** |
| * Sortkey size factor. Values can be changed. |
| */ |
| private static final double PROPORTION_2_ = 0.5; |
| private static final double PROPORTION_3_ = 0.667; |
| |
| // These values come from the UCA ---------------------------------------- |
| |
| /** |
| * This is an enum that lists magic special byte values from the |
| * fractional UCA |
| */ |
| private static final byte BYTE_ZERO_ = 0x0; |
| private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01; |
| private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; |
| private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03; |
| private static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; |
| private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; |
| private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C; |
| private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D; |
| private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF; |
| private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; |
| private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; |
| private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40; |
| private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85; |
| private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45; |
| private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5; |
| private static final int COMMON_BOTTOM_3_ = 0x05; |
| private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86; |
| private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = |
| COMMON_BOTTOM_3_; |
| private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_); |
| private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_; |
| private static final int COMMON_2_ = COMMON_BOTTOM_2_; |
| private static final int COMMON_UPPER_FIRST_3_ = 0xC5; |
| private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_; |
| private static final int COMMON_4_ = (byte)0xFF; |
| |
| /** |
| * Minimum size required for the binary collation data in bytes. |
| * Size of UCA header + size of options to 4 bytes |
| */ |
| private static final int MIN_BINARY_DATA_SIZE_ = (42 + 8) << 2; |
| |
| /** |
| * If this collator is to generate only simple tertiaries for fast path |
| */ |
| private boolean m_isSimple3_; |
| |
| /** |
| * French collation sorting flag |
| */ |
| private boolean m_isFrenchCollation_; |
| /** |
| * Flag indicating if shifted is requested for Quaternary alternate |
| * handling. If this is not true, the default for alternate handling will |
| * be non-ignorable. |
| */ |
| private boolean m_isAlternateHandlingShifted_; |
| /** |
| * Extra case level for sorting |
| */ |
| private boolean m_isCaseLevel_; |
| |
| private static final int SORT_BUFFER_INIT_SIZE_ = 128; |
| private static final int SORT_BUFFER_INIT_SIZE_1_ = |
| SORT_BUFFER_INIT_SIZE_ << 3; |
| private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_; |
| private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_; |
| private static final int SORT_BUFFER_INIT_SIZE_CASE_ = |
| SORT_BUFFER_INIT_SIZE_ >> 2; |
| private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_; |
| |
| private static final int CE_CONTINUATION_TAG_ = 0xC0; |
| private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F; |
| |
| private static final int LAST_BYTE_MASK_ = 0xFF; |
| |
| private static final int CE_RESET_TOP_VALUE_ = 0x9F000303; |
| private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303; |
| |
| private static final byte SORT_CASE_BYTE_START_ = (byte)0x80; |
| private static final byte SORT_CASE_SHIFT_START_ = (byte)7; |
| |
| private static final byte SORT_LEVEL_TERMINATOR_ = 1; |
| |
| /** |
| * CE buffer size |
| */ |
| private static final int CE_BUFFER_SIZE_ = 512; |
| |
| // private methods ------------------------------------------------------- |
| |
| /** |
| * Gets the 2 bytes of primary order and adds it to the primary byte array |
| * @param ce current ce |
| * @param bytes array of byte arrays for each strength |
| * @param bytescount array of the size of each strength byte arrays |
| * @param count array of counters for each of the strength |
| * @param notIsContinuation flag indicating if the current bytes belong to |
| * a continuation ce |
| * @param doShift flag indicating if ce is to be shifted |
| * @param leadPrimary lead primary used for compression |
| * @param commonBottom4 common byte value for Quaternary |
| * @param bottomCount4 smallest byte value for Quaternary |
| * @return the new lead primary for compression |
| */ |
| private final int doPrimaryBytes(int ce, byte bytes[][], int bytescount[], |
| int count[], boolean notIsContinuation, |
| boolean doShift, int leadPrimary, |
| int commonBottom4, int bottomCount4) |
| { |
| |
| int p2 = (ce >>= 16) & LAST_BYTE_MASK_; // in ints for unsigned |
| int p1 = ce >>> 8; // comparison |
| if (doShift) { |
| if (count[4] > 0) { |
| while (count[4] > bottomCount4) { |
| append(bytes, bytescount, 4, |
| (byte)(commonBottom4 + bottomCount4)); |
| count[4] -= bottomCount4; |
| } |
| append(bytes, bytescount, 4, |
| (byte)(commonBottom4 + (count[4] - 1))); |
| count[4] = 0; |
| } |
| // dealing with a variable and we're treating them as shifted |
| // This is a shifted ignorable |
| if (p1 != 0) { |
| // we need to check this since we could be in continuation |
| append(bytes, bytescount, 4, (byte)p1); |
| } |
| if (p2 != 0) { |
| append(bytes, bytescount, 4, (byte)p2); |
| } |
| } |
| else { |
| // Note: This code assumes that the table is well built |
| // i.e. not having 0 bytes where they are not supposed to be. |
| // Usually, we'll have non-zero primary1 & primary2, except |
| // in cases of LatinOne and friends, when primary2 will be |
| // regular and simple sortkey calc |
| if (p1 != CollationElementIterator.IGNORABLE) { |
| if (notIsContinuation) { |
| if (leadPrimary == p1) { |
| append(bytes, bytescount, 1, (byte)p2); |
| } |
| else { |
| if (leadPrimary != 0) { |
| append(bytes, bytescount, 1, |
| (byte)((p1 > leadPrimary) |
| ? BYTE_UNSHIFTED_MAX_ |
| : BYTE_UNSHIFTED_MIN_)); |
| } |
| if (p2 == CollationElementIterator.IGNORABLE) { |
| // one byter, not compressed |
| append(bytes, bytescount, 1, (byte)p1); |
| leadPrimary = 0; |
| } |
| else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_ |
| || (p1 |
| > ((RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0] |
| >> 24) & 0xFF) |
| && p1 |
| < ((RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0] |
| >> 24) & 0xFF))) { |
| // not compressible |
| leadPrimary = 0; |
| append(bytes, bytescount, 1, (byte)p1); |
| append(bytes, bytescount, 1, (byte)p2); |
| } |
| else { // compress |
| leadPrimary = p1; |
| append(bytes, bytescount, 1, (byte)p1); |
| append(bytes, bytescount, 1, (byte)p2); |
| } |
| } |
| } |
| else { |
| // continuation, add primary to the key, no compression |
| append(bytes, bytescount, 1, (byte)p1); |
| if (p2 != CollationElementIterator.IGNORABLE) { |
| append(bytes, bytescount, 1, (byte)p2); // second part |
| } |
| } |
| } |
| } |
| return leadPrimary; |
| } |
| |
| /** |
| * Gets the secondary byte and adds it to the secondary byte array |
| * @param ce current ce |
| * @param bytes array of byte arrays for each strength |
| * @param bytescount array of the size of each strength byte arrays |
| * @param count array of counters for each of the strength |
| * @param notIsContinuation flag indicating if the current bytes belong to |
| * a continuation ce |
| * @param doFrench flag indicator if french sort is to be performed |
| * @param frenchOffset start and end offsets to source string for reversing |
| */ |
| private final void doSecondaryBytes(int ce, byte bytes[][], |
| int bytescount[], int count[], |
| boolean notIsContinuation, |
| boolean doFrench, int frenchOffset[]) |
| { |
| int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison |
| if (s != 0) { |
| if (!doFrench) { |
| // This is compression code. |
| if (s == COMMON_2_ && notIsContinuation) { |
| count[2] ++; |
| } |
| else { |
| if (count[2] > 0) { |
| if (s > COMMON_2_) { // not necessary for 4th level. |
| while (count[2] > TOP_COUNT_2_) { |
| append(bytes, bytescount, 2, |
| (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); |
| count[2] -= TOP_COUNT_2_; |
| } |
| append(bytes, bytescount, 2, |
| (byte)(COMMON_TOP_2_ - (count[2] - 1))); |
| } |
| else { |
| while (count[2] > BOTTOM_COUNT_2_) { |
| append(bytes, bytescount, 2, |
| (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); |
| count[2] -= BOTTOM_COUNT_2_; |
| } |
| append(bytes, bytescount, 2, |
| (byte)(COMMON_BOTTOM_2_ + (count[2] - 1))); |
| } |
| count[2] = 0; |
| } |
| append(bytes, bytescount, 2, (byte)s); |
| } |
| } |
| else { |
| append(bytes, bytescount, 2, (byte)s); |
| // Do the special handling for French secondaries |
| // We need to get continuation elements and do intermediate |
| // restore |
| // abc1c2c3de with french secondaries need to be edc1c2c3ba |
| // NOT edc3c2c1ba |
| if (notIsContinuation) { |
| if (frenchOffset[0] != -1) { |
| // reverse secondaries from frenchStartPtr up to |
| // frenchEndPtr |
| reverseBuffer(bytes[2], frenchOffset); |
| frenchOffset[0] = -1; |
| } |
| } |
| else { |
| if (frenchOffset[0] == -1) { |
| frenchOffset[0] = bytescount[2] - 2; |
| } |
| frenchOffset[1] = bytescount[2] - 1; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Reverse the argument buffer |
| * @param buffer to reverse |
| * @param offset start and end offsets to reverse |
| */ |
| private void reverseBuffer(byte buffer[], int offset[]) |
| { |
| int start = offset[0]; |
| int end = offset[1]; |
| while (start < end) { |
| byte b = buffer[start]; |
| buffer[start ++] = buffer[end]; |
| buffer[end --] = b; |
| } |
| } |
| |
| /** |
| * Insert the case shifting byte if required |
| * @param bytes array of byte arrays corresponding to each strength |
| * @param bytescount array of the size of the byte arrays |
| * @param caseshift value |
| * @return new caseshift value |
| */ |
| private static final int doCaseShift(byte bytes[][], int bytescount[], |
| int caseshift) |
| { |
| if (caseshift == 0) { |
| append(bytes, bytescount, 0, SORT_CASE_BYTE_START_); |
| caseshift = SORT_CASE_SHIFT_START_; |
| } |
| return caseshift; |
| } |
| |
| /** |
| * Performs the casing sort |
| * @param tertiary byte in ints for easy comparison |
| * @param bytes of byte arrays for each strength |
| * @param bytescount array of the size of each strength byte arrays |
| * @param notIsContinuation flag indicating if the current bytes belong to |
| * a continuation ce |
| * @param caseshift |
| * @return the new value of case shift |
| */ |
| private final int doCaseBytes(int tertiary, byte bytes[][], |
| int bytescount[], boolean notIsContinuation, |
| int caseshift) |
| { |
| caseshift = doCaseShift(bytes, bytescount, caseshift); |
| |
| if (notIsContinuation && tertiary != 0) { |
| byte casebits = (byte)(tertiary & 0xC0); |
| if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { |
| if (casebits == 0) { |
| bytes[0][bytescount[0] - 1] |= (1 << (-- caseshift)); |
| } |
| else { |
| // second bit |
| caseshift = doCaseShift(bytes, bytescount, caseshift - 1); |
| bytes[0][bytescount[0] - 1] |= ((casebits >> 6) & 1) |
| << (-- caseshift); |
| } |
| } |
| else { |
| if (casebits != 0) { |
| bytes[0][bytescount[0] - 1] |= 1 << (-- caseshift); |
| // second bit |
| caseshift = doCaseShift(bytes, bytescount, caseshift); |
| bytes[0][bytescount[0] - 1] |= ((casebits >> 7) & 1) |
| << (-- caseshift); |
| } |
| else { |
| caseshift --; |
| } |
| } |
| } |
| |
| return caseshift; |
| } |
| |
| /** |
| * Gets the tertiary byte and adds it to the tertiary byte array |
| * @param tertiary byte in int for easy comparison |
| * @param bytes array of byte arrays for each strength |
| * @param bytescount array of the size of each strength byte arrays |
| * @param count array of counters for each of the strength |
| * @param notIsContinuation flag indicating if the current bytes belong to |
| * a continuation ce |
| */ |
| private final void doTertiaryBytes(int tertiary, byte bytes[][], |
| int bytescount[], int count[], |
| boolean notIsContinuation) |
| { |
| if (tertiary != 0) { |
| // This is compression code. |
| // sequence size check is included in the if clause |
| if (tertiary == m_common3_ && notIsContinuation) { |
| count[3] ++; |
| } |
| else { |
| int common3 = m_common3_ & LAST_BYTE_MASK_; |
| if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) { |
| tertiary += m_addition3_; |
| } |
| else if (tertiary <= common3 |
| && m_common3_ == COMMON_UPPER_FIRST_3_) { |
| tertiary -= m_addition3_; |
| } |
| if (count[3] > 0) { |
| if (tertiary > common3) { |
| while (count[3] > m_topCount3_) { |
| append(bytes, bytescount, 3, |
| (byte)(m_top3_ - m_topCount3_)); |
| count[3] -= m_topCount3_; |
| } |
| append(bytes, bytescount, 3, |
| (byte)(m_top3_ - (count[3] - 1))); |
| } |
| else { |
| while (count[3] > m_bottomCount3_) { |
| append(bytes, bytescount, 3, |
| (byte)(m_bottom3_ + m_bottomCount3_)); |
| count[3] -= m_bottomCount3_; |
| } |
| append(bytes, bytescount, 3, |
| (byte)(m_bottom3_ + (count[3] - 1))); |
| } |
| count[3] = 0; |
| } |
| append(bytes, bytescount, 3, (byte)tertiary); |
| } |
| } |
| } |
| |
| /** |
| * Gets the Quaternary byte and adds it to the Quaternary byte array |
| * @param bytes array of byte arrays for each strength |
| * @param bytescount array of the size of each strength byte arrays |
| * @param count array of counters for each of the strength |
| * @param isCodePointHiragana flag indicator if the previous codepoint |
| * we dealt with was Hiragana |
| * @param commonBottom4 smallest common Quaternary byte |
| * @param bottomCount4 smallest Quaternary byte |
| * @param hiragana4 hiragana Quaternary byte |
| */ |
| private final void doQuaternaryBytes(byte bytes[][], int bytescount[], |
| int count[], |
| boolean isCodePointHiragana, |
| int commonBottom4, int bottomCount4, |
| byte hiragana4) |
| { |
| if (isCodePointHiragana) { // This was Hiragana, need to note it |
| if (count[4] > 0) { // Close this part |
| while (count[4] > bottomCount4) { |
| append(bytes, bytescount, 4, (byte)(commonBottom4 |
| + bottomCount4)); |
| count[4] -= bottomCount4; |
| } |
| append(bytes, bytescount, 4, (byte)(commonBottom4 |
| + (count[4] - 1))); |
| count[4] = 0; |
| } |
| append(bytes, bytescount, 4, hiragana4); // Add the Hiragana |
| } |
| else { // This wasn't Hiragana, so we can continue adding stuff |
| count[4] ++; |
| } |
| } |
| |
| /** |
| * Iterates through the argument string for all ces. |
| * Split the ces into their relevant primaries, secondaries etc. |
| * @param source normalized string |
| * @param compare array of flags indicating if a particular strength is |
| * to be processed |
| * @param bytes an array of byte arrays corresponding to the strengths |
| * @param bytescount an array of the size of the byte arrays |
| * @param count array of compression counters for each strength |
| * @param doFrench flag indicator if special handling of French has to be |
| * done |
| * @param hiragana4 offset for Hiragana quaternary |
| * @param commonBottom4 smallest common quaternary byte |
| * @param bottomCount4 smallest quaternary byte |
| */ |
| private final void getSortKeyBytes(String source, boolean compare[], |
| byte bytes[][], int bytescount[], |
| int count[], boolean doFrench, |
| byte hiragana4, int commonBottom4, |
| int bottomCount4) |
| |
| { |
| int backupDecomposition = getDecomposition(); |
| setDecomposition(NO_DECOMPOSITION); // have to revert to backup later |
| CollationElementIterator coleiter = |
| new CollationElementIterator(source, this); |
| |
| int frenchOffset[] = {-1, -1}; |
| |
| // scriptorder not implemented yet |
| // const uint8_t *scriptOrder = coll->scriptOrder; |
| |
| boolean doShift = false; |
| boolean notIsContinuation = false; |
| |
| int leadPrimary = 0; // int for easier comparison |
| int caseShift = 0; |
| |
| while (true) { |
| int ce = coleiter.next(); |
| if (ce == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| |
| if (ce == CollationElementIterator.IGNORABLE) { |
| continue; |
| } |
| |
| notIsContinuation = !isContinuation(ce); |
| |
| /* |
| * if (notIsContinuation) { |
| if (scriptOrder != NULL) { |
| primary1 = scriptOrder[primary1]; |
| } |
| }*/ |
| boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0; |
| // actually we can just check that the first byte is 0 |
| // generation stuffs the order left first |
| boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) |
| <= m_variableTopValue_; |
| doShift = (m_isAlternateHandlingShifted_ |
| && ((notIsContinuation && isSmallerThanVariableTop |
| && !isPrimaryByteIgnorable) // primary byte not 0 |
| || (!notIsContinuation && doShift)) |
| || (doShift && isPrimaryByteIgnorable)); |
| if (doShift && isPrimaryByteIgnorable) { |
| // amendment to the UCA says that primary ignorables and other |
| // ignorables should be removed if following a shifted code |
| // point |
| // if we were shifted and we got an ignorable code point |
| // we should just completely ignore it |
| continue; |
| } |
| leadPrimary = doPrimaryBytes(ce, bytes, bytescount, count, |
| notIsContinuation, doShift, leadPrimary, |
| commonBottom4, bottomCount4); |
| if (doShift) { |
| continue; |
| } |
| if (compare[2]) { |
| doSecondaryBytes(ce, bytes, bytescount, count, |
| notIsContinuation, doFrench, frenchOffset); |
| } |
| |
| int t = ce & LAST_BYTE_MASK_; |
| if (!notIsContinuation) { |
| t = ce & CE_REMOVE_CONTINUATION_MASK_; |
| } |
| |
| if (compare[0]) { |
| caseShift = doCaseBytes(t, bytes, bytescount, |
| notIsContinuation, caseShift); |
| } |
| else if (notIsContinuation) { |
| t ^= m_caseSwitch_; |
| } |
| |
| t &= m_mask3_; |
| |
| if (compare[3]) { |
| doTertiaryBytes(t, bytes, bytescount, count, |
| notIsContinuation); |
| } |
| |
| if (compare[4] && notIsContinuation) { // compare quad |
| doQuaternaryBytes(bytes, bytescount, count, |
| coleiter.m_isCodePointHiragana_, |
| commonBottom4, bottomCount4, hiragana4); |
| } |
| } |
| setDecomposition(backupDecomposition); // reverts to original |
| if (frenchOffset[0] != -1) { |
| // one last round of checks |
| reverseBuffer(bytes[2], frenchOffset); |
| } |
| } |
| |
| /** |
| * From the individual strength byte results the final compact sortkey |
| * will be calculated. |
| * @param source text string |
| * @param compare array of flags indicating if a particular strength is |
| * to be processed |
| * @param bytes an array of byte arrays corresponding to the strengths |
| * @param bytescount an array of the size of the byte arrays |
| * @param count array of compression counters for each strength |
| * @param doFrench flag indicating that special handling of French has to |
| * be done |
| * @param commonBottom4 smallest common quaternary byte |
| * @param bottomCount4 smallest quaternary byte |
| * @return the compact sortkey |
| */ |
| private final byte[] getSortKey(String source, boolean compare[], |
| byte bytes[][], int bytescount[], |
| int count[], boolean doFrench, |
| int commonBottom4, int bottomCount4) |
| { |
| // we have done all the CE's, now let's put them together to form |
| // a key |
| if (compare[2]) { |
| doSecondary(bytes, bytescount, count, doFrench); |
| if (compare[0]) { |
| doCase(bytes, bytescount); |
| } |
| if (compare[3]) { |
| doTertiary(bytes, bytescount, count); |
| if (compare[4]) { |
| doQuaternary(bytes, bytescount, count, commonBottom4, |
| bottomCount4); |
| if (compare[5]) { |
| doIdentical(source, bytes, bytescount); |
| } |
| |
| } |
| } |
| } |
| append(bytes, bytescount, 1, (byte)0); |
| return bytes[1]; |
| } |
| |
| /** |
| * Packs the French bytes |
| * @param bytes array of byte arrays corresponding to strenghts |
| * @param bytescount array of the size of byte arrays |
| * @param count array of compression counts |
| */ |
| private final void doFrench(byte bytes[][], int bytescount[], int count[]) |
| { |
| for (int i = 0; i < bytescount[2]; i ++) { |
| byte s = bytes[2][bytescount[2] - i - 1]; |
| // This is compression code. |
| if (s == COMMON_2_) { |
| ++ count[2]; |
| } |
| else { |
| if (count[2] > 0) { |
| // getting the unsigned value |
| if ((s & LAST_BYTE_MASK_) > COMMON_2_) { |
| // not necessary for 4th level. |
| while (count[2] > TOP_COUNT_2_) { |
| append(bytes, bytescount, 1, |
| (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); |
| count[2] -= TOP_COUNT_2_; |
| } |
| append(bytes, bytescount, 1, (byte)(COMMON_TOP_2_ |
| - (count[2] - 1))); |
| } |
| else { |
| while (count[2] > BOTTOM_COUNT_2_) { |
| append(bytes, bytescount, 1, |
| (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); |
| count[2] -= BOTTOM_COUNT_2_; |
| } |
| append(bytes, bytescount, 1, (byte)(COMMON_BOTTOM_2_ |
| + (count[2] - 1))); |
| } |
| count[2] = 0; |
| } |
| append(bytes, bytescount, 1, s); |
| } |
| } |
| if (count[2] > 0) { |
| while (count[2] > BOTTOM_COUNT_2_) { |
| append(bytes, bytescount, 1, (byte)(COMMON_BOTTOM_2_ |
| + BOTTOM_COUNT_2_)); |
| count[2] -= BOTTOM_COUNT_2_; |
| } |
| append(bytes, bytescount, 1, (byte)(COMMON_BOTTOM_2_ |
| + (count[2] - 1))); |
| } |
| } |
| |
| /** |
| * Compacts the secondary bytes and stores them into the primary array |
| * @param bytes array of byte arrays corresponding to the strengths |
| * @param bytecount array of the size of the byte arrays |
| * @param count array of the number of compression counts |
| * @param doFrench flag indicator that French has to be handled specially |
| */ |
| private final void doSecondary(byte bytes[][], int bytescount[], |
| int count[], boolean doFrench) |
| { |
| if (count[2] > 0) { |
| while (count[2] > BOTTOM_COUNT_2_) { |
| append(bytes, bytescount, 2, (byte)(COMMON_BOTTOM_2_ |
| + BOTTOM_COUNT_2_)); |
| count[2] -= BOTTOM_COUNT_2_; |
| } |
| append(bytes, bytescount, 2, (byte)(COMMON_BOTTOM_2_ + |
| (count[2] - 1))); |
| } |
| |
| append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); |
| |
| if (doFrench) { // do the reverse copy |
| doFrench(bytes, bytescount, count); |
| } |
| else { |
| if (bytes[1].length <= bytescount[1] + bytescount[2]) { |
| bytes[1] = increase(bytes[1], bytescount[1], bytescount[2]); |
| } |
| System.arraycopy(bytes[2], 0, bytes[1], bytescount[1], |
| bytescount[2]); |
| bytescount[1] += bytescount[2]; |
| } |
| } |
| |
| /** |
| * Increase buffer size |
| * @param array array of bytes |
| * @param size of the byte array |
| * @param incrementsize size to increase |
| * @return the new buffer |
| */ |
| private static final byte[] increase(byte buffer[], int size, |
| int incrementsize) |
| { |
| byte result[] = new byte[buffer.length + incrementsize]; |
| System.arraycopy(buffer, 0, result, 0, size); |
| return result; |
| } |
| |
| /** |
| * Increase buffer size |
| * @param array array of bytes |
| * @param size of the byte array |
| * @param incrementsize size to increase |
| * @return the new buffer |
| */ |
| private static final int[] increase(int buffer[], int size, |
| int incrementsize) |
| { |
| int result[] = new int[buffer.length + incrementsize]; |
| System.arraycopy(buffer, 0, result, 0, size); |
| return result; |
| } |
| |
| /** |
| * Compacts the case bytes and stores them into the primary array |
| * @param bytes array of byte arrays corresponding to the strengths |
| * @param bytecount array of the size of the byte arrays |
| */ |
| private final void doCase(byte bytes[][], int bytescount[]) |
| { |
| append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); |
| if (bytes[1].length <= bytescount[1] + bytescount[0]) { |
| bytes[1] = increase(bytes[1], bytescount[1], bytescount[0]); |
| } |
| if (bytes[1].length <= bytescount[1] + bytescount[0]) { |
| bytes[1] = increase(bytes[1], bytescount[1], bytescount[0]); |
| } |
| System.arraycopy(bytes[0], 0, bytes[1], bytescount[1], bytescount[0]); |
| bytescount[1] += bytescount[0]; |
| } |
| |
| /** |
| * Compacts the tertiary bytes and stores them into the primary array |
| * @param bytes array of byte arrays corresponding to the strengths |
| * @param bytecount array of the size of the byte arrays |
| * @param count array of the number of compression counts |
| */ |
| private final void doTertiary(byte bytes[][], int bytescount[], |
| int count[]) |
| { |
| if (count[3] > 0) { |
| if (m_common3_ != COMMON_BOTTOM_3_) { |
| while (count[3] >= m_topCount3_) { |
| append(bytes, bytescount, 3, (byte)(m_top3_ |
| - m_topCount3_)); |
| count[3] -= m_topCount3_; |
| } |
| append(bytes, bytescount, 3, (byte)(m_top3_ - count[3])); |
| } |
| else { |
| while (count[3] > m_bottomCount3_) { |
| append(bytes, bytescount, 3, (byte)(m_bottom3_ |
| + m_bottomCount3_)); |
| count[3] -= m_bottomCount3_; |
| } |
| append(bytes, bytescount, 3, (byte)(m_bottom3_ |
| + (count[3] - 1))); |
| } |
| } |
| append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); |
| if (bytes[1].length <= bytescount[1] + bytescount[3]) { |
| bytes[1] = increase(bytes[1], bytescount[1], bytescount[3]); |
| } |
| System.arraycopy(bytes[3], 0, bytes[1], bytescount[1], bytescount[3]); |
| bytescount[1] += bytescount[3]; |
| } |
| |
| /** |
| * Compacts the quaternary bytes and stores them into the primary array |
| * @param bytes array of byte arrays corresponding to the strengths |
| * @param bytecount array of the size of the byte arrays |
| * @param count array of compression counts |
| */ |
| private final void doQuaternary(byte bytes[][], int bytescount[], |
| int count[], int commonbottom4, |
| int bottomcount4) |
| { |
| if (count[4] > 0) { |
| while (count[4] > bottomcount4) { |
| append(bytes, bytescount, 4, (byte)(commonbottom4 |
| + bottomcount4)); |
| count[4] -= bottomcount4; |
| } |
| append(bytes, bytescount, 4, (byte)(commonbottom4 |
| + (count[4] - 1))); |
| } |
| append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); |
| if (bytes[1].length <= bytescount[1] + bytescount[4]) { |
| bytes[1] = increase(bytes[1], bytescount[1], bytescount[4]); |
| } |
| System.arraycopy(bytes[4], 0, bytes[1], bytescount[1], bytescount[4]); |
| bytescount[1] += bytescount[4]; |
| } |
| |
| /** |
| * Deals with the identical sort. |
| * Appends the BOCSU version of the source string to the ends of the |
| * byte buffer. |
| * @param source text string |
| * @param bytes array of a byte array corresponding to the strengths |
| * @param bytescount array of the byte array size |
| */ |
| private final void doIdentical(String source, byte bytes[][], |
| int bytescount[]) |
| { |
| int isize = BOCU.getCompressionLength(source); |
| append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); |
| if (bytes[1].length <= bytescount[1] + isize) { |
| bytes[1] = increase(bytes[1], bytescount[1], 1 + isize); |
| } |
| bytescount[1] = BOCU.compress(source, bytes[1], bytescount[1]); |
| } |
| |
| /** |
| * Gets the offset of the first unmatched characters in source and target. |
| * This method returns the offset of the start of a contraction or a |
| * combining sequence, if the first difference is in the middle of such a |
| * sequence. |
| * @param source string |
| * @param target string |
| * @return offset of the first unmatched characters in source and target. |
| */ |
| private final int getFirstUnmatchedOffset(String source, String target) |
| { |
| int result = 0; |
| int slength = source.length(); |
| int tlength = target.length(); |
| int minlength = slength; |
| if (minlength > tlength) { |
| minlength = tlength; |
| } |
| while (result < minlength |
| && source.charAt(result) == target.charAt(result)) { |
| result ++; |
| } |
| if (result > 0) { |
| // There is an identical portion at the beginning of the two |
| // strings. If the identical portion ends within a contraction or a |
| // combining character sequence, back up to the start of that |
| // sequence. |
| char schar = 0; |
| char tchar = 0; |
| if (result < minlength) { |
| schar = source.charAt(result); // first differing chars |
| tchar = target.charAt(result); |
| } |
| else { |
| if (slength == tlength) { |
| return result; |
| } |
| else if (slength < tlength) { |
| tchar = target.charAt(result); |
| } |
| else { |
| schar = source.charAt(result); |
| } |
| } |
| if (isUnsafe(schar) || isUnsafe(tchar)) |
| { |
| // We are stopped in the middle of a contraction or combining |
| // sequence. |
| // Look backwards for the part of the string for the start of |
| // the sequence |
| // It doesn't matter which string we scan, since they are the |
| // same in this region. |
| do { |
| result --; |
| } |
| while (result > 0 && isUnsafe(source.charAt(result))); |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Appending an byte to an array of bytes and increases it if we run out of |
| * space |
| * @param array of byte arrays |
| * @param array of the end offsets corresponding to array |
| * @param appendarrayindex of the int array to append |
| * @param value to append |
| */ |
| private static final void append(byte array[][], int arrayoffset[], |
| int appendarrayindex, byte value) |
| { |
| if (arrayoffset[appendarrayindex] + 1 |
| >= array[appendarrayindex].length) { |
| array[appendarrayindex] = increase(array[appendarrayindex], |
| arrayoffset[appendarrayindex], |
| SORT_BUFFER_INIT_SIZE_); |
| } |
| array[appendarrayindex][arrayoffset[appendarrayindex]] = value; |
| arrayoffset[appendarrayindex] ++; |
| } |
| |
| /** |
| * This is a trick string compare function that goes in and uses sortkeys |
| * to compare. It is used when compare gets in trouble and needs to bail |
| * out. |
| * @param source text string |
| * @param target text string |
| */ |
| private final int compareBySortKeys(String source, String target) |
| |
| { |
| CollationKey sourcekey = getCollationKey(source); |
| CollationKey targetkey = getCollationKey(target); |
| return sourcekey.compareTo(targetkey); |
| } |
| |
| /** |
| * Performs the primary comparisons, and fills up the CE buffer at the |
| * same time. |
| * The return value toggles between the comparison result and the hiragana |
| * result. If either the source is greater than target or vice versa, the |
| * return result is the comparison result, ie 1 or -1, furthermore the |
| * cebuffers will be cleared when that happens. If the primary comparisons |
| * are equal, we'll have to continue with secondary comparison. In this case |
| * the cebuffer will not be cleared and the return result will be the |
| * hiragana result. |
| * @param doHiragana4 flag indicator that Hiragana Quaternary has to be |
| * observed |
| * @param lowestpvalue the lowest primary value that will not be ignored if |
| * alternate handling is shifted |
| * @param source text string |
| * @param target text string |
| * @param textoffset offset in text to start the comparison |
| * @param cebuffer array of CE buffers to populate, offset 0 for source, |
| * 1 for target, cleared when a primary difference is |
| * found. |
| * @param cebuffersize array of CE buffer size corresponding to the |
| * cebuffer, 0 when a primary difference is found. |
| * @return comparion result if a primary difference is found, otherwise |
| * hiragana result |
| */ |
| private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, |
| String source, String target, |
| int textoffset, int cebuffer[][], |
| int cebuffersize[]) |
| |
| { |
| // Preparing the context objects for iterating over strings |
| StringCharacterIterator siter = new StringCharacterIterator(source, |
| textoffset, |
| source.length(), |
| textoffset); |
| CollationElementIterator scoleiter = new CollationElementIterator( |
| siter, this); |
| StringCharacterIterator titer = new StringCharacterIterator(target, |
| textoffset, |
| target.length(), |
| textoffset); |
| CollationElementIterator tcoleiter = new CollationElementIterator( |
| titer, this); |
| |
| // Non shifted primary processing is quite simple |
| if (!m_isAlternateHandlingShifted_) { |
| int hiraganaresult = 0; |
| while (true) { |
| int sorder = 0; |
| // We fetch CEs until we hit a non ignorable primary or end. |
| do { |
| sorder = scoleiter.next(); |
| append(cebuffer, cebuffersize, 0, sorder); |
| sorder &= CE_PRIMARY_MASK_; |
| } while (sorder == CollationElementIterator.IGNORABLE); |
| |
| int torder = 0; |
| do { |
| torder = tcoleiter.next(); |
| append(cebuffer, cebuffersize, 1, torder); |
| torder &= CE_PRIMARY_MASK_; |
| } while (torder == CollationElementIterator.IGNORABLE); |
| |
| // if both primaries are the same |
| if (sorder == torder) { |
| // and there are no more CEs, we advance to the next level |
| if (cebuffer[0][cebuffersize[0] - 1] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| if (doHiragana4 && hiraganaresult == 0 |
| && scoleiter.m_isCodePointHiragana_ != |
| tcoleiter.m_isCodePointHiragana_) { |
| if (scoleiter.m_isCodePointHiragana_) { |
| hiraganaresult = -1; |
| } |
| else { |
| hiraganaresult = 1; |
| } |
| } |
| } |
| else { |
| // if two primaries are different, we are done |
| return endPrimaryCompare(sorder, torder, cebuffer, |
| cebuffersize); |
| } |
| } |
| // no primary difference... do the rest from the buffers |
| return hiraganaresult; |
| } |
| else { // shifted - do a slightly more complicated processing :) |
| while (true) { |
| int sorder = getPrimaryShiftedCompareCE(scoleiter, lowestpvalue, |
| cebuffer, cebuffersize, 0); |
| int torder = getPrimaryShiftedCompareCE(tcoleiter, lowestpvalue, |
| cebuffer, cebuffersize, 1); |
| if (sorder == torder) { |
| if (cebuffer[0][cebuffersize[0] - 1] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| else { |
| continue; |
| } |
| } |
| else { |
| return endPrimaryCompare(sorder, torder, cebuffer, |
| cebuffersize); |
| } |
| } // no primary difference... do the rest from the buffers |
| } |
| return 0; |
| } |
| |
| /** |
| * This is used only for primary strength when we know that sorder is |
| * already different from torder. |
| * Compares sorder and torder, returns -1 if sorder is less than torder. |
| * Clears the cebuffer at the same time. |
| * @param sorder source strength order |
| * @param torder target strength order |
| * @param cebuffer array of buffers containing the ce values |
| * @param cebuffersize array of cebuffer offsets |
| * @return the comparison result of sorder and torder |
| */ |
| private static final int endPrimaryCompare(int sorder, int torder, |
| int cebuffer[][], |
| int cebuffersize[]) |
| { |
| // if we reach here, the ce offset accessed is the last ce |
| // appended to the buffer |
| boolean isSourceNullOrder = (cebuffer[0][cebuffersize[0] - 1] |
| == CollationElementIterator.NULLORDER); |
| boolean isTargetNullOrder = (cebuffer[1][cebuffersize[1] - 1] |
| == CollationElementIterator.NULLORDER); |
| cebuffer[0] = null; |
| cebuffer[1] = null; |
| cebuffersize[0] = 0; |
| cebuffersize[1] = 0; |
| if (isSourceNullOrder) { |
| return -1; |
| } |
| if (isTargetNullOrder) { |
| return 1; |
| } |
| // getting rid of the sign |
| sorder >>>= CE_PRIMARY_SHIFT_; |
| torder >>>= CE_PRIMARY_SHIFT_; |
| if (sorder < torder) { |
| return -1; |
| } |
| return 1; |
| } |
| |
| /** |
| * Calculates the next primary shifted value and fills up cebuffer with the |
| * next non-ignorable ce. |
| * @param coleiter collation element iterator |
| * @param doHiragana4 flag indicator if hiragana quaternary is to be |
| * handled |
| * @param lowestpvalue lowest primary shifted value that will not be |
| * ignored |
| * @param cebuffer array of buffers to append with the next ce |
| * @param cebuffersize array of offsets corresponding to the cebuffer |
| * @param cebufferindex index of the buffer to append to |
| * @return result next modified ce |
| */ |
| private final static int getPrimaryShiftedCompareCE( |
| CollationElementIterator coleiter, |
| int lowestpvalue, int cebuffer[][], |
| int cebuffersize[], int cebufferindex) |
| |
| { |
| boolean shifted = false; |
| int result = CollationElementIterator.IGNORABLE; |
| while (true) { |
| result = coleiter.next(); |
| if (result == CollationElementIterator.NULLORDER) { |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| break; |
| } |
| else if (result == CollationElementIterator.IGNORABLE |
| || (shifted |
| && (result & CE_PRIMARY_MASK_) |
| == CollationElementIterator.IGNORABLE)) { |
| // UCA amendment - ignore ignorables that follow shifted code |
| // points |
| continue; |
| } |
| else if (isContinuation(result)) { |
| if ((result & CE_PRIMARY_MASK_) |
| != CollationElementIterator.IGNORABLE) { |
| // There is primary value |
| if (shifted) { |
| result = (result & CE_PRIMARY_MASK_) |
| | CE_CONTINUATION_MARKER_; |
| // preserve interesting continuation |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| continue; |
| } |
| else { |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| break; |
| } |
| } |
| else { // Just lower level values |
| if (!shifted) { |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| } |
| } |
| } |
| else { // regular |
| if ((result & CE_PRIMARY_MASK_) > lowestpvalue) { |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| break; |
| } |
| else { |
| if ((result & CE_PRIMARY_MASK_) > 0) { |
| shifted = true; |
| result &= CE_PRIMARY_MASK_; |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| continue; |
| } |
| else { |
| append(cebuffer, cebuffersize, cebufferindex, result); |
| shifted = false; |
| continue; |
| } |
| } |
| } |
| } |
| result &= CE_PRIMARY_MASK_; |
| return result; |
| } |
| |
| /** |
| * Appending an int to an array of ints and increases it if we run out of |
| * space |
| * @param array of int arrays |
| * @param array of the end offsets corresponding to array |
| * @param appendarrayindex of the int array to append |
| * @param value to append |
| */ |
| private static final void append(int array[][], int arrayoffset[], |
| int appendarrayindex, int value) |
| { |
| if (arrayoffset[appendarrayindex] + 1 |
| >= array[appendarrayindex].length) { |
| array[appendarrayindex] = increase(array[appendarrayindex], |
| arrayoffset[appendarrayindex], |
| CE_BUFFER_SIZE_); |
| } |
| array[appendarrayindex][arrayoffset[appendarrayindex]] = value; |
| arrayoffset[appendarrayindex] ++; |
| } |
| |
| /** |
| * Does secondary strength comparison based on the collected ces. |
| * @param cebuffer array of int arrays that contains the collected ces |
| * @param cebuffersize array of offsets corresponding to the cebuffer, |
| * indicates the offset of the last ce in buffer |
| * @param doFrench flag indicates if French ordering is to be done |
| * @return the secondary strength comparison result |
| */ |
| private static final int doSecondaryCompare(int cebuffer[][], |
| int cebuffersize[], |
| boolean doFrench) |
| { |
| // now, we're gonna reexamine collected CEs |
| if (!doFrench) { // normal |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| while (sorder == CollationElementIterator.IGNORABLE) { |
| sorder = cebuffer[0][soffset ++] & CE_SECONDARY_MASK_; |
| } |
| int torder = CollationElementIterator.IGNORABLE; |
| while (torder == CollationElementIterator.IGNORABLE) { |
| torder = cebuffer[1][toffset ++] & CE_SECONDARY_MASK_; |
| } |
| |
| if (sorder == torder) { |
| if (cebuffer[0][soffset - 1] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| } |
| else { |
| if (cebuffer[0][soffset - 1] == |
| CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (cebuffer[1][toffset - 1] == |
| CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| } |
| else { // do the French |
| int continuationoffset[] = {0, 0}; |
| int offset[] = {cebuffersize[0] - 2, cebuffersize[1] - 2} ; |
| while (true) { |
| int sorder = getSecondaryFrenchCE(cebuffer, offset, |
| continuationoffset, 0); |
| int torder = getSecondaryFrenchCE(cebuffer, offset, |
| continuationoffset, 1); |
| if (sorder == torder) { |
| if ((offset[0] < 0 && offset[1] < 0) |
| || cebuffer[0][offset[0]] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| } |
| else { |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Calculates the next secondary french CE. |
| * @param cebuffer array of buffers to append with the next ce |
| * @param offset array of offsets corresponding to the cebuffer |
| * @param continuationoffset index of the start of a continuation |
| * @param index of cebuffer to use |
| * @return result next modified ce |
| */ |
| private static final int getSecondaryFrenchCE(int cebuffer[][], |
| int offset[], |
| int continuationoffset[], |
| int index) |
| { |
| int result = CollationElementIterator.IGNORABLE; |
| while (result == CollationElementIterator.IGNORABLE |
| && offset[index] >= 0) { |
| if (continuationoffset[index] == 0) { |
| result = cebuffer[index][offset[index]]; |
| while (isContinuation(cebuffer[index][offset[index] --])); |
| // after this, sorder is at the start of continuation, |
| // and offset points before that |
| if (isContinuation(cebuffer[index][offset[index] + 1])) { |
| // save offset for later |
| continuationoffset[index] = offset[index]; |
| offset[index] += 2; |
| } |
| //} |
| } |
| else { |
| result = cebuffer[index][offset[index] ++]; |
| if (!isContinuation(result)) { |
| // we have finished with this continuation |
| offset[index] = continuationoffset[index]; |
| // reset the pointer to before continuation |
| continuationoffset[index] = 0; |
| continue; |
| } |
| } |
| result &= CE_SECONDARY_MASK_; // remove continuation bit |
| } |
| return result; |
| } |
| |
| /** |
| * Does case strength comparison based on the collected ces. |
| * @param cebuffer array of int arrays that contains the collected ces |
| * @return the case strength comparison result |
| */ |
| private final int doCaseCompare(int cebuffer[][]) |
| { |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| int torder = CollationElementIterator.IGNORABLE; |
| while ((sorder & CE_REMOVE_CASE_) |
| == CollationElementIterator.IGNORABLE) { |
| sorder = cebuffer[0][soffset ++]; |
| if (!isContinuation(sorder)) { |
| sorder &= CE_CASE_MASK_3_; |
| sorder ^= m_caseSwitch_; |
| } |
| else { |
| sorder = CollationElementIterator.IGNORABLE; |
| } |
| } |
| |
| while ((torder & CE_REMOVE_CASE_) |
| == CollationElementIterator.IGNORABLE) { |
| torder = cebuffer[1][toffset ++]; |
| if (!isContinuation(torder)) { |
| torder &= CE_CASE_MASK_3_; |
| torder ^= m_caseSwitch_; |
| } |
| else { |
| torder = CollationElementIterator.IGNORABLE; |
| } |
| } |
| |
| sorder &= CE_CASE_BIT_MASK_; |
| torder &= CE_CASE_BIT_MASK_; |
| if (sorder == torder) { |
| if (cebuffer[0][soffset - 1] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| } |
| else { |
| if (cebuffer[0][soffset - 1] |
| == CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (cebuffer[1][soffset - 1] |
| == CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Does tertiary strength comparison based on the collected ces. |
| * @param cebuffer array of int arrays that contains the collected ces |
| * @return the tertiary strength comparison result |
| */ |
| private final int doTertiaryCompare(int cebuffer[][]) |
| { |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| int torder = CollationElementIterator.IGNORABLE; |
| while ((sorder & CE_REMOVE_CASE_) |
| == CollationElementIterator.IGNORABLE) { |
| sorder = cebuffer[0][soffset ++] & m_mask3_; |
| if (!isContinuation(sorder)) { |
| sorder ^= m_caseSwitch_; |
| } |
| else { |
| sorder &= CE_REMOVE_CASE_; |
| } |
| } |
| |
| while ((torder & CE_REMOVE_CASE_) |
| == CollationElementIterator.IGNORABLE) { |
| torder = cebuffer[1][toffset ++] & m_mask3_; |
| if (!isContinuation(torder)) { |
| torder ^= m_caseSwitch_; |
| } |
| else { |
| torder &= CE_REMOVE_CASE_; |
| } |
| } |
| |
| if (sorder == torder) { |
| if (cebuffer[0][soffset - 1] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| } |
| else { |
| if (cebuffer[0][soffset - 1] == |
| CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (cebuffer[1][toffset - 1] == |
| CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Does quaternary strength comparison based on the collected ces. |
| * @param cebuffer array of int arrays that contains the collected ces |
| * @param lowestpvalue the lowest primary value that will not be ignored if |
| * alternate handling is shifted |
| * @return the quaternary strength comparison result |
| */ |
| private final int doQuaternaryCompare(int cebuffer[][], int lowestpvalue) |
| { |
| boolean sShifted = true; |
| boolean tShifted = true; |
| int soffset = 0; |
| int toffset = 0; |
| while (true) { |
| int sorder = CollationElementIterator.IGNORABLE; |
| int torder = CollationElementIterator.IGNORABLE; |
| while (sorder == CollationElementIterator.IGNORABLE |
| || (isContinuation(sorder) && !sShifted)) { |
| sorder = cebuffer[0][soffset ++]; |
| if (isContinuation(sorder)) { |
| if (!sShifted) { |
| continue; |
| } |
| } |
| else if (sorder > lowestpvalue |
| || (sorder & CE_PRIMARY_MASK_) |
| == CollationElementIterator.IGNORABLE) { |
| // non continuation |
| sorder = CE_PRIMARY_MASK_; |
| sShifted = false; |
| } |
| else { |
| sShifted = true; |
| } |
| } |
| sorder >>>= CE_PRIMARY_SHIFT_; |
| while (torder == CollationElementIterator.IGNORABLE |
| || (isContinuation(torder) && !tShifted)) { |
| torder = cebuffer[1][toffset ++]; |
| if (isContinuation(torder)) { |
| if (!tShifted) { |
| continue; |
| } |
| } |
| else if (torder > lowestpvalue |
| || (torder & CE_PRIMARY_MASK_) |
| == CollationElementIterator.IGNORABLE) { |
| // non continuation |
| torder = CE_PRIMARY_MASK_; |
| tShifted = false; |
| } |
| else { |
| tShifted = true; |
| } |
| } |
| torder >>>= CE_PRIMARY_SHIFT_; |
| |
| if (sorder == torder) { |
| if (cebuffer[0][soffset - 1] |
| == CollationElementIterator.NULLORDER) { |
| break; |
| } |
| } |
| else { |
| if (cebuffer[0][soffset - 1] == |
| CollationElementIterator.NULLORDER) { |
| return -1; |
| } |
| if (cebuffer[1][toffset - 1] == |
| CollationElementIterator.NULLORDER) { |
| return 1; |
| } |
| return (sorder < torder) ? -1 : 1; |
| } |
| } |
| return 0; |
| } |
| |
| /** |
| * Internal function. Does byte level string compare. Used by strcoll if |
| * strength == identical and strings are otherwise equal. This is a rare |
| * case. Comparison must be done on NFD normalized strings. FCD is not good |
| * enough. |
| * @param source text |
| * @param target text |
| * @param offset of the first difference in the text strings |
| * @param normalize flag indicating if we are to normalize the text before |
| * comparison |
| * @return 1 if source is greater than target, -1 less than and 0 if equals |
| */ |
| private static final int doIdenticalCompare(String source, String target, |
| int offset, boolean normalize) |
| |
| { |
| if (normalize) { |
| if (Normalizer.quickCheck(source, Normalizer.NFD) |
| != Normalizer.YES) { |
| source = Normalizer.decompose(source, false); |
| } |
| |
| if (Normalizer.quickCheck(target, Normalizer.NFD) |
| != Normalizer.YES) { |
| target = Normalizer.decompose(target, false); |
| } |
| offset = 0; |
| } |
| |
| return doStringCompare(source, target, offset); |
| } |
| |
| /** |
| * Compares string for their codepoint order. |
| * This comparison handles surrogate characters and place them after the |
| * all non surrogate characters. |
| * @param source text |
| * @param target text |
| * @param offset start offset for comparison |
| * @return 1 if source is greater than target, -1 less than and 0 if equals |
| */ |
| private static final int doStringCompare(String source, |
| String target, |
| int offset) |
| { |
| // compare identical prefixes - they do not need to be fixed up |
| char schar = 0; |
| char tchar = 0; |
| int slength = source.length(); |
| int tlength = target.length(); |
| int minlength = Math.min(slength, tlength); |
| while (offset < minlength) { |
| schar = source.charAt(offset); |
| tchar = target.charAt(offset ++); |
| if (schar != tchar) { |
| break; |
| } |
| } |
| |
| if (schar == tchar && offset == minlength) { |
| if (slength > minlength) { |
| return 1; |
| } |
| if (tlength > minlength) { |
| return -1; |
| } |
| return 0; |
| } |
| |
| // if both values are in or above the surrogate range, Fix them up. |
| if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE |
| && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) { |
| schar = fixupUTF16(schar); |
| tchar = fixupUTF16(tchar); |
| } |
| |
| // now c1 and c2 are in UTF-32-compatible order |
| return (schar < tchar) ? -1 : 1; // schar and tchar has to be different |
| } |
| |
| /** |
| * Rotate surrogates to the top to get code point order |
| */ |
| private static final char fixupUTF16(char ch) |
| { |
| if (ch >= 0xe000) { |
| ch -= 0x800; |
| } |
| else { |
| ch += 0x2000; |
| } |
| return ch; |
| } |
| |
| /** |
| * Checks that the source after offset is ignorable |
| * @param source text string to check |
| * @param offset |
| * @return true if source after offset is ignorable. false otherwise |
| */ |
| private final boolean checkIgnorable(String source, int offset) |
| |
| { |
| StringCharacterIterator siter = new StringCharacterIterator(source, |
| offset, source.length(), offset); |
| CollationElementIterator coleiter = new CollationElementIterator( |
| siter, this); |
| int ce = coleiter.next(); |
| while (ce != CollationElementIterator.NULLORDER) { |
| if (ce != CollationElementIterator.IGNORABLE) { |
| return false; |
| } |
| ce = coleiter.next(); |
| } |
| return true; |
| } |
| |
| /** |
| * Resets the internal case data members and compression values. |
| */ |
| private void updateInternalState() |
| { |
| if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { |
| m_caseSwitch_ = CASE_SWITCH_; |
| } |
| else { |
| m_caseSwitch_ = NO_CASE_SWITCH_; |
| } |
| |
| if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) { |
| m_mask3_ = CE_REMOVE_CASE_; |
| m_common3_ = COMMON_NORMAL_3_; |
| m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; |
| m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; |
| m_bottom3_ = COMMON_BOTTOM_3_; |
| } |
| else { |
| m_mask3_ = CE_KEEP_CASE_; |
| m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; |
| if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { |
| m_common3_ = COMMON_UPPER_FIRST_3_; |
| m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_; |
| m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_; |
| } else { |
| m_common3_ = COMMON_NORMAL_3_; |
| m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_; |
| m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_; |
| } |
| } |
| |
| // Set the compression values |
| int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1; |
| // we multilply double with int, but need only int |
| m_topCount3_ = (int)(PROPORTION_3_ * total3); |
| m_bottomCount3_ = total3 - m_topCount3_; |
| |
| if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ |
| && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) { |
| m_isSimple3_ = true; |
| } |
| else { |
| m_isSimple3_ = false; |
| } |
| } |
| |
| /** |
| * Initializes the RuleBasedCollator |
| */ |
| private final void init() |
| { |
| for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; |
| m_minUnsafe_ ++) { |
| // Find the smallest unsafe char. |
| if (isUnsafe(m_minUnsafe_)) { |
| break; |
| } |
| } |
| |
| for (m_minContractionEnd_ = 0; |
| m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; |
| m_minContractionEnd_ ++) { |
| // Find the smallest contraction-ending char. |
| if (isContractionEnd(m_minContractionEnd_)) { |
| break; |
| } |
| } |
| setStrength(m_defaultStrength_); |
| setDecomposition(m_defaultDecomposition_); |
| m_variableTopValue_ = m_defaultVariableTopValue_; |
| m_isFrenchCollation_ = m_defaultIsFrenchCollation_; |
| m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; |
| m_isCaseLevel_ = m_defaultIsCaseLevel_; |
| m_caseFirst_ = m_defaultCaseFirst_; |
| m_isHiragana4_ = m_defaultIsHiragana4_; |
| updateInternalState(); |
| } |
| } |