| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2008, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.text; |
| |
| import java.text.ParseException; |
| import java.util.Hashtable; |
| import java.util.Arrays; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.impl.UCharacterProperty; |
| |
| /** |
| * Class for parsing collation rules, produces a list of tokens that will be |
| * turned into collation elements |
| * @author Syn Wee Quek |
| * @since release 2.2, June 7 2002 |
| */ |
| final class CollationRuleParser |
| { |
| // public data members --------------------------------------------------- |
| |
| // package private constructors ------------------------------------------ |
| |
| /** |
| * <p>RuleBasedCollator constructor that takes the rules. |
| * Please see RuleBasedCollator class description for more details on the |
| * collation rule syntax.</p> |
| * @see java.util.Locale |
| * @param rules the collation rules to build the collation table from. |
| * @exception ParseException thrown when argument rules have an invalid |
| * syntax. |
| */ |
| CollationRuleParser(String rules) throws ParseException |
| { |
| extractSetsFromRules(rules); |
| m_source_ = new StringBuffer(Normalizer.decompose(rules, false).trim()); |
| m_rules_ = m_source_.toString(); |
| m_current_ = 0; |
| m_extraCurrent_ = m_source_.length(); |
| m_variableTop_ = null; |
| m_parsedToken_ = new ParsedToken(); |
| m_hashTable_ = new Hashtable(); |
| m_options_ = new OptionSet(RuleBasedCollator.UCA_); |
| m_listHeader_ = new TokenListHeader[512]; |
| m_resultLength_ = 0; |
| // call assembleTokenList() manually, so that we can |
| // init a parser and manually parse tokens |
| //assembleTokenList(); |
| } |
| |
| // package private inner classes ----------------------------------------- |
| |
| /** |
| * Collation options set |
| */ |
| static class OptionSet |
| { |
| // package private constructor --------------------------------------- |
| |
| /** |
| * Initializes the option set with the argument collators |
| * @param collator option to use |
| */ |
| OptionSet(RuleBasedCollator collator) |
| { |
| m_variableTopValue_ = collator.m_variableTopValue_; |
| m_isFrenchCollation_ = collator.isFrenchCollation(); |
| m_isAlternateHandlingShifted_ |
| = collator.isAlternateHandlingShifted(); |
| m_caseFirst_ = collator.m_caseFirst_; |
| m_isCaseLevel_ = collator.isCaseLevel(); |
| m_decomposition_ = collator.getDecomposition(); |
| m_strength_ = collator.getStrength(); |
| m_isHiragana4_ = collator.m_isHiragana4_; |
| } |
| |
| // package private data members -------------------------------------- |
| |
| int m_variableTopValue_; |
| boolean m_isFrenchCollation_; |
| /** |
| * Attribute for handling variable elements |
| */ |
| boolean m_isAlternateHandlingShifted_; |
| /** |
| * who goes first, lower case or uppercase |
| */ |
| int m_caseFirst_; |
| /** |
| * do we have an extra case level |
| */ |
| boolean m_isCaseLevel_; |
| /** |
| * attribute for normalization |
| */ |
| int m_decomposition_; |
| /** |
| * attribute for strength |
| */ |
| int m_strength_; |
| /** |
| * attribute for special Hiragana |
| */ |
| boolean m_isHiragana4_; |
| } |
| |
| /** |
| * List of tokens used by the collation rules |
| */ |
| static class TokenListHeader |
| { |
| Token m_first_; |
| Token m_last_; |
| Token m_reset_; |
| boolean m_indirect_; |
| int m_baseCE_; |
| int m_baseContCE_; |
| int m_nextCE_; |
| int m_nextContCE_; |
| int m_previousCE_; |
| int m_previousContCE_; |
| int m_pos_[] = new int[Collator.IDENTICAL + 1]; |
| int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)]; |
| int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)]; |
| int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)]; |
| Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1]; |
| Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1]; |
| } |
| |
| /** |
| * Token wrapper for collation rules |
| */ |
| static class Token |
| { |
| // package private data members --------------------------------------- |
| |
| int m_CE_[]; |
| int m_CELength_; |
| int m_expCE_[]; |
| int m_expCELength_; |
| int m_source_; |
| int m_expansion_; |
| int m_prefix_; |
| int m_strength_; |
| int m_toInsert_; |
| int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>> |
| TokenListHeader m_listHeader_; |
| Token m_previous_; |
| Token m_next_; |
| StringBuffer m_rules_; |
| char m_flags_; |
| |
| // package private constructors --------------------------------------- |
| |
| Token() |
| { |
| m_CE_ = new int[128]; |
| m_expCE_ = new int[128]; |
| // TODO: this should also handle reverse |
| m_polarity_ = TOKEN_POLARITY_POSITIVE_; |
| m_next_ = null; |
| m_previous_ = null; |
| m_CELength_ = 0; |
| m_expCELength_ = 0; |
| } |
| |
| // package private methods -------------------------------------------- |
| |
| /** |
| * Hashcode calculation for token |
| * @return the hashcode |
| */ |
| public int hashCode() |
| { |
| int result = 0; |
| int len = (m_source_ & 0xFF000000) >>> 24; |
| int inc = ((len - 32) / 32) + 1; |
| |
| int start = m_source_ & 0x00FFFFFF; |
| int limit = start + len; |
| |
| while (start < limit) { |
| result = (result * 37) + m_rules_.charAt(start); |
| start += inc; |
| } |
| return result; |
| } |
| |
| /** |
| * Equals calculation |
| * @param target object to compare |
| * @return true if target is the same as this object |
| */ |
| public boolean equals(Object target) |
| { |
| if (target == this) { |
| return true; |
| } |
| if (target instanceof Token) { |
| Token t = (Token)target; |
| int sstart = m_source_ & 0x00FFFFFF; |
| int tstart = t.m_source_ & 0x00FFFFFF; |
| int slimit = (m_source_ & 0xFF000000) >> 24; |
| int tlimit = (m_source_ & 0xFF000000) >> 24; |
| |
| int end = sstart + slimit - 1; |
| |
| if (m_source_ == 0 || t.m_source_ == 0) { |
| return false; |
| } |
| if (slimit != tlimit) { |
| return false; |
| } |
| if (m_source_ == t.m_source_) { |
| return true; |
| } |
| |
| while (sstart < end |
| && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) |
| { |
| ++ sstart; |
| ++ tstart; |
| } |
| if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| } |
| |
| // package private data member ------------------------------------------- |
| |
| /** |
| * Indicator that the token is resetted yet, ie & in the rules |
| */ |
| static final int TOKEN_RESET_ = 0xDEADBEEF; |
| |
| /** |
| * Size of the number of tokens |
| */ |
| int m_resultLength_; |
| /** |
| * List of parsed tokens |
| */ |
| TokenListHeader m_listHeader_[]; |
| /** |
| * Variable top token |
| */ |
| Token m_variableTop_; |
| /** |
| * Collation options |
| */ |
| OptionSet m_options_; |
| /** |
| * Normalized collation rules with some extra characters |
| */ |
| StringBuffer m_source_; |
| /** |
| * Hash table to keep all tokens |
| */ |
| Hashtable m_hashTable_; |
| |
| // package private method ------------------------------------------------ |
| |
| void setDefaultOptionsInCollator(RuleBasedCollator collator) |
| { |
| collator.m_defaultStrength_ = m_options_.m_strength_; |
| collator.m_defaultDecomposition_ = m_options_.m_decomposition_; |
| collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_; |
| collator.m_defaultIsAlternateHandlingShifted_ |
| = m_options_.m_isAlternateHandlingShifted_; |
| collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_; |
| collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_; |
| collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_; |
| collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_; |
| } |
| |
| // private inner classes ------------------------------------------------- |
| |
| /** |
| * This is a token that has been parsed but not yet processed. Used to |
| * reduce the number of arguments in the parser |
| */ |
| private static class ParsedToken |
| { |
| // private constructor ---------------------------------------------- |
| |
| /** |
| * Empty constructor |
| */ |
| ParsedToken() |
| { |
| m_charsLen_ = 0; |
| m_charsOffset_ = 0; |
| m_extensionLen_ = 0; |
| m_extensionOffset_ = 0; |
| m_prefixLen_ = 0; |
| m_prefixOffset_ = 0; |
| m_flags_ = 0; |
| m_strength_ = TOKEN_UNSET_; |
| } |
| |
| // private data members --------------------------------------------- |
| |
| int m_strength_; |
| int m_charsOffset_; |
| int m_charsLen_; |
| int m_extensionOffset_; |
| int m_extensionLen_; |
| int m_prefixOffset_; |
| int m_prefixLen_; |
| char m_flags_; |
| char m_indirectIndex_; |
| } |
| |
| /** |
| * Boundary wrappers |
| */ |
| private static class IndirectBoundaries |
| { |
| // package private constructor --------------------------------------- |
| |
| IndirectBoundaries(int startce[], int limitce[]) |
| { |
| // Set values for the top - TODO: once we have values for all the |
| // indirects, we are going to initalize here. |
| m_startCE_ = startce[0]; |
| m_startContCE_ = startce[1]; |
| if (limitce != null) { |
| m_limitCE_ = limitce[0]; |
| m_limitContCE_ = limitce[1]; |
| } |
| else { |
| m_limitCE_ = 0; |
| m_limitContCE_ = 0; |
| } |
| } |
| |
| // package private data members -------------------------------------- |
| |
| int m_startCE_; |
| int m_startContCE_; |
| int m_limitCE_; |
| int m_limitContCE_; |
| } |
| |
| /** |
| * Collation option rule tag |
| */ |
| private static class TokenOption |
| { |
| // package private constructor --------------------------------------- |
| |
| TokenOption(String name, int attribute, String suboptions[], |
| int suboptionattributevalue[]) |
| { |
| m_name_ = name; |
| m_attribute_ = attribute; |
| m_subOptions_ = suboptions; |
| m_subOptionAttributeValues_ = suboptionattributevalue; |
| } |
| |
| // package private data member --------------------------------------- |
| |
| private String m_name_; |
| private int m_attribute_; |
| private String m_subOptions_[]; |
| private int m_subOptionAttributeValues_[]; |
| } |
| |
| // private variables ----------------------------------------------------- |
| |
| /** |
| * Current parsed token |
| */ |
| private ParsedToken m_parsedToken_; |
| /** |
| * Collation rule |
| */ |
| private String m_rules_; |
| private int m_current_; |
| /** |
| * End of the option while reading. |
| * Need it for UnicodeSet reading support. |
| */ |
| private int m_optionEnd_; |
| /* |
| * Current offset in m_source |
| */ |
| //private int m_sourceLimit_; |
| /** |
| * Offset to m_source_ ofr the extra expansion characters |
| */ |
| private int m_extraCurrent_; |
| |
| /** |
| * UnicodeSet that contains code points to be copied from the UCA |
| */ |
| UnicodeSet m_copySet_; |
| |
| /** |
| * UnicodeSet that contains code points for which we want to remove |
| * UCA contractions. It implies copying of these code points from |
| * the UCA. |
| */ |
| UnicodeSet m_removeSet_; |
| /* |
| * This is space for the extra strings that need to be unquoted during the |
| * parsing of the rules |
| */ |
| //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048; |
| /** |
| * Indicator that the token is not set yet |
| */ |
| private static final int TOKEN_UNSET_ = 0xFFFFFFFF; |
| /* |
| * Indicator that the rule is in the > polarity, ie everything on the |
| * right of the rule is less than |
| */ |
| //private static final int TOKEN_POLARITY_NEGATIVE_ = 0; |
| /** |
| * Indicator that the rule is in the < polarity, ie everything on the |
| * right of the rule is greater than |
| */ |
| private static final int TOKEN_POLARITY_POSITIVE_ = 1; |
| /** |
| * Flag mask to determine if top is set |
| */ |
| private static final int TOKEN_TOP_MASK_ = 0x04; |
| /** |
| * Flag mask to determine if variable top is set |
| */ |
| private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08; |
| /** |
| * Flag mask to determine if a before attribute is set |
| */ |
| private static final int TOKEN_BEFORE_ = 0x03; |
| /** |
| * For use in parsing token options |
| */ |
| private static final int TOKEN_SUCCESS_MASK_ = 0x10; |
| |
| /** |
| * These values are used for finding CE values for indirect positioning. |
| * Indirect positioning is a mechanism for allowing resets on symbolic |
| * values. It only works for resets and you cannot tailor indirect names. |
| * An indirect name can define either an anchor point or a range. An anchor |
| * point behaves in exactly the same way as a code point in reset would, |
| * except that it cannot be tailored. A range (we currently only know for |
| * the [top] range will explicitly set the upper bound for generated CEs, |
| * thus allowing for better control over how many CEs can be squeezed |
| * between in the range without performance penalty. In that respect, we use |
| * [top] for tailoring of locales that use CJK characters. Other indirect |
| * values are currently a pure convenience, they can be used to assure that |
| * the CEs will be always positioned in the same place relative to a point |
| * with known properties (e.g. first primary ignorable). |
| */ |
| private static final IndirectBoundaries INDIRECT_BOUNDARIES_[]; |
| |
| // /** |
| // * Inverse UCA constants |
| // */ |
| // private static final int INVERSE_SIZE_MASK_ = 0xFFF00000; |
| // private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF; |
| // private static final int INVERSE_SHIFT_VALUE_ = 20; |
| |
| /** |
| * Collation option tags |
| * [last variable] last variable value |
| * [last primary ignorable] largest CE for primary ignorable |
| * [last secondary ignorable] largest CE for secondary ignorable |
| * [last tertiary ignorable] largest CE for tertiary ignorable |
| * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) |
| */ |
| private static final TokenOption RULES_OPTIONS_[]; |
| |
| static |
| { |
| INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15]; |
| // UCOL_RESET_TOP_VALUE |
| INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_, |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_); |
| // UCOL_FIRST_PRIMARY_IGNORABLE |
| INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_, |
| null); |
| // UCOL_LAST_PRIMARY_IGNORABLE |
| INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_, |
| null); |
| |
| // UCOL_FIRST_SECONDARY_IGNORABLE |
| INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_, |
| null); |
| // UCOL_LAST_SECONDARY_IGNORABLE |
| INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_, |
| null); |
| // UCOL_FIRST_TERTIARY_IGNORABLE |
| INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_, |
| null); |
| // UCOL_LAST_TERTIARY_IGNORABLE |
| INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_, |
| null); |
| // UCOL_FIRST_VARIABLE; |
| INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_, |
| null); |
| // UCOL_LAST_VARIABLE |
| INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_, |
| null); |
| // UCOL_FIRST_NON_VARIABLE |
| INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_, |
| null); |
| // UCOL_LAST_NON_VARIABLE |
| INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_, |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_); |
| // UCOL_FIRST_IMPLICIT |
| INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_, |
| null); |
| // UCOL_LAST_IMPLICIT |
| INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_, |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_); |
| // UCOL_FIRST_TRAILING |
| INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_, |
| null); |
| // UCOL_LAST_TRAILING |
| INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries( |
| RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_, |
| null); |
| INDIRECT_BOUNDARIES_[14].m_limitCE_ |
| = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24; |
| |
| RULES_OPTIONS_ = new TokenOption[19]; |
| String option[] = {"non-ignorable", "shifted"}; |
| int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_, |
| RuleBasedCollator.AttributeValue.SHIFTED_}; |
| RULES_OPTIONS_[0] = new TokenOption("alternate", |
| RuleBasedCollator.Attribute.ALTERNATE_HANDLING_, |
| option, value); |
| option = new String[1]; |
| option[0] = "2"; |
| value = new int[1]; |
| value[0] = RuleBasedCollator.AttributeValue.ON_; |
| RULES_OPTIONS_[1] = new TokenOption("backwards", |
| RuleBasedCollator.Attribute.FRENCH_COLLATION_, |
| option, value); |
| String offonoption[] = new String[2]; |
| offonoption[0] = "off"; |
| offonoption[1] = "on"; |
| int offonvalue[] = new int[2]; |
| offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_; |
| offonvalue[1] = RuleBasedCollator.AttributeValue.ON_; |
| RULES_OPTIONS_[2] = new TokenOption("caseLevel", |
| RuleBasedCollator.Attribute.CASE_LEVEL_, |
| offonoption, offonvalue); |
| option = new String[3]; |
| option[0] = "lower"; |
| option[1] = "upper"; |
| option[2] = "off"; |
| value = new int[3]; |
| value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_; |
| value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_; |
| value[2] = RuleBasedCollator.AttributeValue.OFF_; |
| RULES_OPTIONS_[3] = new TokenOption("caseFirst", |
| RuleBasedCollator.Attribute.CASE_FIRST_, |
| option, value); |
| RULES_OPTIONS_[4] = new TokenOption("normalization", |
| RuleBasedCollator.Attribute.NORMALIZATION_MODE_, |
| offonoption, offonvalue); |
| RULES_OPTIONS_[5] = new TokenOption("hiraganaQ", |
| RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_, |
| offonoption, offonvalue); |
| option = new String[5]; |
| option[0] = "1"; |
| option[1] = "2"; |
| option[2] = "3"; |
| option[3] = "4"; |
| option[4] = "I"; |
| value = new int[5]; |
| value[0] = RuleBasedCollator.AttributeValue.PRIMARY_; |
| value[1] = RuleBasedCollator.AttributeValue.SECONDARY_; |
| value[2] = RuleBasedCollator.AttributeValue.TERTIARY_; |
| value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_; |
| value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_; |
| RULES_OPTIONS_[6] = new TokenOption("strength", |
| RuleBasedCollator.Attribute.STRENGTH_, |
| option, value); |
| RULES_OPTIONS_[7] = new TokenOption("variable top", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| RULES_OPTIONS_[8] = new TokenOption("rearrange", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| option = new String[3]; |
| option[0] = "1"; |
| option[1] = "2"; |
| option[2] = "3"; |
| value = new int[3]; |
| value[0] = RuleBasedCollator.AttributeValue.PRIMARY_; |
| value[1] = RuleBasedCollator.AttributeValue.SECONDARY_; |
| value[2] = RuleBasedCollator.AttributeValue.TERTIARY_; |
| RULES_OPTIONS_[9] = new TokenOption("before", |
| RuleBasedCollator.Attribute.LIMIT_, |
| option, value); |
| RULES_OPTIONS_[10] = new TokenOption("top", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| String firstlastoption[] = new String[7]; |
| firstlastoption[0] = "primary"; |
| firstlastoption[1] = "secondary"; |
| firstlastoption[2] = "tertiary"; |
| firstlastoption[3] = "variable"; |
| firstlastoption[4] = "regular"; |
| firstlastoption[5] = "implicit"; |
| firstlastoption[6] = "trailing"; |
| |
| int firstlastvalue[] = new int[7]; |
| Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_); |
| |
| RULES_OPTIONS_[11] = new TokenOption("first", |
| RuleBasedCollator.Attribute.LIMIT_, |
| firstlastoption, firstlastvalue); |
| RULES_OPTIONS_[12] = new TokenOption("last", |
| RuleBasedCollator.Attribute.LIMIT_, |
| firstlastoption, firstlastvalue); |
| RULES_OPTIONS_[13] = new TokenOption("optimize", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| RULES_OPTIONS_[14] = new TokenOption("suppressContractions", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| RULES_OPTIONS_[15] = new TokenOption("undefined", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| RULES_OPTIONS_[16] = new TokenOption("scriptOrder", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| RULES_OPTIONS_[17] = new TokenOption("charsetname", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| RULES_OPTIONS_[18] = new TokenOption("charset", |
| RuleBasedCollator.Attribute.LIMIT_, |
| null, null); |
| } |
| |
| /** |
| * Utility data members |
| */ |
| private Token m_utilToken_ = new Token(); |
| private CollationElementIterator m_UCAColEIter_ |
| = RuleBasedCollator.UCA_.getCollationElementIterator(""); |
| private int m_utilCEBuffer_[] = new int[2]; |
| |
| // private methods ------------------------------------------------------- |
| |
| /** |
| * Assembles the token list |
| * @exception ParseException thrown when rules syntax fails |
| */ |
| int assembleTokenList() throws ParseException |
| { |
| Token lastToken = null; |
| m_parsedToken_.m_strength_ = TOKEN_UNSET_; |
| int sourcelimit = m_source_.length(); |
| int expandNext = 0; |
| |
| while (m_current_ < sourcelimit) { |
| m_parsedToken_.m_prefixOffset_ = 0; |
| if (parseNextToken(lastToken == null) < 0) { |
| // we have reached the end |
| continue; |
| } |
| char specs = m_parsedToken_.m_flags_; |
| boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0); |
| boolean top = ((specs & TOKEN_TOP_MASK_) != 0); |
| int lastStrength = TOKEN_UNSET_; |
| if (lastToken != null) { |
| lastStrength = lastToken.m_strength_; |
| } |
| m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24 |
| | m_parsedToken_.m_charsOffset_; |
| m_utilToken_.m_rules_ = m_source_; |
| // 4 Lookup each source in the CharsToToken map, and find a |
| // sourcetoken |
| Token sourceToken = (Token)m_hashTable_.get(m_utilToken_); |
| if (m_parsedToken_.m_strength_ != TOKEN_RESET_) { |
| if (lastToken == null) { |
| // this means that rules haven't started properly |
| throwParseException(m_source_.toString(), 0); |
| } |
| // 6 Otherwise (when relation != reset) |
| if (sourceToken == null) { |
| // If sourceToken is null, create new one |
| sourceToken = new Token(); |
| sourceToken.m_rules_ = m_source_; |
| sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 |
| | m_parsedToken_.m_charsOffset_; |
| sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24 |
| | m_parsedToken_.m_prefixOffset_; |
| // TODO: this should also handle reverse |
| sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_; |
| sourceToken.m_next_ = null; |
| sourceToken.m_previous_ = null; |
| sourceToken.m_CELength_ = 0; |
| sourceToken.m_expCELength_ = 0; |
| m_hashTable_.put(sourceToken, sourceToken); |
| } |
| else { |
| // we could have fished out a reset here |
| if (sourceToken.m_strength_ != TOKEN_RESET_ |
| && lastToken != sourceToken) { |
| // otherwise remove sourceToken from where it was. |
| if (sourceToken.m_next_ != null) { |
| if (sourceToken.m_next_.m_strength_ |
| > sourceToken.m_strength_) { |
| sourceToken.m_next_.m_strength_ |
| = sourceToken.m_strength_; |
| } |
| sourceToken.m_next_.m_previous_ |
| = sourceToken.m_previous_; |
| } |
| else { |
| sourceToken.m_listHeader_.m_last_ |
| = sourceToken.m_previous_; |
| } |
| if (sourceToken.m_previous_ != null) { |
| sourceToken.m_previous_.m_next_ |
| = sourceToken.m_next_; |
| } |
| else { |
| sourceToken.m_listHeader_.m_first_ |
| = sourceToken.m_next_; |
| } |
| sourceToken.m_next_ = null; |
| sourceToken.m_previous_ = null; |
| } |
| } |
| sourceToken.m_strength_ = m_parsedToken_.m_strength_; |
| sourceToken.m_listHeader_ = lastToken.m_listHeader_; |
| |
| // 1. Find the strongest strength in each list, and set |
| // strongestP and strongestN accordingly in the headers. |
| if (lastStrength == TOKEN_RESET_ |
| || sourceToken.m_listHeader_.m_first_ == null) { |
| // If LAST is a reset insert sourceToken in the list. |
| if (sourceToken.m_listHeader_.m_first_ == null) { |
| sourceToken.m_listHeader_.m_first_ = sourceToken; |
| sourceToken.m_listHeader_.m_last_ = sourceToken; |
| } |
| else { // we need to find a place for us |
| // and we'll get in front of the same strength |
| if (sourceToken.m_listHeader_.m_first_.m_strength_ |
| <= sourceToken.m_strength_) { |
| sourceToken.m_next_ |
| = sourceToken.m_listHeader_.m_first_; |
| sourceToken.m_next_.m_previous_ = sourceToken; |
| sourceToken.m_listHeader_.m_first_ = sourceToken; |
| sourceToken.m_previous_ = null; |
| } |
| else { |
| lastToken = sourceToken.m_listHeader_.m_first_; |
| while (lastToken.m_next_ != null |
| && lastToken.m_next_.m_strength_ |
| > sourceToken.m_strength_) { |
| lastToken = lastToken.m_next_; |
| } |
| if (lastToken.m_next_ != null) { |
| lastToken.m_next_.m_previous_ = sourceToken; |
| } |
| else { |
| sourceToken.m_listHeader_.m_last_ |
| = sourceToken; |
| } |
| sourceToken.m_previous_ = lastToken; |
| sourceToken.m_next_ = lastToken.m_next_; |
| lastToken.m_next_ = sourceToken; |
| } |
| } |
| } |
| else { |
| // Otherwise (when LAST is not a reset) |
| // if polarity (LAST) == polarity(relation), insert |
| // sourceToken after LAST, otherwise insert before. |
| // when inserting after or before, search to the next |
| // position with the same strength in that direction. |
| // (This is called postpone insertion). |
| if (sourceToken != lastToken) { |
| if (lastToken.m_polarity_ == sourceToken.m_polarity_) { |
| while (lastToken.m_next_ != null |
| && lastToken.m_next_.m_strength_ |
| > sourceToken.m_strength_) { |
| lastToken = lastToken.m_next_; |
| } |
| sourceToken.m_previous_ = lastToken; |
| if (lastToken.m_next_ != null) { |
| lastToken.m_next_.m_previous_ = sourceToken; |
| } |
| else { |
| sourceToken.m_listHeader_.m_last_ = sourceToken; |
| } |
| sourceToken.m_next_ = lastToken.m_next_; |
| lastToken.m_next_ = sourceToken; |
| } |
| else { |
| while (lastToken.m_previous_ != null |
| && lastToken.m_previous_.m_strength_ |
| > sourceToken.m_strength_) { |
| lastToken = lastToken.m_previous_; |
| } |
| sourceToken.m_next_ = lastToken; |
| if (lastToken.m_previous_ != null) { |
| lastToken.m_previous_.m_next_ = sourceToken; |
| } |
| else { |
| sourceToken.m_listHeader_.m_first_ |
| = sourceToken; |
| } |
| sourceToken.m_previous_ = lastToken.m_previous_; |
| lastToken.m_previous_ = sourceToken; |
| } |
| } |
| else { // repeated one thing twice in rules, stay with the |
| // stronger strength |
| if (lastStrength < sourceToken.m_strength_) { |
| sourceToken.m_strength_ = lastStrength; |
| } |
| } |
| } |
| // if the token was a variable top, we're gonna put it in |
| if (variableTop == true && m_variableTop_ == null) { |
| variableTop = false; |
| m_variableTop_ = sourceToken; |
| } |
| // Treat the expansions. |
| // There are two types of expansions: explicit (x / y) and |
| // reset based propagating expansions |
| // (&abc * d * e <=> &ab * d / c * e / c) |
| // if both of them are in effect for a token, they are combined. |
| sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 |
| | m_parsedToken_.m_extensionOffset_; |
| if (expandNext != 0) { |
| if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) { |
| // primary strength kills off the implicit expansion |
| expandNext = 0; |
| } |
| else if (sourceToken.m_expansion_ == 0) { |
| // if there is no expansion, implicit is just added to |
| // the token |
| sourceToken.m_expansion_ = expandNext; |
| } |
| else { |
| // there is both explicit and implicit expansion. |
| // We need to make a combination |
| int start = expandNext & 0xFFFFFF; |
| int size = expandNext >>> 24; |
| if (size > 0) { |
| m_source_.append(m_source_.substring(start, |
| start + size)); |
| } |
| start = m_parsedToken_.m_extensionOffset_; |
| m_source_.append(m_source_.substring(start, |
| start + m_parsedToken_.m_extensionLen_)); |
| sourceToken.m_expansion_ = (size |
| + m_parsedToken_.m_extensionLen_) << 24 |
| | m_extraCurrent_; |
| m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_; |
| } |
| } |
| // if the previous token was a reset before, the strength of this |
| // token must match the strength of before. Otherwise we have an |
| // undefined situation. |
| // In other words, we currently have a cludge which we use to |
| // represent &a >> x. This is written as &[before 2]a << x. |
| if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) { |
| int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1; |
| if(beforeStrength != sourceToken.m_strength_) { |
| throwParseException(m_source_.toString(), m_current_); |
| } |
| } |
| |
| } |
| else { |
| if (lastToken != null && lastStrength == TOKEN_RESET_) { |
| // if the previous token was also a reset, this means that |
| // we have two consecutive resets and we want to remove the |
| // previous one if empty |
| if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) { |
| m_resultLength_ --; |
| } |
| } |
| if (sourceToken == null) { |
| // this is a reset, but it might still be somewhere in the |
| // tailoring, in shorter form |
| int searchCharsLen = m_parsedToken_.m_charsLen_; |
| while (searchCharsLen > 1 && sourceToken == null) { |
| searchCharsLen --; |
| // key = searchCharsLen << 24 | charsOffset; |
| m_utilToken_.m_source_ = searchCharsLen << 24 |
| | m_parsedToken_.m_charsOffset_; |
| m_utilToken_.m_rules_ = m_source_; |
| sourceToken = (Token)m_hashTable_.get(m_utilToken_); |
| } |
| if (sourceToken != null) { |
| expandNext = (m_parsedToken_.m_charsLen_ |
| - searchCharsLen) << 24 |
| | (m_parsedToken_.m_charsOffset_ |
| + searchCharsLen); |
| } |
| } |
| if ((specs & TOKEN_BEFORE_) != 0) { |
| if (top == false) { |
| // we're doing before & there is no indirection |
| int strength = (specs & TOKEN_BEFORE_) - 1; |
| if (sourceToken != null |
| && sourceToken.m_strength_ != TOKEN_RESET_) { |
| // this is a before that is already ordered in the UCA |
| // - so we need to get the previous with good strength |
| while (sourceToken.m_strength_ > strength |
| && sourceToken.m_previous_ != null) { |
| sourceToken = sourceToken.m_previous_; |
| } |
| // here, either we hit the strength or NULL |
| if (sourceToken.m_strength_ == strength) { |
| if (sourceToken.m_previous_ != null) { |
| sourceToken = sourceToken.m_previous_; |
| } |
| else { // start of list |
| sourceToken |
| = sourceToken.m_listHeader_.m_reset_; |
| } |
| } |
| else { // we hit NULL, we should be doing the else part |
| sourceToken |
| = sourceToken.m_listHeader_.m_reset_; |
| sourceToken = getVirginBefore(sourceToken, |
| strength); |
| } |
| } |
| else { |
| sourceToken |
| = getVirginBefore(sourceToken, strength); |
| } |
| } |
| else { |
| // this is both before and indirection |
| top = false; |
| m_listHeader_[m_resultLength_] = new TokenListHeader(); |
| m_listHeader_[m_resultLength_].m_previousCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_indirect_ = true; |
| // we need to do slightly more work. we need to get the |
| // baseCE using the inverse UCA & getPrevious. The next |
| // bound is not set, and will be decided in ucol_bld |
| int strength = (specs & TOKEN_BEFORE_) - 1; |
| int baseCE = INDIRECT_BOUNDARIES_[ |
| m_parsedToken_.m_indirectIndex_].m_startCE_; |
| int baseContCE = INDIRECT_BOUNDARIES_[ |
| m_parsedToken_.m_indirectIndex_].m_startContCE_; |
| int ce[] = new int[2]; |
| if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_) |
| && (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */ |
| int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16; |
| int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary); |
| int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1); |
| ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505; |
| ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_; |
| } else { |
| CollationParsedRuleBuilder.InverseUCA invuca |
| = CollationParsedRuleBuilder.INVERSE_UCA_; |
| invuca.getInversePrevCE(baseCE, baseContCE, strength, |
| ce); |
| } |
| m_listHeader_[m_resultLength_].m_baseCE_ = ce[0]; |
| m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1]; |
| m_listHeader_[m_resultLength_].m_nextCE_ = 0; |
| m_listHeader_[m_resultLength_].m_nextContCE_ = 0; |
| |
| sourceToken = new Token(); |
| expandNext = initAReset(0, sourceToken); |
| } |
| } |
| // 5 If the relation is a reset: |
| // If sourceToken is null |
| // Create new list, create new sourceToken, make the baseCE |
| // from source, put the sourceToken in ListHeader of the new |
| // list |
| if (sourceToken == null) { |
| if (m_listHeader_[m_resultLength_] == null) { |
| m_listHeader_[m_resultLength_] = new TokenListHeader(); |
| } |
| // 3 Consider each item: relation, source, and expansion: |
| // e.g. ...< x / y ... |
| // First convert all expansions into normal form. |
| // Examples: |
| // If "xy" doesn't occur earlier in the list or in the UCA, |
| // convert &xy * c * d * ... into &x * c/y * d * ... |
| // Note: reset values can never have expansions, although |
| // they can cause the very next item to have one. They may |
| // be contractions, if they are found earlier in the list. |
| if (top == false) { |
| CollationElementIterator coleiter |
| = RuleBasedCollator.UCA_.getCollationElementIterator( |
| m_source_.substring(m_parsedToken_.m_charsOffset_, |
| m_parsedToken_.m_charsOffset_ |
| + m_parsedToken_.m_charsLen_)); |
| |
| int CE = coleiter.next(); |
| // offset to the character in the full rule string |
| int expand = coleiter.getOffset() |
| + m_parsedToken_.m_charsOffset_; |
| int SecondCE = coleiter.next(); |
| |
| m_listHeader_[m_resultLength_].m_baseCE_ |
| = CE & 0xFFFFFF3F; |
| if (RuleBasedCollator.isContinuation(SecondCE)) { |
| m_listHeader_[m_resultLength_].m_baseContCE_ |
| = SecondCE; |
| } |
| else { |
| m_listHeader_[m_resultLength_].m_baseContCE_ = 0; |
| } |
| m_listHeader_[m_resultLength_].m_nextCE_ = 0; |
| m_listHeader_[m_resultLength_].m_nextContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_indirect_ = false; |
| sourceToken = new Token(); |
| expandNext = initAReset(expand, sourceToken); |
| } |
| else { // top == TRUE |
| top = false; |
| m_listHeader_[m_resultLength_].m_previousCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_indirect_ = true; |
| IndirectBoundaries ib = INDIRECT_BOUNDARIES_[ |
| m_parsedToken_.m_indirectIndex_]; |
| m_listHeader_[m_resultLength_].m_baseCE_ |
| = ib.m_startCE_; |
| m_listHeader_[m_resultLength_].m_baseContCE_ |
| = ib.m_startContCE_; |
| m_listHeader_[m_resultLength_].m_nextCE_ |
| = ib.m_limitCE_; |
| m_listHeader_[m_resultLength_].m_nextContCE_ |
| = ib.m_limitContCE_; |
| sourceToken = new Token(); |
| expandNext = initAReset(0, sourceToken); |
| } |
| } |
| else { // reset to something already in rules |
| top = false; |
| } |
| } |
| // 7 After all this, set LAST to point to sourceToken, and goto |
| // step 3. |
| lastToken = sourceToken; |
| } |
| |
| if (m_resultLength_ > 0 |
| && m_listHeader_[m_resultLength_ - 1].m_first_ == null) { |
| m_resultLength_ --; |
| } |
| return m_resultLength_; |
| } |
| |
| /** |
| * Formats and throws a ParseException |
| * @param rules collation rule that failed |
| * @param offset failed offset in rules |
| * @throws ParseException with failure information |
| */ |
| private static final void throwParseException(String rules, int offset) |
| throws ParseException |
| { |
| // for pre-context |
| String precontext = rules.substring(0, offset); |
| String postcontext = rules.substring(offset, rules.length()); |
| StringBuffer error = new StringBuffer( |
| "Parse error occurred in rule at offset "); |
| error.append(offset); |
| error.append("\n after the prefix \""); |
| error.append(precontext); |
| error.append("\" before the suffix \""); |
| error.append(postcontext); |
| throw new ParseException(error.toString(), offset); |
| } |
| |
| private final boolean doSetTop() { |
| m_parsedToken_.m_charsOffset_ = m_extraCurrent_; |
| m_source_.append((char)0xFFFE); |
| IndirectBoundaries ib = |
| INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_]; |
| m_source_.append((char)(ib.m_startCE_ >> 16)); |
| m_source_.append((char)(ib.m_startCE_ & 0xFFFF)); |
| m_extraCurrent_ += 3; |
| if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_ |
| ].m_startContCE_ == 0) { |
| m_parsedToken_.m_charsLen_ = 3; |
| } |
| else { |
| m_source_.append((char)(INDIRECT_BOUNDARIES_[ |
| m_parsedToken_.m_indirectIndex_ |
| ].m_startContCE_ >> 16)); |
| m_source_.append((char)(INDIRECT_BOUNDARIES_[ |
| m_parsedToken_.m_indirectIndex_ |
| ].m_startContCE_ & 0xFFFF)); |
| m_extraCurrent_ += 2; |
| m_parsedToken_.m_charsLen_ = 5; |
| } |
| return true; |
| } |
| |
| private static boolean isCharNewLine(char c) { |
| switch (c) { |
| case 0x000A: /* LF */ |
| case 0x000D: /* CR */ |
| case 0x000C: /* FF */ |
| case 0x0085: /* NEL */ |
| case 0x2028: /* LS */ |
| case 0x2029: /* PS */ |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| /** |
| * Getting the next token |
| * |
| * @param startofrules |
| * flag indicating if we are at the start of rules |
| * @return the offset of the rules |
| * @exception ParseException |
| * thrown when rule parsing fails |
| */ |
| private int parseNextToken(boolean startofrules) throws ParseException |
| { |
| // parsing part |
| boolean variabletop = false; |
| boolean top = false; |
| boolean inchars = true; |
| boolean inquote = false; |
| boolean wasinquote = false; |
| byte before = 0; |
| boolean isescaped = false; |
| int /*newcharslen = 0,*/ newextensionlen = 0; |
| int /*charsoffset = 0,*/ extensionoffset = 0; |
| int newstrength = TOKEN_UNSET_; |
| |
| m_parsedToken_.m_charsLen_ = 0; |
| m_parsedToken_.m_charsOffset_ = 0; |
| m_parsedToken_.m_prefixOffset_ = 0; |
| m_parsedToken_.m_prefixLen_ = 0; |
| m_parsedToken_.m_indirectIndex_ = 0; |
| |
| int limit = m_rules_.length(); |
| while (m_current_ < limit) { |
| char ch = m_source_.charAt(m_current_); |
| if (inquote) { |
| if (ch == 0x0027) { // '\'' |
| inquote = false; |
| } |
| else { |
| if ((m_parsedToken_.m_charsLen_ == 0) || inchars) { |
| if (m_parsedToken_.m_charsLen_ == 0) { |
| m_parsedToken_.m_charsOffset_ = m_extraCurrent_; |
| } |
| m_parsedToken_.m_charsLen_ ++; |
| } |
| else { |
| if (newextensionlen == 0) { |
| extensionoffset = m_extraCurrent_; |
| } |
| newextensionlen ++; |
| } |
| } |
| } |
| else if (isescaped) { |
| isescaped = false; |
| if (newstrength == TOKEN_UNSET_) { |
| throwParseException(m_rules_, m_current_); |
| } |
| if (ch != 0 && m_current_ != limit) { |
| if (inchars) { |
| if (m_parsedToken_.m_charsLen_ == 0) { |
| m_parsedToken_.m_charsOffset_ = m_current_; |
| } |
| m_parsedToken_.m_charsLen_ ++; |
| } |
| else { |
| if (newextensionlen == 0) { |
| extensionoffset = m_current_; |
| } |
| newextensionlen ++; |
| } |
| } |
| } |
| else { |
| if (!UCharacterProperty.isRuleWhiteSpace(ch)) { |
| // Sets the strength for this entry |
| switch (ch) { |
| case 0x003D : // '=' |
| if (newstrength != TOKEN_UNSET_) { |
| return doEndParseNextToken(newstrength, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| // if we start with strength, we'll reset to top |
| if (startofrules == true) { |
| m_parsedToken_.m_indirectIndex_ = 5; |
| top = doSetTop(); |
| return doEndParseNextToken(TOKEN_RESET_, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| newstrength = Collator.IDENTICAL; |
| break; |
| case 0x002C : // ',' |
| if (newstrength != TOKEN_UNSET_) { |
| return doEndParseNextToken(newstrength, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| // if we start with strength, we'll reset to top |
| if (startofrules == true) { |
| m_parsedToken_.m_indirectIndex_ = 5; |
| top = doSetTop(); |
| return doEndParseNextToken(TOKEN_RESET_, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| newstrength = Collator.TERTIARY; |
| break; |
| case 0x003B : // ';' |
| if (newstrength != TOKEN_UNSET_) { |
| return doEndParseNextToken(newstrength, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| // if we start with strength, we'll reset to top |
| if (startofrules == true) { |
| m_parsedToken_.m_indirectIndex_ = 5; |
| top = doSetTop(); |
| return doEndParseNextToken(TOKEN_RESET_, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| newstrength = Collator.SECONDARY; |
| break; |
| case 0x003C : // '<' |
| if (newstrength != TOKEN_UNSET_) { |
| return doEndParseNextToken(newstrength, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| // if we start with strength, we'll reset to top |
| if (startofrules == true) { |
| m_parsedToken_.m_indirectIndex_ = 5; |
| top = doSetTop(); |
| return doEndParseNextToken(TOKEN_RESET_, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| // before this, do a scan to verify whether this is |
| // another strength |
| if (m_source_.charAt(m_current_ + 1) == 0x003C) { |
| m_current_ ++; |
| if (m_source_.charAt(m_current_ + 1) == 0x003C) { |
| m_current_ ++; // three in a row! |
| newstrength = Collator.TERTIARY; |
| } |
| else { // two in a row |
| newstrength = Collator.SECONDARY; |
| } |
| } |
| else { // just one |
| newstrength = Collator.PRIMARY; |
| } |
| break; |
| case 0x0026 : // '&' |
| if (newstrength != TOKEN_UNSET_) { |
| return doEndParseNextToken(newstrength, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0 |
| break; |
| case 0x005b : // '[' |
| // options - read an option, analyze it |
| m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_); |
| if (m_optionEnd_ != -1) { // ']' |
| byte result = readAndSetOption(); |
| m_current_ = m_optionEnd_; |
| if ((result & TOKEN_TOP_MASK_) != 0) { |
| if (newstrength == TOKEN_RESET_) { |
| top = doSetTop(); |
| if (before != 0) { |
| // This is a combination of before and |
| // indirection like |
| // '&[before 2][first regular]<b' |
| m_source_.append((char)0x002d); |
| m_source_.append((char)before); |
| m_extraCurrent_ += 2; |
| m_parsedToken_.m_charsLen_ += 2; |
| } |
| m_current_ ++; |
| return doEndParseNextToken(newstrength, |
| true, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| else { |
| throwParseException(m_rules_, m_current_); |
| } |
| } |
| else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) { |
| if (newstrength != TOKEN_RESET_ |
| && newstrength != TOKEN_UNSET_) { |
| variabletop = true; |
| m_parsedToken_.m_charsOffset_ |
| = m_extraCurrent_; |
| m_source_.append((char)0xFFFF); |
| m_extraCurrent_ ++; |
| m_current_ ++; |
| m_parsedToken_.m_charsLen_ = 1; |
| return doEndParseNextToken(newstrength, |
| top, |
| extensionoffset, |
| newextensionlen, |
| variabletop, before); |
| } |
| else { |
| throwParseException(m_rules_, m_current_); |
| } |
| } |
| else if ((result & TOKEN_BEFORE_) != 0){ |
| if (newstrength == TOKEN_RESET_) { |
| before = (byte)(result & TOKEN_BEFORE_); |
| } |
| else { |
| throwParseException(m_rules_, m_current_); |
| } |
| } |
| } |
| break; |
| case 0x002F : // '/' |
| wasinquote = false; // if we were copying source |
| // characters, we want to stop now |
| inchars = false; // we're now processing expansion |
| break; |
| case 0x005C : // back slash for escaped chars |
| isescaped = true; |
| break; |
| // found a quote, we're gonna start copying |
| case 0x0027 : //'\'' |
| if (newstrength == TOKEN_UNSET_) { |
| // quote is illegal until we have a strength |
| throwParseException(m_rules_, m_current_); |
| } |
| inquote = true; |
| if (inchars) { // we're doing characters |
| if (wasinquote == false) { |
| m_parsedToken_.m_charsOffset_ = m_extraCurrent_; |
| } |
| if (m_parsedToken_.m_charsLen_ != 0) { |
| m_source_.append(m_source_.substring( |
| m_current_ - m_parsedToken_.m_charsLen_, |
| m_current_)); |
| m_extraCurrent_ += m_parsedToken_.m_charsLen_; |
| } |
| m_parsedToken_.m_charsLen_ ++; |
| } |
| else { // we're doing an expansion |
| if (wasinquote == false) { |
| extensionoffset = m_extraCurrent_; |
| } |
| if (newextensionlen != 0) { |
| m_source_.append(m_source_.substring( |
| m_current_ - newextensionlen, |
| m_current_)); |
| m_extraCurrent_ += newextensionlen; |
| } |
| newextensionlen ++; |
| } |
| wasinquote = true; |
| m_current_ ++; |
| ch = m_source_.charAt(m_current_); |
| if (ch == 0x0027) { // copy the double quote |
| m_source_.append(ch); |
| m_extraCurrent_ ++; |
| inquote = false; |
| } |
| break; |
| // '@' is french only if the strength is not currently set |
| // if it is, it's just a regular character in collation |
| case 0x0040 : // '@' |
| if (newstrength == TOKEN_UNSET_) { |
| m_options_.m_isFrenchCollation_ = true; |
| break; |
| } |
| case 0x007C : //| |
| // this means we have actually been reading prefix part |
| // we want to store read characters to the prefix part |
| // and continue reading the characters (proper way |
| // would be to restart reading the chars, but in that |
| // case we would have to complicate the token hasher, |
| // which I do not intend to play with. Instead, we will |
| // do prefixes when prefixes are due (before adding the |
| // elements). |
| m_parsedToken_.m_prefixOffset_ |
| = m_parsedToken_.m_charsOffset_; |
| m_parsedToken_.m_prefixLen_ |
| = m_parsedToken_.m_charsLen_; |
| if (inchars) { // we're doing characters |
| if (wasinquote == false) { |
| m_parsedToken_.m_charsOffset_ = m_extraCurrent_; |
| } |
| if (m_parsedToken_.m_charsLen_ != 0) { |
| String prefix = m_source_.substring( |
| m_current_ - m_parsedToken_.m_charsLen_, |
| m_current_); |
| m_source_.append(prefix); |
| m_extraCurrent_ += m_parsedToken_.m_charsLen_; |
| } |
| m_parsedToken_.m_charsLen_ ++; |
| } |
| wasinquote = true; |
| do { |
| m_current_ ++; |
| ch = m_source_.charAt(m_current_); |
| // skip whitespace between '|' and the character |
| } while (UCharacterProperty.isRuleWhiteSpace(ch)); |
| break; |
| case 0x0023: // '#' // this is a comment, skip everything through the end of line |
| do { |
| m_current_ ++; |
| ch = m_source_.charAt(m_current_); |
| } while (!isCharNewLine(ch)); |
| break; |
| case 0x0021: // '!' // ignoring java set thai reordering |
| break; |
| default : |
| if (newstrength == TOKEN_UNSET_) { |
| throwParseException(m_rules_, m_current_); |
| } |
| if (isSpecialChar(ch) && (inquote == false)) { |
| throwParseException(m_rules_, m_current_); |
| } |
| if (ch == 0x0000 && m_current_ + 1 == limit) { |
| break; |
| } |
| if (inchars) { |
| if (m_parsedToken_.m_charsLen_ == 0) { |
| m_parsedToken_.m_charsOffset_ = m_current_; |
| } |
| m_parsedToken_.m_charsLen_++; |
| } |
| else { |
| if (newextensionlen == 0) { |
| extensionoffset = m_current_; |
| } |
| newextensionlen ++; |
| } |
| break; |
| } |
| } |
| } |
| if (wasinquote) { |
| if (ch != 0x27) { |
| m_source_.append(ch); |
| m_extraCurrent_ ++; |
| } |
| } |
| m_current_ ++; |
| } |
| return doEndParseNextToken(newstrength, top, |
| extensionoffset, newextensionlen, |
| variabletop, before); |
| } |
| |
| /** |
| * End the next parse token |
| * @param newstrength new strength |
| * @return offset in rules, -1 for end of rules |
| */ |
| private int doEndParseNextToken(int newstrength, /*int newcharslen,*/ |
| boolean top, /*int charsoffset,*/ |
| int extensionoffset, int newextensionlen, |
| boolean variabletop, int before) |
| throws ParseException |
| { |
| if (newstrength == TOKEN_UNSET_) { |
| return -1; |
| } |
| if (m_parsedToken_.m_charsLen_ == 0 && top == false) { |
| throwParseException(m_rules_, m_current_); |
| } |
| |
| m_parsedToken_.m_strength_ = newstrength; |
| //m_parsedToken_.m_charsOffset_ = charsoffset; |
| //m_parsedToken_.m_charsLen_ = newcharslen; |
| m_parsedToken_.m_extensionOffset_ = extensionoffset; |
| m_parsedToken_.m_extensionLen_ = newextensionlen; |
| m_parsedToken_.m_flags_ = (char) |
| ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0) |
| | (top ? TOKEN_TOP_MASK_ : 0) | before); |
| return m_current_; |
| } |
| |
| /** |
| * Token before this element |
| * @param sourcetoken |
| * @param strength collation strength |
| * @return the token before source token |
| * @exception ParseException thrown when rules have the wrong syntax |
| */ |
| private Token getVirginBefore(Token sourcetoken, int strength) |
| throws ParseException |
| { |
| // this is a virgin before - we need to fish the anchor from the UCA |
| if (sourcetoken != null) { |
| int offset = sourcetoken.m_source_ & 0xFFFFFF; |
| m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1)); |
| } |
| else { |
| m_UCAColEIter_.setText( |
| m_source_.substring(m_parsedToken_.m_charsOffset_, |
| m_parsedToken_.m_charsOffset_ + 1)); |
| } |
| |
| int basece = m_UCAColEIter_.next() & 0xFFFFFF3F; |
| int basecontce = m_UCAColEIter_.next(); |
| if (basecontce == CollationElementIterator.NULLORDER) { |
| basecontce = 0; |
| } |
| |
| int ch = 0; |
| |
| |
| if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_) |
| && (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */ |
| |
| int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16; |
| int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary); |
| ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1); |
| int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1); |
| m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505; |
| m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_; |
| |
| m_parsedToken_.m_charsOffset_ = m_extraCurrent_; |
| m_source_.append('\uFFFE'); |
| m_source_.append((char)ch); |
| m_extraCurrent_ += 2; |
| m_parsedToken_.m_charsLen_++; |
| |
| m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24) |
| | m_parsedToken_.m_charsOffset_; |
| m_utilToken_.m_rules_ = m_source_; |
| sourcetoken = (Token)m_hashTable_.get(m_utilToken_); |
| |
| if(sourcetoken == null) { |
| m_listHeader_[m_resultLength_] = new TokenListHeader(); |
| m_listHeader_[m_resultLength_].m_baseCE_ |
| = m_utilCEBuffer_[0] & 0xFFFFFF3F; |
| if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) { |
| m_listHeader_[m_resultLength_].m_baseContCE_ |
| = m_utilCEBuffer_[1]; |
| } |
| else { |
| m_listHeader_[m_resultLength_].m_baseContCE_ = 0; |
| } |
| m_listHeader_[m_resultLength_].m_nextCE_ = 0; |
| m_listHeader_[m_resultLength_].m_nextContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_indirect_ = false; |
| |
| sourcetoken = new Token(); |
| initAReset(-1, sourcetoken); |
| } |
| |
| } else { |
| |
| // first ce and second ce m_utilCEBuffer_ |
| /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE( |
| basece, basecontce, |
| strength, m_utilCEBuffer_); |
| // we got the previous CE. Now we need to see if the difference between |
| // the two CEs is really of the requested strength. |
| // if it's a bigger difference (we asked for secondary and got primary), we |
| // need to modify the CE. |
| if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) { |
| // adjust the strength |
| // now we are in the situation where our baseCE should actually be modified in |
| // order to get the CE in the right position. |
| if(strength == Collator.SECONDARY) { |
| m_utilCEBuffer_[0] = basece - 0x0200; |
| } else { // strength == UCOL_TERTIARY |
| m_utilCEBuffer_[0] = basece - 0x02; |
| } |
| if(RuleBasedCollator.isContinuation(basecontce)) { |
| if(strength == Collator.SECONDARY) { |
| m_utilCEBuffer_[1] = basecontce - 0x0200; |
| } else { // strength == UCOL_TERTIARY |
| m_utilCEBuffer_[1] = basecontce - 0x02; |
| } |
| } |
| } |
| |
| /* |
| // the code below relies on getting a code point from the inverse table, in order to be |
| // able to merge the situations like &x < 9 &[before 1]a < d. This won't work: |
| // 1. There are many code points that have the same CE |
| // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken. |
| // Also, in case when there is no equivalent strength before an element, we have to actually |
| // construct one. For example, &[before 2]a << x won't result in x << a, because the element |
| // before a is a primary difference. |
| ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos |
| + 2]; |
| if ((ch & INVERSE_SIZE_MASK_) != 0) { |
| int offset = ch & INVERSE_OFFSET_MASK_; |
| ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[ |
| offset]; |
| } |
| m_source_.append((char)ch); |
| m_extraCurrent_ ++; |
| m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1; |
| m_parsedToken_.m_charsLen_ = 1; |
| |
| // We got an UCA before. However, this might have been tailored. |
| // example: |
| // &\u30ca = \u306a |
| // &[before 3]\u306a<<<\u306a|\u309d |
| |
| m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24) |
| | m_parsedToken_.m_charsOffset_; |
| m_utilToken_.m_rules_ = m_source_; |
| sourcetoken = (Token)m_hashTable_.get(m_utilToken_); |
| */ |
| |
| // here is how it should be. The situation such as &[before 1]a < x, should be |
| // resolved exactly as if we wrote &a > x. |
| // therefore, I don't really care if the UCA value before a has been changed. |
| // However, I do care if the strength between my element and the previous element |
| // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll |
| // have to construct the base CE. |
| |
| // if we found a tailored thing, we have to use the UCA value and |
| // construct a new reset token with constructed name |
| //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) { |
| // character to which we want to anchor is already tailored. |
| // We need to construct a new token which will be the anchor point |
| //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE'); |
| //m_source_.append(ch); |
| //m_extraCurrent_ ++; |
| //m_parsedToken_.m_charsLen_ ++; |
| // grab before |
| m_parsedToken_.m_charsOffset_ -= 10; |
| m_parsedToken_.m_charsLen_ += 10; |
| m_listHeader_[m_resultLength_] = new TokenListHeader(); |
| m_listHeader_[m_resultLength_].m_baseCE_ |
| = m_utilCEBuffer_[0] & 0xFFFFFF3F; |
| if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) { |
| m_listHeader_[m_resultLength_].m_baseContCE_ |
| = m_utilCEBuffer_[1]; |
| } |
| else { |
| m_listHeader_[m_resultLength_].m_baseContCE_ = 0; |
| } |
| m_listHeader_[m_resultLength_].m_nextCE_ = 0; |
| m_listHeader_[m_resultLength_].m_nextContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousCE_ = 0; |
| m_listHeader_[m_resultLength_].m_previousContCE_ = 0; |
| m_listHeader_[m_resultLength_].m_indirect_ = false; |
| sourcetoken = new Token(); |
| initAReset(-1, sourcetoken); |
| //} |
| } |
| return sourcetoken; |
| } |
| |
| /** |
| * Processing Description. |
| * 1. Build a m_listHeader_. Each list has a header, which contains two lists |
| * (positive and negative), a reset token, a baseCE, nextCE, and |
| * previousCE. The lists and reset may be null. |
| * 2. As you process, you keep a LAST pointer that points to the last token |
| * you handled. |
| * @param expand string offset, -1 for null strings |
| * @param targetToken token to update |
| * @return expandnext offset |
| * @throws ParseException thrown when rules syntax failed |
| */ |
| private int initAReset(int expand, Token targetToken) throws ParseException |
| { |
| if (m_resultLength_ == m_listHeader_.length - 1) { |
| // Unfortunately, this won't work, as we store addresses of lhs in |
| // token |
| TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1]; |
| System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1); |
| m_listHeader_ = temp; |
| } |
| // do the reset thing |
| targetToken.m_rules_ = m_source_; |
| targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 |
| | m_parsedToken_.m_charsOffset_; |
| targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 |
| | m_parsedToken_.m_extensionOffset_; |
| // keep the flags around so that we know about before |
| targetToken.m_flags_ = m_parsedToken_.m_flags_; |
| |
| if (m_parsedToken_.m_prefixOffset_ != 0) { |
| throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1); |
| } |
| |
| targetToken.m_prefix_ = 0; |
| // TODO: this should also handle reverse |
| targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_; |
| targetToken.m_strength_ = TOKEN_RESET_; |
| targetToken.m_next_ = null; |
| targetToken.m_previous_ = null; |
| targetToken.m_CELength_ = 0; |
| targetToken.m_expCELength_ = 0; |
| targetToken.m_listHeader_ = m_listHeader_[m_resultLength_]; |
| m_listHeader_[m_resultLength_].m_first_ = null; |
| m_listHeader_[m_resultLength_].m_last_ = null; |
| m_listHeader_[m_resultLength_].m_first_ = null; |
| m_listHeader_[m_resultLength_].m_last_ = null; |
| m_listHeader_[m_resultLength_].m_reset_ = targetToken; |
| |
| /* 3 Consider each item: relation, source, and expansion: |
| * e.g. ...< x / y ... |
| * First convert all expansions into normal form. Examples: |
| * If "xy" doesn't occur earlier in the list or in the UCA, convert |
| * &xy * c * d * ... into &x * c/y * d * ... |
| * Note: reset values can never have expansions, although they can |
| * cause the very next item to have one. They may be contractions, if |
| * they are found earlier in the list. |
| */ |
| int result = 0; |
| if (expand > 0) { |
| // check to see if there is an expansion |
| if (m_parsedToken_.m_charsLen_ > 1) { |
| targetToken.m_source_ = ((expand |
| - m_parsedToken_.m_charsOffset_ ) |
| << 24) |
| | m_parsedToken_.m_charsOffset_; |
| result = ((m_parsedToken_.m_charsLen_ |
| + m_parsedToken_.m_charsOffset_ - expand) << 24) |
| | expand; |
| } |
| } |
| |
| m_resultLength_ ++; |
| m_hashTable_.put(targetToken, targetToken); |
| return result; |
| } |
| |
| /** |
| * Checks if an character is special |
| * @param ch character to test |
| * @return true if the character is special |
| */ |
| private static final boolean isSpecialChar(char ch) |
| { |
| return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A) |
| || (ch <= 0x0060 && ch >= 0x005B) |
| || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B; |
| } |
| |
| private |
| UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException |
| { |
| while(source.charAt(start) != '[') { /* advance while we find the first '[' */ |
| start++; |
| } |
| // now we need to get a balanced set of '[]'. The problem is that a set can have |
| // many, and *end point to the first closing '[' |
| int noOpenBraces = 1; |
| int current = 1; // skip the opening brace |
| while(start+current < source.length() && noOpenBraces != 0) { |
| if(source.charAt(start+current) == '[') { |
| noOpenBraces++; |
| } else if(source.charAt(start+current) == ']') { // closing brace |
| noOpenBraces--; |
| } |
| current++; |
| } |
| //int nextBrace = -1; |
| |
| if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) { |
| throwParseException(m_rules_, start); |
| } |
| return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current); |
| } |
| |
| |
| /** in C, optionarg is passed by reference to function. |
| * We use a private int to simulate this. |
| */ |
| private int m_optionarg_ = 0; |
| |
| private int readOption(String rules, int start, int optionend) |
| { |
| m_optionarg_ = 0; |
| int i = 0; |
| while (i < RULES_OPTIONS_.length) { |
| String option = RULES_OPTIONS_[i].m_name_; |
| int optionlength = option.length(); |
| if (rules.length() > start + optionlength |
| && option.equalsIgnoreCase(rules.substring(start, |
| start + optionlength))) { |
| if (optionend - start > optionlength) { |
| m_optionarg_ = start + optionlength; |
| // start of the options, skip space |
| while (m_optionarg_ < optionend && UCharacter.isWhitespace(rules.charAt(m_optionarg_))) |
| { // eat whitespace |
| m_optionarg_ ++; |
| } |
| } |
| break; |
| } |
| i ++; |
| } |
| if(i == RULES_OPTIONS_.length) { |
| i = -1; |
| } |
| return i; |
| } |
| /** |
| * Reads and set collation options |
| * @return TOKEN_SUCCESS if option is set correct, 0 otherwise |
| * @exception ParseException thrown when options in rules are wrong |
| */ |
| private byte readAndSetOption() throws ParseException |
| { |
| int start = m_current_ + 1; // skip opening '[' |
| int i = readOption(m_rules_, start, m_optionEnd_); |
| |
| int optionarg = m_optionarg_; |
| |
| if (i < 0) { |
| throwParseException(m_rules_, start); |
| } |
| |
| if (i < 7) { |
| if (optionarg != 0) { |
| for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; |
| j ++) { |
| String subname = RULES_OPTIONS_[i].m_subOptions_[j]; |
| int size = optionarg + subname.length(); |
| if (m_rules_.length() > size |
| && subname.equalsIgnoreCase(m_rules_.substring( |
| optionarg, size))) { |
| setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_, |
| RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]); |
| return TOKEN_SUCCESS_MASK_; |
| } |
| } |
| } |
| throwParseException(m_rules_, optionarg); |
| } |
| else if (i == 7) { // variable top |
| return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_; |
| } |
| else if (i == 8) { // rearange |
| return TOKEN_SUCCESS_MASK_; |
| } |
| else if (i == 9) { // before |
| if (optionarg != 0) { |
| for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; |
| j ++) { |
| String subname = RULES_OPTIONS_[i].m_subOptions_[j]; |
| int size = optionarg + subname.length(); |
| if (m_rules_.length() > size |
| && subname.equalsIgnoreCase( |
| m_rules_.substring(optionarg, |
| optionarg + subname.length()))) { |
| return (byte)(TOKEN_SUCCESS_MASK_ |
| | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j] |
| + 1); |
| } |
| } |
| } |
| throwParseException(m_rules_, optionarg); |
| } |
| else if (i == 10) { // top, we are going to have an array with |
| // structures of limit CEs index to this array will be |
| // src->parsedToken.indirectIndex |
| m_parsedToken_.m_indirectIndex_ = 0; |
| return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_; |
| } |
| else if (i < 13) { // first, last |
| for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) { |
| String subname = RULES_OPTIONS_[i].m_subOptions_[j]; |
| int size = optionarg + subname.length(); |
| if (m_rules_.length() > size |
| && subname.equalsIgnoreCase(m_rules_.substring(optionarg, |
| size))) { |
| m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1)); |
| return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_; |
| } |
| } |
| throwParseException(m_rules_, optionarg); |
| } |
| else if(i == 13 || i == 14) { // copy and remove are handled before normalization |
| // we need to move end here |
| int noOpenBraces = 1; |
| m_current_++; // skip opening brace |
| while(m_current_ < m_source_.length() && noOpenBraces != 0) { |
| if(m_source_.charAt(m_current_) == '[') { |
| noOpenBraces++; |
| } else if(m_source_.charAt(m_current_) == ']') { // closing brace |
| noOpenBraces--; |
| } |
| m_current_++; |
| } |
| m_optionEnd_ = m_current_-1; |
| return TOKEN_SUCCESS_MASK_; |
| } |
| else { |
| throwParseException(m_rules_, optionarg); |
| } |
| return TOKEN_SUCCESS_MASK_; // we will never reach here. |
| } |
| |
| /** |
| * Set collation option |
| * @param optionset option set to set |
| * @param attribute type to set |
| * @param value attribute value |
| */ |
| private void setOptions(OptionSet optionset, int attribute, int value) |
| { |
| switch (attribute) { |
| case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ : |
| optionset.m_isHiragana4_ |
| = (value == RuleBasedCollator.AttributeValue.ON_); |
| break; |
| case RuleBasedCollator.Attribute.FRENCH_COLLATION_ : |
| optionset.m_isFrenchCollation_ |
| = (value == RuleBasedCollator.AttributeValue.ON_); |
| break; |
| case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ : |
| optionset.m_isAlternateHandlingShifted_ |
| = (value |
| == RuleBasedCollator.AttributeValue.SHIFTED_); |
| break; |
| case RuleBasedCollator.Attribute.CASE_FIRST_ : |
| optionset.m_caseFirst_ = value; |
| break; |
| case RuleBasedCollator.Attribute.CASE_LEVEL_ : |
| optionset.m_isCaseLevel_ |
| = (value == RuleBasedCollator.AttributeValue.ON_); |
| break; |
| case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ : |
| if (value == RuleBasedCollator.AttributeValue.ON_) { |
| value = Collator.CANONICAL_DECOMPOSITION; |
| } |
| optionset.m_decomposition_ = value; |
| break; |
| case RuleBasedCollator.Attribute.STRENGTH_ : |
| optionset.m_strength_ = value; |
| break; |
| default : |
| break; |
| } |
| } |
| |
| UnicodeSet getTailoredSet() throws ParseException |
| { |
| boolean startOfRules = true; |
| UnicodeSet tailored = new UnicodeSet(); |
| String pattern; |
| CanonicalIterator it = new CanonicalIterator(""); |
| |
| m_parsedToken_.m_strength_ = TOKEN_UNSET_; |
| int sourcelimit = m_source_.length(); |
| //int expandNext = 0; |
| |
| while (m_current_ < sourcelimit) { |
| m_parsedToken_.m_prefixOffset_ = 0; |
| if (parseNextToken(startOfRules) < 0) { |
| // we have reached the end |
| continue; |
| } |
| startOfRules = false; |
| // The idea is to tokenize the rule set. For each non-reset token, |
| // we add all the canonicaly equivalent FCD sequences |
| if(m_parsedToken_.m_strength_ != TOKEN_RESET_) { |
| it.setSource(m_source_.substring( |
| m_parsedToken_.m_charsOffset_, |
| m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_)); |
| pattern = it.next(); |
| while(pattern != null) { |
| if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) { |
| tailored.add(pattern); |
| } |
| pattern = it.next(); |
| } |
| } |
| } |
| return tailored; |
| } |
| |
| final private void extractSetsFromRules(String rules) throws ParseException { |
| int optionNumber = -1; |
| int setStart = 0; |
| int i = 0; |
| while(i < rules.length()) { |
| if(rules.charAt(i) == 0x005B) { |
| optionNumber = readOption(rules, i+1, rules.length()); |
| setStart = m_optionarg_; |
| if(optionNumber == 13) { /* copy - parts of UCA to tailoring */ |
| UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart); |
| if(m_copySet_ == null) { |
| m_copySet_ = newSet; |
| } else { |
| m_copySet_.addAll(newSet); |
| } |
| } else if(optionNumber == 14) { |
| UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart); |
| if(m_removeSet_ == null) { |
| m_removeSet_ = newSet; |
| } else { |
| m_removeSet_.addAll(newSet); |
| } |
| } |
| } |
| i++; |
| } |
| } |
| } |