| /* |
| ********************************************************************** |
| * Copyright (C) 1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/17/99 aliu Creation. |
| ********************************************************************** |
| */ |
| #include "rbt_pars.h" |
| #include "unicode/rbt.h" |
| #include "rbt_rule.h" |
| #include "unirange.h" |
| #include "rbt_data.h" |
| #include "unicode/uniset.h" |
| #include "cstring.h" |
| #include "unicode/parsepos.h" |
| #include "symtable.h" |
| #include "unicode/parseerr.h" |
| |
| // Operators |
| const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/; |
| const UChar TransliterationRuleParser::FORWARD_RULE_OP = 0x003E/*>*/; |
| const UChar TransliterationRuleParser::REVERSE_RULE_OP = 0x003C/*<*/; |
| const UChar TransliterationRuleParser::FWDREV_RULE_OP = 0x007E/*~*/; // internal rep of <> op |
| const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3); |
| |
| // Other special characters |
| const UChar TransliterationRuleParser::QUOTE = 0x0027/*'*/; |
| const UChar TransliterationRuleParser::ESCAPE = 0x005C/*\*/; |
| const UChar TransliterationRuleParser::END_OF_RULE = 0x003B/*;*/; |
| const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = 0x0023/*#*/; |
| |
| const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = 0x007B/*{*/; |
| const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = 0x007D/*}*/; |
| const UChar TransliterationRuleParser::CONTEXT_OPEN = 0x0028/*(*/; |
| const UChar TransliterationRuleParser::CONTEXT_CLOSE = 0x0029/*)*/; |
| const UChar TransliterationRuleParser::SET_OPEN = 0x005B/*[*/; |
| const UChar TransliterationRuleParser::SET_CLOSE = 0x005D/*]*/; |
| const UChar TransliterationRuleParser::CURSOR_POS = 0x007C/*|*/; |
| |
| //---------------------------------------------------------------------- |
| // BEGIN ParseData |
| //---------------------------------------------------------------------- |
| |
| /** |
| * This class implements the SymbolTable interface. It is used |
| * during parsing to give UnicodeSet access to variables that |
| * have been defined so far. Note that it uses setVariablesVector, |
| * _not_ data.setVariables. |
| */ |
| class ParseData : public SymbolTable { |
| public: |
| const TransliterationRuleData* data; // alias |
| |
| const UVector* setVariablesVector; // alias |
| |
| ParseData(const TransliterationRuleData* data = 0, |
| const UVector* setVariablesVector = 0); |
| |
| /** |
| * Lookup the object associated with this string and return it. |
| * Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not |
| * exist. Return a non-NULL set if the name is mapped to a set; |
| * otherwise return a NULL set. |
| */ |
| virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, |
| UErrorCode& status) const; |
| }; |
| |
| ParseData::ParseData(const TransliterationRuleData* d, |
| const UVector* sets) : |
| data(d), setVariablesVector(sets) {} |
| |
| /** |
| * Implement SymbolTable API. Lookup a variable, returning |
| * either a Character, a UnicodeSet, or null. |
| */ |
| void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set, |
| UErrorCode& status) const { |
| c = data->lookupVariable(name, status); |
| if (U_SUCCESS(status)) { |
| int32_t i = c - data->setVariablesBase; |
| set = (i < setVariablesVector->size()) ? |
| (UnicodeSet*) setVariablesVector->elementAt(i) : 0; |
| } |
| } |
| |
| //---------------------------------------------------------------------- |
| // END ParseData |
| //---------------------------------------------------------------------- |
| |
| TransliterationRuleData* |
| TransliterationRuleParser::parse(const UnicodeString& rules, |
| RuleBasedTransliterator::Direction direction, |
| ParseError* parseError) { |
| TransliterationRuleParser parser(rules, direction, parseError); |
| parser.parseRules(); |
| if (U_FAILURE(parser.status)) { |
| delete parser.data; |
| parser.data = 0; |
| } |
| return parser.data; |
| } |
| |
| /** |
| * @param rules list of rules, separated by newline characters |
| * @exception IllegalArgumentException if there is a syntax error in the |
| * rules |
| */ |
| TransliterationRuleParser::TransliterationRuleParser( |
| const UnicodeString& theRules, |
| RuleBasedTransliterator::Direction theDirection, |
| ParseError* theParseError) : |
| rules(theRules), direction(theDirection), data(0), parseError(theParseError) { |
| parseData = new ParseData(0, &setVariablesVector); |
| } |
| |
| /** |
| * Destructor. |
| */ |
| TransliterationRuleParser::~TransliterationRuleParser() { |
| delete parseData; |
| } |
| |
| /** |
| * Parse the given string as a sequence of rules, separated by newline |
| * characters ('\n'), and cause this object to implement those rules. Any |
| * previous rules are discarded. Typically this method is called exactly |
| * once, during construction. |
| * @exception IllegalArgumentException if there is a syntax error in the |
| * rules |
| */ |
| void TransliterationRuleParser::parseRules(void) { |
| status = U_ZERO_ERROR; |
| |
| delete data; |
| data = new TransliterationRuleData(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| parseData->data = data; |
| setVariablesVector.removeAllElements(); |
| if (parseError != 0) { |
| parseError->code = 0; |
| } |
| determineVariableRange(); |
| |
| int32_t pos = 0; |
| int32_t limit = rules.length(); |
| while (pos < limit && U_SUCCESS(status)) { |
| UChar c = rules.charAt(pos++); |
| if (Unicode::isWhitespace(c)) { |
| // Ignore leading whitespace. Note that this is not |
| // Unicode spaces, but Java spaces -- a subset, |
| // representing whitespace likely to be seen in code. |
| continue; |
| } |
| // Skip lines starting with the comment character |
| if (c == RULE_COMMENT_CHAR) { |
| pos = rules.indexOf((UChar)0x000A /*\n*/, pos) + 1; |
| if (pos == 0) { |
| break; // No "\n" found; rest of rule is a commnet |
| } |
| continue; // Either fall out or restart with next line |
| } |
| // We've found the start of a rule. c is its first |
| // character, and pos points past c. Lexically parse the |
| // rule into component pieces. |
| pos = parseRule(--pos, limit); |
| } |
| |
| // Convert the set vector to an array |
| data->setVariablesLength = setVariablesVector.size(); |
| data->setVariables = new UnicodeSet*[data->setVariablesLength]; |
| // orphanElement removes the given element and shifts all other |
| // elements down. For performance (and code clarity) we work from |
| // the end back to index 0. |
| for (int32_t i=data->setVariablesLength; i>0; ) { |
| --i; |
| data->setVariables[i] = |
| (UnicodeSet*) setVariablesVector.orphanElementAt(i); |
| } |
| |
| // Index the rules |
| if (U_SUCCESS(status)) { |
| data->ruleSet.freeze(*data, status); |
| } |
| } |
| |
| /** |
| * MAIN PARSER. Parse the next rule in the given rule string, starting |
| * at pos. Return the index after the last character parsed. Do not |
| * parse characters at or after limit. |
| * |
| * Important: The character at pos must be a non-whitespace character |
| * that is not the comment character. |
| * |
| * This method handles quoting, escaping, and whitespace removal. It |
| * parses the end-of-rule character. It recognizes context and cursor |
| * indicators. Once it does a lexical breakdown of the rule at pos, it |
| * creates a rule object and adds it to our rule list. |
| */ |
| int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { |
| // Locate the left side, operator, and right side |
| int32_t start = pos; |
| UChar op = 0; |
| |
| UnicodeString buf; |
| int32_t cursor = -1; // position of cursor in buf |
| int32_t ante = -1; // position of ante context marker ')' in buf |
| int32_t post = -1; // position of post context marker '(' in buf |
| int32_t postClose = -1; // position of post context close ')' in buf |
| |
| // Assigned to buf and its adjuncts after the LHS has been |
| // parsed. Thereafter, buf etc. refer to the RHS. |
| UnicodeString left; |
| int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1; |
| |
| UnicodeString scratch; |
| |
| while (pos < limit) { |
| UChar c = rules.charAt(pos++); |
| if (Unicode::isWhitespace(c)) { |
| // Ignore whitespace. Note that this is not Unicode |
| // spaces, but Java spaces -- a subset, representing |
| // whitespace likely to be seen in code. |
| continue; |
| } |
| // Handle escapes |
| if (c == ESCAPE) { |
| if (pos == limit) { |
| return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rules, start); |
| } |
| // Parse \uXXXX escapes |
| c = rules.charAt(pos++); |
| if (c == 0x0075/*u*/) { |
| if ((pos+4) > limit) { |
| return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start); |
| } |
| c = (UChar)0x0000; |
| for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic] |
| int32_t digit = Unicode::digit(rules.charAt(pos), 16); |
| if (digit<0) { |
| return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start); |
| } |
| c = (UChar) ((c << 4) | digit); |
| } |
| } |
| |
| buf.append(c); |
| continue; |
| } |
| // Handle quoted matter |
| if (c == QUOTE) { |
| int32_t iq = rules.indexOf(QUOTE, pos); |
| if (iq == pos) { |
| buf.append(c); // Parse [''] outside quotes as ['] |
| ++pos; |
| } else { |
| /* This loop picks up a segment of quoted text of the |
| * form 'aaaa' each time through. If this segment |
| * hasn't really ended ('aaaa''bbbb') then it keeps |
| * looping, each time adding on a new segment. When it |
| * reaches the final quote it breaks. |
| */ |
| for (;;) { |
| if (iq < 0) { |
| return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rules, start); |
| } |
| scratch.truncate(0); |
| rules.extractBetween(pos, iq, scratch); |
| buf.append(scratch); |
| pos = iq+1; |
| if (pos < limit && rules.charAt(pos) == QUOTE) { |
| // Parse [''] inside quotes as ['] |
| iq = rules.indexOf(QUOTE, pos+1); |
| // Continue looping |
| } else { |
| break; |
| } |
| } |
| } |
| continue; |
| } |
| if (OPERATORS.indexOf(c) >= 0) { |
| if (op != 0) { |
| return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start); |
| } |
| // Found an operator char. Check for forward-reverse operator. |
| if (c == REVERSE_RULE_OP && |
| (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) { |
| ++pos; |
| op = FWDREV_RULE_OP; |
| } else { |
| op = c; |
| } |
| left = buf; // lhs |
| leftCursor = cursor; |
| leftAnte = ante; |
| leftPost = post; |
| leftPostClose = postClose; |
| |
| buf.truncate(0); |
| cursor = ante = post = postClose = -1; |
| continue; |
| } |
| if (c == END_OF_RULE) { |
| break; |
| } |
| switch (c) { |
| case VARIABLE_REF_OPEN: |
| { |
| int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos); |
| if (pos == j || j < 0) { // empty or unterminated |
| return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE, rules, start); |
| } |
| scratch.truncate(0); |
| rules.extractBetween(pos, j, scratch); |
| pos = j+1; |
| UChar v = data->lookupVariable(scratch, status); |
| if (U_FAILURE(status)) { |
| return syntaxError(RuleBasedTransliterator::UNDEFINED_VARIABLE, rules, start); |
| } |
| buf.append(v); |
| } |
| break; |
| case CONTEXT_OPEN: |
| if (post >= 0) { |
| return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rules, start); |
| } |
| // Ignore CONTEXT_OPEN if buffer length is zero -- that means |
| // this is the optional opening delimiter for the ante context. |
| if (buf.length() > 0) { |
| post = buf.length(); |
| } |
| break; |
| case CONTEXT_CLOSE: |
| if (postClose >= 0) { |
| return syntaxError(RuleBasedTransliterator::UNEXPECTED_CLOSE_CONTEXT, rules, start); |
| } |
| if (post >= 0) { |
| // This is probably the optional closing delimiter |
| // for the post context; save the pos and check later. |
| postClose = buf.length(); |
| } else if (ante >= 0) { |
| return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rules, start); |
| } else { |
| ante = buf.length(); |
| } |
| break; |
| case SET_OPEN: { |
| ParsePosition pp(pos-1); // Backup to opening '[' |
| buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status))); |
| if (U_FAILURE(status)) { |
| return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rules, start); |
| } |
| pos = pp.getIndex(); } |
| break; |
| case VARIABLE_REF_CLOSE: |
| case SET_CLOSE: |
| return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start); |
| case CURSOR_POS: |
| if (cursor >= 0) { |
| return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rules, start); |
| } |
| cursor = buf.length(); |
| break; |
| default: |
| buf.append(c); |
| break; |
| } |
| } |
| if (op == 0) { |
| return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rules, start); |
| } |
| |
| // Check context close parameters |
| if ((leftPostClose >= 0 && leftPostClose != left.length()) || |
| (postClose >= 0 && postClose != buf.length())) { |
| return syntaxError(RuleBasedTransliterator::TEXT_AFTER_CLOSE_CONTEXT, rules, start); |
| } |
| |
| // Context is only allowed on the input side; that is, the left side |
| // for forward rules. Cursors are only allowed on the output side; |
| // that is, the right side for forward rules. Bidirectional rules |
| // ignore elements that do not apply. |
| |
| switch (op) { |
| case VARIABLE_DEF_OP: |
| // LHS is the name. RHS is a single character, either a literal |
| // or a set (already parsed). If RHS is longer than one |
| // character, it is either a multi-character string, or multiple |
| // sets, or a mixture of chars and sets -- syntax error. |
| if (buf.length() != 1) { |
| return syntaxError(RuleBasedTransliterator::MALFORMED_RHS, rules, start); |
| } |
| if (data->isVariableDefined(left)) { |
| return syntaxError(RuleBasedTransliterator::DUPLICATE_VARIABLE_DEFINITION, rules, start); |
| } |
| data->defineVariable(left, buf.charAt(0), status); |
| break; |
| |
| case FORWARD_RULE_OP: |
| if (direction == RuleBasedTransliterator::FORWARD) { |
| if (ante >= 0 || post >= 0 || leftCursor >= 0) { |
| return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| left, leftAnte, leftPost, |
| buf, cursor, status), status); |
| } // otherwise ignore the rule; it's not the direction we want |
| break; |
| |
| case REVERSE_RULE_OP: |
| if (direction == RuleBasedTransliterator::REVERSE) { |
| if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) { |
| return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| buf, ante, post, |
| left, leftCursor, status), status); |
| } // otherwise ignore the rule; it's not the direction we want |
| break; |
| |
| case FWDREV_RULE_OP: |
| if (direction == RuleBasedTransliterator::FORWARD) { |
| // The output side is the right; trim off any context |
| if (post >= 0) { |
| buf.remove(post); |
| } |
| if (ante >= 0) { |
| buf.removeBetween(0, ante); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| left, leftAnte, leftPost, |
| buf, cursor, status), status); |
| } else { |
| // The output side is the left; trim off any context |
| if (leftPost >= 0) { |
| left.remove(leftPost); |
| } |
| if (leftAnte >= 0) { |
| left.removeBetween(0, leftAnte); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| buf, ante, post, |
| left, leftCursor, status), status); |
| } |
| break; |
| } |
| |
| return pos; |
| } |
| |
| /** |
| * Called by main parser upon syntax error. Search the rule string |
| * for the probable end of the rule. Of course, if the error is that |
| * the end of rule marker is missing, then the rule end will not be found. |
| * In any case the rule start will be correctly reported. |
| * @param msg error description |
| * @param rule pattern string |
| * @param start position of first character of current rule |
| */ |
| int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode, |
| const UnicodeString& rule, |
| int32_t start) { |
| if (parseError != 0) { |
| parseError->code = parseErrorCode; |
| parseError->line = 0; // We don't return a line # |
| parseError->offset = start; // Character offset from rule start |
| int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE); |
| if (end < 0) { |
| end = rule.length(); |
| } |
| rule.extractBetween(start, end, parseError->context); // Current rule |
| } |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return start; |
| } |
| |
| /** |
| * Allocate a private-use substitution character for the given set, |
| * register it in the setVariables hash, and return the substitution |
| * character. |
| */ |
| UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) { |
| if (variableNext >= variableLimit) { |
| // throw new RuntimeException("Private use variables exhausted"); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| setVariablesVector.addElement(adoptedSet); |
| return variableNext++; |
| } |
| |
| /** |
| * Determines what part of the private use region of Unicode we can use for |
| * variable stand-ins. The correct way to do this is as follows: Parse each |
| * rule, and for forward and reverse rules, take the FROM expression, and |
| * make a hash of all characters used. The TO expression should be ignored. |
| * When done, everything not in the hash is available for use. In practice, |
| * this method may employ some other algorithm for improved speed. |
| */ |
| void TransliterationRuleParser::determineVariableRange(void) { |
| UnicodeRange privateUse(0xE000, 0x1900); // Private use area |
| |
| UnicodeRange* r = privateUse.largestUnusedSubrange(rules); |
| |
| data->setVariablesBase = variableNext = variableLimit = (UChar) 0; |
| |
| if (r != 0) { |
| data->setVariablesBase = variableNext = r->start; |
| variableLimit = (UChar) (r->start + r->length); |
| delete r; |
| } |
| |
| if (variableNext >= variableLimit) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| } |
| |
| /** |
| * Returns the index of a character, ignoring quoted text. |
| * For example, in the string "abc'hide'h", the 'h' in "hide" will not be |
| * found by a search for 'h'. |
| */ |
| int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text, |
| int32_t start, int32_t limit, |
| UChar charToFind) { |
| for (int32_t i=start; i<limit; ++i) { |
| UChar c = text.charAt(i); |
| if (c == ESCAPE) { |
| ++i; |
| } else if (c == QUOTE) { |
| while (++i < limit |
| && text.charAt(i) != QUOTE) {} |
| } else if (c == charToFind) { |
| return i; |
| } |
| } |
| return -1; |
| } |