| /* |
| ********************************************************************** |
| * Copyright (C) 1999, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 11/17/99 aliu Creation. |
| ********************************************************************** |
| */ |
| #include "rbt_pars.h" |
| #include "unicode/rbt.h" |
| #include "rbt_rule.h" |
| #include "unirange.h" |
| #include "rbt_data.h" |
| #include "unicode/uniset.h" |
| #include "cstring.h" |
| #include "unicode/parsepos.h" |
| |
| // Operators |
| const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '='; |
| const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>'; |
| const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<'; |
| const UChar TransliterationRuleParser::FWDREV_RULE_OP = '~'; // internal rep of <> op |
| const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3); |
| |
| // Other special characters |
| const UChar TransliterationRuleParser::QUOTE = '\''; |
| const UChar TransliterationRuleParser::ESCAPE = '\\'; |
| const UChar TransliterationRuleParser::END_OF_RULE = ';'; |
| const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#'; |
| |
| const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{'; |
| const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}'; |
| const UChar TransliterationRuleParser::CONTEXT_OPEN = '('; |
| const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')'; |
| const UChar TransliterationRuleParser::SET_OPEN = '['; |
| const UChar TransliterationRuleParser::SET_CLOSE = ']'; |
| const UChar TransliterationRuleParser::CURSOR_POS = '|'; |
| |
| |
| TransliterationRuleData* |
| TransliterationRuleParser::parse(const UnicodeString& rules, |
| RuleBasedTransliterator::Direction direction) { |
| TransliterationRuleParser parser(rules, direction); |
| parser.parseRules(); |
| if (U_FAILURE(parser.status)) { |
| delete parser.data; |
| parser.data = 0; |
| } |
| return parser.data; |
| } |
| |
| /** |
| * @param rules list of rules, separated by newline characters |
| * @exception IllegalArgumentException if there is a syntax error in the |
| * rules |
| */ |
| TransliterationRuleParser::TransliterationRuleParser( |
| const UnicodeString& theRules, |
| RuleBasedTransliterator::Direction theDirection) : |
| rules(theRules), direction(theDirection), data(0) {} |
| |
| /** |
| * Parse the given string as a sequence of rules, separated by newline |
| * characters ('\n'), and cause this object to implement those rules. Any |
| * previous rules are discarded. Typically this method is called exactly |
| * once, during construction. |
| * @exception IllegalArgumentException if there is a syntax error in the |
| * rules |
| */ |
| void TransliterationRuleParser::parseRules(void) { |
| status = U_ZERO_ERROR; |
| |
| delete data; |
| data = new TransliterationRuleData(status); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| determineVariableRange(); |
| |
| int32_t pos = 0; |
| int32_t limit = rules.length(); |
| while (pos < limit && U_SUCCESS(status)) { |
| UChar c = rules.charAt(pos++); |
| if (Unicode::isWhitespace(c)) { |
| // Ignore leading whitespace. Note that this is not |
| // Unicode spaces, but Java spaces -- a subset, |
| // representing whitespace likely to be seen in code. |
| continue; |
| } |
| // Skip lines starting with the comment character |
| if (c == RULE_COMMENT_CHAR) { |
| pos = rules.indexOf("\n", pos) + 1; |
| if (pos == 0) { |
| break; // No "\n" found; rest of rule is a commnet |
| } |
| continue; // Either fall out or restart with next line |
| } |
| // We've found the start of a rule. c is its first |
| // character, and pos points past c. Lexically parse the |
| // rule into component pieces. |
| pos = parseRule(--pos, limit); |
| } |
| |
| // Index the rules |
| if (U_SUCCESS(status)) { |
| data->ruleSet.freeze(*data, status); |
| } |
| } |
| |
| /** |
| * MAIN PARSER. Parse the next rule in the given rule string, starting |
| * at pos. Return the index after the last character parsed. Do not |
| * parse characters at or after limit. |
| * |
| * Important: The character at pos must be a non-whitespace character |
| * that is not the comment character. |
| * |
| * This method handles quoting, escaping, and whitespace removal. It |
| * parses the end-of-rule character. It recognizes context and cursor |
| * indicators. Once it does a lexical breakdown of the rule at pos, it |
| * creates a rule object and adds it to our rule list. |
| */ |
| int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) { |
| // Locate the left side, operator, and right side |
| int32_t start = pos; |
| UChar op = 0; |
| |
| UnicodeString buf; |
| int32_t cursor = -1; // position of cursor in buf |
| int32_t ante = -1; // position of ante context marker ')' in buf |
| int32_t post = -1; // position of post context marker '(' in buf |
| int32_t postClose = -1; // position of post context close ')' in buf |
| |
| // Assigned to buf and its adjuncts after the LHS has been |
| // parsed. Thereafter, buf etc. refer to the RHS. |
| UnicodeString left; |
| int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1; |
| |
| UnicodeString scratch; |
| |
| while (pos < limit) { |
| UChar c = rules.charAt(pos++); |
| if (Unicode::isWhitespace(c)) { |
| // Ignore whitespace. Note that this is not Unicode |
| // spaces, but Java spaces -- a subset, representing |
| // whitespace likely to be seen in code. |
| continue; |
| } |
| // Handle escapes |
| if (c == ESCAPE) { |
| if (pos == limit) { |
| return syntaxError("Trailing backslash", rules, start); |
| } |
| // Parse \uXXXX escapes |
| c = rules.charAt(pos++); |
| if (c == 'u') { |
| if ((pos+4) > limit) { |
| return syntaxError("Malformed Unicode escape", rules, start); |
| } |
| c = (UChar)0x0000; |
| for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic] |
| int32_t digit = Unicode::digit(rules.charAt(pos), 16); |
| if (digit<0) { |
| return syntaxError("Malformed Unicode escape", rules, start); |
| } |
| c = (UChar) ((c << 4) | digit); |
| } |
| } |
| |
| buf.append(c); |
| continue; |
| } |
| // Handle quoted matter |
| if (c == QUOTE) { |
| int32_t iq = rules.indexOf(QUOTE, pos); |
| if (iq == pos) { |
| buf.append(c); // Parse [''] outside quotes as ['] |
| ++pos; |
| } else { |
| /* This loop picks up a segment of quoted text of the |
| * form 'aaaa' each time through. If this segment |
| * hasn't really ended ('aaaa''bbbb') then it keeps |
| * looping, each time adding on a new segment. When it |
| * reaches the final quote it breaks. |
| */ |
| for (;;) { |
| if (iq < 0) { |
| return syntaxError("Unterminated quote", rules, start); |
| } |
| scratch.truncate(0); |
| rules.extractBetween(pos, iq, scratch); |
| buf.append(scratch); |
| pos = iq+1; |
| if (pos < limit && rules.charAt(pos) == QUOTE) { |
| // Parse [''] inside quotes as ['] |
| iq = rules.indexOf(QUOTE, pos+1); |
| // Continue looping |
| } else { |
| break; |
| } |
| } |
| } |
| continue; |
| } |
| if (OPERATORS.indexOf(c) >= 0) { |
| if (op != 0) { |
| return syntaxError("Unquoted special", rules, start); |
| } |
| // Found an operator char. Check for forward-reverse operator. |
| if (c == REVERSE_RULE_OP && |
| (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) { |
| ++pos; |
| op = FWDREV_RULE_OP; |
| } else { |
| op = c; |
| } |
| left = buf; // lhs |
| leftCursor = cursor; |
| leftAnte = ante; |
| leftPost = post; |
| leftPostClose = postClose; |
| |
| buf.truncate(0); |
| cursor = ante = post = postClose = -1; |
| continue; |
| } |
| if (c == END_OF_RULE) { |
| break; |
| } |
| switch (c) { |
| case VARIABLE_REF_OPEN: |
| { |
| int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos); |
| if (pos == j || j < 0) { // empty or unterminated |
| return syntaxError("Malformed variable reference", rules, start); |
| } |
| scratch.truncate(0); |
| rules.extractBetween(pos, j, scratch); |
| pos = j+1; |
| UChar v = data->lookupVariable(scratch, status); |
| if (U_FAILURE(status)) { |
| return syntaxError("Undefined variable", rules, start); |
| } |
| buf.append(v); |
| } |
| break; |
| case CONTEXT_OPEN: |
| if (post >= 0) { |
| return syntaxError("Multiple post contexts", rules, start); |
| } |
| // Ignore CONTEXT_OPEN if buffer length is zero -- that means |
| // this is the optional opening delimiter for the ante context. |
| if (buf.length() > 0) { |
| post = buf.length(); |
| } |
| break; |
| case CONTEXT_CLOSE: |
| if (postClose >= 0) { |
| return syntaxError("Unexpected ')'", rules, start); |
| } |
| if (post >= 0) { |
| // This is probably the optional closing delimiter |
| // for the post context; save the pos and check later. |
| postClose = buf.length(); |
| } else if (ante >= 0) { |
| return syntaxError("Multiple ante contexts", rules, start); |
| } else { |
| ante = buf.length(); |
| } |
| break; |
| case SET_OPEN: { |
| ParsePosition pp(pos-1); // Backup to opening '[' |
| buf.append(registerSet(new UnicodeSet(rules, pp, data, status))); |
| if (U_FAILURE(status)) { |
| return syntaxError("Invalid set", rules, start); |
| } |
| pos = pp.getIndex(); } |
| break; |
| case VARIABLE_REF_CLOSE: |
| case SET_CLOSE: |
| return syntaxError("Unquoted special", rules, start); |
| case CURSOR_POS: |
| if (cursor >= 0) { |
| return syntaxError("Multiple cursors", rules, start); |
| } |
| cursor = buf.length(); |
| break; |
| default: |
| buf.append(c); |
| break; |
| } |
| } |
| if (op == 0) { |
| return syntaxError("No operator", rules, start); |
| } |
| |
| // Check context close parameters |
| if ((leftPostClose >= 0 && leftPostClose != left.length()) || |
| (postClose >= 0 && postClose != buf.length())) { |
| return syntaxError("Extra text after ]", rules, start); |
| } |
| |
| // Context is only allowed on the input side; that is, the left side |
| // for forward rules. Cursors are only allowed on the output side; |
| // that is, the right side for forward rules. Bidirectional rules |
| // ignore elements that do not apply. |
| |
| switch (op) { |
| case VARIABLE_DEF_OP: |
| // LHS is the name. RHS is a single character, either a literal |
| // or a set (already parsed). If RHS is longer than one |
| // character, it is either a multi-character string, or multiple |
| // sets, or a mixture of chars and sets -- syntax error. |
| if (buf.length() != 1) { |
| return syntaxError("Malformed RHS", rules, start); |
| } |
| if (data->isVariableDefined(left)) { |
| return syntaxError("Duplicate definition", rules, start); |
| } |
| data->defineVariable(left, buf.charAt(0), status); |
| break; |
| |
| case FORWARD_RULE_OP: |
| if (direction == RuleBasedTransliterator::FORWARD) { |
| if (ante >= 0 || post >= 0 || leftCursor >= 0) { |
| return syntaxError("Malformed rule", rules, start); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| left, leftAnte, leftPost, |
| buf, cursor, status), status); |
| } // otherwise ignore the rule; it's not the direction we want |
| break; |
| |
| case REVERSE_RULE_OP: |
| if (direction == RuleBasedTransliterator::REVERSE) { |
| if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) { |
| return syntaxError("Malformed rule", rules, start); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| buf, ante, post, |
| left, leftCursor, status), status); |
| } // otherwise ignore the rule; it's not the direction we want |
| break; |
| |
| case FWDREV_RULE_OP: |
| if (direction == RuleBasedTransliterator::FORWARD) { |
| // The output side is the right; trim off any context |
| if (post >= 0) { |
| buf.remove(post); |
| } |
| if (ante >= 0) { |
| buf.removeBetween(0, ante); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| left, leftAnte, leftPost, |
| buf, cursor, status), status); |
| } else { |
| // The output side is the left; trim off any context |
| if (leftPost >= 0) { |
| left.remove(leftPost); |
| } |
| if (leftAnte >= 0) { |
| left.removeBetween(0, leftAnte); |
| } |
| data->ruleSet.addRule(new TransliterationRule( |
| buf, ante, post, |
| left, leftCursor, status), status); |
| } |
| break; |
| } |
| |
| return pos; |
| } |
| |
| /** |
| * Called by main parser upon syntax error. Search the rule string |
| * for the probable end of the rule. Of course, if the error is that |
| * the end of rule marker is missing, then the rule end will not be found. |
| * In any case the rule start will be correctly reported. |
| * @param msg error description |
| * @param rule pattern string |
| * @param start position of first character of current rule |
| */ |
| int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/, |
| const UnicodeString& /*rule*/, |
| int32_t start) { |
| //| int end = quotedIndexOf(rule, start, rule.length(), ";"); |
| //| if (end < 0) { |
| //| end = rule.length(); |
| //| } |
| //| throw new IllegalArgumentException(msg + " in " + |
| //| rule.substring(start, end)); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return start; |
| } |
| |
| /** |
| * Allocate a private-use substitution character for the given set, |
| * register it in the setVariables hash, and return the substitution |
| * character. |
| */ |
| UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) { |
| if (variableNext >= variableLimit) { |
| // throw new RuntimeException("Private use variables exhausted"); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| return 0; |
| } |
| UChar c = variableNext++; |
| data->defineSet(c, adoptedSet, status); |
| return c; |
| } |
| |
| /** |
| * Determines what part of the private use region of Unicode we can use for |
| * variable stand-ins. The correct way to do this is as follows: Parse each |
| * rule, and for forward and reverse rules, take the FROM expression, and |
| * make a hash of all characters used. The TO expression should be ignored. |
| * When done, everything not in the hash is available for use. In practice, |
| * this method may employ some other algorithm for improved speed. |
| */ |
| void TransliterationRuleParser::determineVariableRange(void) { |
| UnicodeRange privateUse(0xE000, 0x1900); // Private use area |
| |
| UnicodeRange* r = privateUse.largestUnusedSubrange(rules); |
| |
| variableNext = variableLimit = (UChar) 0; |
| |
| if (r != 0) { |
| variableNext = r->start; |
| variableLimit = (UChar) (r->start + r->length); |
| delete r; |
| } |
| |
| if (variableNext >= variableLimit) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| } |
| |
| /** |
| * Returns the index of the first character in a set, ignoring quoted text. |
| * For example, in the string "abc'hide'h", the 'h' in "hide" will not be |
| * found by a search for "h". Unlike String.indexOf(), this method searches |
| * not for a single character, but for any character of the string |
| * <code>setOfChars</code>. |
| * @param text text to be searched |
| * @param start the beginning index, inclusive; <code>0 <= start |
| * <= limit</code>. |
| * @param limit the ending index, exclusive; <code>start <= limit |
| * <= text.length()</code>. |
| * @param setOfChars string with one or more distinct characters |
| * @return Offset of the first character in <code>setOfChars</code> |
| * found, or -1 if not found. |
| * @see #indexOf |
| */ |
| int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text, |
| int32_t start, int32_t limit, |
| const UnicodeString& setOfChars) { |
| for (int32_t i=start; i<limit; ++i) { |
| UChar c = text.charAt(i); |
| if (c == QUOTE) { |
| while (++i < limit |
| && text.charAt(i) != QUOTE) {} |
| } else if (setOfChars.indexOf(c) >= 0) { |
| return i; |
| } |
| } |
| return -1; |
| } |