source/i18n/rbt_pars.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (C) 1999, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */
 #include "rbt_pars.h"
 #include "unicode/rbt.h"
 #include "rbt_rule.h"
 #include "unirange.h"
 #include "rbt_data.h"
 #include "unicode/uniset.h"
 #include "cstring.h"
 #include "unicode/parsepos.h"

 // Operators
 const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
 const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
 const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
 const UChar TransliterationRuleParser::FWDREV_RULE_OP  = '~'; // internal rep of <> op
 const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);

 // Other special characters
 const UChar TransliterationRuleParser::QUOTE = '\'';
 const UChar TransliterationRuleParser::ESCAPE = '\\';
 const UChar TransliterationRuleParser::END_OF_RULE = ';';
 const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';

 const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
 const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
 const UChar TransliterationRuleParser::CONTEXT_OPEN = '(';
 const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')';
 const UChar TransliterationRuleParser::SET_OPEN = '[';
 const UChar TransliterationRuleParser::SET_CLOSE = ']';
 const UChar TransliterationRuleParser::CURSOR_POS = '|';


 TransliterationRuleData*
 TransliterationRuleParser::parse(const UnicodeString& rules,
                                  RuleBasedTransliterator::Direction direction) {
     TransliterationRuleParser parser(rules, direction);
     parser.parseRules();
     if (U_FAILURE(parser.status)) {
         delete parser.data;
         parser.data = 0;
     }
     return parser.data;
 }

 /**
  * @param rules list of rules, separated by newline characters
  * @exception IllegalArgumentException if there is a syntax error in the
  * rules
  */
 TransliterationRuleParser::TransliterationRuleParser(
                                      const UnicodeString& theRules,
                                      RuleBasedTransliterator::Direction theDirection) :
     rules(theRules), direction(theDirection), data(0) {}

 /**
  * Parse the given string as a sequence of rules, separated by newline
  * characters ('\n'), and cause this object to implement those rules.  Any
  * previous rules are discarded.  Typically this method is called exactly
  * once, during construction.
  * @exception IllegalArgumentException if there is a syntax error in the
  * rules
  */
 void TransliterationRuleParser::parseRules(void) {
     status = U_ZERO_ERROR;

     delete data;
     data = new TransliterationRuleData(status);
     if (U_FAILURE(status)) {
         return;
     }

     determineVariableRange();

     int32_t pos = 0;
     int32_t limit = rules.length();
     while (pos < limit && U_SUCCESS(status)) {
         UChar c = rules.charAt(pos++);
         if (Unicode::isWhitespace(c)) {
             // Ignore leading whitespace.  Note that this is not
             // Unicode spaces, but Java spaces -- a subset,
             // representing whitespace likely to be seen in code.
             continue;
         }
         // Skip lines starting with the comment character
         if (c == RULE_COMMENT_CHAR) {
             pos = rules.indexOf("\n", pos) + 1;
             if (pos == 0) {
                 break; // No "\n" found; rest of rule is a commnet
             }
             continue; // Either fall out or restart with next line
         }
         // We've found the start of a rule.  c is its first
         // character, and pos points past c.  Lexically parse the
         // rule into component pieces.
         pos = parseRule(--pos, limit);
     }

     // Index the rules
     if (U_SUCCESS(status)) {
         data->ruleSet.freeze(*data, status);
     }
 }

 /**
  * MAIN PARSER.  Parse the next rule in the given rule string, starting
  * at pos.  Return the index after the last character parsed.  Do not
  * parse characters at or after limit.
  *
  * Important:  The character at pos must be a non-whitespace character
  * that is not the comment character.
  *
  * This method handles quoting, escaping, and whitespace removal.  It
  * parses the end-of-rule character.  It recognizes context and cursor
  * indicators.  Once it does a lexical breakdown of the rule at pos, it
  * creates a rule object and adds it to our rule list.
  */
 int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
     // Locate the left side, operator, and right side
     int32_t start = pos;
     UChar op = 0;

     UnicodeString buf;
     int32_t cursor = -1; // position of cursor in buf
     int32_t ante = -1;   // position of ante context marker ')' in buf
     int32_t post = -1;   // position of post context marker '(' in buf
     int32_t postClose = -1; // position of post context close ')' in buf

     // Assigned to buf and its adjuncts after the LHS has been
     // parsed.  Thereafter, buf etc. refer to the RHS.
     UnicodeString left;
     int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;

     UnicodeString scratch;

     while (pos < limit) {
         UChar c = rules.charAt(pos++);
         if (Unicode::isWhitespace(c)) {
             // Ignore whitespace.  Note that this is not Unicode
             // spaces, but Java spaces -- a subset, representing
             // whitespace likely to be seen in code.
             continue;
         }
         // Handle escapes
         if (c == ESCAPE) {
             if (pos == limit) {
                 return syntaxError("Trailing backslash", rules, start);
             }
             // Parse \uXXXX escapes
             c = rules.charAt(pos++);
             if (c == 'u') {
                 if ((pos+4) > limit) {
                     return syntaxError("Malformed Unicode escape", rules, start);
                 }
                 c = (UChar)0x0000;
                 for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
                     int32_t digit = Unicode::digit(rules.charAt(pos), 16);
                     if (digit<0) {
                         return syntaxError("Malformed Unicode escape", rules, start);
                     }
                     c = (UChar) ((c << 4) | digit);
                 }
             }

             buf.append(c);
             continue;
         }
         // Handle quoted matter
         if (c == QUOTE) {
             int32_t iq = rules.indexOf(QUOTE, pos);
             if (iq == pos) {
                 buf.append(c); // Parse [''] outside quotes as [']
                 ++pos;
             } else {
                 /* This loop picks up a segment of quoted text of the
                  * form 'aaaa' each time through.  If this segment
                  * hasn't really ended ('aaaa''bbbb') then it keeps
                  * looping, each time adding on a new segment.  When it
                  * reaches the final quote it breaks.
                  */
                 for (;;) {
                     if (iq < 0) {
                         return syntaxError("Unterminated quote", rules, start);
                     }
                     scratch.truncate(0);
                     rules.extractBetween(pos, iq, scratch);
                     buf.append(scratch);
                     pos = iq+1;
                     if (pos < limit && rules.charAt(pos) == QUOTE) {
                         // Parse [''] inside quotes as [']
                         iq = rules.indexOf(QUOTE, pos+1);
                         // Continue looping
                     } else {
                         break;
                     }
                 }
             }
             continue;
         }
         if (OPERATORS.indexOf(c) >= 0) {
             if (op != 0) {
                 return syntaxError("Unquoted special", rules, start);
             }
             // Found an operator char.  Check for forward-reverse operator.
             if (c == REVERSE_RULE_OP &&
                 (pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
                 ++pos;
                 op = FWDREV_RULE_OP;
             } else {
                 op = c;
             }
             left = buf; // lhs
             leftCursor = cursor;
             leftAnte = ante;
             leftPost = post;
             leftPostClose = postClose;

             buf.truncate(0);
             cursor = ante = post = postClose = -1;
             continue;
         }
         if (c == END_OF_RULE) {
             break;
         }
         switch (c) {
         case VARIABLE_REF_OPEN:
             {
                 int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
                 if (pos == j || j < 0) { // empty or unterminated
                     return syntaxError("Malformed variable reference", rules, start);
                 }
                 scratch.truncate(0);
                 rules.extractBetween(pos, j, scratch);
                 pos = j+1;
                 UChar v = data->lookupVariable(scratch, status);
                 if (U_FAILURE(status)) {
                     return syntaxError("Undefined variable", rules, start);
                 }
                 buf.append(v);
             }
             break;
         case CONTEXT_OPEN:
             if (post >= 0) {
                 return syntaxError("Multiple post contexts", rules, start);
             }
             // Ignore CONTEXT_OPEN if buffer length is zero -- that means
             // this is the optional opening delimiter for the ante context.
             if (buf.length() > 0) {
                 post = buf.length();
             }
             break;
         case CONTEXT_CLOSE:
             if (postClose >= 0) {
                 return syntaxError("Unexpected ')'", rules, start);
             }
             if (post >= 0) {
                 // This is probably the optional closing delimiter
                 // for the post context; save the pos and check later.
                 postClose = buf.length();
             } else if (ante >= 0) {
                 return syntaxError("Multiple ante contexts", rules, start);
             } else {
                 ante = buf.length();
             }
             break;
         case SET_OPEN: {
             ParsePosition pp(pos-1); // Backup to opening '['
             buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
             if (U_FAILURE(status)) {
                 return syntaxError("Invalid set", rules, start);
             }
             pos = pp.getIndex(); }
             break;
         case VARIABLE_REF_CLOSE:
         case SET_CLOSE:
             return syntaxError("Unquoted special", rules, start);
         case CURSOR_POS:
             if (cursor >= 0) {
                 return syntaxError("Multiple cursors", rules, start);
             }
             cursor = buf.length();
             break;
         default:
             buf.append(c);
             break;
         }
     }
     if (op == 0) {
         return syntaxError("No operator", rules, start);
     }

     // Check context close parameters
     if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
         (postClose >= 0 && postClose != buf.length())) {
         return syntaxError("Extra text after ]", rules, start);
     }

     // Context is only allowed on the input side; that is, the left side
     // for forward rules.  Cursors are only allowed on the output side;
     // that is, the right side for forward rules.  Bidirectional rules
     // ignore elements that do not apply.

     switch (op) {
     case VARIABLE_DEF_OP:
         // LHS is the name.  RHS is a single character, either a literal
         // or a set (already parsed).  If RHS is longer than one
         // character, it is either a multi-character string, or multiple
         // sets, or a mixture of chars and sets -- syntax error.
         if (buf.length() != 1) {
             return syntaxError("Malformed RHS", rules, start);
         }
         if (data->isVariableDefined(left)) {
             return syntaxError("Duplicate definition", rules, start);
         }
         data->defineVariable(left, buf.charAt(0), status);
         break;

     case FORWARD_RULE_OP:
         if (direction == RuleBasedTransliterator::FORWARD) {
             if (ante >= 0 || post >= 0 || leftCursor >= 0) {
                 return syntaxError("Malformed rule", rules, start);
             }
             data->ruleSet.addRule(new TransliterationRule(
                                      left, leftAnte, leftPost,
                                      buf, cursor, status), status);
         } // otherwise ignore the rule; it's not the direction we want
         break;

     case REVERSE_RULE_OP:
         if (direction == RuleBasedTransliterator::REVERSE) {
             if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
                 return syntaxError("Malformed rule", rules, start);
             }
             data->ruleSet.addRule(new TransliterationRule(
                                      buf, ante, post,
                                      left, leftCursor, status), status);
         } // otherwise ignore the rule; it's not the direction we want
         break;

     case FWDREV_RULE_OP:
         if (direction == RuleBasedTransliterator::FORWARD) {
             // The output side is the right; trim off any context
             if (post >= 0) {
                 buf.remove(post);
             }
             if (ante >= 0) {
                 buf.removeBetween(0, ante);
             }
             data->ruleSet.addRule(new TransliterationRule(
                                      left, leftAnte, leftPost,
                                      buf, cursor, status), status);
         } else {
             // The output side is the left; trim off any context
             if (leftPost >= 0) {
                 left.remove(leftPost);
             }
             if (leftAnte >= 0) {
                 left.removeBetween(0, leftAnte);
             }
             data->ruleSet.addRule(new TransliterationRule(
                                      buf, ante, post,
                                      left, leftCursor, status), status);
         }
         break;
     }

     return pos;
 }

 /**
  * Called by main parser upon syntax error.  Search the rule string
  * for the probable end of the rule.  Of course, if the error is that
  * the end of rule marker is missing, then the rule end will not be found.
  * In any case the rule start will be correctly reported.
  * @param msg error description
  * @param rule pattern string
  * @param start position of first character of current rule
  */
 int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
                                                const UnicodeString& /*rule*/,
                                                int32_t start) {
 //|    int end = quotedIndexOf(rule, start, rule.length(), ";");
 //|    if (end < 0) {
 //|        end = rule.length();
 //|    }
 //|    throw new IllegalArgumentException(msg + " in " +
 //|                                       rule.substring(start, end));
     status = U_ILLEGAL_ARGUMENT_ERROR;
     return start;
 }

 /**
  * Allocate a private-use substitution character for the given set,
  * register it in the setVariables hash, and return the substitution
  * character.
  */
 UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
     if (variableNext >= variableLimit) {
         // throw new RuntimeException("Private use variables exhausted");
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return 0;
     }
     UChar c = variableNext++;
     data->defineSet(c, adoptedSet, status);
     return c;
 }

 /**
  * Determines what part of the private use region of Unicode we can use for
  * variable stand-ins.  The correct way to do this is as follows: Parse each
  * rule, and for forward and reverse rules, take the FROM expression, and
  * make a hash of all characters used.  The TO expression should be ignored.
  * When done, everything not in the hash is available for use.  In practice,
  * this method may employ some other algorithm for improved speed.
  */
 void TransliterationRuleParser::determineVariableRange(void) {
     UnicodeRange privateUse(0xE000, 0x1900); // Private use area

     UnicodeRange* r = privateUse.largestUnusedSubrange(rules);

     variableNext = variableLimit = (UChar) 0;

     if (r != 0) {
         variableNext = r->start;
         variableLimit = (UChar) (r->start + r->length);
         delete r;
     }

     if (variableNext >= variableLimit) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
     }
 }

 /**
  * Returns the index of the first character in a set, ignoring quoted text.
  * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
  * found by a search for "h".  Unlike String.indexOf(), this method searches
  * not for a single character, but for any character of the string
  * <code>setOfChars</code>.
  * @param text text to be searched
  * @param start the beginning index, inclusive; <code>0 <= start
  * <= limit</code>.
  * @param limit the ending index, exclusive; <code>start <= limit
  * <= text.length()</code>.
  * @param setOfChars string with one or more distinct characters
  * @return Offset of the first character in <code>setOfChars</code>
  * found, or -1 if not found.
  * @see #indexOf
  */
 int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
                                                  int32_t start, int32_t limit,
                                                  const UnicodeString& setOfChars) {
     for (int32_t i=start; i<limit; ++i) {
         UChar c = text.charAt(i);
         if (c == QUOTE) {
             while (++i < limit
                    && text.charAt(i) != QUOTE) {}
         } else if (setOfChars.indexOf(c) >= 0) {
             return i;
         }
     }
     return -1;
 }
	/*
	**********************************************************************
	* Copyright (C) 1999, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 11/17/99 aliu Creation.
	**********************************************************************
	*/
	#include "rbt_pars.h"
	#include "unicode/rbt.h"
	#include "rbt_rule.h"
	#include "unirange.h"
	#include "rbt_data.h"
	#include "unicode/uniset.h"
	#include "cstring.h"
	#include "unicode/parsepos.h"

	// Operators
	const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
	const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
	const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
	const UChar TransliterationRuleParser::FWDREV_RULE_OP = '~'; // internal rep of <> op
	const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);

	// Other special characters
	const UChar TransliterationRuleParser::QUOTE = '\'';
	const UChar TransliterationRuleParser::ESCAPE = '\\';
	const UChar TransliterationRuleParser::END_OF_RULE = ';';
	const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';

	const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
	const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
	const UChar TransliterationRuleParser::CONTEXT_OPEN = '(';
	const UChar TransliterationRuleParser::CONTEXT_CLOSE = ')';
	const UChar TransliterationRuleParser::SET_OPEN = '[';
	const UChar TransliterationRuleParser::SET_CLOSE = ']';
	const UChar TransliterationRuleParser::CURSOR_POS = '\|';


	TransliterationRuleData*
	TransliterationRuleParser::parse(const UnicodeString& rules,
	RuleBasedTransliterator::Direction direction) {
	TransliterationRuleParser parser(rules, direction);
	parser.parseRules();
	if (U_FAILURE(parser.status)) {
	delete parser.data;
	parser.data = 0;
	}
	return parser.data;
	}

	/**
	* @param rules list of rules, separated by newline characters
	* @exception IllegalArgumentException if there is a syntax error in the
	* rules
	*/
	TransliterationRuleParser::TransliterationRuleParser(
	const UnicodeString& theRules,
	RuleBasedTransliterator::Direction theDirection) :
	rules(theRules), direction(theDirection), data(0) {}

	/**
	* Parse the given string as a sequence of rules, separated by newline
	* characters ('\n'), and cause this object to implement those rules. Any
	* previous rules are discarded. Typically this method is called exactly
	* once, during construction.
	* @exception IllegalArgumentException if there is a syntax error in the
	* rules
	*/
	void TransliterationRuleParser::parseRules(void) {
	status = U_ZERO_ERROR;

	delete data;
	data = new TransliterationRuleData(status);
	if (U_FAILURE(status)) {
	return;
	}

	determineVariableRange();

	int32_t pos = 0;
	int32_t limit = rules.length();
	while (pos < limit && U_SUCCESS(status)) {
	UChar c = rules.charAt(pos++);
	if (Unicode::isWhitespace(c)) {
	// Ignore leading whitespace. Note that this is not
	// Unicode spaces, but Java spaces -- a subset,
	// representing whitespace likely to be seen in code.
	continue;
	}
	// Skip lines starting with the comment character
	if (c == RULE_COMMENT_CHAR) {
	pos = rules.indexOf("\n", pos) + 1;
	if (pos == 0) {
	break; // No "\n" found; rest of rule is a commnet
	}
	continue; // Either fall out or restart with next line
	}
	// We've found the start of a rule. c is its first
	// character, and pos points past c. Lexically parse the
	// rule into component pieces.
	pos = parseRule(--pos, limit);
	}

	// Index the rules
	if (U_SUCCESS(status)) {
	data->ruleSet.freeze(*data, status);
	}
	}

	/**
	* MAIN PARSER. Parse the next rule in the given rule string, starting
	* at pos. Return the index after the last character parsed. Do not
	* parse characters at or after limit.
	*
	* Important: The character at pos must be a non-whitespace character
	* that is not the comment character.
	*
	* This method handles quoting, escaping, and whitespace removal. It
	* parses the end-of-rule character. It recognizes context and cursor
	* indicators. Once it does a lexical breakdown of the rule at pos, it
	* creates a rule object and adds it to our rule list.
	*/
	int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
	// Locate the left side, operator, and right side
	int32_t start = pos;
	UChar op = 0;

	UnicodeString buf;
	int32_t cursor = -1; // position of cursor in buf
	int32_t ante = -1; // position of ante context marker ')' in buf
	int32_t post = -1; // position of post context marker '(' in buf
	int32_t postClose = -1; // position of post context close ')' in buf

	// Assigned to buf and its adjuncts after the LHS has been
	// parsed. Thereafter, buf etc. refer to the RHS.
	UnicodeString left;
	int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;

	UnicodeString scratch;

	while (pos < limit) {
	UChar c = rules.charAt(pos++);
	if (Unicode::isWhitespace(c)) {
	// Ignore whitespace. Note that this is not Unicode
	// spaces, but Java spaces -- a subset, representing
	// whitespace likely to be seen in code.
	continue;
	}
	// Handle escapes
	if (c == ESCAPE) {
	if (pos == limit) {
	return syntaxError("Trailing backslash", rules, start);
	}
	// Parse \uXXXX escapes
	c = rules.charAt(pos++);
	if (c == 'u') {
	if ((pos+4) > limit) {
	return syntaxError("Malformed Unicode escape", rules, start);
	}
	c = (UChar)0x0000;
	for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
	int32_t digit = Unicode::digit(rules.charAt(pos), 16);
	if (digit<0) {
	return syntaxError("Malformed Unicode escape", rules, start);
	}
	c = (UChar) ((c << 4) \| digit);
	}
	}

	buf.append(c);
	continue;
	}
	// Handle quoted matter
	if (c == QUOTE) {
	int32_t iq = rules.indexOf(QUOTE, pos);
	if (iq == pos) {
	buf.append(c); // Parse [''] outside quotes as [']
	++pos;
	} else {
	/* This loop picks up a segment of quoted text of the
	* form 'aaaa' each time through. If this segment
	* hasn't really ended ('aaaa''bbbb') then it keeps
	* looping, each time adding on a new segment. When it
	* reaches the final quote it breaks.
	*/
	for (;;) {
	if (iq < 0) {
	return syntaxError("Unterminated quote", rules, start);
	}
	scratch.truncate(0);
	rules.extractBetween(pos, iq, scratch);
	buf.append(scratch);
	pos = iq+1;
	if (pos < limit && rules.charAt(pos) == QUOTE) {
	// Parse [''] inside quotes as [']
	iq = rules.indexOf(QUOTE, pos+1);
	// Continue looping
	} else {
	break;
	}
	}
	}
	continue;
	}
	if (OPERATORS.indexOf(c) >= 0) {
	if (op != 0) {
	return syntaxError("Unquoted special", rules, start);
	}
	// Found an operator char. Check for forward-reverse operator.
	if (c == REVERSE_RULE_OP &&
	(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
	++pos;
	op = FWDREV_RULE_OP;
	} else {
	op = c;
	}
	left = buf; // lhs
	leftCursor = cursor;
	leftAnte = ante;
	leftPost = post;
	leftPostClose = postClose;

	buf.truncate(0);
	cursor = ante = post = postClose = -1;
	continue;
	}
	if (c == END_OF_RULE) {
	break;
	}
	switch (c) {
	case VARIABLE_REF_OPEN:
	{
	int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
	if (pos == j \|\| j < 0) { // empty or unterminated
	return syntaxError("Malformed variable reference", rules, start);
	}
	scratch.truncate(0);
	rules.extractBetween(pos, j, scratch);
	pos = j+1;
	UChar v = data->lookupVariable(scratch, status);
	if (U_FAILURE(status)) {
	return syntaxError("Undefined variable", rules, start);
	}
	buf.append(v);
	}
	break;
	case CONTEXT_OPEN:
	if (post >= 0) {
	return syntaxError("Multiple post contexts", rules, start);
	}
	// Ignore CONTEXT_OPEN if buffer length is zero -- that means
	// this is the optional opening delimiter for the ante context.
	if (buf.length() > 0) {
	post = buf.length();
	}
	break;
	case CONTEXT_CLOSE:
	if (postClose >= 0) {
	return syntaxError("Unexpected ')'", rules, start);
	}
	if (post >= 0) {
	// This is probably the optional closing delimiter
	// for the post context; save the pos and check later.
	postClose = buf.length();
	} else if (ante >= 0) {
	return syntaxError("Multiple ante contexts", rules, start);
	} else {
	ante = buf.length();
	}
	break;
	case SET_OPEN: {
	ParsePosition pp(pos-1); // Backup to opening '['
	buf.append(registerSet(new UnicodeSet(rules, pp, data, status)));
	if (U_FAILURE(status)) {
	return syntaxError("Invalid set", rules, start);
	}
	pos = pp.getIndex(); }
	break;
	case VARIABLE_REF_CLOSE:
	case SET_CLOSE:
	return syntaxError("Unquoted special", rules, start);
	case CURSOR_POS:
	if (cursor >= 0) {
	return syntaxError("Multiple cursors", rules, start);
	}
	cursor = buf.length();
	break;
	default:
	buf.append(c);
	break;
	}
	}
	if (op == 0) {
	return syntaxError("No operator", rules, start);
	}

	// Check context close parameters
	if ((leftPostClose >= 0 && leftPostClose != left.length()) \|\|
	(postClose >= 0 && postClose != buf.length())) {
	return syntaxError("Extra text after ]", rules, start);
	}

	// Context is only allowed on the input side; that is, the left side
	// for forward rules. Cursors are only allowed on the output side;
	// that is, the right side for forward rules. Bidirectional rules
	// ignore elements that do not apply.

	switch (op) {
	case VARIABLE_DEF_OP:
	// LHS is the name. RHS is a single character, either a literal
	// or a set (already parsed). If RHS is longer than one
	// character, it is either a multi-character string, or multiple
	// sets, or a mixture of chars and sets -- syntax error.
	if (buf.length() != 1) {
	return syntaxError("Malformed RHS", rules, start);
	}
	if (data->isVariableDefined(left)) {
	return syntaxError("Duplicate definition", rules, start);
	}
	data->defineVariable(left, buf.charAt(0), status);
	break;

	case FORWARD_RULE_OP:
	if (direction == RuleBasedTransliterator::FORWARD) {
	if (ante >= 0 \|\| post >= 0 \|\| leftCursor >= 0) {
	return syntaxError("Malformed rule", rules, start);
	}
	data->ruleSet.addRule(new TransliterationRule(
	left, leftAnte, leftPost,
	buf, cursor, status), status);
	} // otherwise ignore the rule; it's not the direction we want
	break;

	case REVERSE_RULE_OP:
	if (direction == RuleBasedTransliterator::REVERSE) {
	if (leftAnte >= 0 \|\| leftPost >= 0 \|\| cursor >= 0) {
	return syntaxError("Malformed rule", rules, start);
	}
	data->ruleSet.addRule(new TransliterationRule(
	buf, ante, post,
	left, leftCursor, status), status);
	} // otherwise ignore the rule; it's not the direction we want
	break;

	case FWDREV_RULE_OP:
	if (direction == RuleBasedTransliterator::FORWARD) {
	// The output side is the right; trim off any context
	if (post >= 0) {
	buf.remove(post);
	}
	if (ante >= 0) {
	buf.removeBetween(0, ante);
	}
	data->ruleSet.addRule(new TransliterationRule(
	left, leftAnte, leftPost,
	buf, cursor, status), status);
	} else {
	// The output side is the left; trim off any context
	if (leftPost >= 0) {
	left.remove(leftPost);
	}
	if (leftAnte >= 0) {
	left.removeBetween(0, leftAnte);
	}
	data->ruleSet.addRule(new TransliterationRule(
	buf, ante, post,
	left, leftCursor, status), status);
	}
	break;
	}

	return pos;
	}

	/**
	* Called by main parser upon syntax error. Search the rule string
	* for the probable end of the rule. Of course, if the error is that
	* the end of rule marker is missing, then the rule end will not be found.
	* In any case the rule start will be correctly reported.
	* @param msg error description
	* @param rule pattern string
	* @param start position of first character of current rule
	*/
	int32_t TransliterationRuleParser::syntaxError(const char* /msg/,
	const UnicodeString& /rule/,
	int32_t start) {
	//\| int end = quotedIndexOf(rule, start, rule.length(), ";");
	//\| if (end < 0) {
	//\| end = rule.length();
	//\| }
	//\| throw new IllegalArgumentException(msg + " in " +
	//\| rule.substring(start, end));
	status = U_ILLEGAL_ARGUMENT_ERROR;
	return start;
	}

	/**
	* Allocate a private-use substitution character for the given set,
	* register it in the setVariables hash, and return the substitution
	* character.
	*/
	UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
	if (variableNext >= variableLimit) {
	// throw new RuntimeException("Private use variables exhausted");
	status = U_ILLEGAL_ARGUMENT_ERROR;
	return 0;
	}
	UChar c = variableNext++;
	data->defineSet(c, adoptedSet, status);
	return c;
	}

	/**
	* Determines what part of the private use region of Unicode we can use for
	* variable stand-ins. The correct way to do this is as follows: Parse each
	* rule, and for forward and reverse rules, take the FROM expression, and
	* make a hash of all characters used. The TO expression should be ignored.
	* When done, everything not in the hash is available for use. In practice,
	* this method may employ some other algorithm for improved speed.
	*/
	void TransliterationRuleParser::determineVariableRange(void) {
	UnicodeRange privateUse(0xE000, 0x1900); // Private use area

	UnicodeRange* r = privateUse.largestUnusedSubrange(rules);

	variableNext = variableLimit = (UChar) 0;

	if (r != 0) {
	variableNext = r->start;
	variableLimit = (UChar) (r->start + r->length);
	delete r;
	}

	if (variableNext >= variableLimit) {
	status = U_ILLEGAL_ARGUMENT_ERROR;
	}
	}

	/**
	* Returns the index of the first character in a set, ignoring quoted text.
	* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
	* found by a search for "h". Unlike String.indexOf(), this method searches
	* not for a single character, but for any character of the string
	* <code>setOfChars</code>.
	* @param text text to be searched
	* @param start the beginning index, inclusive; <code>0 <= start
	* <= limit</code>.
	* @param limit the ending index, exclusive; <code>start <= limit
	* <= text.length()</code>.
	* @param setOfChars string with one or more distinct characters
	* @return Offset of the first character in <code>setOfChars</code>
	* found, or -1 if not found.
	* @see #indexOf
	*/
	int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
	int32_t start, int32_t limit,
	const UnicodeString& setOfChars) {
	for (int32_t i=start; i<limit; ++i) {
	UChar c = text.charAt(i);
	if (c == QUOTE) {
	while (++i < limit
	&& text.charAt(i) != QUOTE) {}
	} else if (setOfChars.indexOf(c) >= 0) {
	return i;
	}
	}
	return -1;
	}