blob: a459696fa0ede72aa1f8cb35c00d746cc74546c0 [file] [log] [blame]
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_pars.h"
#include "unicode/rbt.h"
#include "rbt_rule.h"
#include "unirange.h"
#include "rbt_data.h"
#include "unicode/uniset.h"
#include "cstring.h"
#include "unicode/parsepos.h"
#include "symtable.h"
#include "unicode/parseerr.h"
// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/;
const UChar TransliterationRuleParser::FORWARD_RULE_OP = 0x003E/*>*/;
const UChar TransliterationRuleParser::REVERSE_RULE_OP = 0x003C/*<*/;
const UChar TransliterationRuleParser::FWDREV_RULE_OP = 0x007E/*~*/; // internal rep of <> op
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);
// Other special characters
const UChar TransliterationRuleParser::QUOTE = 0x0027/*'*/;
const UChar TransliterationRuleParser::ESCAPE = 0x005C/*\*/;
const UChar TransliterationRuleParser::END_OF_RULE = 0x003B/*;*/;
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = 0x0023/*#*/;
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = 0x007B/*{*/;
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = 0x007D/*}*/;
const UChar TransliterationRuleParser::CONTEXT_OPEN = 0x0028/*(*/;
const UChar TransliterationRuleParser::CONTEXT_CLOSE = 0x0029/*)*/;
const UChar TransliterationRuleParser::SET_OPEN = 0x005B/*[*/;
const UChar TransliterationRuleParser::SET_CLOSE = 0x005D/*]*/;
const UChar TransliterationRuleParser::CURSOR_POS = 0x007C/*|*/;
//----------------------------------------------------------------------
// BEGIN ParseData
//----------------------------------------------------------------------
/**
* This class implements the SymbolTable interface. It is used
* during parsing to give UnicodeSet access to variables that
* have been defined so far. Note that it uses setVariablesVector,
* _not_ data.setVariables.
*/
class ParseData : public SymbolTable {
public:
const TransliterationRuleData* data; // alias
const UVector* setVariablesVector; // alias
ParseData(const TransliterationRuleData* data = 0,
const UVector* setVariablesVector = 0);
/**
* Lookup the object associated with this string and return it.
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
* exist. Return a non-NULL set if the name is mapped to a set;
* otherwise return a NULL set.
*/
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
UErrorCode& status) const;
};
ParseData::ParseData(const TransliterationRuleData* d,
const UVector* sets) :
data(d), setVariablesVector(sets) {}
/**
* Implement SymbolTable API. Lookup a variable, returning
* either a Character, a UnicodeSet, or null.
*/
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
UErrorCode& status) const {
c = data->lookupVariable(name, status);
if (U_SUCCESS(status)) {
int32_t i = c - data->setVariablesBase;
set = (i < setVariablesVector->size()) ?
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
}
}
//----------------------------------------------------------------------
// END ParseData
//----------------------------------------------------------------------
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction,
ParseError* parseError) {
TransliterationRuleParser parser(rules, direction, parseError);
parser.parseRules();
if (U_FAILURE(parser.status)) {
delete parser.data;
parser.data = 0;
}
return parser.data;
}
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliterationRuleParser::TransliterationRuleParser(
const UnicodeString& theRules,
RuleBasedTransliterator::Direction theDirection,
ParseError* theParseError) :
rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
parseData = new ParseData(0, &setVariablesVector);
}
/**
* Destructor.
*/
TransliterationRuleParser::~TransliterationRuleParser() {
delete parseData;
}
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
* previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliterationRuleParser::parseRules(void) {
status = U_ZERO_ERROR;
delete data;
data = new TransliterationRuleData(status);
if (U_FAILURE(status)) {
return;
}
parseData->data = data;
setVariablesVector.removeAllElements();
if (parseError != 0) {
parseError->code = 0;
}
determineVariableRange();
int32_t pos = 0;
int32_t limit = rules.length();
while (pos < limit && U_SUCCESS(status)) {
UChar c = rules.charAt(pos++);
if (Unicode::isWhitespace(c)) {
// Ignore leading whitespace. Note that this is not
// Unicode spaces, but Java spaces -- a subset,
// representing whitespace likely to be seen in code.
continue;
}
// Skip lines starting with the comment character
if (c == RULE_COMMENT_CHAR) {
pos = rules.indexOf((UChar)0x000A /*\n*/, pos) + 1;
if (pos == 0) {
break; // No "\n" found; rest of rule is a commnet
}
continue; // Either fall out or restart with next line
}
// We've found the start of a rule. c is its first
// character, and pos points past c. Lexically parse the
// rule into component pieces.
pos = parseRule(--pos, limit);
}
// Convert the set vector to an array
data->setVariablesLength = setVariablesVector.size();
data->setVariables = new UnicodeSet*[data->setVariablesLength];
// orphanElement removes the given element and shifts all other
// elements down. For performance (and code clarity) we work from
// the end back to index 0.
for (int32_t i=data->setVariablesLength; i>0; ) {
--i;
data->setVariables[i] =
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
}
// Index the rules
if (U_SUCCESS(status)) {
data->ruleSet.freeze(*data, status);
}
}
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not
* parse characters at or after limit.
*
* Important: The character at pos must be a non-whitespace character
* that is not the comment character.
*
* This method handles quoting, escaping, and whitespace removal. It
* parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*/
int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
// Locate the left side, operator, and right side
int32_t start = pos;
UChar op = 0;
UnicodeString buf;
int32_t cursor = -1; // position of cursor in buf
int32_t ante = -1; // position of ante context marker ')' in buf
int32_t post = -1; // position of post context marker '(' in buf
int32_t postClose = -1; // position of post context close ')' in buf
// Assigned to buf and its adjuncts after the LHS has been
// parsed. Thereafter, buf etc. refer to the RHS.
UnicodeString left;
int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
UnicodeString scratch;
while (pos < limit) {
UChar c = rules.charAt(pos++);
if (Unicode::isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rules, start);
}
// Parse \uXXXX escapes
c = rules.charAt(pos++);
if (c == 0x0075/*u*/) {
if ((pos+4) > limit) {
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
}
c = (UChar)0x0000;
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
if (digit<0) {
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
}
c = (UChar) ((c << 4) | digit);
}
}
buf.append(c);
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int32_t iq = rules.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
for (;;) {
if (iq < 0) {
return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rules, start);
}
scratch.truncate(0);
rules.extractBetween(pos, iq, scratch);
buf.append(scratch);
pos = iq+1;
if (pos < limit && rules.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rules.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
}
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
if (op != 0) {
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
}
// Found an operator char. Check for forward-reverse operator.
if (c == REVERSE_RULE_OP &&
(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
++pos;
op = FWDREV_RULE_OP;
} else {
op = c;
}
left = buf; // lhs
leftCursor = cursor;
leftAnte = ante;
leftPost = post;
leftPostClose = postClose;
buf.truncate(0);
cursor = ante = post = postClose = -1;
continue;
}
if (c == END_OF_RULE) {
break;
}
switch (c) {
case VARIABLE_REF_OPEN:
{
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
if (pos == j || j < 0) { // empty or unterminated
return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE, rules, start);
}
scratch.truncate(0);
rules.extractBetween(pos, j, scratch);
pos = j+1;
UChar v = data->lookupVariable(scratch, status);
if (U_FAILURE(status)) {
return syntaxError(RuleBasedTransliterator::UNDEFINED_VARIABLE, rules, start);
}
buf.append(v);
}
break;
case CONTEXT_OPEN:
if (post >= 0) {
return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rules, start);
}
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
// this is the optional opening delimiter for the ante context.
if (buf.length() > 0) {
post = buf.length();
}
break;
case CONTEXT_CLOSE:
if (postClose >= 0) {
return syntaxError(RuleBasedTransliterator::UNEXPECTED_CLOSE_CONTEXT, rules, start);
}
if (post >= 0) {
// This is probably the optional closing delimiter
// for the post context; save the pos and check later.
postClose = buf.length();
} else if (ante >= 0) {
return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rules, start);
} else {
ante = buf.length();
}
break;
case SET_OPEN: {
ParsePosition pp(pos-1); // Backup to opening '['
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
if (U_FAILURE(status)) {
return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rules, start);
}
pos = pp.getIndex(); }
break;
case VARIABLE_REF_CLOSE:
case SET_CLOSE:
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
case CURSOR_POS:
if (cursor >= 0) {
return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rules, start);
}
cursor = buf.length();
break;
default:
buf.append(c);
break;
}
}
if (op == 0) {
return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rules, start);
}
// Check context close parameters
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
(postClose >= 0 && postClose != buf.length())) {
return syntaxError(RuleBasedTransliterator::TEXT_AFTER_CLOSE_CONTEXT, rules, start);
}
// Context is only allowed on the input side; that is, the left side
// for forward rules. Cursors are only allowed on the output side;
// that is, the right side for forward rules. Bidirectional rules
// ignore elements that do not apply.
switch (op) {
case VARIABLE_DEF_OP:
// LHS is the name. RHS is a single character, either a literal
// or a set (already parsed). If RHS is longer than one
// character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error.
if (buf.length() != 1) {
return syntaxError(RuleBasedTransliterator::MALFORMED_RHS, rules, start);
}
if (data->isVariableDefined(left)) {
return syntaxError(RuleBasedTransliterator::DUPLICATE_VARIABLE_DEFINITION, rules, start);
}
data->defineVariable(left, buf.charAt(0), status);
break;
case FORWARD_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
}
data->ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
buf, cursor, status), status);
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == RuleBasedTransliterator::REVERSE) {
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
}
data->ruleSet.addRule(new TransliterationRule(
buf, ante, post,
left, leftCursor, status), status);
} // otherwise ignore the rule; it's not the direction we want
break;
case FWDREV_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
// The output side is the right; trim off any context
if (post >= 0) {
buf.remove(post);
}
if (ante >= 0) {
buf.removeBetween(0, ante);
}
data->ruleSet.addRule(new TransliterationRule(
left, leftAnte, leftPost,
buf, cursor, status), status);
} else {
// The output side is the left; trim off any context
if (leftPost >= 0) {
left.remove(leftPost);
}
if (leftAnte >= 0) {
left.removeBetween(0, leftAnte);
}
data->ruleSet.addRule(new TransliterationRule(
buf, ante, post,
left, leftCursor, status), status);
}
break;
}
return pos;
}
/**
* Called by main parser upon syntax error. Search the rule string
* for the probable end of the rule. Of course, if the error is that
* the end of rule marker is missing, then the rule end will not be found.
* In any case the rule start will be correctly reported.
* @param msg error description
* @param rule pattern string
* @param start position of first character of current rule
*/
int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
const UnicodeString& rule,
int32_t start) {
if (parseError != 0) {
parseError->code = parseErrorCode;
parseError->line = 0; // We don't return a line #
parseError->offset = start; // Character offset from rule start
int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE);
if (end < 0) {
end = rule.length();
}
rule.extractBetween(start, end, parseError->context); // Current rule
}
status = U_ILLEGAL_ARGUMENT_ERROR;
return start;
}
/**
* Allocate a private-use substitution character for the given set,
* register it in the setVariables hash, and return the substitution
* character.
*/
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
if (variableNext >= variableLimit) {
// throw new RuntimeException("Private use variables exhausted");
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
setVariablesVector.addElement(adoptedSet);
return variableNext++;
}
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
* rule, and for forward and reverse rules, take the FROM expression, and
* make a hash of all characters used. The TO expression should be ignored.
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void TransliterationRuleParser::determineVariableRange(void) {
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
if (r != 0) {
data->setVariablesBase = variableNext = r->start;
variableLimit = (UChar) (r->start + r->length);
delete r;
}
if (variableNext >= variableLimit) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Returns the index of a character, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for 'h'.
*/
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
UChar charToFind) {
for (int32_t i=start; i<limit; ++i) {
UChar c = text.charAt(i);
if (c == ESCAPE) {
++i;
} else if (c == QUOTE) {
while (++i < limit
&& text.charAt(i) != QUOTE) {}
} else if (c == charToFind) {
return i;
}
}
return -1;
}