blob: 9a956da6b887fb9555cbd0d267a7a962c1b0908c [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
* $Date: 2001/10/24 00:03:38 $
* $Revision: 1.7 $
**********************************************************************
*/
package com.ibm.text;
import com.ibm.text.resources.ResourceReader;
import com.ibm.util.Utility;
import java.util.Stack;
import java.util.Vector;
import java.text.ParsePosition;
class TransliteratorParser {
//----------------------------------------------------------------------
// Data members
//----------------------------------------------------------------------
/**
* PUBLIC data member containing the parsed data object, or null if
* there were no rules.
*/
public RuleBasedTransliterator.Data data;
/**
* PUBLIC data member.
* The block of ::IDs, both at the top and at the bottom.
* Inserted into these may be additional rules at the
* idSplitPoint.
*/
public String idBlock;
/**
* PUBLIC data member.
* In a compound RBT, the index at which the RBT rules are
* inserted into the ID block. Index 0 means before any IDs
* in the block. Index idBlock.length() means after all IDs
* in the block. Index is a string index.
*/
public int idSplitPoint;
/**
* PUBLIC data member containing the parsed compound filter, if any.
*/
public UnicodeSet compoundFilter;
// The number of rules parsed. This tells us if there were
// any actual transliterator rules, or if there were just ::ID
// block IDs.
private int ruleCount;
private int direction;
/**
* Temporary symbol table used during parsing.
*/
private ParseData parseData;
/**
* Temporary vector of set variables. When parsing is complete, this
* is copied into the array data.variables. As with data.variables,
* element 0 corresponds to character data.variablesBase.
*/
private Vector variablesVector;
/**
* The next available stand-in for variables. This starts at some point in
* the private use area (discovered dynamically) and increments up toward
* <code>variableLimit</code>. At any point during parsing, available
* variables are <code>variableNext..variableLimit-1</code>.
*/
private char variableNext;
/**
* The last available stand-in for variables. This is discovered
* dynamically. At any point during parsing, available variables are
* <code>variableNext..variableLimit-1</code>. During variable definition
* we use the special value variableLimit-1 as a placeholder.
*/
private char variableLimit;
/**
* When we encounter an undefined variable, we do not immediately signal
* an error, in case we are defining this variable, e.g., "$a = [a-z];".
* Instead, we save the name of the undefined variable, and substitute
* in the placeholder char variableLimit - 1, and decrement
* variableLimit.
*/
private String undefinedVariableName;
/**
* The stand-in character for the 'dot' set, represented by '.' in
* patterns. This is allocated the first time it is needed, and
* reused thereafter.
*/
private int dotStandIn = -1;
//----------------------------------------------------------------------
// Constants
//----------------------------------------------------------------------
// Indicator for ID blocks
private static final String ID_TOKEN = "::";
private static final int ID_TOKEN_LEN = 2;
// Operators
private static final char VARIABLE_DEF_OP = '=';
private static final char FORWARD_RULE_OP = '>';
private static final char REVERSE_RULE_OP = '<';
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
private static final String OPERATORS = "=><";
// Other special characters
private static final char QUOTE = '\'';
private static final char ESCAPE = '\\';
private static final char END_OF_RULE = ';';
private static final char RULE_COMMENT_CHAR = '#';
private static final char CONTEXT_ANTE = '{'; // ante{key
private static final char CONTEXT_POST = '}'; // key}post
private static final char CURSOR_POS = '|';
private static final char CURSOR_OFFSET = '@';
private static final char ANCHOR_START = '^';
private static final char KLEENE_STAR = '*';
private static final char ONE_OR_MORE = '+';
private static final char ZERO_OR_ONE = '?';
private static final char DOT = '.';
private static final String DOT_SET = "[^[:Zp:][:Zl:]\r\n$]";
// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END = '$';
// Segments of the input string are delimited by "(" and ")". In the
// output string these segments are referenced as "$1" through "$9".
private static final char SEGMENT_OPEN = '(';
private static final char SEGMENT_CLOSE = ')';
//----------------------------------------------------------------------
// class ParseData
//----------------------------------------------------------------------
/**
* This class implements the SymbolTable interface. It is used
* during parsing to give UnicodeSet access to variables that
* have been defined so far. Note that it uses variablesVector,
* _not_ data.variables.
*/
private class ParseData implements SymbolTable {
/**
* Implement SymbolTable API.
*/
public char[] lookup(String name) {
return (char[]) data.variableNames.get(name);
}
/**
* Implement SymbolTable API.
*/
public UnicodeSet lookupSet(int ch) {
// Note that we cannot use data.lookupSet() because the
// set array has not been constructed yet.
int i = ch - data.variablesBase;
if (i >= 0 && i < variablesVector.size()) {
return (UnicodeSet) variablesVector.elementAt(i);
}
return null;
}
/**
* Implement SymbolTable API. Parse out a symbol reference
* name.
*/
public String parseReference(String text, ParsePosition pos, int limit) {
int start = pos.getIndex();
int i = start;
while (i < limit) {
char c = text.charAt(i);
if ((i==start && !Character.isUnicodeIdentifierStart(c)) ||
!Character.isUnicodeIdentifierPart(c)) {
break;
}
++i;
}
if (i == start) { // No valid name chars
return null;
}
pos.setIndex(i);
return text.substring(start, i);
}
}
//----------------------------------------------------------------------
// classes RuleBody, RuleArray, and RuleReader
//----------------------------------------------------------------------
/**
* A private abstract class representing the interface to rule
* source code that is broken up into lines. Handles the
* folding of lines terminated by a backslash. This folding
* is limited; it does not account for comments, quotes, or
* escapes, so its use to be limited.
*/
private static abstract class RuleBody {
/**
* Retrieve the next line of the source, or return null if
* none. Folds lines terminated by a backslash into the
* next line, without regard for comments, quotes, or
* escapes.
*/
String nextLine() {
String s = handleNextLine();
if (s != null &&
s.length() > 0 &&
s.charAt(s.length() - 1) == '\\') {
StringBuffer b = new StringBuffer(s);
do {
b.deleteCharAt(b.length()-1);
s = handleNextLine();
if (s == null) {
break;
}
b.append(s);
} while (s.length() > 0 &&
s.charAt(s.length() - 1) == '\\');
s = b.toString();
}
return s;
}
/**
* Reset to the first line of the source.
*/
abstract void reset();
/**
* Subclass method to return the next line of the source.
*/
abstract String handleNextLine();
};
/**
* RuleBody subclass for a String[] array.
*/
private static class RuleArray extends RuleBody {
String[] array;
int i;
public RuleArray(String[] array) { this.array = array; i = 0; }
public String handleNextLine() {
return (i < array.length) ? array[i++] : null;
}
public void reset() {
i = 0;
}
};
/**
* RuleBody subclass for a ResourceReader.
*/
private static class RuleReader extends RuleBody {
ResourceReader reader;
public RuleReader(ResourceReader reader) { this.reader = reader; }
public String handleNextLine() {
try {
return reader.readLine();
} catch (java.io.IOException e) {}
return null;
}
public void reset() {
reader.reset();
}
};
//----------------------------------------------------------------------
// class Segments
//----------------------------------------------------------------------
/**
* Segments are parentheses-enclosed regions of the input string.
* These are referenced in the output string using the notation $1,
* $2, etc. Numbering is in order of appearance of the left
* parenthesis. Number is one-based. Segments are defined as start,
* limit pairs. Segments may nest.
*
* During parsing, segment data is encoded in an object of class
* Segments. At runtime, the same data is encoded in compact form as
* an array of integers in a TransliterationRule. The runtime encoding
* must satisfy three goals:
*
* 1. Iterate over the offsets in a pattern, from left to right,
* and indicate all segment boundaries, in order. This is done
* during matching.
*
* 2. Given a reference $n, produce the start and limit offsets
* for that segment. This is done during replacement.
*
* 3. Similar to goal 1, but in addition, indicate whether each
* segment boundary is a start or a limit, in other words, whether
* each is an open paren or a close paren. This is required by
* the toRule() method.
*
* Goal 1 must be satisfied at high speed since this is done during
* matching. Goal 2 is next most important. Goal 3 is not performance
* critical since it is only needed by toRule().
*
* The array of integers is actually two arrays concatenated. The
* first gives the index values of the open and close parentheses in
* the order they appear. The second maps segment numbers to the
* indices of the first array. The two arrays have the same length.
* Iterating over the first array satisfies goal 1. Indexing into the
* second array satisfies goal 2. Goal 3 is satisfied by iterating
* over the second array and constructing the required data when
* needed. This is what toRule() does.
*
* Example: (a b(c d)e f)
* 0 1 2 3 4 5 6
*
* First array: Indices are 0, 2, 4, and 6.
* Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
* second array is 0, 3, 1 2 -- these give the indices in the
* first array at which $1:open, $1:close, $2:open, and $2:close
* occur.
*
* The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
*
* Each subarray is terminated with a -1, and two leading entries
* give the number of segments and the offset to the first entry
* of the second array. In addition, the second array value are
* all offset by 2 so they index directly into the final array.
* The total array size is 4*segments[0] + 4. The second index is
* 2*segments[0] + 3.
*
* In the output string, a segment reference is indicated by a
* character in a special range, as defined by
* RuleBasedTransliterator.Data.
*
* Most rules have no segments, in which case segments is null, and the
* output string need not be checked for segment reference characters.
*
* See also rbt_rule.h/cpp.
*/
private static class Segments {
private Vector offsets; // holds Integer objects
private Vector isOpenParen; // holds Boolean objects
private int offset(int i) {
return ((Integer) offsets.elementAt(i)).intValue();
}
private boolean isOpen(int i) {
return ((Boolean) isOpenParen.elementAt(i)).booleanValue();
}
// size of the Vectors
private int size() {
// assert(offset.size() == isOpenParen.size());
return offsets.size();
}
public Segments() {
offsets = new Vector();
isOpenParen = new Vector();
}
public void addParenthesisAt(int offset, boolean isOpen) {
offsets.addElement(new Integer(offset));
isOpenParen.addElement(new Boolean(isOpen));
}
public int getLastParenOffset(boolean[] isOpenParen) {
if (size() == 0) {
return -1;
}
isOpenParen[0] = isOpen(size()-1);
return offset(size()-1);
}
// Remove the last (rightmost) segment. Store its offsets in start
// and limit, and then convert all offsets at or after start to be
// equal to start. Upon failure, return FALSE. Assume that the
// caller has already called getLastParenOffset() and validated that
// there is at least one parenthesis and that the last one is a close
// paren.
public boolean extractLastParenSubstring(int[] start, int[] limit) {
// assert(offsets.size() > 0);
// assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
int i = size() - 1;
int n = 1; // count of close parens we need to match
// Record position of the last close paren
limit[0] = offset(i);
--i; // back up to the one before the last one
while (i >= 0 && n != 0) {
n += isOpen(i) ? -1 : 1;
}
if (n != 0) {
return false;
}
// assert(i>=0);
start[0] = offset(i);
// Reset all segment pairs from i to size() - 1 to [start, start+1).
while (i<size()) {
int o = isOpen(i) ? start[0] : (start[0]+1);
offsets.setElementAt(new Integer(o), i);
++i;
}
return true;
}
// Assume caller has already gotten a TRUE validate().
public int[] createArray() {
int c = count(); // number of segments
int arrayLen = 4*c + 4;
int[] array = new int[arrayLen];
int a2offset = 2*c + 3; // offset to array 2
array[0] = c;
array[1] = a2offset;
int i;
for (i=0; i<2*c; ++i) {
array[2+i] = offset(i);
}
array[a2offset-1] = -1;
array[arrayLen-1] = -1;
// Now walk through and match up segment numbers with parentheses.
// Number segments from 0. We're going to offset all entries by 2
// to skip the first two elements, array[0] and array[1].
Stack stack = new Stack();
int nextOpen = 0; // seg # of next open, 0-based
for (i=0; i<2*c; ++i) {
boolean open = isOpen(i);
// Let seg be the zero-based segment number.
// Open parens are at 2*seg in array 2.
// Close parens are at 2*seg+1 in array 2.
if (open) {
array[a2offset + 2*nextOpen] = 2+i;
stack.push(new Integer(nextOpen));
++nextOpen;
} else {
int nextClose = ((Integer) stack.pop()).intValue();
array[a2offset + 2*nextClose+1] = 2+i;
}
}
// assert(stack.empty());
return array;
}
public boolean validate() {
// want number of parens >= 2
// want number of parens to be even
// want first paren '('
// want parens to match up in the end
if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
return false;
}
int n = 0;
for (int i=0; i<size(); ++i) {
n += isOpen(i) ? 1 : -1;
if (n < 0) {
return false;
}
}
return n == 0;
}
// Number of segments
// Assume caller has already gotten a TRUE validate().
public int count() {
// assert(validate());
return size() / 2;
}
}
//----------------------------------------------------------------------
// class RuleHalf
//----------------------------------------------------------------------
/**
* A class representing one side of a rule. This class knows how to
* parse half of a rule. It is tightly coupled to the method
* TransliteratorParser.parseRule().
*/
private static class RuleHalf {
public String text;
public int cursor = -1; // position of cursor in text
public int ante = -1; // position of ante context marker '{' in text
public int post = -1; // position of post context marker '}' in text
// Record the position of the segment substrings and references. A
// given side should have segments or segment references, but not
// both.
public Segments segments = null;
public int maxRef = -1; // index of largest ref (1..9)
// Record the offset to the cursor either to the left or to the
// right of the key. This is indicated by characters on the output
// side that allow the cursor to be positioned arbitrarily within
// the matching text. For example, abc{def} > | @@@ xyz; changes
// def to xyz and moves the cursor to before abc. Offset characters
// must be at the start or end, and they cannot move the cursor past
// the ante- or postcontext text. Placeholders are only valid in
// output text.
public int cursorOffset = 0; // only nonzero on output side
public boolean anchorStart = false;
public boolean anchorEnd = false;
/**
* Parse one side of a rule, stopping at either the limit,
* the END_OF_RULE character, or an operator. Return
* the pos of the terminating character (or limit).
*/
public int parse(String rule, int pos, int limit,
TransliteratorParser parser) {
int start = pos;
StringBuffer buf = new StringBuffer();
ParsePosition pp = null;
int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
boolean done = false;
int quoteStart = -1; // Most recent 'single quoted string'
int quoteLimit = -1;
int varStart = -1; // Most recent $variableReference
int varLimit = -1;
int[] iref = new int[1];
main:
while (pos < limit && !done) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore whitespace. Note that this is not Unicode
// spaces, but Java spaces -- a subset, representing
// whitespace likely to be seen in code.
continue;
}
if (OPERATORS.indexOf(c) >= 0) {
--pos; // Backup to point to operator
break main;
}
if (anchorEnd) {
// Text after a presumed end anchor is a syntax err
syntaxError("Malformed variable reference", rule, start);
}
if (UnicodeSet.resemblesPattern(rule, pos-1)) {
if (pp == null) {
pp = new ParsePosition(0);
}
pp.setIndex(pos-1); // Backup to opening '['
buf.append(parser.parseSet(rule, pp));
pos = pp.getIndex();
continue;
}
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
syntaxError("Trailing backslash", rule, start);
}
iref[0] = pos;
int escaped = Utility.unescapeAt(rule, iref);
pos = iref[0];
if (escaped == -1) {
syntaxError("Malformed escape", rule, start);
}
UTF16.append(buf, escaped);
continue;
}
// Handle quoted matter
if (c == QUOTE) {
int iq = rule.indexOf(QUOTE, pos);
if (iq == pos) {
buf.append(c); // Parse [''] outside quotes as [']
++pos;
} else {
/* This loop picks up a segment of quoted text of the
* form 'aaaa' each time through. If this segment
* hasn't really ended ('aaaa''bbbb') then it keeps
* looping, each time adding on a new segment. When it
* reaches the final quote it breaks.
*/
quoteStart = buf.length();
for (;;) {
if (iq < 0) {
syntaxError("Unterminated quote", rule, start);
}
buf.append(rule.substring(pos, iq));
pos = iq+1;
if (pos < limit && rule.charAt(pos) == QUOTE) {
// Parse [''] inside quotes as [']
iq = rule.indexOf(QUOTE, pos+1);
// Continue looping
} else {
break;
}
}
quoteLimit = buf.length();
}
continue;
}
switch (c) {
case ANCHOR_START:
if (buf.length() == 0 && !anchorStart) {
anchorStart = true;
} else {
syntaxError("Misplaced anchor start",
rule, start);
}
break;
case SEGMENT_OPEN:
case SEGMENT_CLOSE:
// Handle segment definitions "(" and ")"
// Parse "(", ")"
if (segments == null) {
segments = new Segments();
}
segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
break;
case END_OF_RULE:
--pos; // Backup to point to END_OF_RULE
break main;
case SymbolTable.SYMBOL_REF:
// Handle variable references and segment references "$1" .. "$9"
{
// A variable reference must be followed immediately
// by a Unicode identifier start and zero or more
// Unicode identifier part characters, or by a digit
// 1..9 if it is a segment reference.
if (pos == limit) {
// A variable ref character at the end acts as
// an anchor to the context limit, as in perl.
anchorEnd = true;
break;
}
// Parse "$1" "$2" .. "$9" .. (no upper limit)
c = rule.charAt(pos);
int r = Character.digit(c, 10);
if (r >= 1 && r <= 9) {
++pos;
while (pos < limit) {
c = rule.charAt(pos);
int d = Character.digit(c, 10);
if (d < 0) {
break;
}
if (r > 214748364 ||
(r == 214748364 && d > 7)) {
syntaxError("Undefined segment reference",
rule, start);
}
r = 10*r + d;
}
if (r > maxRef) {
maxRef = r;
}
buf.append(parser.getSegmentStandin(r));
} else {
if (pp == null) { // Lazy create
pp = new ParsePosition(0);
}
pp.setIndex(pos);
String name = parser.parseData.
parseReference(rule, pp, limit);
if (name == null) {
// This means the '$' was not followed by a
// valid name. Try to interpret it as an
// end anchor then. If this also doesn't work
// (if we see a following character) then signal
// an error.
anchorEnd = true;
break;
}
pos = pp.getIndex();
// If this is a variable definition statement,
// then the LHS variable will be undefined. In
// that case appendVariableDef() will append the
// special placeholder char variableLimit-1.
varStart = buf.length();
parser.appendVariableDef(name, buf);
varLimit = buf.length();
}
}
break;
case CONTEXT_ANTE:
if (ante >= 0) {
syntaxError("Multiple ante contexts", rule, start);
}
ante = buf.length();
break;
case CONTEXT_POST:
if (post >= 0) {
syntaxError("Multiple post contexts", rule, start);
}
post = buf.length();
break;
case CURSOR_POS:
if (cursor >= 0) {
syntaxError("Multiple cursors", rule, start);
}
cursor = buf.length();
break;
case CURSOR_OFFSET:
if (cursorOffset < 0) {
if (buf.length() > 0) {
syntaxError("Misplaced " + c, rule, start);
}
--cursorOffset;
} else if (cursorOffset > 0) {
if (buf.length() != cursorOffsetPos || cursor >= 0) {
syntaxError("Misplaced " + c, rule, start);
}
++cursorOffset;
} else {
if (cursor == 0 && buf.length() == 0) {
cursorOffset = -1;
} else if (cursor < 0) {
cursorOffsetPos = buf.length();
cursorOffset = 1;
} else {
syntaxError("Misplaced " + c, rule, start);
}
}
break;
case DOT:
buf.append(parser.getDotStandIn());
break;
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
// Quantifiers. We handle single characters, quoted strings,
// variable references, and segments.
// a+ matches aaa
// 'foo'+ matches foofoofoo
// $v+ matches xyxyxy if $v == xy
// (seg)+ matches segsegseg
{
int qstart, qlimit;
boolean[] isOpenParen = new boolean[1];
boolean isSegment = false;
if (segments != null &&
segments.getLastParenOffset(isOpenParen) == buf.length()) {
// The */+ immediately follows a segment
if (isOpenParen[0]) {
syntaxError("Misplaced quantifier", rule, start);
}
int[] startparam = new int[1];
int[] limitparam = new int[1];
if (!segments.extractLastParenSubstring(startparam, limitparam)) {
syntaxError("Mismatched segment delimiters", rule, start);
}
qstart = startparam[0];
qlimit = limitparam[0];
isSegment = true;
} else {
// The */+ follows an isolated character or quote
// or variable reference
if (buf.length() == quoteLimit) {
// The */+ follows a 'quoted string'
qstart = quoteStart;
qlimit = quoteLimit;
} else if (buf.length() == varLimit) {
// The */+ follows a $variableReference
qstart = varStart;
qlimit = varLimit;
} else {
// The */+ follows a single character
qstart = buf.length() - 1;
qlimit = qstart + 1;
}
}
UnicodeMatcher m =
new StringMatcher(buf.toString(), qstart, qlimit,
isSegment, parser.data);
int min = 0;
int max = Quantifier.MAX;
switch (c) {
case ONE_OR_MORE:
min = 1;
break;
case ZERO_OR_ONE:
min = 0;
max = 1;
break;
// case KLEENE_STAR:
// do nothing -- min, max already set
}
m = new Quantifier(m, min, max);
buf.setLength(qstart);
buf.append(parser.generateStandInFor(m));
}
break;
default:
// Disallow unquoted characters other than [0-9A-Za-z]
// in the printable ASCII range. These characters are
// reserved for possible future use.
if (c >= 0x0021 && c <= 0x007E &&
!((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z'))) {
syntaxError("Unquoted " + c, rule, start);
}
buf.append(c);
break;
}
}
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
syntaxError("Misplaced " + CURSOR_POS, rule, start);
}
text = buf.toString();
return pos;
}
/**
* Remove context.
*/
void removeContext() {
text = text.substring(ante < 0 ? 0 : ante,
post < 0 ? text.length() : post);
ante = post = -1;
anchorStart = anchorEnd = false;
}
/**
* Create and return an int[] array of segments.
*/
int[] createSegments() {
return (segments == null) ? null : segments.createArray();
}
}
//----------------------------------------------------------------------
// PUBLIC methods
//----------------------------------------------------------------------
/**
* Constructor.
*/
public TransliteratorParser() {
}
/**
* Parse a set of rules. After the parse completes, examine the public
* data members for results.
*/
public void parse(String rules, int direction) {
parseRules(new RuleArray(new String[] { rules }), direction);
}
/**
* Parse a set of rules. After the parse completes, examine the public
* data members for results.
*/
public void parse(ResourceReader rules, int direction) {
parseRules(new RuleReader(rules), direction);
}
//----------------------------------------------------------------------
// PRIVATE methods
//----------------------------------------------------------------------
/**
* Parse an array of zero or more rules. The strings in the array are
* treated as if they were concatenated together, with rule terminators
* inserted between array elements if not present already.
*
* Any previous rules are discarded. Typically this method is called exactly
* once, during construction.
*
* The member this.data will be set to null if there are no rules.
*
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void parseRules(RuleBody ruleArray, int dir) {
data = new RuleBasedTransliterator.Data();
direction = dir;
ruleCount = 0;
compoundFilter = null;
// By default, rules use part of the private use area
// E000..F8FF for variables and other stand-ins. Currently
// the range F000..F8FF is typically sufficient. The 'use
// variable range' pragma allows rule sets to modify this.
setVariableRange(0xF000, 0xF8FF);
variablesVector = new Vector();
parseData = new ParseData();
StringBuffer errors = null;
int errorCount = 0;
ruleArray.reset();
StringBuffer idBlockResult = new StringBuffer();
idSplitPoint = -1;
// The mode marks whether we are in the header ::id block, the
// rule block, or the footer ::id block.
// mode == 0: start: rule->1, ::id->0
// mode == 1: in rules: rule->1, ::id->2
// mode == 2: in footer rule block: rule->ERROR, ::id->2
int mode = 0;
// The compound filter offset is an index into idBlockResult.
// If it is 0, then the compound filter occurred at the start,
// and it is the offset to the _start_ of the compound filter
// pattern. Otherwise it is the offset to the _limit_ of the
// compound filter pattern within idBlockResult.
this.compoundFilter = null;
int compoundFilterOffset = -1;
main:
for (;;) {
String rule = ruleArray.nextLine();
if (rule == null) {
break;
}
int pos = 0;
int limit = rule.length();
while (pos < limit) {
char c = rule.charAt(pos++);
if (Character.isWhitespace(c)) {
// Ignore leading whitespace. Note that this is not
// Unicode spaces, but Java spaces -- a subset,
// representing whitespace likely to be seen in code.
continue;
}
// Skip lines starting with the comment character
if (c == RULE_COMMENT_CHAR) {
pos = rule.indexOf("\n", pos) + 1;
if (pos == 0) {
break; // No "\n" found; rest of rule is a commnet
}
continue; // Either fall out or restart with next line
}
// Often a rule file contains multiple errors. It's
// convenient to the rule author if these are all reported
// at once. We keep parsing rules even after a failure, up
// to a specified limit, and report all errors at once.
try {
// We've found the start of a rule or ID. c is its first
// character, and pos points past c.
--pos;
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
// chars left.
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
pos += ID_TOKEN_LEN;
c = rule.charAt(pos);
while (UCharacter.isWhitespace(c) && pos < limit) {
++pos;
c = rule.charAt(pos);
}
int lengthBefore = idBlockResult.length();
if (mode == 1) {
mode = 2;
idSplitPoint = lengthBefore;
}
int[] p = new int[] { pos };
boolean[] sawDelim = new boolean[1];
UnicodeSet[] cpdFilter = new UnicodeSet[1];
Transliterator.parseID(rule, idBlockResult, p, sawDelim, cpdFilter, direction, false);
if (p[0] == pos || !sawDelim[0]) {
// Invalid ::id
int i1 = pos + 2;
while (i1 < rule.length() && rule.charAt(i1) != ';') {
++i1;
}
throw new IllegalArgumentException("Invalid ::ID " +
rule.substring(pos, i1));
}
if (cpdFilter[0] != null) {
if (compoundFilter != null) {
// Multiple compound filters
throw new IllegalArgumentException("Multiple compound filters");
}
compoundFilter = cpdFilter[0];
compoundFilterOffset = (direction == Transliterator.FORWARD) ?
lengthBefore : idBlockResult.length();
}
pos = p[0];
} else if (resemblesPragma(rule, pos, limit)) {
int ppp = parsePragma(rule, pos, limit);
if (ppp < 0) {
throw new IllegalArgumentException("Unrecognized pragma: " +
rule.substring(pos));
}
pos = ppp;
} else {
// Parse a rule
pos = parseRule(rule, pos, limit);
++ruleCount;
if (mode == 2) {
// ::id in illegal position (because a rule
// occurred after the ::id footer block)
throw new IllegalArgumentException("::ID in illegal position");
}
mode = 1;
}
} catch (IllegalArgumentException e) {
if (errorCount == 30) {
errors.append("\nMore than 30 errors; further messages squelched");
break main;
}
if (errors == null) {
errors = new StringBuffer(e.getMessage());
} else {
errors.append("\n" + e.getMessage());
}
++errorCount;
pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
}
}
}
idBlock = idBlockResult.toString();
// Convert the set vector to an array
data.variables = new UnicodeMatcher[variablesVector.size()];
variablesVector.copyInto(data.variables);
variablesVector = null;
// Do more syntax checking and index the rules
try {
if (compoundFilter != null) {
if ((direction == Transliterator.FORWARD &&
compoundFilterOffset != 0) ||
(direction == Transliterator.REVERSE &&
compoundFilterOffset != idBlock.length())) {
throw new IllegalArgumentException("Compound filters misplaced");
}
}
data.ruleSet.freeze();
if (idSplitPoint < 0) {
idSplitPoint = idBlock.length();
}
if (ruleCount == 0) {
data = null;
}
} catch (IllegalArgumentException e) {
if (errors == null) {
errors = new StringBuffer(e.getMessage());
} else {
errors.append("\n").append(e.getMessage());
}
}
if (errors != null) {
throw new IllegalArgumentException(errors.toString());
}
}
/**
* MAIN PARSER. Parse the next rule in the given rule string, starting
* at pos. Return the index after the last character parsed. Do not
* parse characters at or after limit.
*
* Important: The character at pos must be a non-whitespace character
* that is not the comment character.
*
* This method handles quoting, escaping, and whitespace removal. It
* parses the end-of-rule character. It recognizes context and cursor
* indicators. Once it does a lexical breakdown of the rule at pos, it
* creates a rule object and adds it to our rule list.
*
* This method is tightly coupled to the inner class RuleHalf.
*/
private int parseRule(String rule, int pos, int limit) {
// Locate the left side, operator, and right side
int start = pos;
char operator = 0;
RuleHalf left = new RuleHalf();
RuleHalf right = new RuleHalf();
undefinedVariableName = null;
pos = left.parse(rule, pos, limit, this);
if (pos == limit ||
OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) {
syntaxError("No operator", rule, start);
}
// Found an operator char. Check for forward-reverse operator.
if (operator == REVERSE_RULE_OP &&
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
++pos;
operator = FWDREV_RULE_OP;
}
pos = right.parse(rule, pos, limit, this);
if (pos < limit) {
if (rule.charAt(pos) == END_OF_RULE) {
++pos;
} else {
// RuleHalf parser must have terminated at an operator
syntaxError("Unquoted operator", rule, start);
}
}
if (operator == VARIABLE_DEF_OP) {
// LHS is the name. RHS is a single character, either a literal
// or a set (already parsed). If RHS is longer than one
// character, it is either a multi-character string, or multiple
// sets, or a mixture of chars and sets -- syntax error.
// We expect to see a single undefined variable (the one being
// defined).
if (undefinedVariableName == null) {
syntaxError("Missing '$' or duplicate definition", rule, start);
}
if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
syntaxError("Malformed LHS", rule, start);
}
if (left.anchorStart || left.anchorEnd ||
right.anchorStart || right.anchorEnd) {
syntaxError("Malformed variable def", rule, start);
}
// We allow anything on the right, including an empty string.
int n = right.text.length();
char[] value = new char[n];
right.text.getChars(0, n, value, 0);
data.variableNames.put(undefinedVariableName, value);
++variableLimit;
return pos;
}
// If this is not a variable definition rule, we shouldn't have
// any undefined variable names.
if (undefinedVariableName != null) {
syntaxError("Undefined variable $" + undefinedVariableName,
rule, start);
}
// If the direction we want doesn't match the rule
// direction, do nothing.
if (operator != FWDREV_RULE_OP &&
((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
return pos;
}
// Transform the rule into a forward rule by swapping the
// sides if necessary.
if (direction == Transliterator.REVERSE) {
RuleHalf temp = left;
left = right;
right = temp;
}
// Remove non-applicable elements in forward-reverse
// rules. Bidirectional rules ignore elements that do not
// apply.
if (operator == FWDREV_RULE_OP) {
right.removeContext();
right.segments = null;
left.cursor = left.maxRef = -1;
left.cursorOffset = 0;
}
// Normalize context
if (left.ante < 0) {
left.ante = 0;
}
if (left.post < 0) {
left.post = left.text.length();
}
// Context is only allowed on the input side. Cursors are only
// allowed on the output side. Segment delimiters can only appear
// on the left, and references on the right. Cursor offset
// cannot appear without an explicit cursor. Cursor offset
// cannot place the cursor outside the limits of the context.
// Anchors are only allowed on the input side.
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
right.segments != null || left.maxRef >= 0 ||
(right.cursorOffset != 0 && right.cursor < 0) ||
// - The following two checks were used to ensure that the
// - the cursor offset stayed within the ante- or postcontext.
// - However, with the addition of quantifiers, we have to
// - allow arbitrary cursor offsets and do runtime checking.
//(right.cursorOffset > (left.text.length() - left.post)) ||
//(-right.cursorOffset > left.ante) ||
right.anchorStart || right.anchorEnd) {
syntaxError("Malformed rule", rule, start);
}
// Check integrity of segments and segment references. Each
// segment's start must have a corresponding limit, and the
// references must not refer to segments that do not exist.
if (left.segments != null) {
if (!left.segments.validate()) {
syntaxError("Missing segment close", rule, start);
}
int n = left.segments.count();
if (right.maxRef > n) {
syntaxError("Undefined segment reference", rule, start);
}
}
data.ruleSet.addRule(new TransliterationRule(
left.text, left.ante, left.post,
right.text, right.cursor, right.cursorOffset,
left.createSegments(),
left.anchorStart, left.anchorEnd,
data));
return pos;
}
/**
* Set the variable range to [start, end] (inclusive).
*/
private void setVariableRange(int start, int end) {
if (start > end || start < 0 || end > 0xFFFF) {
throw new IllegalArgumentException("Invalid variable range " + start + ", " + end);
}
// Segment references work down; variables work up. We don't
// know how many of each we will need.
data.segmentBase = (char) end;
data.segmentCount = 0;
data.variablesBase = variableNext = (char) start; // first private use
variableLimit = (char) (end + 1);
}
/**
* Set the maximum backup to 'backup', in response to a pragma
* statement.
*/
private void pragmaMaximumBackup(int backup) {
//TODO Finish
}
/**
* Begin normalizing all rules using the given mode, in response
* to a pragma statement.
*/
private void pragmaNormalizeRules(Normalizer.Mode mode) {
//TODO Finish
}
/**
* Return true if the given rule looks like a pragma.
* @param pos offset to the first non-whitespace character
* of the rule.
* @param limit pointer past the last character of the rule.
*/
static boolean resemblesPragma(String rule, int pos, int limit) {
// Must start with /use\s/i
return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
}
/**
* Parse a pragma. This method assumes resemblesPragma() has
* already returned true.
* @param pos offset to the first non-whitespace character
* of the rule.
* @param limit pointer past the last character of the rule.
* @return the position index after the final ';' of the pragma,
* or -1 on failure.
*/
private int parsePragma(String rule, int pos, int limit) {
int[] array = new int[2];
// resemblesPragma() has already returned true, so we
// know that pos points to /use\s/i; we can skip 4 characters
// immediately
pos += 4;
// Here are the pragmas we recognize:
// use variable range 0xE000 0xEFFF;
// use maximum backup 16;
// use nfd rules;
int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
if (p >= 0) {
setVariableRange(array[0], array[1]);
return p;
}
p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
if (p >= 0) {
pragmaMaximumBackup(array[0]);
return p;
}
p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
if (p >= 0) {
pragmaNormalizeRules(Normalizer.DECOMP);
return p;
}
p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
if (p >= 0) {
pragmaNormalizeRules(Normalizer.COMPOSE);
return p;
}
// Syntax error: unable to parse pragma
return -1;
}
/**
* Throw an exception indicating a syntax error. Search the rule string
* for the probable end of the rule. Of course, if the error is that
* the end of rule marker is missing, then the rule end will not be found.
* In any case the rule start will be correctly reported.
* @param msg error description
* @param rule pattern string
* @param start position of first character of current rule
*/
static final void syntaxError(String msg, String rule, int start) {
int end = ruleEnd(rule, start, rule.length());
throw new IllegalArgumentException(msg + " in \"" +
Utility.escape(rule.substring(start, end)) + '"');
}
static final int ruleEnd(String rule, int start, int limit) {
int end = quotedIndexOf(rule, start, limit, ";");
if (end < 0) {
end = limit;
}
return end;
}
/**
* Parse a UnicodeSet out, store it, and return the stand-in character
* used to represent it.
*/
private final char parseSet(String rule, ParsePosition pos) {
UnicodeSet set = new UnicodeSet(rule, pos, parseData);
if (variableNext >= variableLimit) {
throw new RuntimeException("Private use variables exhausted");
}
set.compact();
return generateStandInFor(set);
}
/**
* Generate and return a stand-in for a new UnicodeMatcher. Store
* the matcher.
*/
char generateStandInFor(UnicodeMatcher matcher) {
// assert(matcher != null);
if (variableNext >= variableLimit) {
throw new RuntimeException("Private use variables exhausted");
}
variablesVector.addElement(matcher);
return variableNext++;
}
/**
* Return the stand-in for the dot set. It is allocated the first
* time and reused thereafter.
*/
char getDotStandIn() {
if (dotStandIn == -1) {
dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
}
return (char) dotStandIn;
}
/**
* Append the value of the given variable name to the given
* StringBuffer.
* @exception IllegalArgumentException if the name is unknown.
*/
private void appendVariableDef(String name, StringBuffer buf) {
char[] ch = (char[]) data.variableNames.get(name);
if (ch == null) {
// We allow one undefined variable so that variable definition
// statements work. For the first undefined variable we return
// the special placeholder variableLimit-1, and save the variable
// name.
if (undefinedVariableName == null) {
undefinedVariableName = name;
if (variableNext >= variableLimit) {
throw new RuntimeException("Private use variables exhausted");
}
buf.append((char) --variableLimit);
} else {
throw new IllegalArgumentException("Undefined variable $"
+ name);
}
} else {
buf.append(ch);
}
}
char getSegmentStandin(int r) {
// assert(r>=1);
if (r > data.segmentCount) {
data.segmentCount = r;
variableLimit = (char) (data.segmentBase - r + 1);
if (variableNext >= variableLimit) {
throw new IllegalArgumentException("Too many variables / segments");
}
}
return data.getSegmentStandin(r);
}
/**
* Returns the index of the first character in a set, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
*/
private static int quotedIndexOf(String text, int start, int limit,
String setOfChars) {
for (int i=start; i<limit; ++i) {
char c = text.charAt(i);
if (c == ESCAPE) {
++i;
} else if (c == QUOTE) {
while (++i < limit
&& text.charAt(i) != QUOTE) {}
} else if (setOfChars.indexOf(c) >= 0) {
return i;
}
}
return -1;
}
}
//eof