| /* |
| ********************************************************************** |
| * Copyright (c) 2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $ |
| * $Date: 2001/10/24 00:03:38 $ |
| * $Revision: 1.7 $ |
| ********************************************************************** |
| */ |
| package com.ibm.text; |
| |
| import com.ibm.text.resources.ResourceReader; |
| import com.ibm.util.Utility; |
| import java.util.Stack; |
| import java.util.Vector; |
| import java.text.ParsePosition; |
| |
| class TransliteratorParser { |
| |
| //---------------------------------------------------------------------- |
| // Data members |
| //---------------------------------------------------------------------- |
| |
| /** |
| * PUBLIC data member containing the parsed data object, or null if |
| * there were no rules. |
| */ |
| public RuleBasedTransliterator.Data data; |
| |
| /** |
| * PUBLIC data member. |
| * The block of ::IDs, both at the top and at the bottom. |
| * Inserted into these may be additional rules at the |
| * idSplitPoint. |
| */ |
| public String idBlock; |
| |
| /** |
| * PUBLIC data member. |
| * In a compound RBT, the index at which the RBT rules are |
| * inserted into the ID block. Index 0 means before any IDs |
| * in the block. Index idBlock.length() means after all IDs |
| * in the block. Index is a string index. |
| */ |
| public int idSplitPoint; |
| |
| /** |
| * PUBLIC data member containing the parsed compound filter, if any. |
| */ |
| public UnicodeSet compoundFilter; |
| |
| |
| // The number of rules parsed. This tells us if there were |
| // any actual transliterator rules, or if there were just ::ID |
| // block IDs. |
| private int ruleCount; |
| |
| private int direction; |
| |
| /** |
| * Temporary symbol table used during parsing. |
| */ |
| private ParseData parseData; |
| |
| /** |
| * Temporary vector of set variables. When parsing is complete, this |
| * is copied into the array data.variables. As with data.variables, |
| * element 0 corresponds to character data.variablesBase. |
| */ |
| private Vector variablesVector; |
| |
| /** |
| * The next available stand-in for variables. This starts at some point in |
| * the private use area (discovered dynamically) and increments up toward |
| * <code>variableLimit</code>. At any point during parsing, available |
| * variables are <code>variableNext..variableLimit-1</code>. |
| */ |
| private char variableNext; |
| |
| /** |
| * The last available stand-in for variables. This is discovered |
| * dynamically. At any point during parsing, available variables are |
| * <code>variableNext..variableLimit-1</code>. During variable definition |
| * we use the special value variableLimit-1 as a placeholder. |
| */ |
| private char variableLimit; |
| |
| /** |
| * When we encounter an undefined variable, we do not immediately signal |
| * an error, in case we are defining this variable, e.g., "$a = [a-z];". |
| * Instead, we save the name of the undefined variable, and substitute |
| * in the placeholder char variableLimit - 1, and decrement |
| * variableLimit. |
| */ |
| private String undefinedVariableName; |
| |
| /** |
| * The stand-in character for the 'dot' set, represented by '.' in |
| * patterns. This is allocated the first time it is needed, and |
| * reused thereafter. |
| */ |
| private int dotStandIn = -1; |
| |
| //---------------------------------------------------------------------- |
| // Constants |
| //---------------------------------------------------------------------- |
| |
| // Indicator for ID blocks |
| private static final String ID_TOKEN = "::"; |
| private static final int ID_TOKEN_LEN = 2; |
| |
| // Operators |
| private static final char VARIABLE_DEF_OP = '='; |
| private static final char FORWARD_RULE_OP = '>'; |
| private static final char REVERSE_RULE_OP = '<'; |
| private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op |
| |
| private static final String OPERATORS = "=><"; |
| |
| // Other special characters |
| private static final char QUOTE = '\''; |
| private static final char ESCAPE = '\\'; |
| private static final char END_OF_RULE = ';'; |
| private static final char RULE_COMMENT_CHAR = '#'; |
| |
| private static final char CONTEXT_ANTE = '{'; // ante{key |
| private static final char CONTEXT_POST = '}'; // key}post |
| private static final char CURSOR_POS = '|'; |
| private static final char CURSOR_OFFSET = '@'; |
| private static final char ANCHOR_START = '^'; |
| |
| private static final char KLEENE_STAR = '*'; |
| private static final char ONE_OR_MORE = '+'; |
| private static final char ZERO_OR_ONE = '?'; |
| |
| private static final char DOT = '.'; |
| private static final String DOT_SET = "[^[:Zp:][:Zl:]\r\n$]"; |
| |
| // By definition, the ANCHOR_END special character is a |
| // trailing SymbolTable.SYMBOL_REF character. |
| // private static final char ANCHOR_END = '$'; |
| |
| // Segments of the input string are delimited by "(" and ")". In the |
| // output string these segments are referenced as "$1" through "$9". |
| private static final char SEGMENT_OPEN = '('; |
| private static final char SEGMENT_CLOSE = ')'; |
| |
| //---------------------------------------------------------------------- |
| // class ParseData |
| //---------------------------------------------------------------------- |
| |
| /** |
| * This class implements the SymbolTable interface. It is used |
| * during parsing to give UnicodeSet access to variables that |
| * have been defined so far. Note that it uses variablesVector, |
| * _not_ data.variables. |
| */ |
| private class ParseData implements SymbolTable { |
| |
| /** |
| * Implement SymbolTable API. |
| */ |
| public char[] lookup(String name) { |
| return (char[]) data.variableNames.get(name); |
| } |
| |
| /** |
| * Implement SymbolTable API. |
| */ |
| public UnicodeSet lookupSet(int ch) { |
| // Note that we cannot use data.lookupSet() because the |
| // set array has not been constructed yet. |
| int i = ch - data.variablesBase; |
| if (i >= 0 && i < variablesVector.size()) { |
| return (UnicodeSet) variablesVector.elementAt(i); |
| } |
| return null; |
| } |
| |
| /** |
| * Implement SymbolTable API. Parse out a symbol reference |
| * name. |
| */ |
| public String parseReference(String text, ParsePosition pos, int limit) { |
| int start = pos.getIndex(); |
| int i = start; |
| while (i < limit) { |
| char c = text.charAt(i); |
| if ((i==start && !Character.isUnicodeIdentifierStart(c)) || |
| !Character.isUnicodeIdentifierPart(c)) { |
| break; |
| } |
| ++i; |
| } |
| if (i == start) { // No valid name chars |
| return null; |
| } |
| pos.setIndex(i); |
| return text.substring(start, i); |
| } |
| } |
| |
| //---------------------------------------------------------------------- |
| // classes RuleBody, RuleArray, and RuleReader |
| //---------------------------------------------------------------------- |
| |
| /** |
| * A private abstract class representing the interface to rule |
| * source code that is broken up into lines. Handles the |
| * folding of lines terminated by a backslash. This folding |
| * is limited; it does not account for comments, quotes, or |
| * escapes, so its use to be limited. |
| */ |
| private static abstract class RuleBody { |
| |
| /** |
| * Retrieve the next line of the source, or return null if |
| * none. Folds lines terminated by a backslash into the |
| * next line, without regard for comments, quotes, or |
| * escapes. |
| */ |
| String nextLine() { |
| String s = handleNextLine(); |
| if (s != null && |
| s.length() > 0 && |
| s.charAt(s.length() - 1) == '\\') { |
| |
| StringBuffer b = new StringBuffer(s); |
| do { |
| b.deleteCharAt(b.length()-1); |
| s = handleNextLine(); |
| if (s == null) { |
| break; |
| } |
| b.append(s); |
| } while (s.length() > 0 && |
| s.charAt(s.length() - 1) == '\\'); |
| |
| s = b.toString(); |
| } |
| return s; |
| } |
| |
| /** |
| * Reset to the first line of the source. |
| */ |
| abstract void reset(); |
| |
| /** |
| * Subclass method to return the next line of the source. |
| */ |
| abstract String handleNextLine(); |
| }; |
| |
| /** |
| * RuleBody subclass for a String[] array. |
| */ |
| private static class RuleArray extends RuleBody { |
| String[] array; |
| int i; |
| public RuleArray(String[] array) { this.array = array; i = 0; } |
| public String handleNextLine() { |
| return (i < array.length) ? array[i++] : null; |
| } |
| public void reset() { |
| i = 0; |
| } |
| }; |
| |
| /** |
| * RuleBody subclass for a ResourceReader. |
| */ |
| private static class RuleReader extends RuleBody { |
| ResourceReader reader; |
| public RuleReader(ResourceReader reader) { this.reader = reader; } |
| public String handleNextLine() { |
| try { |
| return reader.readLine(); |
| } catch (java.io.IOException e) {} |
| return null; |
| } |
| public void reset() { |
| reader.reset(); |
| } |
| }; |
| |
| //---------------------------------------------------------------------- |
| // class Segments |
| //---------------------------------------------------------------------- |
| |
| /** |
| * Segments are parentheses-enclosed regions of the input string. |
| * These are referenced in the output string using the notation $1, |
| * $2, etc. Numbering is in order of appearance of the left |
| * parenthesis. Number is one-based. Segments are defined as start, |
| * limit pairs. Segments may nest. |
| * |
| * During parsing, segment data is encoded in an object of class |
| * Segments. At runtime, the same data is encoded in compact form as |
| * an array of integers in a TransliterationRule. The runtime encoding |
| * must satisfy three goals: |
| * |
| * 1. Iterate over the offsets in a pattern, from left to right, |
| * and indicate all segment boundaries, in order. This is done |
| * during matching. |
| * |
| * 2. Given a reference $n, produce the start and limit offsets |
| * for that segment. This is done during replacement. |
| * |
| * 3. Similar to goal 1, but in addition, indicate whether each |
| * segment boundary is a start or a limit, in other words, whether |
| * each is an open paren or a close paren. This is required by |
| * the toRule() method. |
| * |
| * Goal 1 must be satisfied at high speed since this is done during |
| * matching. Goal 2 is next most important. Goal 3 is not performance |
| * critical since it is only needed by toRule(). |
| * |
| * The array of integers is actually two arrays concatenated. The |
| * first gives the index values of the open and close parentheses in |
| * the order they appear. The second maps segment numbers to the |
| * indices of the first array. The two arrays have the same length. |
| * Iterating over the first array satisfies goal 1. Indexing into the |
| * second array satisfies goal 2. Goal 3 is satisfied by iterating |
| * over the second array and constructing the required data when |
| * needed. This is what toRule() does. |
| * |
| * Example: (a b(c d)e f) |
| * 0 1 2 3 4 5 6 |
| * |
| * First array: Indices are 0, 2, 4, and 6. |
| |
| * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the |
| * second array is 0, 3, 1 2 -- these give the indices in the |
| * first array at which $1:open, $1:close, $2:open, and $2:close |
| * occur. |
| * |
| * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1 |
| * |
| * Each subarray is terminated with a -1, and two leading entries |
| * give the number of segments and the offset to the first entry |
| * of the second array. In addition, the second array value are |
| * all offset by 2 so they index directly into the final array. |
| * The total array size is 4*segments[0] + 4. The second index is |
| * 2*segments[0] + 3. |
| * |
| * In the output string, a segment reference is indicated by a |
| * character in a special range, as defined by |
| * RuleBasedTransliterator.Data. |
| * |
| * Most rules have no segments, in which case segments is null, and the |
| * output string need not be checked for segment reference characters. |
| * |
| * See also rbt_rule.h/cpp. |
| */ |
| private static class Segments { |
| |
| private Vector offsets; // holds Integer objects |
| |
| private Vector isOpenParen; // holds Boolean objects |
| |
| private int offset(int i) { |
| return ((Integer) offsets.elementAt(i)).intValue(); |
| } |
| |
| private boolean isOpen(int i) { |
| return ((Boolean) isOpenParen.elementAt(i)).booleanValue(); |
| } |
| |
| // size of the Vectors |
| private int size() { |
| // assert(offset.size() == isOpenParen.size()); |
| return offsets.size(); |
| } |
| |
| public Segments() { |
| offsets = new Vector(); |
| isOpenParen = new Vector(); |
| } |
| |
| public void addParenthesisAt(int offset, boolean isOpen) { |
| offsets.addElement(new Integer(offset)); |
| isOpenParen.addElement(new Boolean(isOpen)); |
| } |
| |
| public int getLastParenOffset(boolean[] isOpenParen) { |
| if (size() == 0) { |
| return -1; |
| } |
| isOpenParen[0] = isOpen(size()-1); |
| return offset(size()-1); |
| } |
| |
| // Remove the last (rightmost) segment. Store its offsets in start |
| // and limit, and then convert all offsets at or after start to be |
| // equal to start. Upon failure, return FALSE. Assume that the |
| // caller has already called getLastParenOffset() and validated that |
| // there is at least one parenthesis and that the last one is a close |
| // paren. |
| public boolean extractLastParenSubstring(int[] start, int[] limit) { |
| // assert(offsets.size() > 0); |
| // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0); |
| int i = size() - 1; |
| int n = 1; // count of close parens we need to match |
| // Record position of the last close paren |
| limit[0] = offset(i); |
| --i; // back up to the one before the last one |
| while (i >= 0 && n != 0) { |
| n += isOpen(i) ? -1 : 1; |
| } |
| if (n != 0) { |
| return false; |
| } |
| // assert(i>=0); |
| start[0] = offset(i); |
| // Reset all segment pairs from i to size() - 1 to [start, start+1). |
| while (i<size()) { |
| int o = isOpen(i) ? start[0] : (start[0]+1); |
| offsets.setElementAt(new Integer(o), i); |
| ++i; |
| } |
| return true; |
| } |
| |
| // Assume caller has already gotten a TRUE validate(). |
| public int[] createArray() { |
| int c = count(); // number of segments |
| int arrayLen = 4*c + 4; |
| int[] array = new int[arrayLen]; |
| int a2offset = 2*c + 3; // offset to array 2 |
| |
| array[0] = c; |
| array[1] = a2offset; |
| int i; |
| for (i=0; i<2*c; ++i) { |
| array[2+i] = offset(i); |
| } |
| array[a2offset-1] = -1; |
| array[arrayLen-1] = -1; |
| // Now walk through and match up segment numbers with parentheses. |
| // Number segments from 0. We're going to offset all entries by 2 |
| // to skip the first two elements, array[0] and array[1]. |
| Stack stack = new Stack(); |
| int nextOpen = 0; // seg # of next open, 0-based |
| for (i=0; i<2*c; ++i) { |
| boolean open = isOpen(i); |
| // Let seg be the zero-based segment number. |
| // Open parens are at 2*seg in array 2. |
| // Close parens are at 2*seg+1 in array 2. |
| if (open) { |
| array[a2offset + 2*nextOpen] = 2+i; |
| stack.push(new Integer(nextOpen)); |
| ++nextOpen; |
| } else { |
| int nextClose = ((Integer) stack.pop()).intValue(); |
| array[a2offset + 2*nextClose+1] = 2+i; |
| } |
| } |
| // assert(stack.empty()); |
| |
| return array; |
| } |
| |
| public boolean validate() { |
| // want number of parens >= 2 |
| // want number of parens to be even |
| // want first paren '(' |
| // want parens to match up in the end |
| if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) { |
| return false; |
| } |
| int n = 0; |
| for (int i=0; i<size(); ++i) { |
| n += isOpen(i) ? 1 : -1; |
| if (n < 0) { |
| return false; |
| } |
| } |
| return n == 0; |
| } |
| |
| // Number of segments |
| // Assume caller has already gotten a TRUE validate(). |
| public int count() { |
| // assert(validate()); |
| return size() / 2; |
| } |
| } |
| |
| //---------------------------------------------------------------------- |
| // class RuleHalf |
| //---------------------------------------------------------------------- |
| |
| /** |
| * A class representing one side of a rule. This class knows how to |
| * parse half of a rule. It is tightly coupled to the method |
| * TransliteratorParser.parseRule(). |
| */ |
| private static class RuleHalf { |
| |
| public String text; |
| |
| public int cursor = -1; // position of cursor in text |
| public int ante = -1; // position of ante context marker '{' in text |
| public int post = -1; // position of post context marker '}' in text |
| |
| // Record the position of the segment substrings and references. A |
| // given side should have segments or segment references, but not |
| // both. |
| public Segments segments = null; |
| public int maxRef = -1; // index of largest ref (1..9) |
| |
| // Record the offset to the cursor either to the left or to the |
| // right of the key. This is indicated by characters on the output |
| // side that allow the cursor to be positioned arbitrarily within |
| // the matching text. For example, abc{def} > | @@@ xyz; changes |
| // def to xyz and moves the cursor to before abc. Offset characters |
| // must be at the start or end, and they cannot move the cursor past |
| // the ante- or postcontext text. Placeholders are only valid in |
| // output text. |
| public int cursorOffset = 0; // only nonzero on output side |
| |
| public boolean anchorStart = false; |
| public boolean anchorEnd = false; |
| |
| /** |
| * Parse one side of a rule, stopping at either the limit, |
| * the END_OF_RULE character, or an operator. Return |
| * the pos of the terminating character (or limit). |
| */ |
| public int parse(String rule, int pos, int limit, |
| TransliteratorParser parser) { |
| int start = pos; |
| StringBuffer buf = new StringBuffer(); |
| ParsePosition pp = null; |
| int cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_ |
| boolean done = false; |
| int quoteStart = -1; // Most recent 'single quoted string' |
| int quoteLimit = -1; |
| int varStart = -1; // Most recent $variableReference |
| int varLimit = -1; |
| int[] iref = new int[1]; |
| |
| main: |
| while (pos < limit && !done) { |
| char c = rule.charAt(pos++); |
| if (Character.isWhitespace(c)) { |
| // Ignore whitespace. Note that this is not Unicode |
| // spaces, but Java spaces -- a subset, representing |
| // whitespace likely to be seen in code. |
| continue; |
| } |
| if (OPERATORS.indexOf(c) >= 0) { |
| --pos; // Backup to point to operator |
| break main; |
| } |
| if (anchorEnd) { |
| // Text after a presumed end anchor is a syntax err |
| syntaxError("Malformed variable reference", rule, start); |
| } |
| if (UnicodeSet.resemblesPattern(rule, pos-1)) { |
| if (pp == null) { |
| pp = new ParsePosition(0); |
| } |
| pp.setIndex(pos-1); // Backup to opening '[' |
| buf.append(parser.parseSet(rule, pp)); |
| pos = pp.getIndex(); |
| continue; |
| } |
| // Handle escapes |
| if (c == ESCAPE) { |
| if (pos == limit) { |
| syntaxError("Trailing backslash", rule, start); |
| } |
| iref[0] = pos; |
| int escaped = Utility.unescapeAt(rule, iref); |
| pos = iref[0]; |
| if (escaped == -1) { |
| syntaxError("Malformed escape", rule, start); |
| } |
| UTF16.append(buf, escaped); |
| continue; |
| } |
| // Handle quoted matter |
| if (c == QUOTE) { |
| int iq = rule.indexOf(QUOTE, pos); |
| if (iq == pos) { |
| buf.append(c); // Parse [''] outside quotes as ['] |
| ++pos; |
| } else { |
| /* This loop picks up a segment of quoted text of the |
| * form 'aaaa' each time through. If this segment |
| * hasn't really ended ('aaaa''bbbb') then it keeps |
| * looping, each time adding on a new segment. When it |
| * reaches the final quote it breaks. |
| */ |
| quoteStart = buf.length(); |
| for (;;) { |
| if (iq < 0) { |
| syntaxError("Unterminated quote", rule, start); |
| } |
| buf.append(rule.substring(pos, iq)); |
| pos = iq+1; |
| if (pos < limit && rule.charAt(pos) == QUOTE) { |
| // Parse [''] inside quotes as ['] |
| iq = rule.indexOf(QUOTE, pos+1); |
| // Continue looping |
| } else { |
| break; |
| } |
| } |
| quoteLimit = buf.length(); |
| } |
| continue; |
| } |
| switch (c) { |
| case ANCHOR_START: |
| if (buf.length() == 0 && !anchorStart) { |
| anchorStart = true; |
| } else { |
| syntaxError("Misplaced anchor start", |
| rule, start); |
| } |
| break; |
| case SEGMENT_OPEN: |
| case SEGMENT_CLOSE: |
| // Handle segment definitions "(" and ")" |
| // Parse "(", ")" |
| if (segments == null) { |
| segments = new Segments(); |
| } |
| segments.addParenthesisAt(buf.length(), c == SEGMENT_OPEN); |
| break; |
| case END_OF_RULE: |
| --pos; // Backup to point to END_OF_RULE |
| break main; |
| case SymbolTable.SYMBOL_REF: |
| // Handle variable references and segment references "$1" .. "$9" |
| { |
| // A variable reference must be followed immediately |
| // by a Unicode identifier start and zero or more |
| // Unicode identifier part characters, or by a digit |
| // 1..9 if it is a segment reference. |
| if (pos == limit) { |
| // A variable ref character at the end acts as |
| // an anchor to the context limit, as in perl. |
| anchorEnd = true; |
| break; |
| } |
| // Parse "$1" "$2" .. "$9" .. (no upper limit) |
| c = rule.charAt(pos); |
| int r = Character.digit(c, 10); |
| if (r >= 1 && r <= 9) { |
| ++pos; |
| while (pos < limit) { |
| c = rule.charAt(pos); |
| int d = Character.digit(c, 10); |
| if (d < 0) { |
| break; |
| } |
| if (r > 214748364 || |
| (r == 214748364 && d > 7)) { |
| syntaxError("Undefined segment reference", |
| rule, start); |
| } |
| r = 10*r + d; |
| } |
| if (r > maxRef) { |
| maxRef = r; |
| } |
| buf.append(parser.getSegmentStandin(r)); |
| } else { |
| if (pp == null) { // Lazy create |
| pp = new ParsePosition(0); |
| } |
| pp.setIndex(pos); |
| String name = parser.parseData. |
| parseReference(rule, pp, limit); |
| if (name == null) { |
| // This means the '$' was not followed by a |
| // valid name. Try to interpret it as an |
| // end anchor then. If this also doesn't work |
| // (if we see a following character) then signal |
| // an error. |
| anchorEnd = true; |
| break; |
| } |
| pos = pp.getIndex(); |
| // If this is a variable definition statement, |
| // then the LHS variable will be undefined. In |
| // that case appendVariableDef() will append the |
| // special placeholder char variableLimit-1. |
| varStart = buf.length(); |
| parser.appendVariableDef(name, buf); |
| varLimit = buf.length(); |
| } |
| } |
| break; |
| case CONTEXT_ANTE: |
| if (ante >= 0) { |
| syntaxError("Multiple ante contexts", rule, start); |
| } |
| ante = buf.length(); |
| break; |
| case CONTEXT_POST: |
| if (post >= 0) { |
| syntaxError("Multiple post contexts", rule, start); |
| } |
| post = buf.length(); |
| break; |
| case CURSOR_POS: |
| if (cursor >= 0) { |
| syntaxError("Multiple cursors", rule, start); |
| } |
| cursor = buf.length(); |
| break; |
| case CURSOR_OFFSET: |
| if (cursorOffset < 0) { |
| if (buf.length() > 0) { |
| syntaxError("Misplaced " + c, rule, start); |
| } |
| --cursorOffset; |
| } else if (cursorOffset > 0) { |
| if (buf.length() != cursorOffsetPos || cursor >= 0) { |
| syntaxError("Misplaced " + c, rule, start); |
| } |
| ++cursorOffset; |
| } else { |
| if (cursor == 0 && buf.length() == 0) { |
| cursorOffset = -1; |
| } else if (cursor < 0) { |
| cursorOffsetPos = buf.length(); |
| cursorOffset = 1; |
| } else { |
| syntaxError("Misplaced " + c, rule, start); |
| } |
| } |
| break; |
| case DOT: |
| buf.append(parser.getDotStandIn()); |
| break; |
| case KLEENE_STAR: |
| case ONE_OR_MORE: |
| case ZERO_OR_ONE: |
| // Quantifiers. We handle single characters, quoted strings, |
| // variable references, and segments. |
| // a+ matches aaa |
| // 'foo'+ matches foofoofoo |
| // $v+ matches xyxyxy if $v == xy |
| // (seg)+ matches segsegseg |
| { |
| int qstart, qlimit; |
| boolean[] isOpenParen = new boolean[1]; |
| boolean isSegment = false; |
| if (segments != null && |
| segments.getLastParenOffset(isOpenParen) == buf.length()) { |
| // The */+ immediately follows a segment |
| if (isOpenParen[0]) { |
| syntaxError("Misplaced quantifier", rule, start); |
| } |
| int[] startparam = new int[1]; |
| int[] limitparam = new int[1]; |
| if (!segments.extractLastParenSubstring(startparam, limitparam)) { |
| syntaxError("Mismatched segment delimiters", rule, start); |
| } |
| qstart = startparam[0]; |
| qlimit = limitparam[0]; |
| isSegment = true; |
| } else { |
| // The */+ follows an isolated character or quote |
| // or variable reference |
| if (buf.length() == quoteLimit) { |
| // The */+ follows a 'quoted string' |
| qstart = quoteStart; |
| qlimit = quoteLimit; |
| } else if (buf.length() == varLimit) { |
| // The */+ follows a $variableReference |
| qstart = varStart; |
| qlimit = varLimit; |
| } else { |
| // The */+ follows a single character |
| qstart = buf.length() - 1; |
| qlimit = qstart + 1; |
| } |
| } |
| UnicodeMatcher m = |
| new StringMatcher(buf.toString(), qstart, qlimit, |
| isSegment, parser.data); |
| int min = 0; |
| int max = Quantifier.MAX; |
| switch (c) { |
| case ONE_OR_MORE: |
| min = 1; |
| break; |
| case ZERO_OR_ONE: |
| min = 0; |
| max = 1; |
| break; |
| // case KLEENE_STAR: |
| // do nothing -- min, max already set |
| } |
| m = new Quantifier(m, min, max); |
| buf.setLength(qstart); |
| buf.append(parser.generateStandInFor(m)); |
| } |
| break; |
| default: |
| // Disallow unquoted characters other than [0-9A-Za-z] |
| // in the printable ASCII range. These characters are |
| // reserved for possible future use. |
| if (c >= 0x0021 && c <= 0x007E && |
| !((c >= '0' && c <= '9') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= 'a' && c <= 'z'))) { |
| syntaxError("Unquoted " + c, rule, start); |
| } |
| buf.append(c); |
| break; |
| } |
| } |
| |
| if (cursorOffset > 0 && cursor != cursorOffsetPos) { |
| syntaxError("Misplaced " + CURSOR_POS, rule, start); |
| } |
| text = buf.toString(); |
| return pos; |
| } |
| |
| /** |
| * Remove context. |
| */ |
| void removeContext() { |
| text = text.substring(ante < 0 ? 0 : ante, |
| post < 0 ? text.length() : post); |
| ante = post = -1; |
| anchorStart = anchorEnd = false; |
| } |
| |
| /** |
| * Create and return an int[] array of segments. |
| */ |
| int[] createSegments() { |
| return (segments == null) ? null : segments.createArray(); |
| } |
| } |
| |
| //---------------------------------------------------------------------- |
| // PUBLIC methods |
| //---------------------------------------------------------------------- |
| |
| /** |
| * Constructor. |
| */ |
| public TransliteratorParser() { |
| } |
| |
| /** |
| * Parse a set of rules. After the parse completes, examine the public |
| * data members for results. |
| */ |
| public void parse(String rules, int direction) { |
| parseRules(new RuleArray(new String[] { rules }), direction); |
| } |
| |
| /** |
| * Parse a set of rules. After the parse completes, examine the public |
| * data members for results. |
| */ |
| public void parse(ResourceReader rules, int direction) { |
| parseRules(new RuleReader(rules), direction); |
| } |
| |
| //---------------------------------------------------------------------- |
| // PRIVATE methods |
| //---------------------------------------------------------------------- |
| |
| /** |
| * Parse an array of zero or more rules. The strings in the array are |
| * treated as if they were concatenated together, with rule terminators |
| * inserted between array elements if not present already. |
| * |
| * Any previous rules are discarded. Typically this method is called exactly |
| * once, during construction. |
| * |
| * The member this.data will be set to null if there are no rules. |
| * |
| * @exception IllegalArgumentException if there is a syntax error in the |
| * rules |
| */ |
| void parseRules(RuleBody ruleArray, int dir) { |
| data = new RuleBasedTransliterator.Data(); |
| direction = dir; |
| ruleCount = 0; |
| compoundFilter = null; |
| |
| // By default, rules use part of the private use area |
| // E000..F8FF for variables and other stand-ins. Currently |
| // the range F000..F8FF is typically sufficient. The 'use |
| // variable range' pragma allows rule sets to modify this. |
| setVariableRange(0xF000, 0xF8FF); |
| |
| variablesVector = new Vector(); |
| parseData = new ParseData(); |
| |
| StringBuffer errors = null; |
| int errorCount = 0; |
| |
| ruleArray.reset(); |
| |
| StringBuffer idBlockResult = new StringBuffer(); |
| idSplitPoint = -1; |
| // The mode marks whether we are in the header ::id block, the |
| // rule block, or the footer ::id block. |
| // mode == 0: start: rule->1, ::id->0 |
| // mode == 1: in rules: rule->1, ::id->2 |
| // mode == 2: in footer rule block: rule->ERROR, ::id->2 |
| int mode = 0; |
| |
| // The compound filter offset is an index into idBlockResult. |
| // If it is 0, then the compound filter occurred at the start, |
| // and it is the offset to the _start_ of the compound filter |
| // pattern. Otherwise it is the offset to the _limit_ of the |
| // compound filter pattern within idBlockResult. |
| this.compoundFilter = null; |
| int compoundFilterOffset = -1; |
| |
| main: |
| for (;;) { |
| String rule = ruleArray.nextLine(); |
| if (rule == null) { |
| break; |
| } |
| int pos = 0; |
| int limit = rule.length(); |
| while (pos < limit) { |
| char c = rule.charAt(pos++); |
| if (Character.isWhitespace(c)) { |
| // Ignore leading whitespace. Note that this is not |
| // Unicode spaces, but Java spaces -- a subset, |
| // representing whitespace likely to be seen in code. |
| continue; |
| } |
| // Skip lines starting with the comment character |
| if (c == RULE_COMMENT_CHAR) { |
| pos = rule.indexOf("\n", pos) + 1; |
| if (pos == 0) { |
| break; // No "\n" found; rest of rule is a commnet |
| } |
| continue; // Either fall out or restart with next line |
| } |
| // Often a rule file contains multiple errors. It's |
| // convenient to the rule author if these are all reported |
| // at once. We keep parsing rules even after a failure, up |
| // to a specified limit, and report all errors at once. |
| try { |
| // We've found the start of a rule or ID. c is its first |
| // character, and pos points past c. |
| --pos; |
| // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 |
| // chars left. |
| if ((pos + ID_TOKEN_LEN + 1) <= limit && |
| rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) { |
| pos += ID_TOKEN_LEN; |
| c = rule.charAt(pos); |
| while (UCharacter.isWhitespace(c) && pos < limit) { |
| ++pos; |
| c = rule.charAt(pos); |
| } |
| int lengthBefore = idBlockResult.length(); |
| if (mode == 1) { |
| mode = 2; |
| idSplitPoint = lengthBefore; |
| } |
| int[] p = new int[] { pos }; |
| boolean[] sawDelim = new boolean[1]; |
| UnicodeSet[] cpdFilter = new UnicodeSet[1]; |
| Transliterator.parseID(rule, idBlockResult, p, sawDelim, cpdFilter, direction, false); |
| if (p[0] == pos || !sawDelim[0]) { |
| // Invalid ::id |
| int i1 = pos + 2; |
| while (i1 < rule.length() && rule.charAt(i1) != ';') { |
| ++i1; |
| } |
| throw new IllegalArgumentException("Invalid ::ID " + |
| rule.substring(pos, i1)); |
| } |
| if (cpdFilter[0] != null) { |
| if (compoundFilter != null) { |
| // Multiple compound filters |
| throw new IllegalArgumentException("Multiple compound filters"); |
| } |
| compoundFilter = cpdFilter[0]; |
| compoundFilterOffset = (direction == Transliterator.FORWARD) ? |
| lengthBefore : idBlockResult.length(); |
| } |
| pos = p[0]; |
| } else if (resemblesPragma(rule, pos, limit)) { |
| int ppp = parsePragma(rule, pos, limit); |
| if (ppp < 0) { |
| throw new IllegalArgumentException("Unrecognized pragma: " + |
| rule.substring(pos)); |
| } |
| pos = ppp; |
| } else { |
| // Parse a rule |
| pos = parseRule(rule, pos, limit); |
| ++ruleCount; |
| if (mode == 2) { |
| // ::id in illegal position (because a rule |
| // occurred after the ::id footer block) |
| throw new IllegalArgumentException("::ID in illegal position"); |
| } |
| mode = 1; |
| } |
| } catch (IllegalArgumentException e) { |
| if (errorCount == 30) { |
| errors.append("\nMore than 30 errors; further messages squelched"); |
| break main; |
| } |
| if (errors == null) { |
| errors = new StringBuffer(e.getMessage()); |
| } else { |
| errors.append("\n" + e.getMessage()); |
| } |
| ++errorCount; |
| pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';' |
| } |
| } |
| } |
| |
| idBlock = idBlockResult.toString(); |
| |
| // Convert the set vector to an array |
| data.variables = new UnicodeMatcher[variablesVector.size()]; |
| variablesVector.copyInto(data.variables); |
| variablesVector = null; |
| |
| // Do more syntax checking and index the rules |
| try { |
| if (compoundFilter != null) { |
| if ((direction == Transliterator.FORWARD && |
| compoundFilterOffset != 0) || |
| (direction == Transliterator.REVERSE && |
| compoundFilterOffset != idBlock.length())) { |
| throw new IllegalArgumentException("Compound filters misplaced"); |
| } |
| } |
| |
| data.ruleSet.freeze(); |
| |
| if (idSplitPoint < 0) { |
| idSplitPoint = idBlock.length(); |
| } |
| |
| if (ruleCount == 0) { |
| data = null; |
| } |
| } catch (IllegalArgumentException e) { |
| if (errors == null) { |
| errors = new StringBuffer(e.getMessage()); |
| } else { |
| errors.append("\n").append(e.getMessage()); |
| } |
| } |
| |
| if (errors != null) { |
| throw new IllegalArgumentException(errors.toString()); |
| } |
| } |
| |
| /** |
| * MAIN PARSER. Parse the next rule in the given rule string, starting |
| * at pos. Return the index after the last character parsed. Do not |
| * parse characters at or after limit. |
| * |
| * Important: The character at pos must be a non-whitespace character |
| * that is not the comment character. |
| * |
| * This method handles quoting, escaping, and whitespace removal. It |
| * parses the end-of-rule character. It recognizes context and cursor |
| * indicators. Once it does a lexical breakdown of the rule at pos, it |
| * creates a rule object and adds it to our rule list. |
| * |
| * This method is tightly coupled to the inner class RuleHalf. |
| */ |
| private int parseRule(String rule, int pos, int limit) { |
| // Locate the left side, operator, and right side |
| int start = pos; |
| char operator = 0; |
| |
| RuleHalf left = new RuleHalf(); |
| RuleHalf right = new RuleHalf(); |
| |
| undefinedVariableName = null; |
| pos = left.parse(rule, pos, limit, this); |
| |
| if (pos == limit || |
| OPERATORS.indexOf(operator = rule.charAt(pos++)) < 0) { |
| syntaxError("No operator", rule, start); |
| } |
| |
| // Found an operator char. Check for forward-reverse operator. |
| if (operator == REVERSE_RULE_OP && |
| (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { |
| ++pos; |
| operator = FWDREV_RULE_OP; |
| } |
| |
| pos = right.parse(rule, pos, limit, this); |
| |
| if (pos < limit) { |
| if (rule.charAt(pos) == END_OF_RULE) { |
| ++pos; |
| } else { |
| // RuleHalf parser must have terminated at an operator |
| syntaxError("Unquoted operator", rule, start); |
| } |
| } |
| |
| if (operator == VARIABLE_DEF_OP) { |
| // LHS is the name. RHS is a single character, either a literal |
| // or a set (already parsed). If RHS is longer than one |
| // character, it is either a multi-character string, or multiple |
| // sets, or a mixture of chars and sets -- syntax error. |
| |
| // We expect to see a single undefined variable (the one being |
| // defined). |
| if (undefinedVariableName == null) { |
| syntaxError("Missing '$' or duplicate definition", rule, start); |
| } |
| if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { |
| syntaxError("Malformed LHS", rule, start); |
| } |
| if (left.anchorStart || left.anchorEnd || |
| right.anchorStart || right.anchorEnd) { |
| syntaxError("Malformed variable def", rule, start); |
| } |
| // We allow anything on the right, including an empty string. |
| int n = right.text.length(); |
| char[] value = new char[n]; |
| right.text.getChars(0, n, value, 0); |
| data.variableNames.put(undefinedVariableName, value); |
| |
| ++variableLimit; |
| return pos; |
| } |
| |
| // If this is not a variable definition rule, we shouldn't have |
| // any undefined variable names. |
| if (undefinedVariableName != null) { |
| syntaxError("Undefined variable $" + undefinedVariableName, |
| rule, start); |
| } |
| |
| // If the direction we want doesn't match the rule |
| // direction, do nothing. |
| if (operator != FWDREV_RULE_OP && |
| ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) { |
| return pos; |
| } |
| |
| // Transform the rule into a forward rule by swapping the |
| // sides if necessary. |
| if (direction == Transliterator.REVERSE) { |
| RuleHalf temp = left; |
| left = right; |
| right = temp; |
| } |
| |
| // Remove non-applicable elements in forward-reverse |
| // rules. Bidirectional rules ignore elements that do not |
| // apply. |
| if (operator == FWDREV_RULE_OP) { |
| right.removeContext(); |
| right.segments = null; |
| left.cursor = left.maxRef = -1; |
| left.cursorOffset = 0; |
| } |
| |
| // Normalize context |
| if (left.ante < 0) { |
| left.ante = 0; |
| } |
| if (left.post < 0) { |
| left.post = left.text.length(); |
| } |
| |
| // Context is only allowed on the input side. Cursors are only |
| // allowed on the output side. Segment delimiters can only appear |
| // on the left, and references on the right. Cursor offset |
| // cannot appear without an explicit cursor. Cursor offset |
| // cannot place the cursor outside the limits of the context. |
| // Anchors are only allowed on the input side. |
| if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || |
| right.segments != null || left.maxRef >= 0 || |
| (right.cursorOffset != 0 && right.cursor < 0) || |
| // - The following two checks were used to ensure that the |
| // - the cursor offset stayed within the ante- or postcontext. |
| // - However, with the addition of quantifiers, we have to |
| // - allow arbitrary cursor offsets and do runtime checking. |
| //(right.cursorOffset > (left.text.length() - left.post)) || |
| //(-right.cursorOffset > left.ante) || |
| right.anchorStart || right.anchorEnd) { |
| syntaxError("Malformed rule", rule, start); |
| } |
| |
| // Check integrity of segments and segment references. Each |
| // segment's start must have a corresponding limit, and the |
| // references must not refer to segments that do not exist. |
| if (left.segments != null) { |
| if (!left.segments.validate()) { |
| syntaxError("Missing segment close", rule, start); |
| } |
| int n = left.segments.count(); |
| if (right.maxRef > n) { |
| syntaxError("Undefined segment reference", rule, start); |
| } |
| } |
| |
| data.ruleSet.addRule(new TransliterationRule( |
| left.text, left.ante, left.post, |
| right.text, right.cursor, right.cursorOffset, |
| left.createSegments(), |
| left.anchorStart, left.anchorEnd, |
| data)); |
| |
| return pos; |
| } |
| |
| /** |
| * Set the variable range to [start, end] (inclusive). |
| */ |
| private void setVariableRange(int start, int end) { |
| if (start > end || start < 0 || end > 0xFFFF) { |
| throw new IllegalArgumentException("Invalid variable range " + start + ", " + end); |
| } |
| |
| // Segment references work down; variables work up. We don't |
| // know how many of each we will need. |
| data.segmentBase = (char) end; |
| data.segmentCount = 0; |
| data.variablesBase = variableNext = (char) start; // first private use |
| variableLimit = (char) (end + 1); |
| } |
| |
| /** |
| * Set the maximum backup to 'backup', in response to a pragma |
| * statement. |
| */ |
| private void pragmaMaximumBackup(int backup) { |
| //TODO Finish |
| } |
| |
| /** |
| * Begin normalizing all rules using the given mode, in response |
| * to a pragma statement. |
| */ |
| private void pragmaNormalizeRules(Normalizer.Mode mode) { |
| //TODO Finish |
| } |
| |
| /** |
| * Return true if the given rule looks like a pragma. |
| * @param pos offset to the first non-whitespace character |
| * of the rule. |
| * @param limit pointer past the last character of the rule. |
| */ |
| static boolean resemblesPragma(String rule, int pos, int limit) { |
| // Must start with /use\s/i |
| return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0; |
| } |
| |
| /** |
| * Parse a pragma. This method assumes resemblesPragma() has |
| * already returned true. |
| * @param pos offset to the first non-whitespace character |
| * of the rule. |
| * @param limit pointer past the last character of the rule. |
| * @return the position index after the final ';' of the pragma, |
| * or -1 on failure. |
| */ |
| private int parsePragma(String rule, int pos, int limit) { |
| int[] array = new int[2]; |
| |
| // resemblesPragma() has already returned true, so we |
| // know that pos points to /use\s/i; we can skip 4 characters |
| // immediately |
| pos += 4; |
| |
| // Here are the pragmas we recognize: |
| // use variable range 0xE000 0xEFFF; |
| // use maximum backup 16; |
| // use nfd rules; |
| int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array); |
| if (p >= 0) { |
| setVariableRange(array[0], array[1]); |
| return p; |
| } |
| |
| p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array); |
| if (p >= 0) { |
| pragmaMaximumBackup(array[0]); |
| return p; |
| } |
| |
| p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null); |
| if (p >= 0) { |
| pragmaNormalizeRules(Normalizer.DECOMP); |
| return p; |
| } |
| |
| p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null); |
| if (p >= 0) { |
| pragmaNormalizeRules(Normalizer.COMPOSE); |
| return p; |
| } |
| |
| // Syntax error: unable to parse pragma |
| return -1; |
| } |
| |
| /** |
| * Throw an exception indicating a syntax error. Search the rule string |
| * for the probable end of the rule. Of course, if the error is that |
| * the end of rule marker is missing, then the rule end will not be found. |
| * In any case the rule start will be correctly reported. |
| * @param msg error description |
| * @param rule pattern string |
| * @param start position of first character of current rule |
| */ |
| static final void syntaxError(String msg, String rule, int start) { |
| int end = ruleEnd(rule, start, rule.length()); |
| throw new IllegalArgumentException(msg + " in \"" + |
| Utility.escape(rule.substring(start, end)) + '"'); |
| } |
| |
| static final int ruleEnd(String rule, int start, int limit) { |
| int end = quotedIndexOf(rule, start, limit, ";"); |
| if (end < 0) { |
| end = limit; |
| } |
| return end; |
| } |
| |
| /** |
| * Parse a UnicodeSet out, store it, and return the stand-in character |
| * used to represent it. |
| */ |
| private final char parseSet(String rule, ParsePosition pos) { |
| UnicodeSet set = new UnicodeSet(rule, pos, parseData); |
| if (variableNext >= variableLimit) { |
| throw new RuntimeException("Private use variables exhausted"); |
| } |
| set.compact(); |
| return generateStandInFor(set); |
| } |
| |
| /** |
| * Generate and return a stand-in for a new UnicodeMatcher. Store |
| * the matcher. |
| */ |
| char generateStandInFor(UnicodeMatcher matcher) { |
| // assert(matcher != null); |
| if (variableNext >= variableLimit) { |
| throw new RuntimeException("Private use variables exhausted"); |
| } |
| variablesVector.addElement(matcher); |
| return variableNext++; |
| } |
| |
| /** |
| * Return the stand-in for the dot set. It is allocated the first |
| * time and reused thereafter. |
| */ |
| char getDotStandIn() { |
| if (dotStandIn == -1) { |
| dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); |
| } |
| return (char) dotStandIn; |
| } |
| |
| /** |
| * Append the value of the given variable name to the given |
| * StringBuffer. |
| * @exception IllegalArgumentException if the name is unknown. |
| */ |
| private void appendVariableDef(String name, StringBuffer buf) { |
| char[] ch = (char[]) data.variableNames.get(name); |
| if (ch == null) { |
| // We allow one undefined variable so that variable definition |
| // statements work. For the first undefined variable we return |
| // the special placeholder variableLimit-1, and save the variable |
| // name. |
| if (undefinedVariableName == null) { |
| undefinedVariableName = name; |
| if (variableNext >= variableLimit) { |
| throw new RuntimeException("Private use variables exhausted"); |
| } |
| buf.append((char) --variableLimit); |
| } else { |
| throw new IllegalArgumentException("Undefined variable $" |
| + name); |
| } |
| } else { |
| buf.append(ch); |
| } |
| } |
| |
| char getSegmentStandin(int r) { |
| // assert(r>=1); |
| if (r > data.segmentCount) { |
| data.segmentCount = r; |
| variableLimit = (char) (data.segmentBase - r + 1); |
| if (variableNext >= variableLimit) { |
| throw new IllegalArgumentException("Too many variables / segments"); |
| } |
| } |
| return data.getSegmentStandin(r); |
| } |
| |
| /** |
| * Returns the index of the first character in a set, ignoring quoted text. |
| * For example, in the string "abc'hide'h", the 'h' in "hide" will not be |
| * found by a search for "h". Unlike String.indexOf(), this method searches |
| * not for a single character, but for any character of the string |
| * <code>setOfChars</code>. |
| * @param text text to be searched |
| * @param start the beginning index, inclusive; <code>0 <= start |
| * <= limit</code>. |
| * @param limit the ending index, exclusive; <code>start <= limit |
| * <= text.length()</code>. |
| * @param setOfChars string with one or more distinct characters |
| * @return Offset of the first character in <code>setOfChars</code> |
| * found, or -1 if not found. |
| * @see #indexOf |
| */ |
| private static int quotedIndexOf(String text, int start, int limit, |
| String setOfChars) { |
| for (int i=start; i<limit; ++i) { |
| char c = text.charAt(i); |
| if (c == ESCAPE) { |
| ++i; |
| } else if (c == QUOTE) { |
| while (++i < limit |
| && text.charAt(i) != QUOTE) {} |
| } else if (setOfChars.indexOf(c) >= 0) { |
| return i; |
| } |
| } |
| return -1; |
| } |
| } |
| |
| //eof |