blob: 0a1d30b30931a1c14ba75cd4f84ee73f7b3161f2 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2002-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.util;
import java.text.ParsePosition;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeMatcher;
import com.ibm.icu.text.UnicodeSet;
public class Tokenizer {
protected String source;
protected StringBuffer buffer = new StringBuffer();
protected long number;
protected UnicodeSet unicodeSet = null;
protected int index;
boolean backedup = false;
protected int lastIndex = -1;
protected int nextIndex;
int lastValue = BACKEDUP_TOO_FAR;
TokenSymbolTable symbolTable = new TokenSymbolTable();
private static final char
QUOTE = '\'',
BSLASH = '\\';
private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +
"\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +
"]");
private static final UnicodeSet SYNTAX = new UnicodeSet("[" +
"\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +
"\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +
"\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +
"\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +
"\\u3001\\u3003\\u3008-\\u3020\\u3030" +
"\\uFD3E\\uFD3F\\uFE45\\uFE46" +
"]").removeAll(QUOTERS).remove('$');
private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
//private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
private static final UnicodeSet NON_STRING = new UnicodeSet()
.addAll(WHITESPACE)
.addAll(SYNTAX);
protected UnicodeSet whiteSpace = WHITESPACE;
protected UnicodeSet syntax = SYNTAX;
private UnicodeSet non_string = NON_STRING;
private void fixSets() {
if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
}
if (whiteSpace.containsSome(QUOTERS)) {
whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS);
}
non_string = new UnicodeSet(syntax)
.addAll(whiteSpace);
}
public Tokenizer setSource(String source) {
this.source = source;
this.index = 0;
return this; // for chaining
}
public Tokenizer setIndex(int index) {
this.index = index;
return this; // for chaining
}
public static final int
DONE = -1,
NUMBER = -2,
STRING = -3,
UNICODESET = -4,
UNTERMINATED_QUOTE = -5,
BACKEDUP_TOO_FAR = -6;
private static final int
//FIRST = 0,
//IN_NUMBER = 1,
//IN_SPACE = 2,
AFTER_QUOTE = 3, // warning: order is important for switch statement
IN_STRING = 4,
AFTER_BSLASH = 5,
IN_QUOTE = 6;
public String toString(int type, boolean backedupBefore) {
String s = backedup ? "@" : "*";
switch(type) {
case DONE:
return s+"Done"+s;
case BACKEDUP_TOO_FAR:
return s+"Illegal Backup"+s;
case UNTERMINATED_QUOTE:
return s+"Unterminated Quote=" + getString() + s;
case STRING:
return s+"s=" + getString() + s;
case NUMBER:
return s+"n=" + getNumber() + s;
case UNICODESET:
return s+"n=" + getUnicodeSet() + s;
default:
return s+"c=" + usf.getName(type,true) + s;
}
}
private static final BagFormatter usf = new BagFormatter();
public void backup() {
if (backedup) throw new IllegalArgumentException("backup too far");
backedup = true;
nextIndex = index;
index = lastIndex;
}
/*
public int next2() {
boolean backedupBefore = backedup;
int result = next();
System.out.println(toString(result, backedupBefore));
return result;
}
*/
public int next() {
if (backedup) {
backedup = false;
index = nextIndex;
return lastValue;
}
int cp = 0;
boolean inComment = false;
// clean off any leading whitespace or comments
while (true) {
if (index >= source.length()) return lastValue = DONE;
cp = nextChar();
if (inComment) {
if (NEWLINE.contains(cp)) inComment = false;
} else {
if (cp == '#') inComment = true;
else if (!whiteSpace.contains(cp)) break;
}
}
// record the last index in case we have to backup
lastIndex = index;
if (cp == '[') {
ParsePosition pos = new ParsePosition(index-1);
unicodeSet = new UnicodeSet(source,pos,symbolTable);
index = pos.getIndex();
return lastValue = UNICODESET;
}
// get syntax character
if (syntax.contains(cp)) return lastValue = cp;
// get number, if there is one
if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
number = UCharacter.getNumericValue(cp);
while (index < source.length()) {
cp = nextChar();
if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
index -= UTF16.getCharCount(cp); // BACKUP!
break;
}
number *= 10;
number += UCharacter.getNumericValue(cp);
}
return lastValue = NUMBER;
}
buffer.setLength(0);
int status = IN_STRING;
main:
while (true) {
switch (status) {
case AFTER_QUOTE: // check for double ''?
if (cp == QUOTE) {
UTF16.append(buffer, QUOTE);
status = IN_QUOTE;
break;
}
// OTHERWISE FALL THROUGH!!!
case IN_STRING:
if (cp == QUOTE) status = IN_QUOTE;
else if (cp == BSLASH) status = AFTER_BSLASH;
else if (non_string.contains(cp)) {
index -= UTF16.getCharCount(cp); // BACKUP!
break main;
} else UTF16.append(buffer,cp);
break;
case IN_QUOTE:
if (cp == QUOTE) status = AFTER_QUOTE;
else UTF16.append(buffer,cp);
break;
case AFTER_BSLASH:
switch(cp) {
case 'n': cp = '\n'; break;
case 'r': cp = '\r'; break;
case 't': cp = '\t'; break;
}
UTF16.append(buffer,cp);
status = IN_STRING;
break;
default: throw new IllegalArgumentException("Internal Error");
}
if (index >= source.length()) break;
cp = nextChar();
}
if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
return lastValue = STRING;
}
public String getString() {
return buffer.toString();
}
public String toString() {
return source.substring(0,index) + "$$$" + source.substring(index);
}
public long getNumber() {
return number;
}
public UnicodeSet getUnicodeSet() {
return unicodeSet;
}
private int nextChar() {
int cp = UTF16.charAt(source,index);
index += UTF16.getCharCount(cp);
return cp;
}
public int getIndex() {
return index;
}
public String getSource() {
return source;
}
public UnicodeSet getSyntax() {
return syntax;
}
public UnicodeSet getWhiteSpace() {
return whiteSpace;
}
public void setSyntax(UnicodeSet set) {
syntax = set;
fixSets();
}
public void setWhiteSpace(UnicodeSet set) {
whiteSpace = set;
fixSets();
}
public Set getLookedUpItems() {
return symbolTable.itemsLookedUp;
}
public void addSymbol(String var, String value, int start, int limit) {
// the limit is after the ';', so remove it
--limit;
char[] body = new char[limit - start];
value.getChars(start, limit, body, 0);
symbolTable.add(var, body);
}
public class TokenSymbolTable implements SymbolTable {
Map contents = new HashMap();
Set itemsLookedUp = new HashSet();
public void add(String var, char[] body) {
// start from 1 to avoid the $
contents.put(var.substring(1), body);
}
/* (non-Javadoc)
* @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
*/
public char[] lookup(String s) {
itemsLookedUp.add('$' + s);
return (char[])contents.get(s);
}
/* (non-Javadoc)
* @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
*/
public UnicodeMatcher lookupMatcher(int ch) {
// TODO Auto-generated method stub
return null;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
*/
public String parseReference(String text, ParsePosition pos, int limit) {
int cp;
int start = pos.getIndex();
int i;
for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(text, i);
if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
break;
}
}
pos.setIndex(i);
return text.substring(start,i);
}
}
}