| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2006-2009, Google, International Business Machines Corporation * |
| * and others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.impl; |
| |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| |
| /** |
| * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax. |
| * The '' (two quotes) is treated as a single quote, inside or outside a quote |
| * <ul> |
| * <li>Any ignorable characters are ignored in parsing.</li> |
| * <li>Any syntax characters are broken into separate tokens</li> |
| * <li>Quote characters can be specified: '...', "...", and \x </li> |
| * <li>Other characters are treated as literals</li> |
| * </ul> |
| */ |
| public class PatternTokenizer { |
| // settings used in the interpretation of the pattern |
| private UnicodeSet ignorableCharacters = new UnicodeSet(); |
| private UnicodeSet syntaxCharacters = new UnicodeSet(); |
| private UnicodeSet extraQuotingCharacters = new UnicodeSet(); |
| private UnicodeSet escapeCharacters = new UnicodeSet(); |
| private boolean usingSlash = false; |
| private boolean usingQuote = false; |
| |
| // transient data, set when needed. Null it out for any changes in the above fields. |
| private transient UnicodeSet needingQuoteCharacters = null; |
| |
| // data about the current pattern being parsed. start gets moved as we go along. |
| private int start; |
| private int limit; |
| private String pattern; |
| |
| public UnicodeSet getIgnorableCharacters() { |
| return (UnicodeSet) ignorableCharacters.clone(); |
| } |
| /** |
| * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]"); |
| * @param ignorableCharacters Characters to be ignored. |
| * @return A PatternTokenizer object in which characters are specified as ignored characters. |
| */ |
| public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) { |
| this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone(); |
| needingQuoteCharacters = null; |
| return this; |
| } |
| public UnicodeSet getSyntaxCharacters() { |
| return (UnicodeSet) syntaxCharacters.clone(); |
| } |
| public UnicodeSet getExtraQuotingCharacters() { |
| return (UnicodeSet) extraQuotingCharacters.clone(); |
| } |
| /** |
| * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]") |
| * @param syntaxCharacters Characters to be set as syntax characters. |
| * @return A PatternTokenizer object in which characters are specified as syntax characters. |
| */ |
| public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) { |
| this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone(); |
| needingQuoteCharacters = null; |
| return this; |
| } |
| /** |
| * Sets the extra characters to be quoted in literals |
| * @param syntaxCharacters Characters to be set as extra quoting characters. |
| * @return A PatternTokenizer object in which characters are specified as extra quoting characters. |
| */ |
| public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) { |
| this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone(); |
| needingQuoteCharacters = null; |
| return this; |
| } |
| |
| public UnicodeSet getEscapeCharacters() { |
| return (UnicodeSet) escapeCharacters.clone(); |
| } |
| /** |
| * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]"); |
| * @param escapeCharacters Characters to be set as escape characters. |
| * @return A PatternTokenizer object in which characters are specified as escape characters. |
| */ |
| public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) { |
| this.escapeCharacters = (UnicodeSet) escapeCharacters.clone(); |
| return this; |
| } |
| public boolean isUsingQuote() { |
| return usingQuote; |
| } |
| public PatternTokenizer setUsingQuote(boolean usingQuote) { |
| this.usingQuote = usingQuote; |
| needingQuoteCharacters = null; |
| return this; |
| } |
| public boolean isUsingSlash() { |
| return usingSlash; |
| } |
| public PatternTokenizer setUsingSlash(boolean usingSlash) { |
| this.usingSlash = usingSlash; |
| needingQuoteCharacters = null; |
| return this; |
| } |
| // public UnicodeSet getQuoteCharacters() { |
| // return (UnicodeSet) quoteCharacters.clone(); |
| // } |
| // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) { |
| // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone(); |
| // needingQuoteCharacters = null; |
| // return this; |
| // } |
| public int getLimit() { |
| return limit; |
| } |
| public PatternTokenizer setLimit(int limit) { |
| this.limit = limit; |
| return this; |
| } |
| public int getStart() { |
| return start; |
| } |
| public PatternTokenizer setStart(int start) { |
| this.start = start; |
| return this; |
| } |
| |
| public PatternTokenizer setPattern(CharSequence pattern) { |
| return setPattern(pattern.toString()); |
| } |
| |
| public PatternTokenizer setPattern(String pattern) { |
| if (pattern == null) { |
| throw new IllegalArgumentException("Inconsistent arguments"); |
| } |
| this.start = 0; |
| this.limit = pattern.length(); |
| this.pattern = pattern; |
| return this; |
| } |
| |
| public static final char SINGLE_QUOTE = '\''; |
| public static final char BACK_SLASH = '\\'; |
| private static int NO_QUOTE = -1, IN_QUOTE = -2; |
| |
| public String quoteLiteral(CharSequence string) { |
| return quoteLiteral(string.toString()); |
| } |
| |
| /** |
| * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes. |
| * @param string String passed to quote a literal string. |
| * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes. |
| */ |
| public String quoteLiteral(String string) { |
| if (needingQuoteCharacters == null) { |
| needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters) |
| if (usingSlash) needingQuoteCharacters.add(BACK_SLASH); |
| if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE); |
| } |
| StringBuffer result = new StringBuffer(); |
| int quotedChar = NO_QUOTE; |
| int cp; |
| for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(string, i); |
| if (escapeCharacters.contains(cp)) { |
| // we may have to fix up previous characters |
| if (quotedChar == IN_QUOTE) { |
| result.append(SINGLE_QUOTE); |
| quotedChar = NO_QUOTE; |
| } |
| appendEscaped(result, cp); |
| continue; |
| } |
| |
| if (needingQuoteCharacters.contains(cp)) { |
| // if we have already started a quote |
| if (quotedChar == IN_QUOTE) { |
| UTF16.append(result, cp); |
| if (usingQuote && cp == SINGLE_QUOTE) { // double it |
| result.append(SINGLE_QUOTE); |
| } |
| continue; |
| } |
| // otherwise not already in quote |
| if (usingSlash) { |
| result.append(BACK_SLASH); |
| UTF16.append(result, cp); |
| continue; |
| } |
| if (usingQuote) { |
| if (cp == SINGLE_QUOTE) { // double it and continue |
| result.append(SINGLE_QUOTE); |
| result.append(SINGLE_QUOTE); |
| continue; |
| } |
| result.append(SINGLE_QUOTE); |
| UTF16.append(result, cp); |
| quotedChar = IN_QUOTE; |
| continue; |
| } |
| // we have no choice but to use \\u or \\U |
| appendEscaped(result, cp); |
| continue; |
| } |
| // otherwise cp doesn't need quoting |
| // we may have to fix up previous characters |
| if (quotedChar == IN_QUOTE) { |
| result.append(SINGLE_QUOTE); |
| quotedChar = NO_QUOTE; |
| } |
| UTF16.append(result, cp); |
| } |
| // all done. |
| // we may have to fix up previous characters |
| if (quotedChar == IN_QUOTE) { |
| result.append(SINGLE_QUOTE); |
| } |
| return result.toString(); |
| } |
| |
| private void appendEscaped(StringBuffer result, int cp) { |
| if (cp <= 0xFFFF) { |
| result.append("\\u").append(Utility.hex(cp,4)); |
| } else { |
| result.append("\\U").append(Utility.hex(cp,8)); |
| } |
| } |
| |
| public String normalize() { |
| int oldStart = start; |
| StringBuffer result = new StringBuffer(); |
| StringBuffer buffer = new StringBuffer(); |
| while (true) { |
| buffer.setLength(0); |
| int status = next(buffer); |
| if (status == DONE) { |
| start = oldStart; |
| return result.toString(); |
| } |
| if (status != SYNTAX) { |
| result.append(quoteLiteral(buffer)); |
| } else { |
| result.append(buffer); |
| } |
| } |
| } |
| |
| public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5; |
| |
| private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4; |
| |
| public int next(StringBuffer buffer) { |
| if (start >= limit) return DONE; |
| int status = UNKNOWN; |
| int lastQuote = UNKNOWN; |
| int quoteStatus = NONE; |
| int hexCount = 0; |
| int hexValue = 0; |
| int cp; |
| main: |
| for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { |
| cp = UTF16.charAt(pattern, i); |
| // if we are in a quote, then handle it. |
| switch (quoteStatus) { |
| case SLASH_START: |
| switch (cp) { |
| case 'u': |
| quoteStatus = HEX; |
| hexCount = 4; |
| hexValue = 0; |
| continue main; |
| case 'U': |
| quoteStatus = HEX; |
| hexCount = 8; |
| hexValue = 0; |
| continue main; |
| default: |
| if (usingSlash) { |
| UTF16.append(buffer, cp); |
| quoteStatus = NONE; |
| continue main; |
| } else { |
| buffer.append(BACK_SLASH); |
| quoteStatus = NONE; |
| } |
| } |
| break; // fall through to NONE |
| case HEX: |
| hexValue <<= 4; |
| hexValue += cp; |
| switch (cp) { |
| case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': |
| hexValue -= '0'; break; |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| hexValue -= 'a' - 10; break; |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| hexValue -= 'A' - 10; break; |
| default: |
| start = i; |
| return BROKEN_ESCAPE; |
| } |
| --hexCount; |
| if (hexCount == 0) { |
| quoteStatus = NONE; |
| UTF16.append(buffer, hexValue); |
| } |
| continue main; |
| case AFTER_QUOTE: |
| // see if we get another quote character |
| // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote |
| if (cp == lastQuote) { |
| UTF16.append(buffer, cp); |
| quoteStatus = NORMAL_QUOTE; |
| continue main; |
| } |
| quoteStatus = NONE; |
| break; // fall through to NONE |
| case START_QUOTE: |
| // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote |
| if (cp == lastQuote) { |
| UTF16.append(buffer, cp); |
| quoteStatus = NONE; // get out of quote, with no trace remaining |
| continue; |
| } |
| // otherwise get into quote |
| UTF16.append(buffer, cp); |
| quoteStatus = NORMAL_QUOTE; |
| continue main; |
| case NORMAL_QUOTE: |
| if (cp == lastQuote) { |
| quoteStatus = AFTER_QUOTE; // get out of quote |
| continue main; |
| } |
| UTF16.append(buffer, cp); |
| continue main; |
| } |
| |
| if (ignorableCharacters.contains(cp)) { |
| continue; |
| } |
| // do syntax characters |
| if (syntaxCharacters.contains(cp)) { |
| if (status == UNKNOWN) { |
| UTF16.append(buffer, cp); |
| start = i + UTF16.getCharCount(cp); |
| return SYNTAX; |
| } else { // LITERAL, so back up and break |
| start = i; |
| return status; |
| } |
| } |
| // otherwise it is a literal; keep on going |
| status = LITERAL; |
| if (cp == BACK_SLASH) { |
| quoteStatus = SLASH_START; |
| continue; |
| } else if (usingQuote && cp == SINGLE_QUOTE) { |
| lastQuote = cp; |
| quoteStatus = START_QUOTE; |
| continue; |
| } |
| // normal literals |
| UTF16.append(buffer, cp); |
| } |
| // handle final cleanup |
| start = limit; |
| switch (quoteStatus) { |
| case HEX: |
| status = BROKEN_ESCAPE; |
| break; |
| case SLASH_START: |
| if (usingSlash) { |
| status = BROKEN_ESCAPE; |
| } else { |
| buffer.append(BACK_SLASH); |
| } |
| break; |
| case START_QUOTE: case NORMAL_QUOTE: |
| status = BROKEN_QUOTE; |
| break; |
| } |
| return status; |
| } |
| |
| |
| } |
| //eof |