blob: 9e2ba57c02d96478aed5f1ea42ebeb88dd604a68 [file] [log] [blame]
* Copyright (c) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/,v $
* $Date: 2001/10/25 02:23:53 $
* $Revision: 1.3 $
import java.text.*;
import java.util.*;
* INTERNAL CLASS implementing the UnicodeSet properties as outlined
* at:
* Recognized syntax:
* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
* \p{foo} \P{foo} - white space not allowed within "\p" or "\P"
* Other than the above restrictions, white space is ignored. Case
* is ignored except in "\p" and "\P".
* This class cannot be instantiated. It has a public static method,
* createPropertySet(), with takes a pattern to be parsed and returns
* a new UnicodeSet. Another public static method,
* resemblesPattern(), returns true if a given pattern string appears
* to be a property set pattern, and therefore should be passed in to
* createPropertySet().
* NOTE: Current implementation is incomplete. The following list
* indicates which properties are supported.
* + GeneralCategory
* CombiningClass
* BidiClass
* DecompositionType
* + NumericValue
* NumericType
* EastAsianWidth
* LineBreak
* JoiningType
* + Script
* '+' indicates a supported property.
* @author Alan Liu
* @version $RCSfile:,v $ $Revision: 1.3 $ $Date: 2001/10/25 02:23:53 $
class UnicodePropertySet {
private static final Hashtable NAME_MAP = new Hashtable();
private static final Hashtable CATEGORY_MAP = new Hashtable();
* A cache mapping character category integers, as returned by
* UCharacter.getType(), to sets. Entries are initially
* null and are created on demand.
private static final UnicodeSet[] CATEGORY_CACHE =
new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT];
* A cache mapping script integers, as defined by
* UScript, to sets. Entries are initially
* null and are created on demand.
private static final UnicodeSet[] SCRIPT_CACHE =
new UnicodeSet[UScript.CODE_LIMIT];
// Special value codes
private static final int ANY = -1; // general category: all code points
// Public API
* Return true if the given position, in the given pattern, appears
* to be the start of a property set pattern [:foo:], \p{foo}, or
* \P{foo}.
public static boolean resemblesPattern(String pattern, int pos) {
// Patterns are at least 5 characters long
if ((pos+5) > pattern.length()) {
return false;
// Look for an opening [:, [:^, \p, or \P
return pattern.regionMatches(pos, "[:", 0, 2) ||
pattern.regionMatches(true, pos, "\\p", 0, 2);
* Create a UnicodeSet by parsing the given pattern at the given
* parse position.
* @param pattern the pattern string
* @param ppos on entry, the position at which to begin parsing.
* This shold be one of the locations marked '^':
* [:blah:] \p{blah} \P{blah}
* ^ % ^ % ^ %
* On return, the position after the last character parsed, that is,
* the locations marked '%'. If the parse fails, ppos is returned
* unchanged.
* @return a newly-constructed UnicodeSet object, or null upon
* failure.
public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) {
UnicodeSet set = null;
int pos = ppos.getIndex();
// On entry, ppos should point to one of the following locations:
// Minimum length is 5 characters, e.g. \p{L}
if ((pos+5) > pattern.length()) {
return null;
boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat}
boolean invert = false;
// Look for an opening [:, [:^, \p, or \P
if (pattern.regionMatches(pos, "[:", 0, 2)) {
posix = true;
pos = Utility.skipWhitespace(pattern, pos+2);
if (pos < pattern.length() && pattern.charAt(pos) == '^') {
invert = true;
} else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) {
invert = (pattern.charAt(pos+1) == 'P');
pos = Utility.skipWhitespace(pattern, pos+2);
if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
// Syntax error; "\p" or "\P" not followed by "{"
return null;
} else {
// Open delimiter not seen
return null;
// Look for the matching close delimiter, either :] or }
int close = pattern.indexOf(posix ? ":]" : "}", pos);
if (close < 0) {
// Syntax error; close delimiter missing
return null;
// Look for an '=' sign. If this is present, we will parse a
// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
// pattern.
int equals = pattern.indexOf('=', pos);
if (equals >= 0 && equals < close) {
// Equals seen; parse medium/long pattern
String typeName = munge(pattern, pos, equals);
String valueName = munge(pattern, equals+1, close);
SetFactory factory;
factory = (SetFactory) NAME_MAP.get(typeName);
if (factory == null) {
// Syntax error; type name not recognized
return null;
set = factory.create(valueName);
} else {
// No equals seen; parse short format \p{Cf}
String shortName = munge(pattern, pos, close);
// First try general category
set = createCategorySet(shortName);
// If this fails, try script
if (set == null) {
set = createScriptSet(shortName);
if (invert) {
// Move to the limit position after the close delimiter
ppos.setIndex(close + (posix ? 2 : 1));
return set;
// Property set factory classes
// NOTE: This will change/go away when we implement UCharacter
// based property retrieval.
static interface SetFactory {
UnicodeSet create(String valueName);
static class NumericValueFactory implements SetFactory {
NumericValueFactory() {}
public UnicodeSet create(String valueName) {
double value = Double.parseDouble(valueName);
final int ivalue = (int) value;
if (ivalue != value || ivalue < 0) {
// UCharacter doesn't support negative or non-integral
// values, so just return an empty set
return new UnicodeSet();
return createSetFromFilter(new Filter() {
public boolean contains(int cp) {
return UCharacter.getUnicodeNumericValue(cp) == ivalue;
// Property set factory static methods
// NOTE: This will change/go away when we implement UCharacter
// based property retrieval.
* Given a general category value name, create a corresponding
* set and return it, or return null if the name is invalid.
* @param valueName a pre-munged general category value name
private static UnicodeSet createCategorySet(String valueName) {
Integer valueObj;
valueObj = (Integer) CATEGORY_MAP.get(valueName);
if (valueObj == null) {
return null;
int valueCode = valueObj.intValue();
UnicodeSet set = new UnicodeSet();
if (valueCode == ANY) {
return set;
for (int cat=0; cat<UCharacterCategory.CHAR_CATEGORY_COUNT; ++cat) {
if ((valueCode & (1 << cat)) != 0) {
return set;
* Given a script value name, create a corresponding set and
* return it, or return null if the name is invalid.
* @param valueName a pre-munged script value name
private static UnicodeSet createScriptSet(String valueName) {
int[] script = UScript.getCode(valueName);
if (script[0] == UScript.INVALID_CODE) {
// Syntax error; unknown short name
return null;
return new UnicodeSet(getScriptSet(script[0]));
// Utility methods
* Returns a UnicodeSet for the given category. This set is
* cached and returned again if this method is called again with
* the same parameter.
* Callers MUST NOT MODIFY the returned set.
private static UnicodeSet getCategorySet(final int cat) {
if (CATEGORY_CACHE[cat] == null) {
createSetFromFilter(new Filter() {
public boolean contains(int cp) {
return UCharacter.getType(cp) == cat;
return CATEGORY_CACHE[cat];
* Returns a UnicodeSet for the given script. This set is
* cached and returned again if this method is called again with
* the same parameter.
* Callers MUST NOT MODIFY the returned set.
private static UnicodeSet getScriptSet(final int script) {
if (SCRIPT_CACHE[script] == null) {
SCRIPT_CACHE[script] =
createSetFromFilter(new Filter() {
public boolean contains(int cp) {
return UScript.getScript(cp) == script;
return SCRIPT_CACHE[script];
* Given a string, munge it to upper case and lose the whitespace.
* So "General Category " becomes "GENERALCATEGORY". We munge all
* type and value strings, and store all type and value keys
* pre-munged.
private static String munge(String str, int start, int limit) {
StringBuffer buf = new StringBuffer();
for (int i=start; i<limit; ) {
int c = UTF16.charAt(str, i);
i += UTF16.getCharCount(c);
if (!UCharacter.isWhitespace(c)) {
UTF16.append(buf, UCharacter.toUpperCase(c));
return buf.toString();
// Generic filter-based scanning code
// NOTE: In general, we don't want to do this! This is a temporary
// implementation until we have time for something that examines
// the underlying UCharacter data structures in an intelligent
// way. Iterating over all code points is dumb. What we want to
// do, for instance, is iterate over internally-stored ranges
// of characters that have a given property.
static interface Filter {
boolean contains(int codePoint);
static UnicodeSet createSetFromFilter(Filter filter) {
// Walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
UnicodeSet set = new UnicodeSet();
int start = -1;
int end = -2;
// TODO Extend this up to UnicodeSet.MAX_VALUE when we have
// better performance; i.e., when this code can get moved into
// the UCharacter class and not have to iterate over code
// points. Right now it's way too slow to iterate to 10FFFF.
for (int i=UnicodeSet.MIN_VALUE; i<=0xFFFF; ++i) {
if (filter.contains(i)) {
if ((end+1) == i) {
end = i;
} else {
if (start >= 0) {
set.add(start, end);
start = end = i;
if (start >= 0) {
set.add(start, end);
return set;
// Type and value name maps
* Add a type mapping to the name map.
private static void addType(String shortName, String longName,
SetFactory factory) {
if (true) {
if (NAME_MAP.get(shortName) != null) {
throw new InternalError("Duplicate name " + shortName);
if (NAME_MAP.get(longName) != null) {
throw new InternalError("Duplicate name " + longName);
NAME_MAP.put(shortName, factory);
NAME_MAP.put(longName, factory);
* Add a value mapping to the name map.
private static void addValue(Hashtable map,
String shortName, String longName,
int value) {
if (true) {
if (map.get(shortName) != null) {
throw new InternalError("Duplicate name " + shortName);
if (longName != null && map.get(longName) != null) {
throw new InternalError("Duplicate name " + longName);
Integer valueObj = new Integer(value);
map.put(shortName, valueObj);
if (longName != null) {
map.put(longName, valueObj);
static {
// NOTE: We munge all search keys to have no whitespace
// and upper case. As such, all stored keys should have
// this format.
// Load the map with type data
addType("GC", "GENERALCATEGORY", new SetFactory() {
public UnicodeSet create(String valueName) {
return createCategorySet(valueName);
addType("NV", "NUMERICVALUE", new NumericValueFactory());
addType("SC", "SCRIPT", new SetFactory() {
public UnicodeSet create(String valueName) {
return createScriptSet(valueName);
// Load the map with value data
// General Category
addValue(CATEGORY_MAP, "ANY", null, ANY); // special case
addValue(CATEGORY_MAP, "C", "OTHER",
(1 << UCharacterCategory.CONTROL) |
(1 << UCharacterCategory.FORMAT) |
(1 << UCharacterCategory.GENERAL_OTHER_TYPES) |
(1 << UCharacterCategory.PRIVATE_USE) |
(1 << UCharacterCategory.SURROGATE));
1 << UCharacterCategory.CONTROL);
1 << UCharacterCategory.FORMAT);
1 << UCharacterCategory.GENERAL_OTHER_TYPES);
1 << UCharacterCategory.PRIVATE_USE);
1 << UCharacterCategory.SURROGATE);
(1 << UCharacterCategory.LOWERCASE_LETTER) |
(1 << UCharacterCategory.MODIFIER_LETTER) |
(1 << UCharacterCategory.OTHER_LETTER) |
(1 << UCharacterCategory.TITLECASE_LETTER) |
(1 << UCharacterCategory.UPPERCASE_LETTER));
1 << UCharacterCategory.LOWERCASE_LETTER);
1 << UCharacterCategory.MODIFIER_LETTER);
1 << UCharacterCategory.OTHER_LETTER);
1 << UCharacterCategory.TITLECASE_LETTER);
1 << UCharacterCategory.UPPERCASE_LETTER);
addValue(CATEGORY_MAP, "M", "MARK",
(1 << UCharacterCategory.NON_SPACING_MARK) |
(1 << UCharacterCategory.COMBINING_SPACING_MARK) |
(1 << UCharacterCategory.ENCLOSING_MARK));
1 << UCharacterCategory.NON_SPACING_MARK);
1 << UCharacterCategory.COMBINING_SPACING_MARK);
1 << UCharacterCategory.ENCLOSING_MARK);
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
(1 << UCharacterCategory.LETTER_NUMBER) |
(1 << UCharacterCategory.OTHER_NUMBER));
1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER);
1 << UCharacterCategory.LETTER_NUMBER);
1 << UCharacterCategory.OTHER_NUMBER);
(1 << UCharacterCategory.CONNECTOR_PUNCTUATION) |
(1 << UCharacterCategory.DASH_PUNCTUATION) |
(1 << UCharacterCategory.END_PUNCTUATION) |
(1 << UCharacterCategory.FINAL_PUNCTUATION) |
(1 << UCharacterCategory.INITIAL_PUNCTUATION) |
(1 << UCharacterCategory.OTHER_PUNCTUATION) |
(1 << UCharacterCategory.START_PUNCTUATION));
1 << UCharacterCategory.CONNECTOR_PUNCTUATION);
1 << UCharacterCategory.DASH_PUNCTUATION);
1 << UCharacterCategory.END_PUNCTUATION);
1 << UCharacterCategory.FINAL_PUNCTUATION);
1 << UCharacterCategory.INITIAL_PUNCTUATION);
1 << UCharacterCategory.OTHER_PUNCTUATION);
1 << UCharacterCategory.START_PUNCTUATION);
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
(1 << UCharacterCategory.MATH_SYMBOL) |
(1 << UCharacterCategory.OTHER_SYMBOL));
1 << UCharacterCategory.CURRENCY_SYMBOL);
1 << UCharacterCategory.MODIFIER_SYMBOL);
1 << UCharacterCategory.MATH_SYMBOL);
1 << UCharacterCategory.OTHER_SYMBOL);
(1 << UCharacterCategory.LINE_SEPARATOR) |
(1 << UCharacterCategory.PARAGRAPH_SEPARATOR) |
(1 << UCharacterCategory.SPACE_SEPARATOR));
1 << UCharacterCategory.LINE_SEPARATOR);
1 << UCharacterCategory.PARAGRAPH_SEPARATOR);
1 << UCharacterCategory.SPACE_SEPARATOR);