source/i18n/upropset.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (c) 2001, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 * $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.cpp,v $
 * $Date: 2001/10/24 05:42:13 $
 * $Revision: 1.3 $
 **********************************************************************
 */
 #include "upropset.h"
 #include "ustrfmt.h"
 #include "unicode/unistr.h"
 #include "unicode/uscript.h"
 #include "unicode/uniset.h"
 #include "unicode/parsepos.h"
 #include "hash.h"

 U_NAMESPACE_BEGIN

 static Hashtable* NAME_MAP = NULL;

 static Hashtable* CATEGORY_MAP = NULL;

 /**
  * A cache mapping character category integers, as returned by
  * UCharacter.getType(), to sets.  Entries are initially
  * null and are created on demand.
  */
 static UnicodeSet* CATEGORY_CACHE = NULL;

 /**
  * A cache mapping script integers, as defined by
  * UScript, to sets.  Entries are initially
  * null and are created on demand.
  */
 static UnicodeSet* SCRIPT_CACHE = NULL;

 // Special value codes
 static const int32_t ANY = -1; // general category: all code points

 //----------------------------------------------------------------
 // Unicode string and character constants
 //----------------------------------------------------------------

 static const UChar POSIX_OPEN[]  = { 91,58,0 }; // "[:"
 static const UChar POSIX_CLOSE[] = { 58,93,0 }; // ":]"

 static const UChar PERL_OPEN[]  = { 92,112,0 }; // "\\p"
 static const UChar PERL_CLOSE[] = { 125,0 };    // "}"

 static const UChar HAT        = 0x005E; /*^*/
 static const UChar UPPER_P    = 0x0050; /*P*/
 static const UChar LEFT_BRACE = 0x007B; /*{*/
 static const UChar EQUALS     = 0x003D; /*=*/

 //----------------------------------------------------------------------
 // class _CharString
 // An identical class named CharString can be found in transreg.cpp.
 // If we find ourselves needing another copy of this utility class we
 // should probably pull it out into putil or some such place.
 //----------------------------------------------------------------------

 class _CharString {
  public:
     _CharString(const UnicodeString& str);
     ~_CharString();
     operator char*() { return ptr; }
  private:
     char buf[128];
     char* ptr;
 };

 _CharString::_CharString(const UnicodeString& str) {
     if (str.length() >= (int32_t)sizeof(buf)) {
         ptr = new char[str.length() + 8];
     } else {
         ptr = buf;
     }
     str.extract(0, 0x7FFFFFFF, ptr, "");
 }

 _CharString::~_CharString() {
     if (ptr != buf) {
         delete[] ptr;
     }
 }

 //----------------------------------------------------------------
 // Public API
 //----------------------------------------------------------------

 /**
  * Return true if the given position, in the given pattern, appears
  * to be the start of a property set pattern [:foo:], \p{foo}, or
  * \P{foo}.
  */
 UBool UnicodePropertySet::resemblesPattern(const UnicodeString& pattern,
                                            int32_t pos) {
     // Patterns are at least 5 characters long
     if ((pos+5) > pattern.length()) {
         return FALSE;
     }

     // Look for an opening [:, [:^, \p, or \P
     return (0 == pattern.compare(pos, 2, POSIX_OPEN)) ||
         (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT));
 }

 /**
  * Create a UnicodeSet by parsing the given pattern at the given
  * parse position.
  *
  * @param pattern the pattern string
  * @param ppos on entry, the position at which to begin parsing.
  * This shold be one of the locations marked '^':
  *
  *   [:blah:]     \p{blah}     \P{blah}
  *   ^       %    ^       %    ^       %
  *
  * On return, the position after the last character parsed, that is,
  * the locations marked '%'.  If the parse fails, ppos is returned
  * unchanged.
  * @return a newly-constructed UnicodeSet object, or null upon
  * failure.
  */
 UnicodeSet* UnicodePropertySet::createFromPattern(const UnicodeString& pattern,
                                                   ParsePosition& ppos) {
     init();

     UnicodeSet* set = NULL;

     int32_t pos = ppos.getIndex();

     // On entry, ppos should point to one of the following locations:

     // Minimum length is 5 characters, e.g. \p{L}
     if ((pos+5) > pattern.length()) {
         return NULL;
     }

     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat}
     UBool invert = FALSE;

     // Look for an opening [:, [:^, \p, or \P
     if (0 == pattern.compare(pos, 2, POSIX_OPEN)) {
         posix = TRUE;
         pos = skipWhitespace(pattern, pos+2);
         if (pos < pattern.length() && pattern.charAt(pos) == HAT) {
             ++pos;
             invert = TRUE;
         }
     } else if (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)) {
         invert = (pattern.charAt(pos+1) == UPPER_P);
         pos = skipWhitespace(pattern, pos+2);
         if (pos == pattern.length() || pattern.charAt(pos++) != LEFT_BRACE) {
             // Syntax error; "\p" or "\P" not followed by "{"
             return NULL;
         }
     } else {
         // Open delimiter not seen
         return NULL;
     }

     // Look for the matching close delimiter, either :] or }
     int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
     if (close < 0) {
         // Syntax error; close delimiter missing
         return NULL;
     }

     // Look for an '=' sign.  If this is present, we will parse a
     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
     // pattern.
     int32_t equals = pattern.indexOf(EQUALS, pos);
     if (equals >= 0 && equals < close) {
         // Equals seen; parse medium/long pattern
         UnicodeString typeName = munge(pattern, pos, equals);
         UnicodeString valueName = munge(pattern, equals+1, close);
         SetFactory factory;
         factory = voidPtrToSetFactory(NAME_MAP->get(typeName));
         if (factory == NULL) {
             // Syntax error; type name not recognized
             return NULL;
         }
         set = (*factory)(valueName);
     } else {
         // No equals seen; parse short format \p{Cf}
         UnicodeString shortName = munge(pattern, pos, close);

         // First try general category
         set = createCategorySet(shortName);

         // If this fails, try script
         if (set == NULL) {
             set = createScriptSet(shortName);
         }
     }

     // Upon failure, return NULL with ppos unchanged
     if (set == NULL) {
         return NULL;
     }

     if (invert) {
         set->complement();
     }

     // Move to the limit position after the close delimiter
     ppos.setIndex(close + (posix ? 2 : 1));

     return set;
 }

 //----------------------------------------------------------------
 // Property set factory static methods
 // NOTE: This will change/go away when we implement UCharacter
 // based property retrieval.
 //----------------------------------------------------------------

 static UBool _numericValueFilter(UChar32 c, void* context) {
     int32_t value = * (int32_t*) context;
     // TODO: Change this to a more generic function, like
     // u_charNumericValue (when one exists).
     return u_charDigitValue(c) == value;
 }

 UnicodeSet* UnicodePropertySet::createNumericValueSet(const UnicodeString& valueName) {
     _CharString cvalueName(valueName);
     UnicodeSet* set = new UnicodeSet();
     char* end;
     double value = uprv_strtod(cvalueName, &end);
     int32_t ivalue = (int32_t) value;
     if (ivalue != value || ivalue < 0 || *end != 0) {
         // UCharacter doesn't support negative or non-integral
         // values, so just return an empty set
         return set;
     }
     initSetFromFilter(*set, _numericValueFilter, &ivalue);
     return set;
 }

 /**
  * Given a general category value name, create a corresponding
  * set and return it, or return null if the name is invalid.
  * @param valueName a pre-munged general category value name
  */
 UnicodeSet* UnicodePropertySet::createCategorySet(const UnicodeString& valueName) {
     int32_t valueCode = CATEGORY_MAP->geti(valueName);
     if (valueCode == 0) {
         return NULL;
     }

     UnicodeSet* set = new UnicodeSet();
     if (valueCode == ANY) {
         set->complement();
         return set;
     }
     for (int32_t cat=0; cat<U_CHAR_CATEGORY_COUNT; ++cat) {
         if ((valueCode & (1 << cat)) != 0) {
             set->addAll(getCategorySet(cat));
         }
     }
     return set;
 }

 /**
  * Given a script value name, create a corresponding set and
  * return it, or return null if the name is invalid.
  * @param valueName a pre-munged script value name
  */
 UnicodeSet* UnicodePropertySet::createScriptSet(const UnicodeString& valueName) {
     _CharString cvalueName(valueName);
     UErrorCode ec = U_ZERO_ERROR;
     const int32_t capacity = 10;
     UScriptCode script[capacity]={USCRIPT_INVALID_CODE};
     int32_t num = uscript_getCode(cvalueName,script,capacity, &ec);

     if (script[0] == USCRIPT_INVALID_CODE || U_FAILURE(ec)) {
         // Syntax error; unknown short name
         return NULL;
     }
     return new UnicodeSet(getScriptSet(script[0]));
 }

 //----------------------------------------------------------------
 // Utility methods
 //----------------------------------------------------------------

 static UBool _categoryFilter(UChar32 c, void* context) {
     int32_t value = * (int32_t*) context;
     return u_charType(c) == value;
 }

 /**
  * Returns a UnicodeSet for the given category.  This set is
  * cached and returned again if this method is called again with
  * the same parameter.
  *
  * Callers MUST NOT MODIFY the returned set.
  */
 const UnicodeSet& UnicodePropertySet::getCategorySet(int32_t cat) {
     if (CATEGORY_CACHE[cat].isEmpty()) {
         initSetFromFilter(CATEGORY_CACHE[cat], _categoryFilter, &cat);
     }
     return CATEGORY_CACHE[cat];
 }

 static UBool _scriptFilter(UChar32 c, void* context) {
     UScriptCode value = * (UScriptCode*) context;
     UErrorCode ec = U_ZERO_ERROR;
     return uscript_getScript(c, &ec) == value;
 }

 /**
  * Returns a UnicodeSet for the given script.  This set is
  * cached and returned again if this method is called again with
  * the same parameter.
  *
  * Callers MUST NOT MODIFY the returned set.
  */
 const UnicodeSet& UnicodePropertySet::getScriptSet(UScriptCode script) {
     if (SCRIPT_CACHE[script].isEmpty()) {
         initSetFromFilter(SCRIPT_CACHE[script], _scriptFilter, &script);
     }
     return SCRIPT_CACHE[script];
 }

 /**
  * Given a string, munge it to lost the whitespace.  So "General
  * Category " becomes "GeneralCategory".  We munge all type and value
  * strings, and store all type and value keys pre-munged.  NOTE:
  * Unlike the Java version, we do not modify the case, since we use a
  * case-insensitive compare function.
  */
 UnicodeString UnicodePropertySet::munge(const UnicodeString& str,
                                         int32_t start, int32_t limit) {
     UnicodeString buf;
     for (int32_t i=start; i<limit; ) {
         UChar32 c = str.char32At(i);
         i += UTF_CHAR_LENGTH(c);
         if (!u_isspace(c)) {
             buf.append(c);
         }
     }
     return buf;
 }

 /**
  * Skip over a sequence of zero or more white space characters
  * at pos.  Return the index of the first non-white-space character
  * at or after pos, or str.length(), if there is none.
  */
 int32_t UnicodePropertySet::skipWhitespace(const UnicodeString& str,
                                            int32_t pos) {
     while (pos < str.length()) {
         UChar32 c = str.char32At(pos);
         if (!u_isspace(c)) {
             break;
         }
         pos += UTF_CHAR_LENGTH(c);
     }
     return pos;
 }

 //----------------------------------------------------------------
 // Generic filter-based scanning code
 //
 // NOTE: In general, we don't want to do this!  This is a temporary
 // implementation until we have time for something that examines
 // the underlying UCharacter data structures in an intelligent
 // way.  Iterating over all code points is dumb.  What we want to
 // do, for instance, is iterate over internally-stored ranges
 // of characters that have a given property.
 //----------------------------------------------------------------

 void UnicodePropertySet::initSetFromFilter(UnicodeSet& set, Filter filter,
                                            void* context) {
     // Walk through all Unicode characters, noting the start
     // and end of each range for which filter.contain(c) is
     // true.  Add each range to a set.
     set.clear();

     int32_t start = -1;
     int32_t end = -2;

     // TODO Extend this up to UnicodeSet.MAX_VALUE when we have
     // better performance; i.e., when this code can get moved into
     // the UCharacter class and not have to iterate over code
     // points.  Right now it's way too slow to iterate to 10FFFF.

     for (int32_t i=UnicodeSet::MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) {
         if ((*filter)((UChar32) i, context)) {
             if ((end+1) == i) {
                 end = i;
             } else {
                 if (start >= 0) {
                     set.add((UChar32)start, (UChar32)end);
                 }
                 start = end = i;
             }
         }
     }
     if (start >= 0) {
         set.add((UChar32)start, (UChar32)end);
     }
 }

 //----------------------------------------------------------------
 // Type and value name maps
 //----------------------------------------------------------------

 /**
  * Add a type mapping to the name map.
  */
 void UnicodePropertySet::addType(const UnicodeString& shortName,
                                  const UnicodeString& longName,
                                  SetFactory factory) {
     UErrorCode ec = U_ZERO_ERROR;
     void* p = setFactoryToVoidPtr(factory);
     NAME_MAP->put(shortName, p, ec);
     NAME_MAP->put(longName, p, ec);
 }

 /**
  * Add a value mapping to the name map.
  */
 void UnicodePropertySet::addValue(Hashtable* map,
                                   const UnicodeString& shortName,
                                   const UnicodeString& longName,
                                   int32_t value) {
     // assert(value != 0);
     UErrorCode ec = U_ZERO_ERROR;
     map->puti(shortName, value, ec);
     if (longName.length() != 0) {
         map->puti(longName, value, ec);
     }
 }

 void UnicodePropertySet::init() {
     if (NAME_MAP != NULL) {
         return;
     }

     NAME_MAP = new Hashtable(TRUE);
     CATEGORY_MAP = new Hashtable(TRUE);
     CATEGORY_CACHE = new UnicodeSet[U_CHAR_CATEGORY_COUNT];
     SCRIPT_CACHE = new UnicodeSet[USCRIPT_CODE_LIMIT];

     // NOTE:  We munge all search keys to have no whitespace
     // and upper case.  As such, all stored keys should have
     // this format.

     // Load the map with type data

     addType("GC", "GENERALCATEGORY", createCategorySet);

     //addType("CC", "COMBININGCLASS", COMBINING_CLASS);
     //addType("BC", "BIDICLASS", BIDI_CLASS);
     //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);

     addType("NV", "NUMERICVALUE", createNumericValueSet);

     //addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
     //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
     //addType("LB", "LINEBREAK", LINE_BREAK);
     //addType("JT", "JOININGTYPE", JOINING_TYPE);

     addType("SC", "SCRIPT", createScriptSet);

     // Load the map with value data

     // General Category

     addValue(CATEGORY_MAP, "ANY", "", ANY); // special case

     addValue(CATEGORY_MAP, "C", "OTHER",
              (1 << U_CONTROL_CHAR) |
              (1 << U_FORMAT_CHAR) |
              (1 << U_GENERAL_OTHER_TYPES) |
              (1 << U_PRIVATE_USE_CHAR) |
              (1 << U_SURROGATE));

     addValue(CATEGORY_MAP, "CC", "CONTROL",
              1 << U_CONTROL_CHAR);
     addValue(CATEGORY_MAP, "CF", "FORMAT",
              1 << U_FORMAT_CHAR);
     addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
              1 << U_GENERAL_OTHER_TYPES);
     addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
              1 << U_PRIVATE_USE_CHAR);
     addValue(CATEGORY_MAP, "CS", "SURROGATE",
              1 << U_SURROGATE);

     addValue(CATEGORY_MAP, "L", "LETTER",
              (1 << U_LOWERCASE_LETTER) |
              (1 << U_MODIFIER_LETTER) |
              (1 << U_OTHER_LETTER) |
              (1 << U_TITLECASE_LETTER) |
              (1 << U_UPPERCASE_LETTER));

     addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
              1 << U_LOWERCASE_LETTER);
     addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
              1 << U_MODIFIER_LETTER);
     addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
              1 << U_OTHER_LETTER);
     addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
              1 << U_TITLECASE_LETTER);
     addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
              1 << U_UPPERCASE_LETTER);

     addValue(CATEGORY_MAP, "M", "MARK",
              (1 << U_NON_SPACING_MARK) |
              (1 << U_COMBINING_SPACING_MARK) |
              (1 << U_ENCLOSING_MARK));

     addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
              1 << U_NON_SPACING_MARK);
     addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
              1 << U_COMBINING_SPACING_MARK);
     addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
              1 << U_ENCLOSING_MARK);

     addValue(CATEGORY_MAP, "N", "NUMBER",
              (1 << U_DECIMAL_DIGIT_NUMBER) |
              (1 << U_LETTER_NUMBER) |
              (1 << U_OTHER_NUMBER));

     addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
              1 << U_DECIMAL_DIGIT_NUMBER);
     addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
              1 << U_LETTER_NUMBER);
     addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
              1 << U_OTHER_NUMBER);

     addValue(CATEGORY_MAP, "P", "PUNCTUATION",
              (1 << U_CONNECTOR_PUNCTUATION) |
              (1 << U_DASH_PUNCTUATION) |
              (1 << U_END_PUNCTUATION) |
              (1 << U_FINAL_PUNCTUATION) |
              (1 << U_INITIAL_PUNCTUATION) |
              (1 << U_OTHER_PUNCTUATION) |
              (1 << U_START_PUNCTUATION));

     addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
              1 << U_CONNECTOR_PUNCTUATION);
     addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
              1 << U_DASH_PUNCTUATION);
     addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
              1 << U_END_PUNCTUATION);
     addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
              1 << U_FINAL_PUNCTUATION);
     addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
              1 << U_INITIAL_PUNCTUATION);
     addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
              1 << U_OTHER_PUNCTUATION);
     addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
              1 << U_START_PUNCTUATION);

     addValue(CATEGORY_MAP, "S", "SYMBOL",
              (1 << U_CURRENCY_SYMBOL) |
              (1 << U_MODIFIER_SYMBOL) |
              (1 << U_MATH_SYMBOL) |
              (1 << U_OTHER_SYMBOL));

     addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
              1 << U_CURRENCY_SYMBOL);
     addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
              1 << U_MODIFIER_SYMBOL);
     addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
              1 << U_MATH_SYMBOL);
     addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
              1 << U_OTHER_SYMBOL);

     addValue(CATEGORY_MAP, "Z", "SEPARATOR",
              (1 << U_LINE_SEPARATOR) |
              (1 << U_PARAGRAPH_SEPARATOR) |
              (1 << U_SPACE_SEPARATOR));

     addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
              1 << U_LINE_SEPARATOR);
     addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
              1 << U_PARAGRAPH_SEPARATOR);
     addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
              1 << U_SPACE_SEPARATOR);
 }

 void UnicodePropertySet::cleanup() {
     if (NAME_MAP != NULL) {
         delete NAME_MAP; NAME_MAP = NULL;
         delete CATEGORY_MAP; CATEGORY_MAP = NULL;
         delete[] CATEGORY_CACHE; CATEGORY_CACHE = NULL;
         delete[] SCRIPT_CACHE; SCRIPT_CACHE = NULL;
     }
 }

 U_NAMESPACE_END

 //eof
	/*
	**********************************************************************
	* Copyright (c) 2001, International Business Machines
	* Corporation and others. All Rights Reserved.
	**********************************************************************
	* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.cpp,v $
	* $Date: 2001/10/24 05:42:13 $
	* $Revision: 1.3 $
	**********************************************************************
	*/
	#include "upropset.h"
	#include "ustrfmt.h"
	#include "unicode/unistr.h"
	#include "unicode/uscript.h"
	#include "unicode/uniset.h"
	#include "unicode/parsepos.h"
	#include "hash.h"

	U_NAMESPACE_BEGIN

	static Hashtable* NAME_MAP = NULL;

	static Hashtable* CATEGORY_MAP = NULL;

	/**
	* A cache mapping character category integers, as returned by
	* UCharacter.getType(), to sets. Entries are initially
	* null and are created on demand.
	*/
	static UnicodeSet* CATEGORY_CACHE = NULL;

	/**
	* A cache mapping script integers, as defined by
	* UScript, to sets. Entries are initially
	* null and are created on demand.
	*/
	static UnicodeSet* SCRIPT_CACHE = NULL;

	// Special value codes
	static const int32_t ANY = -1; // general category: all code points

	//----------------------------------------------------------------
	// Unicode string and character constants
	//----------------------------------------------------------------

	static const UChar POSIX_OPEN[] = { 91,58,0 }; // "[:"
	static const UChar POSIX_CLOSE[] = { 58,93,0 }; // ":]"

	static const UChar PERL_OPEN[] = { 92,112,0 }; // "\\p"
	static const UChar PERL_CLOSE[] = { 125,0 }; // "}"

	static const UChar HAT = 0x005E; /^/
	static const UChar UPPER_P = 0x0050; /P/
	static const UChar LEFT_BRACE = 0x007B; /{/
	static const UChar EQUALS = 0x003D; /=/

	//----------------------------------------------------------------------
	// class _CharString
	// An identical class named CharString can be found in transreg.cpp.
	// If we find ourselves needing another copy of this utility class we
	// should probably pull it out into putil or some such place.
	//----------------------------------------------------------------------

	class _CharString {
	public:
	_CharString(const UnicodeString& str);
	~_CharString();
	operator char*() { return ptr; }
	private:
	char buf[128];
	char* ptr;
	};

	_CharString::_CharString(const UnicodeString& str) {
	if (str.length() >= (int32_t)sizeof(buf)) {
	ptr = new char[str.length() + 8];
	} else {
	ptr = buf;
	}
	str.extract(0, 0x7FFFFFFF, ptr, "");
	}

	_CharString::~_CharString() {
	if (ptr != buf) {
	delete[] ptr;
	}
	}

	//----------------------------------------------------------------
	// Public API
	//----------------------------------------------------------------

	/**
	* Return true if the given position, in the given pattern, appears
	* to be the start of a property set pattern [:foo:], \p{foo}, or
	* \P{foo}.
	*/
	UBool UnicodePropertySet::resemblesPattern(const UnicodeString& pattern,
	int32_t pos) {
	// Patterns are at least 5 characters long
	if ((pos+5) > pattern.length()) {
	return FALSE;
	}

	// Look for an opening [:, [:^, \p, or \P
	return (0 == pattern.compare(pos, 2, POSIX_OPEN)) \|\|
	(0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT));
	}

	/**
	* Create a UnicodeSet by parsing the given pattern at the given
	* parse position.
	*
	* @param pattern the pattern string
	* @param ppos on entry, the position at which to begin parsing.
	* This shold be one of the locations marked '^':
	*
	* [:blah:] \p{blah} \P{blah}
	* ^ % ^ % ^ %
	*
	* On return, the position after the last character parsed, that is,
	* the locations marked '%'. If the parse fails, ppos is returned
	* unchanged.
	* @return a newly-constructed UnicodeSet object, or null upon
	* failure.
	*/
	UnicodeSet* UnicodePropertySet::createFromPattern(const UnicodeString& pattern,
	ParsePosition& ppos) {
	init();

	UnicodeSet* set = NULL;

	int32_t pos = ppos.getIndex();

	// On entry, ppos should point to one of the following locations:

	// Minimum length is 5 characters, e.g. \p{L}
	if ((pos+5) > pattern.length()) {
	return NULL;
	}

	UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat}
	UBool invert = FALSE;

	// Look for an opening [:, [:^, \p, or \P
	if (0 == pattern.compare(pos, 2, POSIX_OPEN)) {
	posix = TRUE;
	pos = skipWhitespace(pattern, pos+2);
	if (pos < pattern.length() && pattern.charAt(pos) == HAT) {
	++pos;
	invert = TRUE;
	}
	} else if (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)) {
	invert = (pattern.charAt(pos+1) == UPPER_P);
	pos = skipWhitespace(pattern, pos+2);
	if (pos == pattern.length() \|\| pattern.charAt(pos++) != LEFT_BRACE) {
	// Syntax error; "\p" or "\P" not followed by "{"
	return NULL;
	}
	} else {
	// Open delimiter not seen
	return NULL;
	}

	// Look for the matching close delimiter, either :] or }
	int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
	if (close < 0) {
	// Syntax error; close delimiter missing
	return NULL;
	}

	// Look for an '=' sign. If this is present, we will parse a
	// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
	// pattern.
	int32_t equals = pattern.indexOf(EQUALS, pos);
	if (equals >= 0 && equals < close) {
	// Equals seen; parse medium/long pattern
	UnicodeString typeName = munge(pattern, pos, equals);
	UnicodeString valueName = munge(pattern, equals+1, close);
	SetFactory factory;
	factory = voidPtrToSetFactory(NAME_MAP->get(typeName));
	if (factory == NULL) {
	// Syntax error; type name not recognized
	return NULL;
	}
	set = (*factory)(valueName);
	} else {
	// No equals seen; parse short format \p{Cf}
	UnicodeString shortName = munge(pattern, pos, close);

	// First try general category
	set = createCategorySet(shortName);

	// If this fails, try script
	if (set == NULL) {
	set = createScriptSet(shortName);
	}
	}

	// Upon failure, return NULL with ppos unchanged
	if (set == NULL) {
	return NULL;
	}

	if (invert) {
	set->complement();
	}

	// Move to the limit position after the close delimiter
	ppos.setIndex(close + (posix ? 2 : 1));

	return set;
	}

	//----------------------------------------------------------------
	// Property set factory static methods
	// NOTE: This will change/go away when we implement UCharacter
	// based property retrieval.
	//----------------------------------------------------------------

	static UBool _numericValueFilter(UChar32 c, void* context) {
	int32_t value = * (int32_t*) context;
	// TODO: Change this to a more generic function, like
	// u_charNumericValue (when one exists).
	return u_charDigitValue(c) == value;
	}

	UnicodeSet* UnicodePropertySet::createNumericValueSet(const UnicodeString& valueName) {
	_CharString cvalueName(valueName);
	UnicodeSet* set = new UnicodeSet();
	char* end;
	double value = uprv_strtod(cvalueName, &end);
	int32_t ivalue = (int32_t) value;
	if (ivalue != value \|\| ivalue < 0 \|\| *end != 0) {
	// UCharacter doesn't support negative or non-integral
	// values, so just return an empty set
	return set;
	}
	initSetFromFilter(*set, _numericValueFilter, &ivalue);
	return set;
	}

	/**
	* Given a general category value name, create a corresponding
	* set and return it, or return null if the name is invalid.
	* @param valueName a pre-munged general category value name
	*/
	UnicodeSet* UnicodePropertySet::createCategorySet(const UnicodeString& valueName) {
	int32_t valueCode = CATEGORY_MAP->geti(valueName);
	if (valueCode == 0) {
	return NULL;
	}

	UnicodeSet* set = new UnicodeSet();
	if (valueCode == ANY) {
	set->complement();
	return set;
	}
	for (int32_t cat=0; cat<U_CHAR_CATEGORY_COUNT; ++cat) {
	if ((valueCode & (1 << cat)) != 0) {
	set->addAll(getCategorySet(cat));
	}
	}
	return set;
	}

	/**
	* Given a script value name, create a corresponding set and
	* return it, or return null if the name is invalid.
	* @param valueName a pre-munged script value name
	*/
	UnicodeSet* UnicodePropertySet::createScriptSet(const UnicodeString& valueName) {
	_CharString cvalueName(valueName);
	UErrorCode ec = U_ZERO_ERROR;
	const int32_t capacity = 10;
	UScriptCode script[capacity]={USCRIPT_INVALID_CODE};
	int32_t num = uscript_getCode(cvalueName,script,capacity, &ec);

	if (script[0] == USCRIPT_INVALID_CODE \|\| U_FAILURE(ec)) {
	// Syntax error; unknown short name
	return NULL;
	}
	return new UnicodeSet(getScriptSet(script[0]));
	}

	//----------------------------------------------------------------
	// Utility methods
	//----------------------------------------------------------------

	static UBool _categoryFilter(UChar32 c, void* context) {
	int32_t value = * (int32_t*) context;
	return u_charType(c) == value;
	}

	/**
	* Returns a UnicodeSet for the given category. This set is
	* cached and returned again if this method is called again with
	* the same parameter.
	*
	* Callers MUST NOT MODIFY the returned set.
	*/
	const UnicodeSet& UnicodePropertySet::getCategorySet(int32_t cat) {
	if (CATEGORY_CACHE[cat].isEmpty()) {
	initSetFromFilter(CATEGORY_CACHE[cat], _categoryFilter, &cat);
	}
	return CATEGORY_CACHE[cat];
	}

	static UBool _scriptFilter(UChar32 c, void* context) {
	UScriptCode value = * (UScriptCode*) context;
	UErrorCode ec = U_ZERO_ERROR;
	return uscript_getScript(c, &ec) == value;
	}

	/**
	* Returns a UnicodeSet for the given script. This set is
	* cached and returned again if this method is called again with
	* the same parameter.
	*
	* Callers MUST NOT MODIFY the returned set.
	*/
	const UnicodeSet& UnicodePropertySet::getScriptSet(UScriptCode script) {
	if (SCRIPT_CACHE[script].isEmpty()) {
	initSetFromFilter(SCRIPT_CACHE[script], _scriptFilter, &script);
	}
	return SCRIPT_CACHE[script];
	}

	/**
	* Given a string, munge it to lost the whitespace. So "General
	* Category " becomes "GeneralCategory". We munge all type and value
	* strings, and store all type and value keys pre-munged. NOTE:
	* Unlike the Java version, we do not modify the case, since we use a
	* case-insensitive compare function.
	*/
	UnicodeString UnicodePropertySet::munge(const UnicodeString& str,
	int32_t start, int32_t limit) {
	UnicodeString buf;
	for (int32_t i=start; i<limit; ) {
	UChar32 c = str.char32At(i);
	i += UTF_CHAR_LENGTH(c);
	if (!u_isspace(c)) {
	buf.append(c);
	}
	}
	return buf;
	}

	/**
	* Skip over a sequence of zero or more white space characters
	* at pos. Return the index of the first non-white-space character
	* at or after pos, or str.length(), if there is none.
	*/
	int32_t UnicodePropertySet::skipWhitespace(const UnicodeString& str,
	int32_t pos) {
	while (pos < str.length()) {
	UChar32 c = str.char32At(pos);
	if (!u_isspace(c)) {
	break;
	}
	pos += UTF_CHAR_LENGTH(c);
	}
	return pos;
	}

	//----------------------------------------------------------------
	// Generic filter-based scanning code
	//
	// NOTE: In general, we don't want to do this! This is a temporary
	// implementation until we have time for something that examines
	// the underlying UCharacter data structures in an intelligent
	// way. Iterating over all code points is dumb. What we want to
	// do, for instance, is iterate over internally-stored ranges
	// of characters that have a given property.
	//----------------------------------------------------------------

	void UnicodePropertySet::initSetFromFilter(UnicodeSet& set, Filter filter,
	void* context) {
	// Walk through all Unicode characters, noting the start
	// and end of each range for which filter.contain(c) is
	// true. Add each range to a set.
	set.clear();

	int32_t start = -1;
	int32_t end = -2;

	// TODO Extend this up to UnicodeSet.MAX_VALUE when we have
	// better performance; i.e., when this code can get moved into
	// the UCharacter class and not have to iterate over code
	// points. Right now it's way too slow to iterate to 10FFFF.

	for (int32_t i=UnicodeSet::MIN_VALUE; i<=0xFFFF/TEMPORARY/; ++i) {
	if ((*filter)((UChar32) i, context)) {
	if ((end+1) == i) {
	end = i;
	} else {
	if (start >= 0) {
	set.add((UChar32)start, (UChar32)end);
	}
	start = end = i;
	}
	}
	}
	if (start >= 0) {
	set.add((UChar32)start, (UChar32)end);
	}
	}

	//----------------------------------------------------------------
	// Type and value name maps
	//----------------------------------------------------------------

	/**
	* Add a type mapping to the name map.
	*/
	void UnicodePropertySet::addType(const UnicodeString& shortName,
	const UnicodeString& longName,
	SetFactory factory) {
	UErrorCode ec = U_ZERO_ERROR;
	void* p = setFactoryToVoidPtr(factory);
	NAME_MAP->put(shortName, p, ec);
	NAME_MAP->put(longName, p, ec);
	}

	/**
	* Add a value mapping to the name map.
	*/
	void UnicodePropertySet::addValue(Hashtable* map,
	const UnicodeString& shortName,
	const UnicodeString& longName,
	int32_t value) {
	// assert(value != 0);
	UErrorCode ec = U_ZERO_ERROR;
	map->puti(shortName, value, ec);
	if (longName.length() != 0) {
	map->puti(longName, value, ec);
	}
	}

	void UnicodePropertySet::init() {
	if (NAME_MAP != NULL) {
	return;
	}

	NAME_MAP = new Hashtable(TRUE);
	CATEGORY_MAP = new Hashtable(TRUE);
	CATEGORY_CACHE = new UnicodeSet[U_CHAR_CATEGORY_COUNT];
	SCRIPT_CACHE = new UnicodeSet[USCRIPT_CODE_LIMIT];

	// NOTE: We munge all search keys to have no whitespace
	// and upper case. As such, all stored keys should have
	// this format.

	// Load the map with type data

	addType("GC", "GENERALCATEGORY", createCategorySet);

	//addType("CC", "COMBININGCLASS", COMBINING_CLASS);
	//addType("BC", "BIDICLASS", BIDI_CLASS);
	//addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);

	addType("NV", "NUMERICVALUE", createNumericValueSet);

	//addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
	//addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
	//addType("LB", "LINEBREAK", LINE_BREAK);
	//addType("JT", "JOININGTYPE", JOINING_TYPE);

	addType("SC", "SCRIPT", createScriptSet);

	// Load the map with value data

	// General Category

	addValue(CATEGORY_MAP, "ANY", "", ANY); // special case

	addValue(CATEGORY_MAP, "C", "OTHER",
	(1 << U_CONTROL_CHAR) \|
	(1 << U_FORMAT_CHAR) \|
	(1 << U_GENERAL_OTHER_TYPES) \|
	(1 << U_PRIVATE_USE_CHAR) \|
	(1 << U_SURROGATE));

	addValue(CATEGORY_MAP, "CC", "CONTROL",
	1 << U_CONTROL_CHAR);
	addValue(CATEGORY_MAP, "CF", "FORMAT",
	1 << U_FORMAT_CHAR);
	addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
	1 << U_GENERAL_OTHER_TYPES);
	addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
	1 << U_PRIVATE_USE_CHAR);
	addValue(CATEGORY_MAP, "CS", "SURROGATE",
	1 << U_SURROGATE);

	addValue(CATEGORY_MAP, "L", "LETTER",
	(1 << U_LOWERCASE_LETTER) \|
	(1 << U_MODIFIER_LETTER) \|
	(1 << U_OTHER_LETTER) \|
	(1 << U_TITLECASE_LETTER) \|
	(1 << U_UPPERCASE_LETTER));

	addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
	1 << U_LOWERCASE_LETTER);
	addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
	1 << U_MODIFIER_LETTER);
	addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
	1 << U_OTHER_LETTER);
	addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
	1 << U_TITLECASE_LETTER);
	addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
	1 << U_UPPERCASE_LETTER);

	addValue(CATEGORY_MAP, "M", "MARK",
	(1 << U_NON_SPACING_MARK) \|
	(1 << U_COMBINING_SPACING_MARK) \|
	(1 << U_ENCLOSING_MARK));

	addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
	1 << U_NON_SPACING_MARK);
	addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
	1 << U_COMBINING_SPACING_MARK);
	addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
	1 << U_ENCLOSING_MARK);

	addValue(CATEGORY_MAP, "N", "NUMBER",
	(1 << U_DECIMAL_DIGIT_NUMBER) \|
	(1 << U_LETTER_NUMBER) \|
	(1 << U_OTHER_NUMBER));

	addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
	1 << U_DECIMAL_DIGIT_NUMBER);
	addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
	1 << U_LETTER_NUMBER);
	addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
	1 << U_OTHER_NUMBER);

	addValue(CATEGORY_MAP, "P", "PUNCTUATION",
	(1 << U_CONNECTOR_PUNCTUATION) \|
	(1 << U_DASH_PUNCTUATION) \|
	(1 << U_END_PUNCTUATION) \|
	(1 << U_FINAL_PUNCTUATION) \|
	(1 << U_INITIAL_PUNCTUATION) \|
	(1 << U_OTHER_PUNCTUATION) \|
	(1 << U_START_PUNCTUATION));

	addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
	1 << U_CONNECTOR_PUNCTUATION);
	addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
	1 << U_DASH_PUNCTUATION);
	addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
	1 << U_END_PUNCTUATION);
	addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
	1 << U_FINAL_PUNCTUATION);
	addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
	1 << U_INITIAL_PUNCTUATION);
	addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
	1 << U_OTHER_PUNCTUATION);
	addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
	1 << U_START_PUNCTUATION);

	addValue(CATEGORY_MAP, "S", "SYMBOL",
	(1 << U_CURRENCY_SYMBOL) \|
	(1 << U_MODIFIER_SYMBOL) \|
	(1 << U_MATH_SYMBOL) \|
	(1 << U_OTHER_SYMBOL));

	addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
	1 << U_CURRENCY_SYMBOL);
	addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
	1 << U_MODIFIER_SYMBOL);
	addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
	1 << U_MATH_SYMBOL);
	addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
	1 << U_OTHER_SYMBOL);

	addValue(CATEGORY_MAP, "Z", "SEPARATOR",
	(1 << U_LINE_SEPARATOR) \|
	(1 << U_PARAGRAPH_SEPARATOR) \|
	(1 << U_SPACE_SEPARATOR));

	addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
	1 << U_LINE_SEPARATOR);
	addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
	1 << U_PARAGRAPH_SEPARATOR);
	addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
	1 << U_SPACE_SEPARATOR);
	}

	void UnicodePropertySet::cleanup() {
	if (NAME_MAP != NULL) {
	delete NAME_MAP; NAME_MAP = NULL;
	delete CATEGORY_MAP; CATEGORY_MAP = NULL;
	delete[] CATEGORY_CACHE; CATEGORY_CACHE = NULL;
	delete[] SCRIPT_CACHE; SCRIPT_CACHE = NULL;
	}
	}

	U_NAMESPACE_END

	//eof