src/com/ibm/icu/lang/UCharacter.java - external/github.com/unicode-org/icu - Git at Google

 /**
 *******************************************************************************
 * Copyright (C) 1996-2001, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $
 * $Date: 2002/04/05 01:38:15 $
 * $Revision: 1.39 $
 *
 *******************************************************************************
 */

 package com.ibm.icu.lang;

 import java.util.Locale;
 import com.ibm.icu.impl.UCharacterProperty;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.util.RangeValueIterator;
 import com.ibm.icu.util.ValueIterator;
 import com.ibm.icu.util.VersionInfo;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.impl.NormalizerImpl;

 /**
 * <p>
 * The UCharacter class provides extensions to the
 * <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html>
 * java.lang.Character</a> class. These extensions provide support for
 * Unicode 3.1 properties and together with the <a href=UTF16.html>UTF16</a>
 * class, provide support for supplementary characters (those with code
 * points above U+FFFF).
 * </p>
 * <p>
 * Code points are represented in these API using ints. While it would be
 * more convenient in Java to have a separate primitive datatype for them,
 * ints suffice in the meantime.
 * </p>
 * <p>
 * To use this class please add the jar file name icu4j.jar to the
 * class path, since it contains data files which supply the information used
 * by this file.<br>
 * E.g. In Windows <br>
 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
 * Otherwise, another method would be to copy the files uprops.dat and
 * unames.dat from the icu4j source subdirectory
 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
 * </p>
 * <p>
 * Aside from the additions for UTF-16 support, and the updated Unicode 3.1
 * properties, the main differences between UCharacter and Character are:
 * <ul>
 * <li> UCharacter is not designed to be a char wrapper and does not have
 *      APIs to which involves management of that single char.<br>
 *      These include:
 *      <ul>
 *        <li> char charValue(),
 *        <li> int compareTo(java.lang.Character, java.lang.Character), etc.
 *      </ul>
 * <li> UCharacter does not include Character APIs that are deprecated, not
 *      does it include the Java-specific character information, such as
 *      boolean isJavaIdentifierPart(char ch).
 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
 *      values '10' - '35'. UCharacter also does this in digit and
 *      getNumericValue, to adhere to the java semantics of these
 *      methods.  New methods unicodeDigit, and
 *      getUnicodeNumericValue do not treat the above code points
 *      as having numeric values.  This is a semantic change from ICU4J 1.3.1.
 * </ul>
 * <p>
 * Further detail differences can be determined from the program
 *        <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java>
 *        com.ibm.icu.dev.test.lang.UCharacterCompare</a>
 * </p>
 * @author Syn Wee Quek
 * @since oct 06 2000
 * @see com.ibm.icu.lang.UCharacterCategory
 * @see com.ibm.icu.lang.UCharacterDirection
 */

 public final class UCharacter
 {
     // public data members -----------------------------------------------

     /**
     * The lowest Unicode code point value.
     */
     public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;

     /**
     * The highest Unicode code point value (scalar value) according to the
     * Unicode Standard.
     * This is a 21-bit value (21 bits, rounded up).<br>
     * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
     */
     public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;

     /**
     * The minimum value for Supplementary code points
     */
     public static final int SUPPLEMENTARY_MIN_VALUE =
                                           UTF16.SUPPLEMENTARY_MIN_VALUE;

     /**
     * Unicode value used when translating into Unicode encoding form and there
     * is no existing character.
     */
 	public static final int REPLACEMENT_CHAR = '\uFFFD';


     // public methods ----------------------------------------------------

     /**
     * Retrieves the numeric value of a decimal digit code point.
     * <br>This method observes the semantics of
     * <code>java.lang.Character.digit()</code>.  Note that this
     * will return positive values for code points for which isDigit
     * returns false, just like java.lang.Character.
     * <br><em>Semantic Change:</em> In release 1.3.1 and
     * prior, this did not treat the European letters as having a
     * digit value, and also treated numeric letters and other numbers as
     * digits.
     * This has been changed to conform to the java semantics.
     * <br>A code point is a valid digit if and only if:
     * <ul>
     *   <li>ch is a decimal digit or one of the european letters, and
     *   <li>the value of ch is less than the specified radix.
     * </ul>
     * @param ch the code point to query
     * @param radix the radix
     * @return the numeric value represented by the code point in the
     * specified radix, or -1 if the code point is not a decimal digit
     * or if its value is too large for the radix
     */
     public static int digit(int ch, int radix)
     {
         int props       = getProps(ch);
         int numericType = UCharacterProperty.getNumericType(props);

         int result = -1;
         if (numericType == UCharacterProperty.DECIMAL_DIGIT_NUMERIC_TYPE_) {
         	// if props == 0, it will just fall through and return -1
         	if (!UCharacterProperty.isExceptionIndicator(props)) {
             	// not contained in exception data
             	result = UCharacterProperty.getSignedValue(props);
             }
             else {
             	int index = UCharacterProperty.getExceptionIndex(props);
             	if (PROPERTY_.hasExceptionValue(index,
                                    UCharacterProperty.EXC_NUMERIC_VALUE_)) {
                 	return PROPERTY_.getException(index,
                                       UCharacterProperty.EXC_NUMERIC_VALUE_);
                 }
             }
         }

         if (result < 0 && radix > 10) {
             result = getEuropeanDigit(ch);
         }

         if (result < 0 || result >= radix) {
             return -1;
         }
         return result;
     }

     /**
     * Retrieves the numeric value of a decimal digit code point.
     * <br>This is a convenience overload of <code>digit(int, int)</code>
     * that provides a decimal radix.
     * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
     * treated numeric letters and other numbers as digits.  This has
     * been changed to conform to the java semantics.
     * @param ch the code point to query
     * @return the numeric value represented by the code point,
     * or -1 if the code point is not a decimal digit or if its
     * value is too large for a decimal radix
     */
     public static int digit(int ch)
     {
         return digit(ch, DECIMAL_RADIX_);
     }

    /**
     * Returns the numeric value of the code point as a nonnegative
     * integer.
     * <br>If the code point does not have a numeric value, then -1 is returned.
     * <br>
     * If the code point has a numeric value that cannot be represented as a
     * nonnegative integer (for example, a fractional value), then -2 is
     * returned.
     * <br><em>Semantic Change:</em> In release 1.3.1 and
     * prior, this returned -1 for ASCII letters and their
     * fullwidth counterparts.  This has been changed to
     * conform to the java semantics.
     * @param ch the code point to query
     * @return the numeric value of the code point, or -1 if it has no numeric
     * value, or -2 if it has a numeric value that cannot be represented as a
     * nonnegative integer
     */
     public static int getNumericValue(int ch)
     {
         return getNumericValueInternal(ch, true);
     }

    /**
     * Returns the Unicode numeric value of the code point as a nonnegative
     * integer.
     * <br>If the code point does not have a numeric value, then -1 is returned. <br>
     * If the code point has a numeric value that cannot be represented as a
     * nonnegative integer (for example, a fractional value), then -2 is
     * returned.
     * This returns values other than -1 for all and only those code points
     * whose type is a numeric type.
     * @param ch the code point to query
     * @return the numeric value of the code point, or -1 if it has no numeric
     * value, or -2 if it has a numeric value that cannot be represented as a
     * nonnegative integer
     */
     public static int getUnicodeNumericValue(int ch)
     {
         return getNumericValueInternal(ch, false);
     }

     /**
     * Returns a value indicating a code point's Unicode category.
     * Up-to-date Unicode implementation of java.lang.Character.getType() except
     * for the above mentioned code points that had their category changed.<br>
     * Return results are constants from the interface
     * <a href=UCharacterCategory.html>UCharacterCategory</a>
     * @param ch code point whose type is to be determined
     * @return category which is a value of UCharacterCategory
     */
     public static int getType(int ch)
     {
         return UCharacterProperty.getPropType(getProps(ch));
     }

     /**
     * Determines if a code point has a defined meaning in the up-to-date Unicode
     * standard.
     * E.g. supplementary code points though allocated space are not defined in
     * Unicode yet.<br>
     * Up-to-date Unicode implementation of java.lang.Character.isDefined()
     * @param ch code point to be determined if it is defined in the most current
     *        version of Unicode
     * @return true if this code point is defined in unicode
     */
     public static boolean isDefined(int ch)
     {
         return getProps(ch) != 0;
     }

    /**
     * Determines if a code point is a Java digit.
     * <br>This method observes the semantics of
     * <code>java.lang.Character.isDigit()</code>.  It returns true for
     * decimal digits only.
     * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
     * treated numeric letters and other numbers as digits.  This has
     * been changed to conform to the java semantics.
     * @param ch code point to query
     * @return true if this code point is a digit */
     public static boolean isDigit(int ch)
     {
         return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
     }

     /**
     * Determines if the specified code point is an ISO control character.
     * A code point is considered to be an ISO control character if it is in the
     * range &#92u0000 through &#92u001F or in the range &#92u007F through
     * &#92u009F.<br>
     * Up-to-date Unicode implementation of java.lang.Character.isISOControl()
     * @param ch code point to determine if it is an ISO control character
     * @return true if code point is a ISO control character
     */
     public static boolean isISOControl(int ch)
     {
         return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ &&
             ((ch <= UNIT_SEPERATOR_) || (ch >= DELETE_));
     }

     /**
     * Determines if the specified code point is a letter.
     * Up-to-date Unicode implementation of java.lang.Character.isLetter()
     * @param ch code point to determine if it is a letter
     * @return true if code point is a letter
     */
     public static boolean isLetter(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.UPPERCASE_LETTER ||
             cat == UCharacterCategory.LOWERCASE_LETTER ||
             cat == UCharacterCategory.TITLECASE_LETTER ||
             cat == UCharacterCategory.MODIFIER_LETTER ||
             cat == UCharacterCategory.OTHER_LETTER;
     }

     /**
     * Determines if the specified code point is a letter or digit.
     * Note this method, unlike java.lang.Character does not regard the ascii
     * characters 'A' - 'Z' and 'a' - 'z' as digits.
     * @param ch code point to determine if it is a letter or a digit
     * @return true if code point is a letter or a digit
     */
     public static boolean isLetterOrDigit(int ch)
     {
         return isDigit(ch) || isLetter(ch);
     }

     /**
     * Determines if the specified code point is a lowercase character.
     * UnicodeData only contains case mappings for code points where they are
     * one-to-one mappings; it also omits information about context-sensitive
     * case mappings.<br> For more information about Unicode case mapping please
     * refer to the <a href=http://www.unicode.org/unicode/reports/tr21/>
     * Technical report #21</a>.<br>
     * Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
     * @param ch code point to determine if it is in lowercase
     * @return true if code point is a lowercase character
     */
     public static boolean isLowerCase(int ch)
     {
         // if props == 0, it will just fall through and return false
         return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
     }

     /**
     * Determines if the specified code point is a white space character.
     * A code point is considered to be an whitespace character if and only
     * if it satisfies one of the following criteria:
     * <ul>
     * <li> It is a Unicode space separator (category "Zs"), but is not
     *      a no-break space (&#92u00A0 or &#92u202F or &#92uFEFF).
     * <li> It is a Unicode line separator (category "Zl").
     * <li> It is a Unicode paragraph separator (category "Zp").
     * <li> It is &#92u0009, HORIZONTAL TABULATION.
     * <li> It is &#92u000A, LINE FEED.
     * <li> It is &#92u000B, VERTICAL TABULATION.
     * <li> It is &#92u000C, FORM FEED.
     * <li> It is &#92u000D, CARRIAGE RETURN.
     * <li> It is &#92u001C, FILE SEPARATOR.
     * <li> It is &#92u001D, GROUP SEPARATOR.
     * <li> It is &#92u001E, RECORD SEPARATOR.
     * <li> It is &#92u001F, UNIT SEPARATOR.
     * </ul>
     *
     * Up-to-date Unicode implementation of java.lang.Character.isWhitespace().
     * @param ch code point to determine if it is a white space
     * @return true if the specified code point is a white space character
     */
     public static boolean isWhitespace(int ch)
     {
         int cat = getType(ch);
         // exclude no-break spaces
         // if props == 0, it will just fall through and return false
         return (cat == UCharacterCategory.SPACE_SEPARATOR ||
                 cat == UCharacterCategory.LINE_SEPARATOR ||
                 cat == UCharacterCategory.PARAGRAPH_SEPARATOR) &&
                 (ch != NO_BREAK_SPACE_) &&
                 (ch != NARROW_NO_BREAK_SPACE_) &&
                 (ch != ZERO_WIDTH_NO_BREAK_SPACE_) ||
                 // TAB VT LF FF CR FS GS RS US NL are all control characters
                 // that are white spaces.
                 (ch >= 0x9 && ch <= 0xd) || (ch >= 0x1c && ch <= 0x1f);
     }

     /**
     * Determines if the specified code point is a Unicode specified space
     * character, i.e. if code point is in the category Zs, Zl and Zp.
     * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
     * @param ch code point to determine if it is a space
     * @return true if the specified code point is a space character
     */
     public static boolean isSpaceChar(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.SPACE_SEPARATOR ||
             cat == UCharacterCategory.LINE_SEPARATOR ||
             cat == UCharacterCategory.PARAGRAPH_SEPARATOR;
     }

     /**
     * Determines if the specified code point is a titlecase character.
     * UnicodeData only contains case mappings for code points where they are
     * one-to-one mappings; it also omits information about context-sensitive
     * case mappings.<br>
     * For more information about Unicode case mapping please refer to the
     * <a href=http://www.unicode.org/unicode/reports/tr21/>
     * Technical report #21</a>.<br>
     * Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
     * @param ch code point to determine if it is in title case
     * @return true if the specified code point is a titlecase character
     */
     public static boolean isTitleCase(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.TITLECASE_LETTER;
     }

     /**
     * Determines if the specified code point may be any part of a Unicode
     * identifier other than the starting character.
     * A code point may be part of a Unicode identifier if and only if it is one
     * of the following:
     * <ul>
     * <li> Lu Uppercase letter
     * <li> Ll Lowercase letter
     * <li> Lt Titlecase letter
     * <li> Lm Modifier letter
     * <li> Lo Other letter
     * <li> Nl Letter number
     * <li> Pc Connecting punctuation character
     * <li> Nd decimal number
     * <li> Mc Spacing combining mark
     * <li> Mn Non-spacing mark
     * <li> Cf formatting code
     * </ul>
     * Up-to-date Unicode implementation of
     * java.lang.Character.isUnicodeIdentifierPart().<br>
     * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
     * @param ch code point to determine if is can be part of a Unicode identifier
     * @return true if code point is any character belonging a unicode identifier
     *         suffix after the first character
     */
     public static boolean isUnicodeIdentifierPart(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.UPPERCASE_LETTER ||
             cat == UCharacterCategory.LOWERCASE_LETTER ||
             cat == UCharacterCategory.TITLECASE_LETTER ||
             cat == UCharacterCategory.MODIFIER_LETTER ||
             cat == UCharacterCategory.OTHER_LETTER ||
             cat == UCharacterCategory.LETTER_NUMBER ||
             cat == UCharacterCategory.CONNECTOR_PUNCTUATION ||
             cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
             cat == UCharacterCategory.COMBINING_SPACING_MARK ||
             cat == UCharacterCategory.NON_SPACING_MARK ||
             // cat == UCharacterCategory.FORMAT;
             isIdentifierIgnorable(ch);
     }

     /**
     * Determines if the specified code point is permissible as the first
     * character in a Unicode identifier.
     * A code point may start a Unicode identifier if it is of type either
     * <ul>
     * <li> Lu Uppercase letter
     * <li> Ll Lowercase letter
     * <li> Lt Titlecase letter
     * <li> Lm Modifier letter
     * <li> Lo Other letter
     * <li> Nl Letter number
     * </ul>
     * Up-to-date Unicode implementation of
     * java.lang.Character.isUnicodeIdentifierStart().<br>
     * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
     * @param ch code point to determine if it can start a Unicode identifier
     * @return true if code point is the first character belonging a unicode
     *              identifier
     */
     public static boolean isUnicodeIdentifierStart(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.UPPERCASE_LETTER ||
             cat == UCharacterCategory.LOWERCASE_LETTER ||
             cat == UCharacterCategory.TITLECASE_LETTER ||
             cat == UCharacterCategory.MODIFIER_LETTER ||
             cat == UCharacterCategory.OTHER_LETTER ||
             cat == UCharacterCategory.LETTER_NUMBER;
     }

     /**
     * Determines if the specified code point should be regarded as an ignorable
     * character in a Unicode identifier.
     * A character is ignorable in the Unicode standard if it is of the type Cf,
     * Formatting code.<br>
     * Up-to-date Unicode implementation of
     * java.lang.Character.isIdentifierIgnorable().<br>
     * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
     * @param ch code point to be determined if it can be ignored in a Unicode
     *        identifier.
     * @return true if the code point is ignorable
     */
     public static boolean isIdentifierIgnorable(int ch)
     {
         // see java.lang.Character.isIdentifierIgnorable() on range of
         // ignorable characters.
         return ch <= 8 || (ch >= 0xe && ch <= 0x1b) ||
                (ch >= 0x7f && ch <= 0x9f) ||
                getType(ch) == UCharacterCategory.FORMAT;
     }

     /**
     * Determines if the specified code point is an uppercase character.
     * UnicodeData only contains case mappings for code point where they are
     * one-to-one mappings; it also omits information about context-sensitive
     * case mappings.<br>
     * For language specific case conversion behavior, use
     * toUpperCase(locale, str). <br>
     * For example, the case conversion for dot-less i and dotted I in Turkish,
     * or for final sigma in Greek.
     * For more information about Unicode case mapping please refer to the
     * <a href=http://www.unicode.org/unicode/reports/tr21/>
     * Technical report #21</a>.<br>
     * Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
     * @param ch code point to determine if it is in uppercase
     * @return true if the code point is an uppercase character
     */
     public static boolean isUpperCase(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.UPPERCASE_LETTER;
     }

     /**
     * The given code point is mapped to its lowercase equivalent; if the code
     * point has no lowercase equivalent, the code point itself is returned.
     * UnicodeData only contains case mappings for code point where they are
     * one-to-one mappings; it also omits information about context-sensitive
     * case mappings.<br>
     * For language specific case conversion behavior, use
     * toLowerCase(locale, str). <br>
     * For example, the case conversion for dot-less i and dotted I in Turkish,
     * or for final sigma in Greek.
     * For more information about Unicode case mapping please refer to the
     * <a href=http://www.unicode.org/unicode/reports/tr21/>
     * Technical report #21</a>.<br>
     * Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
     * @param ch code point whose lowercase equivalent is to be retrieved
     * @return the lowercase equivalent code point
     */
     public static int toLowerCase(int ch)
     {
         int props = getProps(ch);
         // if props == 0, it will just fall through and return itself
         if(!UCharacterProperty.isExceptionIndicator(props)) {
             int cat = UCharacterProperty.getPropType(props);
             if (cat == UCharacterCategory.UPPERCASE_LETTER ||
                 cat == UCharacterCategory.TITLECASE_LETTER) {
                 return ch + UCharacterProperty.getSignedValue(props);
             }
         }
         else
         {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (PROPERTY_.hasExceptionValue(index,
                                        UCharacterProperty.EXC_LOWERCASE_)) {
                 return PROPERTY_.getException(index,
                                          UCharacterProperty.EXC_LOWERCASE_);
             }
         }
         return ch;
     }

     /**
     * Converts argument code point and returns a String object representing the
     * code point's value in UTF16 format.
     * The result is a string whose length is 1 for non-supplementary code points,
     * 2 otherwise.<br>
     * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
     * function.<br>
     * Up-to-date Unicode implementation of java.lang.Character.toString()
     * @param ch code point
     * @return string representation of the code point, null if code point is not
     *         defined in unicode
     */
     public static String toString(int ch)
     {
         if (ch < MIN_VALUE || ch > MAX_VALUE) {
             return null;
         }

         if (ch < SUPPLEMENTARY_MIN_VALUE) {
             return String.valueOf((char)ch);
         }

         StringBuffer result = new StringBuffer();
         result.append(UTF16.getLeadSurrogate(ch));
         result.append(UTF16.getTrailSurrogate(ch));
         return result.toString();
     }

     /**
     * Converts the code point argument to titlecase.
     * UnicodeData only contains case mappings for code points where they are
     * one-to-one mappings; it also omits information about context-sensitive
     * case mappings.<br>
     * There are only four Unicode characters that are truly titlecase forms
     * that are distinct from uppercase forms.
     * For more information about Unicode case mapping please refer
     * to the <a href=http://www.unicode.org/unicode/reports/tr21/>
     * Technical report #21</a>.<br>
     * If no titlecase is available, the uppercase is returned. If no uppercase
     * is available, the code point itself is returned.<br>
     * Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
     * @param ch code point  whose title case is to be retrieved
     * @return titlecase code point
     */
     public static int toTitleCase(int ch)
     {
         int props = getProps(ch);
         // if props == 0, it will just fall through and return itself
         if (!UCharacterProperty.isExceptionIndicator(props)) {
             if (UCharacterProperty.getPropType(props) ==
                 UCharacterCategory.LOWERCASE_LETTER) {
                 // here, titlecase is same as uppercase
                 return ch - UCharacterProperty.getSignedValue(props);
             }
         }
         else {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (PROPERTY_.hasExceptionValue(index,
                                          UCharacterProperty.EXC_TITLECASE_)) {
                 return PROPERTY_.getException(index,
                                          UCharacterProperty.EXC_TITLECASE_);
             }
             else {
                 // here, titlecase is same as uppercase
                 if (PROPERTY_.hasExceptionValue(index,
                                        UCharacterProperty.EXC_UPPERCASE_)) {
                     return PROPERTY_.getException(index,
                                          UCharacterProperty.EXC_UPPERCASE_);
                 }
             }
         }
         return ch; // no mapping - return c itself
     }

     /**
     * Converts the character argument to uppercase.
     * UnicodeData only contains case mappings for characters where they are
     * one-to-one mappings; it also omits information about context-sensitive
     * case mappings.<br>
     * For more information about Unicode case mapping please refer
     * to the <a href=http://www.unicode.org/unicode/reports/tr21/>
     * Technical report #21</a>.<br>
     * If no uppercase is available, the character itself is returned.<br>
     * Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
     * @param ch code point whose uppercase is to be retrieved
     * @return uppercase code point
     */
     public static int toUpperCase(int ch)
     {
         int props = getProps(ch);
         // if props == 0, it will just fall through and return itself
         if (!UCharacterProperty.isExceptionIndicator(props)) {
             if (UCharacterProperty.getPropType(props) ==
                 UCharacterCategory.LOWERCASE_LETTER) {
                 // here, titlecase is same as uppercase */
                 return ch - UCharacterProperty.getSignedValue(props);
             }
         }
         else
         {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (PROPERTY_.hasExceptionValue(index,
                                          UCharacterProperty.EXC_UPPERCASE_)) {
                 return PROPERTY_.getException(index,
                                          UCharacterProperty.EXC_UPPERCASE_);
             }
         }
         return ch; // no mapping - return c itself
     }

     // extra methods not in java.lang.Character --------------------------

     /**
     * Determines if the code point is a supplementary character.
     * A code point is a supplementary character if and only if it is greater than
     * <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
     * @param ch code point to be determined if it is in the supplementary plane
     * @return true if code point is a supplementary character
     */
     public static boolean isSupplementary(int ch)
     {
         return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
             ch <= UCharacter.MAX_VALUE;
     }

     /**
     * Determines if the code point is in the BMP plane.
     * @param ch code point to be determined if it is not a supplementary
     *        character
     * @return true if code point is not a supplementary character
     */
     public static boolean isBMP(int ch)
     {
         return (ch >= 0 && ch <= LAST_CHAR_MASK_);
     }

     /**
     * Determines whether the specified code point is a printable character
     * according to the Unicode standard.
     * @param ch code point to be determined if it is printable
     * @return true if the code point is a printable character
     */
     public static boolean isPrintable(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return (cat != UCharacterCategory.UNASSIGNED &&
             cat != UCharacterCategory.CONTROL &&
             cat != UCharacterCategory.FORMAT &&
             cat != UCharacterCategory.PRIVATE_USE &&
             cat != UCharacterCategory.SURROGATE &&
             cat != UCharacterCategory.GENERAL_OTHER_TYPES);
     }

     /**
     * Determines whether the specified code point is of base form.
     * A code point of base form does not graphically combine with preceding
     * characters, and is neither a control nor a format character.
     * @param ch code point to be determined if it is of base form
     * @return true if the code point is of base form
     */
     public static boolean isBaseForm(int ch)
     {
         int cat = getType(ch);
         // if props == 0, it will just fall through and return false
         return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
             cat == UCharacterCategory.OTHER_NUMBER ||
             cat == UCharacterCategory.LETTER_NUMBER ||
             cat == UCharacterCategory.UPPERCASE_LETTER ||
             cat == UCharacterCategory.LOWERCASE_LETTER ||
             cat == UCharacterCategory.TITLECASE_LETTER ||
             cat == UCharacterCategory.MODIFIER_LETTER ||
             cat == UCharacterCategory.OTHER_LETTER ||
             cat == UCharacterCategory.NON_SPACING_MARK ||
             cat == UCharacterCategory.ENCLOSING_MARK ||
             cat == UCharacterCategory.COMBINING_SPACING_MARK;
     }

     /**
     * Returns the Bidirection property of a code point.
     * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
     * property.<br>
     * Result returned belongs to the interface
     * <a href=UCharacterDirection.html>UCharacterDirection</a>
     * @param ch the code point to be determined its direction
     * @return direction constant from UCharacterDirection. Otherwise is
     *         character is not defined, UCharacterDirection.BOUNDARY_NEUTRAL
     *         will be returned.
     */
     public static int getDirection(int ch)
     {
         int props = getProps(ch);
         if (props != 0) {
             return UCharacterProperty.getDirection(props);
         }
         return UCharacterDirection.BOUNDARY_NEUTRAL;
     }

     /**
     * Determines whether the code point has the "mirrored" property.
     * This property is set for characters that are commonly used in
     * Right-To-Left contexts and need to be displayed with a "mirrored"
     * glyph.
     * @param ch code point whose mirror is to be determined
     * @return true if the code point has the "mirrored" property
     */
     public static boolean isMirrored(int ch)
     {
         int props = getProps(ch);
         // if props == 0, it will just fall through and return false
         return UCharacterProperty.isMirrored(props);
     }

     /**
     * Maps the specified code point to a "mirror-image" code point.
     * For code points with the "mirrored" property, implementations sometimes
     * need a "poor man's" mapping to another code point such that the default
     * glyph may serve as the mirror-image of the default glyph of the specified
     * code point.<br>
     * This is useful for text conversion to and from codepages with visual
     * order, and for displays without glyph selection capabilities.
     * @param ch code point whose mirror is to be retrieved
     * @return another code point that may serve as a mirror-image substitute, or
     *         ch itself if there is no such mapping or ch does not have the
     *         "mirrored" property
     */
     public static int getMirror(int ch)
     {
         int props = getProps(ch);
         // mirrored - the value is a mirror offset
         // if props == 0, it will just fall through and return false
         if (UCharacterProperty.isMirrored(props)) {
             if(!UCharacterProperty.isExceptionIndicator(props)) {
                 return ch + UCharacterProperty.getSignedValue(props);
             }
             else
             {
                 int index = UCharacterProperty.getExceptionIndex(props);
                 if (PROPERTY_.hasExceptionValue(index,
                                     UCharacterProperty.EXC_MIRROR_MAPPING_))
                 return PROPERTY_.getException(index,
                                      UCharacterProperty.EXC_MIRROR_MAPPING_);
             }
         }
         return ch;
     }

     /**
     * Gets the combining class of the argument codepoint
     * @param ch code point whose combining is to be retrieved
     * @return the combining class of the codepoint
     */
     public static int getCombiningClass(int ch)
     {
     	if (ch < MIN_VALUE || ch > MAX_VALUE) {
     		throw new IllegalArgumentException("Codepoint out of bounds");
     	}
     	return NormalizerImpl.getCombiningClass(ch);
     }

     /**
     * A code point is illegal if and only if
     * <ul>
     * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
     * <li> A surrogate value, 0xD800 to 0xDFFF
     * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
     * </ul>
     * Note: legal does not mean that it is assigned in this version of Unicode.
     * @param ch code point to determine if it is a legal code point by itself
     * @return true if and only if legal.
     */
     public static boolean isLegal(int ch)
     {
         if (ch < MIN_VALUE) {
             return false;
         }
         if (ch < UTF16.SURROGATE_MIN_VALUE) {
             return true;
         }
         if (ch <= UTF16.SURROGATE_MAX_VALUE) {
             return false;
         }
         if (isNonCharacter(ch)) {
             return false;
         }
         return (ch <= MAX_VALUE);
     }

     /**
     * A string is legal iff all its code points are legal.
     * A code point is illegal if and only if
     * <ul>
     * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
     * <li> A surrogate value, 0xD800 to 0xDFFF
     * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
     * </ul>
     * Note: legal does not mean that it is assigned in this version of Unicode.
     * @param ch code point to determine if it is a legal code point by itself
     * @return true if and only if legal.
     */
     public static boolean isLegal(String str)
     {
         int size = str.length();
         int codepoint;
         for (int i = 0; i < size; i ++)
         {
             codepoint = UTF16.charAt(str, i);
             if (!isLegal(codepoint)) {
                 return false;
             }
             if (isSupplementary(codepoint)) {
                 i ++;
             }
         }
         return true;
     }

     /**
     * Gets the version of Unicode data used.
     * @return the unicode version number used
     */
     public static VersionInfo getUnicodeVersion()
     {
         return PROPERTY_.m_unicodeVersion_;
     }

     /**
     * Retrieve the most current Unicode name of the argument code point, or
     * null if the character is unassigned or outside the range
     * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
     * <br>
     * Note calling any methods related to code point names, e.g. get*Name*()
     * incurs a one-time initialisation cost to construct the name tables.
     * @param ch the code point for which to get the name
     * @return most current Unicode name
     */
     public static String getName(int ch)
     {
         return NAME_.getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
     }

     /**
     * Retrieve the earlier version 1.0 Unicode name of the argument code point,
     * or null if the character is unassigned or outside the range
     * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
     * <br>
     * Note calling any methods related to code point names, e.g. get*Name*()
     * incurs a one-time initialisation cost to construct the name tables.
     * @param ch the code point for which to get the name
     * @return version 1.0 Unicode name
     */
     public static String getName1_0(int ch)
     {
         return NAME_.getName(ch,
                              UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
     }

     /**
     * <p>Retrieves a name for a valid codepoint. Unlike, getName(int) and
     * getName1_0(int), this method will return a name even for codepoints that
     * are not assigned a name in UnicodeData.txt.
     * </p>
     * The names are returned in the following order.
     * <ul>
     * <li> Most current Unicode name if there is any
     * <li> Unicode 1.0 name if there is any
     * <li> Extended name in the form of "<codepoint_type-codepoint_hex_digits>".
     *      E.g. <noncharacter-fffe>
     * </ul>
     * Note calling any methods related to code point names, e.g. get*Name*()
     * incurs a one-time initialisation cost to construct the name tables.
     * @param ch the code point for which to get the name
     * @return a name for the argument codepoint
     * @draft 2.1
     */
     public static String getExtendedName(int ch)
     {
         return NAME_.getName(ch, UCharacterNameChoice.U_EXTENDED_CHAR_NAME);
     }

     /**
     * <p>Find a Unicode code point by its most current Unicode name and
     * return its code point value. All Unicode names are in uppercase.</p>
     * Note calling any methods related to code point names, e.g. get*Name*()
     * incurs a one-time initialisation cost to construct the name tables.
     * @param name most current Unicode character name whose code point is to be
     *        returned
     * @return code point or -1 if name is not found
     */
     public static int getCharFromName(String name)
     {
         return NAME_.getCharFromName(
                             UCharacterNameChoice.U_UNICODE_CHAR_NAME, name);
     }

     /**
     * <p>Find a Unicode character by its version 1.0 Unicode name and return
     * its code point value. All Unicode names are in uppercase.</p>
     * Note calling any methods related to code point names, e.g. get*Name*()
     * incurs a one-time initialisation cost to construct the name tables.
     * @param name Unicode 1.0 code point name whose code point is to
     *             returned
     * @return code point or -1 if name is not found
     */
     public static int getCharFromName1_0(String name)
     {
         return NAME_.getCharFromName(
                          UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name);
     }

     /**
     * <p>Find a Unicode character by either its name and return its code
     * point value. All Unicode names are in uppercase.
     * Extended names are all lowercase except for numbers and are contained
     * within angle brackets.</p>
     * The names are searched in the following order
     * <ul>
     * <li> Most current Unicode name if there is any
     * <li> Unicode 1.0 name if there is any
     * <li> Extended name in the form of "<codepoint_type-codepoint_hex_digits>".
     *      E.g. <noncharacter-FFFE>
     * </ul>
     * Note calling any methods related to code point names, e.g. get*Name*()
     * incurs a one-time initialisation cost to construct the name tables.
     * @param name codepoint name
     * @return code point associated with the name or -1 if the name is not
     *         found.
     * @draft 2.1
     */
     public static int getCharFromExtendedName(String name)
     {
         return NAME_.getCharFromName(
                             UCharacterNameChoice.U_EXTENDED_CHAR_NAME, name);
     }

     /**
     * Returns a code point corresponding to the two UTF16 characters.
     * @param lead the lead char
     * @param trail the trail char
     * @return code point if surrogate characters are valid.
     * @exception IllegalArgumentException thrown when argument characters do
     *            not form a valid codepoint
     */
     public static int getCodePoint(char lead, char trail)
     {
         if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
 	        lead <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
             trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
 	        trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
             return UCharacterProperty.getRawSupplementary(lead, trail);
         }
         throw new IllegalArgumentException("Illegal surrogate characters");
     }

     /**
     * Returns the code point corresponding to the UTF16 character.
     * @param char16 the UTF16 character
     * @return code point if argument is a valid character.
     * @exception IllegalArgumentException thrown when char16 is not a valid
     *            codepoint
     */
     public static int getCodePoint(char char16)
     {
         if (UCharacter.isLegal(char16)) {
             return char16;
         }
         throw new IllegalArgumentException("Illegal codepoint");
     }

     /**
     * Gets uppercase version of the argument string.
     * Casing is dependent on the default locale and context-sensitive.
     * @param str source string to be performed on
     * @return uppercase version of the argument string
     */
     public static String toUpperCase(String str)
     {
         return toUpperCase(Locale.getDefault(), str);
     }

     /**
     * Gets lowercase version of the argument string.
     * Casing is dependent on the default locale and context-sensitive
     * @param str source string to be performed on
     * @return lowercase version of the argument string
     */
     public static String toLowerCase(String str)
     {
         return toLowerCase(Locale.getDefault(), str);
     }

     /**
     * <p>Gets the titlecase version of the argument string.</p>
     * <p>Position for titlecasing is determined by the argument break
     * iterator, hence the user can customized his break iterator for
     * a specialized titlecasing. In this case only the forward iteration
     * needs to be implemented.
     * If the break iterator passed in is null, the default Unicode algorithm
     * will be used to determine the titlecase positions.
     * </p>
     * <p>Only positions returned by the break iterator will be title cased,
     * character in between the positions will all be in lower case.</p>
     * <p>Casing is dependent on the default locale and context-sensitive</p>
     * @param str source string to be performed on
     * @param breakiter break iterator to determine the positions in which
     *        the character should be title cased.
     * @return lowercase version of the argument string
     * @draft 2.1
     */
     public static String toTitleCase(String str, BreakIterator breakiter)
     {
         return toTitleCase(Locale.getDefault(), str, breakiter);
     }

     /**
     * Gets uppercase version of the argument string.
     * Casing is dependent on the argument locale and context-sensitive.
     * @param locale which string is to be converted in
     * @param str source string to be performed on
     * @return uppercase version of the argument string
     */
     public static String toUpperCase(Locale locale, String str)
     {
     	if (locale == null) {
     		locale = Locale.getDefault();
     	}
         return PROPERTY_.toUpperCase(locale, str, 0, str.length());
     }

     /**
     * Gets lowercase version of the argument string.
     * Casing is dependent on the argument locale and context-sensitive
     * @param locale which string is to be converted in
     * @param str source string to be performed on
     * @return lowercase version of the argument string
     */
     public static String toLowerCase(Locale locale, String str)
     {
     	int length = str.length();
     	StringBuffer result = new StringBuffer(length);
     	if (locale == null) {
     		locale = Locale.getDefault();
     	}
         PROPERTY_.toLowerCase(locale, str, 0, length, result);
         return result.toString();
     }

     /**
     * <p>Gets the titlecase version of the argument string.</p>
     * <p>Position for titlecasing is determined by the argument break
     * iterator, hence the user can customized his break iterator for
     * a specialized titlecasing. In this case only the forward iteration
     * needs to be implemented.
     * If the break iterator passed in is null, the default Unicode algorithm
     * will be used to determine the titlecase positions.
     * </p>
     * <p>Only positions returned by the break iterator will be title cased,
     * character in between the positions will all be in lower case.</p>
     * <p>Casing is dependent on the argument locale and context-sensitive</p>
     * @param locale which string is to be converted in
     * @param str source string to be performed on
     * @param breakiter break iterator to determine the positions in which
     *        the character should be title cased.
     * @return lowercase version of the argument string
     * @draft 2.1
     */
     public static String toTitleCase(Locale locale, String str,
                                      BreakIterator breakiter)
     {
         if (breakiter == null) {
         	if (locale == null) {
         		locale = Locale.getDefault();
         	}
             breakiter = BreakIterator.getWordInstance(locale);
         }
         return PROPERTY_.toTitleCase(locale, str, breakiter);
     }

     /**
     * The given character is mapped to its case folding equivalent according to
     * UnicodeData.txt and CaseFolding.txt; if the character has no case folding
     * equivalent, the character itself is returned.
     * Only "simple", single-code point case folding mappings are used.
     * For "full", multiple-code point mappings use the API
     * foldCase(String str, boolean defaultmapping).
     * @param ch             the character to be converted
     * @param defaultmapping Indicates if all mappings defined in CaseFolding.txt
     *                       is to be used, otherwise the mappings for dotted I
     *                       and dotless i marked with 'I' in CaseFolding.txt will
     *                       be skipped.
     * @return               the case folding equivalent of the character, if any;
     *                       otherwise the character itself.
     * @see                  #foldCase(String, boolean)
     */
     public static int foldCase(int ch, boolean defaultmapping)
     {
         int props = PROPERTY_.getProperty(ch);
         if (!UCharacterProperty.isExceptionIndicator(props)) {
             int type = UCharacterProperty.getPropType(props);
             if (type == UCharacterCategory.UPPERCASE_LETTER ||
                 type == UCharacterCategory.TITLECASE_LETTER) {
                 return ch + UCharacterProperty.getSignedValue(props);
             }
         }
         else {
             int index = UCharacterProperty.getExceptionIndex(props);
             if (PROPERTY_.hasExceptionValue(index,
                                       UCharacterProperty.EXC_CASE_FOLDING_)) {
                 int exception = PROPERTY_.getException(index,
                                       UCharacterProperty.EXC_CASE_FOLDING_);
                 if (exception != 0) {
                     int foldedcasech =
                          PROPERTY_.getFoldCase(exception & LAST_CHAR_MASK_);
                     if (foldedcasech != 0){
                         return foldedcasech;
                     }
                 }
                 else {
                     // special case folding mappings, hardcoded
                     if (defaultmapping &&
                         (ch ==
                            UCharacterProperty.LATIN_SMALL_LETTER_DOTLESS_I_ ||
                          ch ==
                     UCharacterProperty.LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_))
                     {
                         // map dotted I and dotless i to U+0069 small i
                         return UCharacterProperty.LATIN_SMALL_LETTER_I_;
                     }
                     // return ch itself because it is excluded from case folding
                     return ch;
                 }
             }
             if (PROPERTY_.hasExceptionValue(index,
                                        UCharacterProperty.EXC_LOWERCASE_)) {
                 // not else! - allow to fall through from above
                 return PROPERTY_.getException(index,
                                          UCharacterProperty.EXC_LOWERCASE_);
             }
         }

         return ch; // no mapping - return the character itself
     }

     /**
     * The given string is mapped to its case folding equivalent according to
     * UnicodeData.txt and CaseFolding.txt; if any character has no case folding
     * equivalent, the character itself is returned.
     * "Full", multiple-code point case folding mappings are returned here.
     * For "simple" single-code point mappings use the API
     * foldCase(int ch, boolean defaultmapping).
     * @param str            the String to be converted
     * @param defaultmapping Indicates if all mappings defined in CaseFolding.txt
     *                       is to be used, otherwise the mappings for dotted I
     *                       and dotless i marked with 'I' in CaseFolding.txt will
     *                       be skipped.
     * @return               the case folding equivalent of the character, if any;
     *                       otherwise the character itself.
     * @see                  #foldCase(int, boolean)
     */
     public static String foldCase(String str, boolean defaultmapping)
     {
         int          size   = str.length();
         StringBuffer result = new StringBuffer(size);
         int          offset  = 0;
         int          ch;

         // case mapping loop
         while (offset < size) {
             ch = UTF16.charAt(str, offset);
             offset += UTF16.getCharCount(ch);
             int props = PROPERTY_.getProperty(ch);
             if (!UCharacterProperty.isExceptionIndicator(props)) {
                 int type = UCharacterProperty.getPropType(props);
                 if (type == UCharacterCategory.UPPERCASE_LETTER ||
                     type == UCharacterCategory.TITLECASE_LETTER) {
                     ch += UCharacterProperty.getSignedValue(props);
                 }
             }
             else {
                 int index = UCharacterProperty.getExceptionIndex(props);
                 if (PROPERTY_.hasExceptionValue(index,
                                     UCharacterProperty.EXC_CASE_FOLDING_)) {
                     int exception = PROPERTY_.getException(index,
                                       UCharacterProperty.EXC_CASE_FOLDING_);
                     if (exception != 0) {
                         PROPERTY_.getFoldCase(exception & LAST_CHAR_MASK_,
                                              exception >> SHIFT_24_, result);
                     }
                     else {
                         // special case folding mappings, hardcoded
                         if (defaultmapping &&
                             (ch ==
                             UCharacterProperty.LATIN_SMALL_LETTER_DOTLESS_I_ ||
                              ch ==
                     UCharacterProperty.LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_))
                         {
                             // map dotted I and dotless i to U+0069 small i
                             result.append(
                                     UCharacterProperty.LATIN_SMALL_LETTER_I_);
                         }
                         else {
                             // output c itself because it is excluded from
                             // case folding
                             UTF16.append(result, ch);
                         }
                     }
                     // do not fall through to the output of c
                     continue;
                 }
                 else {
                     if (PROPERTY_.hasExceptionValue(index,
                                          UCharacterProperty.EXC_LOWERCASE_)) {
                         ch = PROPERTY_.getException(index,
                                           UCharacterProperty.EXC_LOWERCASE_);
                     }
                 }

             }

             // handle 1:1 code point mappings from UnicodeData.txt
             UTF16.append(result, ch);
         }

         return result.toString();
     }

     /**
     * Return numeric value of Han code points.
     * <br> This returns the value of Han 'numeric' code points,
     * including those for zero, ten, hundred, thousand, ten thousand,
     * and hundred million.  Unicode does not consider these to be
     * numeric. This includes both the standard and 'checkwriting'
     * characters, the 'big circle' zero character, and the standard
     * zero character.
     * @draft
     * @param ch code point to query
     * @return value if it is a Han 'numeric character,' otherwise return -1.
     */
     public static int getHanNumericValue(int ch)
     {
         switch(ch)
         {
         case IDEOGRAPHIC_NUMBER_ZERO_ :
         case CJK_IDEOGRAPH_COMPLEX_ZERO_ :
             return 0; // Han Zero
         case CJK_IDEOGRAPH_FIRST_ :
         case CJK_IDEOGRAPH_COMPLEX_ONE_ :
             return 1; // Han One
         case CJK_IDEOGRAPH_SECOND_ :
         case CJK_IDEOGRAPH_COMPLEX_TWO_ :
             return 2; // Han Two
         case CJK_IDEOGRAPH_THIRD_ :
         case CJK_IDEOGRAPH_COMPLEX_THREE_ :
             return 3; // Han Three
         case CJK_IDEOGRAPH_FOURTH_ :
         case CJK_IDEOGRAPH_COMPLEX_FOUR_ :
             return 4; // Han Four
         case CJK_IDEOGRAPH_FIFTH_ :
         case CJK_IDEOGRAPH_COMPLEX_FIVE_ :
             return 5; // Han Five
         case CJK_IDEOGRAPH_SIXTH_ :
         case CJK_IDEOGRAPH_COMPLEX_SIX_ :
             return 6; // Han Six
         case CJK_IDEOGRAPH_SEVENTH_ :
         case CJK_IDEOGRAPH_COMPLEX_SEVEN_ :
             return 7; // Han Seven
         case CJK_IDEOGRAPH_EIGHTH_ :
         case CJK_IDEOGRAPH_COMPLEX_EIGHT_ :
             return 8; // Han Eight
         case CJK_IDEOGRAPH_NINETH_ :
         case CJK_IDEOGRAPH_COMPLEX_NINE_ :
             return 9; // Han Nine
         case CJK_IDEOGRAPH_TEN_ :
         case CJK_IDEOGRAPH_COMPLEX_TEN_ :
             return 10;
         case CJK_IDEOGRAPH_HUNDRED_ :
         case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ :
             return 100;
         case CJK_IDEOGRAPH_THOUSAND_ :
         case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ :
             return 1000;
         case CJK_IDEOGRAPH_TEN_THOUSAND_ :
             return 10000;
         case CJK_IDEOGRAPH_HUNDRED_MILLION_ :
             return 100000000;
         }
         return -1; // no value
     }

     /**
     * <p>Gets an iterator for character types, iterating over codepoints.</p>
     * Example of use:<br>
     * <pre>
     * RangeValueIterator iterator = UCharacter.getTypeIterator();
     * RangeValueIterator.Element element = new RangeValueIterator.Element();
     * while (iterator.next(element)) {
     *     System.out.println("Codepoint \\u" +
     *                        Integer.toHexString(element.start) +
     *                        " to codepoint \\u" +
     *                        Integer.toHexString(element.limit - 1) +
     *                        " has the character type " +
     *                        element.value);
     * }
     * </pre>
     * @return an iterator
     * @draft 2.1
     */
     public static RangeValueIterator getTypeIterator()
     {
         return new UCharacterTypeIterator(PROPERTY_);
     }

 	/**
     * <p>Gets an iterator for character names, iterating over codepoints.</p>
     * <p>This API only gets the iterator for the modern, most up-to-date
     * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
     * for extended names use getExtendedNameIterator().</p>
     * Example of use:<br>
     * <pre>
     * ValueIterator iterator = UCharacter.getNameIterator();
     * ValueIterator.Element element = new ValueIterator.Element();
     * while (iterator.next(element)) {
     *     System.out.println("Codepoint \\u" +
     *                        Integer.toHexString(element.codepoint) +
     *                        " has the name " + (String)element.value);
     * }
     * </pre>
     * <p>The maximal range which the name iterator iterates is from
     * UCharacter.MIN_VALUE to UCharacter.MAX_VALUE.</p>
     * @return an iterator
     * @draft 2.1
     */
     public static ValueIterator getNameIterator()
     {
         return new UCharacterNameIterator(NAME_,
                                    UCharacterNameChoice.U_UNICODE_CHAR_NAME);
     }

     /**
     * <p>Gets an iterator for character names, iterating over codepoints.</p>
     * <p>This API only gets the iterator for the older 1.0 Unicode names.
     * For modern, most up-to-date Unicode names use getNameIterator() or
     * for extended names use getExtendedNameIterator().</p>
     * Example of use:<br>
     * <pre>
     * ValueIterator iterator = UCharacter.get1_0NameIterator();
     * ValueIterator.Element element = new ValueIterator.Element();
     * while (iterator.next(element)) {
     *     System.out.println("Codepoint \\u" +
     *                        Integer.toHexString(element.codepoint) +
     *                        " has the name " + (String)element.value);
     * }
     * </pre>
     * <p>The maximal range which the name iterator iterates is from
     * @return an iterator
     * @draft 2.1
     */
     public static ValueIterator getName1_0Iterator()
     {
         return new UCharacterNameIterator(NAME_,
                                  UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
     }

     /**
     * <p>Gets an iterator for character names, iterating over codepoints.</p>
     * <p>This API only gets the iterator for the extended names.
     * For modern, most up-to-date Unicode names use getNameIterator() or
     * for older 1.0 Unicode names use get1_0NameIterator().</p>
     * Example of use:<br>
     * <pre>
     * ValueIterator iterator = UCharacter.getExtendedNameIterator();
     * ValueIterator.Element element = new ValueIterator.Element();
     * while (iterator.next(element)) {
     *     System.out.println("Codepoint \\u" +
     *                        Integer.toHexString(element.codepoint) +
     *                        " has the name " + (String)element.value);
     * }
     * </pre>
     * <p>The maximal range which the name iterator iterates is from
     * @return an iterator
     * @draft 2.1
     */
     public static ValueIterator getExtendedNameIterator()
     {
         return new UCharacterNameIterator(NAME_,
                                  UCharacterNameChoice.U_EXTENDED_CHAR_NAME);
     }

     /**
      * <p>Get the "age" of the code point.</p>
      * <p>The "age" is the Unicode version when the code point was first
      * designated (as a non-character or for Private Use) or assigned a
      * character.
      * <p>This can be useful to avoid emitting code points to receiving
      * processes that do not accept newer characters.</p>
      * <p>The data is from the UCD file DerivedAge.txt.</p>
      * @param ch The code point.
      * @return the Unicode version number
      * @draft ICU 2.1
      */
     public static VersionInfo getAge(int ch)
     {
     	if (ch < MIN_VALUE || ch > MAX_VALUE) {
     		throw new IllegalArgumentException("Codepoint out of bounds");
     	}
     	return PROPERTY_.getAge(ch);
     }

     /**
 	 * <p>Check a binary Unicode property for a code point.</p>
 	 * <p>Unicode, especially in version 3.2, defines many more properties
 	 * than the original set in UnicodeData.txt.</p>
 	 * <p>This API is intended to reflect Unicode properties as defined in
 	 * the Unicode Character Database (UCD) and Unicode Technical Reports
 	 * (UTR).</p>
 	 * <p>For details about the properties see
 	 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
 	 * <p>For names of Unicode properties see the UCD file
 	 * PropertyAliases.txt.</p>
 	 * <p>This API does not check the validity of the codepoint.</p>
 	 * <p>Important: If ICU is built with UCD files from Unicode versions
 	 * below 3.2, then properties marked with "new" are not or
 	 * not fully available.</p>
 	 * @param codepoint Code point to test.
 	 * @param property selector constant from com.ibm.icu.lang.UProperty,
 	 *        identifies which binary property to check.
 	 * @return true or false according to the binary Unicode property value
 	 *         for ch. Also false if property is out of bounds or if the
 	 *         Unicode version does not have data for the property at all, or
 	 *         not for this code point.
 	 * @see com.ibm.icu.lang.UProperty
 	 * @draft ICU 2.1
 	 */
 	public static boolean hasBinaryProperty(int ch, int property)
 	{
 		if (ch < MIN_VALUE || ch > MAX_VALUE) {
     		throw new IllegalArgumentException("Codepoint out of bounds");
     	}
     	return PROPERTY_.hasBinaryProperty(ch, property);
 	}

 	/**
 	 * <p>Check if a code point has the Alphabetic Unicode property.</p>
 	 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).</p>
 	 * <p>Different from UCharacter.isLetter(ch)!</p>
 	 * @draft ICU 2.1
 	 * @param ch codepoint to be tested
 	 */
 	public static boolean isUAlphabetic(int ch)
 	{
 		return hasBinaryProperty(ch, UProperty.ALPHABETIC);
 	}

 	/**
 	 * <p>Check if a code point has the Lowercase Unicode property.</p>
 	 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE).</p>
 	 * <p>This is different from UCharacter.isLowerCase(ch)!</p>
 	 * @param ch codepoint to be tested
 	 * @draft ICU 2.1
 	 */
 	public static boolean isULowercase(int ch)
 	{
 		return hasBinaryProperty(ch, UProperty.LOWERCASE);
 	}

 	/**
 	 * <p>Check if a code point has the Uppercase Unicode property.</p>
 	 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE).</p>
 	 * <p>This is different from UCharacter.isUpperCase(ch)!</p>
 	 * @param ch codepoint to be tested
 	 * @draft ICU 2.1
 	 */
 	public static boolean isUUppercase(int ch)
 	{
 		return hasBinaryProperty(ch, UProperty.UPPERCASE);
 	}

 	/**
 	 * <p>Check if a code point has the White_Space Unicode property.</p>
 	 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE).</p>
 	 * <p>This is different from both UCharacter.isSpace(ch) and
 	 * UCharacter.isWhiteSpace(ch)!</p>
 	 * @param ch codepoint to be tested
 	 * @draft ICU 2.1
 	 */
 	public static boolean isUWhiteSpace(int ch)
 	{
 		return hasBinaryProperty(ch, UProperty.WHITE_SPACE);
 	}

     // protected data members --------------------------------------------

     /**
     * Database storing the sets of character name
     */
     protected static final UCharacterName NAME_;

     // block to initialise name database and unicode 1.0 data
     static
     {
         try
         {
             NAME_ = new UCharacterName();
         }
         catch (Exception e)
         {
             throw new RuntimeException(e.getMessage());
         }
     }

     // protected methods -------------------------------------------------

     /**
     * Determines if codepoint is a non character
     * @param ch codepoint
     * @return true if codepoint is a non character false otherwise
     */
     static boolean isNonCharacter(int ch)
     {
         if ((ch & NON_CHARACTER_SUFFIX_MIN_3_0_) ==
                                             NON_CHARACTER_SUFFIX_MIN_3_0_) {
             return true;
         }

         return ch >= NON_CHARACTER_MIN_3_1_ && ch <=  NON_CHARACTER_MAX_3_1_;
     }

     // private variables -------------------------------------------------

     /**
     * Database storing the sets of character property
     */
     private static final UCharacterProperty PROPERTY_;

 	// block to initialise character property database
     static
     {
         try
         {
             PROPERTY_ = UCharacterProperty.getInstance();
         }
         catch (Exception e)
         {
             throw new RuntimeException(e.getMessage());
         }
     }

     /**
     * To get the last character out from a data type
     */
     private static final int LAST_CHAR_MASK_ = 0xFFFF;

     /**
     * To get the last byte out from a data type
     */
     private static final int LAST_BYTE_MASK_ = 0xFF;

     /**
     * Shift 16 bits
     */
     private static final int SHIFT_16_ = 16;

     /**
     * Shift 24 bits
     */
     private static final int SHIFT_24_ = 24;

     /**
     * Minimum suffix value that indicates if a character is non character.
     * Unicode 3.0 non characters
     */
     private static final int NON_CHARACTER_SUFFIX_MIN_3_0_ = 0xFFFE;

     /**
     * New minimum non character in Unicode 3.1
     */
     private static final int NON_CHARACTER_MIN_3_1_ = 0xFDD0;

     /**
     * New non character range in Unicode 3.1
     */
     private static final int NON_CHARACTER_MAX_3_1_ = 0xFDEF;

     /**
     * Decimal radix
     */
     private static final int DECIMAL_RADIX_ = 10;

     /**
     * No break space code point
     */
     private static final int NO_BREAK_SPACE_ = 0xA0;

     /**
     * Narrow no break space code point
     */
     private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;

     /**
     * Zero width no break space code point
     */
     private static final int ZERO_WIDTH_NO_BREAK_SPACE_ = 0xFEFF;

     /**
     * Ideographic number zero code point
     */
     private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;

     /**
     * CJK Ideograph, First code point
     */
     private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;

     /**
     * CJK Ideograph, Second code point
     */
     private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;

     /**
     * CJK Ideograph, Third code point
     */
     private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;

     /**
     * CJK Ideograph, Fourth code point
     */
     private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;

     /**
     * CJK Ideograph, FIFTH code point
     */
     private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;

     /**
     * CJK Ideograph, Sixth code point
     */
     private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;

     /**
     * CJK Ideograph, Seventh code point
     */
     private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;

     /**
     * CJK Ideograph, Eighth code point
     */
     private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;

     /**
     * CJK Ideograph, Nineth code point
     */
     private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;

     /**
     * Application Program command code point
     */
     private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;

     /**
     * Unit seperator code point
     */
     private static final int UNIT_SEPERATOR_ = 0x001F;

     /**
     * Delete code point
     */
     private static final int DELETE_ = 0x007F;

     /**
     * ISO control character first range upper limit 0x0 - 0x1F
     */
     private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F;

     /**
     * Han digit characters
     */
     private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_     = 0x96f6;
     private static final int CJK_IDEOGRAPH_COMPLEX_ONE_      = 0x58f9;
     private static final int CJK_IDEOGRAPH_COMPLEX_TWO_      = 0x8cb3;
     private static final int CJK_IDEOGRAPH_COMPLEX_THREE_    = 0x53c3;
     private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_     = 0x8086;
     private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_     = 0x4f0d;
     private static final int CJK_IDEOGRAPH_COMPLEX_SIX_      = 0x9678;
     private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_    = 0x67d2;
     private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_    = 0x634c;
     private static final int CJK_IDEOGRAPH_COMPLEX_NINE_     = 0x7396;
     private static final int CJK_IDEOGRAPH_TEN_              = 0x5341;
     private static final int CJK_IDEOGRAPH_COMPLEX_TEN_      = 0x62fe;
     private static final int CJK_IDEOGRAPH_HUNDRED_          = 0x767e;
     private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_  = 0x4f70;
     private static final int CJK_IDEOGRAPH_THOUSAND_         = 0x5343;
     private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
     private static final int CJK_IDEOGRAPH_TEN_THOUSAND_     = 0x824c;
     private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_  = 0x5104;

     // private constructor -----------------------------------------------

     /**
     * Private constructor to prevent instantiation
     */
     private UCharacter()
     {
     }

     // private methods ---------------------------------------------------

     /**
     * Gets the correct property information from UCharacterProperty
     * @param ch character whose information is to be retrieved
     * @return a 32 bit information, returns 0 if no data is found.
     */
     private static int getProps(int ch)
     {
         if (ch >= MIN_VALUE & ch <= MAX_VALUE) {
             return PROPERTY_.getProperty(ch);
         }
         return 0;
     }

     private static int getEuropeanDigit(int ch) {
         if (ch <= 0x7a) {
             if (ch >= 0x41 && ch <= 0x5a) {
                 return ch + 10 - 0x41;
             } else if (ch >= 0x61) {
                 return ch + 10 - 0x61;
             }
         } else if (ch >= 0xff21) {
             if (ch <= 0xff3a) {
                 return ch + 10 - 0xff21;
             } else if (ch >= 0xff41 && ch <= 0xff5a) {
                 return ch + 10 - 0xff41;
             }
         }
         return -1;
     }

     private static int getNumericValueInternal(int ch, boolean useEuropean)
     {
         int props       = getProps(ch);
         int numericType = UCharacterProperty.getNumericType(props);

         int result = -1;
         if (numericType == UCharacterProperty.NON_DIGIT_NUMERIC_TYPE_) {
             result = -2;
         }
         if (numericType != UCharacterProperty.NON_NUMERIC_TYPE_) {
         	// if props == 0, it will just fall through and return -1
         	if (!UCharacterProperty.isExceptionIndicator(props)) {
             	// not contained in exception data
             	return UCharacterProperty.getSignedValue(props);
             }

 			int index = UCharacterProperty.getExceptionIndex(props);
            	if (!PROPERTY_.hasExceptionValue(index,
                                UCharacterProperty.EXC_DENOMINATOR_VALUE_) &&
                 PROPERTY_.hasExceptionValue(index,
                                    UCharacterProperty.EXC_NUMERIC_VALUE_)) {
                	return PROPERTY_.getException(index,
                                      UCharacterProperty.EXC_NUMERIC_VALUE_);
             }
         }

         if (result < 0 && useEuropean) {
             int europeannumeric = getEuropeanDigit(ch);
             if (europeannumeric >= 0) {
             	return europeannumeric;
             }
         }

         return result;
     }
 }