| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $ |
| * $Date: 2001/12/04 20:09:07 $ |
| * $Revision: 1.20 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| package com.ibm.text; |
| |
| import java.util.Locale; |
| import com.ibm.util.Utility; |
| |
| /** |
| * <p> |
| * The UCharacter class provides extensions to the |
| * <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html> |
| * java.lang.Character</a> class. These extensions provide support for |
| * Unicode 3.1 properties and together with the <a href=UTF16.html>UTF16</a> |
| * class, provide support for supplementary characters (those with code |
| * points above U+FFFF). |
| * </p> |
| * <p> |
| * Code points are represented in these API using ints. While it would be |
| * more convenient in Java to have a separate primitive datatype for them, |
| * ints suffice in the meantime. |
| * </p> |
| * <p> |
| * To use this class please add the jar file name icu4j.jar to the |
| * class path, since it contains data files which supply the information used |
| * by this file.<br> |
| * E.g. In Windows <br> |
| * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br> |
| * Otherwise, another method would be to copy the files uprops.dat and |
| * unames.dat from the icu4j source subdirectory |
| * <i>$ICU4J_SRC/src/com/ibm/text/resources</i> to your class directory |
| * <i>$ICU4J_CLASS/com/ibm/text/resources</i>. |
| * </p> |
| * <p> |
| * For more information about the data file format, please refer to |
| * <a href=http://oss.software.ibm.com/icu4j/doc/com/ibm/text/ReadMe.html> |
| * Read Me</a>. |
| * </p> |
| * <p> |
| * Aside from the additions for UTF-16 support, and the updated Unicode 3.1 |
| * properties, the main differences between UCharacter and Character are: |
| * <ul> |
| * <li> UCharacter is not designed to be a char wrapper and does not have |
| * APIs to which involves management of that single char.<br> |
| * These include: |
| * <ul> |
| * <li> char charValue(), |
| * <li> int compareTo(java.lang.Character, java.lang.Character), etc. |
| * </ul> |
| * <li> UCharacter does not include Character APIs that are deprecated, not |
| * does it include the Java-specific character information, such as |
| * boolean isJavaIdentifierPart(char ch). |
| * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric |
| * values '10' - '35'. UCharacter also does this in digit and |
| * getNumericValue, to adhere to the java semantics of these |
| * methods. New methods unicodeDigit, and |
| * getUnicodeNumericValue do not treat the above code points |
| * as having numeric values. This is a semantic change from ICU4J 1.3.1. |
| * <li> For consistency with ICU4C's data, control code points below have their |
| * Unicode general category reset to the types below. |
| * <ul> |
| * <li> TAB 0x9 : U_SPACE_SEPARATOR |
| * <li> VT 0xb : U_SPACE_SEPARATOR |
| * <li> LF 0xa : U_PARAGRAPH_SEPARATOR |
| * <li> FF 0xc : U_LINE_SEPARATOR |
| * <li> CR 0xd : U_PARAGRAPH_SEPARATOR |
| * <li> FS 0x1c : U_PARAGRAPH_SEPARATOR |
| * <li> GS 0x1d : U_PARAGRAPH_SEPARATOR |
| * <li> RS 0x1e : U_PARAGRAPH_SEPARATOR |
| * <li> US 0x1f : U_SPACE_SEPARATOR |
| * <li> NL 0x85 : U_PARAGRAPH_SEPARATOR |
| * </ul> |
| * <p> |
| * Further detail differences can be determined from the program |
| * <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/test/text/UCharacterCompare.java> |
| * com.ibm.icu.test.text.UCharacterCompare</a> |
| * </p> |
| * @author Syn Wee Quek |
| * @since oct 06 2000 |
| * @see com.ibm.text.UCharacterCategory |
| * @see com.ibm.text.UCharacterDirection |
| */ |
| |
| public final class UCharacter |
| { |
| // public variables ============================================== |
| |
| /** |
| * The lowest Unicode code point value. Code points are non-ne N_VALUE |
| */ |
| public static final int MIN_VALUE = 0; |
| |
| /** |
| * The highest Unicode code point value (scalar value) according to the |
| * Unicode Standard.<br> |
| * This is a 21-bit value (21 bits, rounded up).<br> |
| * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE |
| */ |
| public static final int MAX_VALUE = 0x10ffff; |
| |
| /** |
| * The minimum value for Supplementary code points |
| */ |
| public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; |
| |
| /** |
| * Unicode value used when translating into Unicode encoding form and there |
| * is no existing character. |
| */ |
| public static final int REPLACEMENT_CHAR = '\uFFFD'; |
| |
| |
| // constructor ==================================================== |
| |
| /** |
| * Private constructor to prevent instantiation |
| */ |
| private UCharacter() |
| { |
| } |
| |
| // public methods =================================================== |
| |
| /** |
| * Retrieves the numeric value of a decimal digit code point. |
| * <br>This method observes the semantics of |
| * <code>java.lang.Character.digit()</code>. Note that this |
| * will return positive values for code points for which isDigit |
| * returns false, just like java.lang.Character. |
| * <br><em>Semantic Change:</em> In release 1.3.1 and |
| * prior, this did not treat the European letters as having a |
| * digit value, and also treated numeric letters and other numbers as |
| * digits. |
| * This has been changed to conform to the java semantics. |
| * <br>A code point is a valid digit if and only if: |
| * <ul> |
| * <li>ch is a decimal digit or one of the european letters, and |
| * <li>the value of ch is less than the specified radix. |
| * </ul> |
| * @param ch the code point to query |
| * @param radix the radix |
| * @return the numeric value represented by the code point in the |
| * specified radix, or -1 if the code point is not a decimal digit |
| * or if its value is too large for the radix |
| */ |
| public static int digit(int ch, int radix) |
| { |
| int props = getProps(ch); |
| int result = -1; |
| // if props == 0, it will just fall through and return -1 |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| // not contained in exception data |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.DECIMAL_DIGIT_NUMBER) { |
| result = UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| |
| if (result < 0 && radix > 10) { |
| result = getEuropeanDigit(ch); |
| } |
| |
| if (result < 0 || result >= radix) { |
| return -1; |
| } |
| return result; |
| } |
| |
| private static boolean isEuropeanDigit(int ch) { |
| return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) || |
| (ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a))); |
| } |
| |
| private static int getEuropeanDigit(int ch) { |
| if (ch <= 0x7a) { |
| if (ch >= 0x41 && ch <= 0x5a) { |
| return ch + 10 - 0x41; |
| } else if (ch >= 0x61) { |
| return ch + 10 - 0x61; |
| } |
| } else if (ch >= 0xff21) { |
| if (ch <= 0xff3a) { |
| return ch + 10 - 0xff21; |
| } else if (ch >= 0xff41 && ch <= 0xff5a) { |
| return ch + 10 - 0xff41; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Retrieves the numeric value of a decimal digit code point. |
| * <br>This is a convenience overload of <code>digit(int, int)</code> |
| * that provides a decimal radix. |
| * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this |
| * treated numeric letters and other numbers as digits. This has |
| * been changed to conform to the java semantics. |
| * @param ch the code point to query |
| * @return the numeric value represented by the code point, |
| * or -1 if the code point is not a decimal digit or if its |
| * value is too large for a decimal radix |
| */ |
| public static int digit(int ch) |
| { |
| return digit(ch, DECIMAL_RADIX_); |
| } |
| |
| /** |
| * Returns the Unicode numeric value of the code point as a nonnegative |
| * integer. |
| * <br>If the code point does not have a numeric value, then -1 is returned. |
| * <br> |
| * If the code point has a numeric value that cannot be represented as a |
| * nonnegative integer (for example, a fractional value), then -2 is |
| * returned. |
| * <br><em>Semantic Change:</em> In release 1.3.1 and |
| * prior, this returned -1 for ASCII letters and their |
| * fullwidth counterparts. This has been changed to |
| * conform to the java semantics. |
| * @param ch the code point to query |
| * @return the numeric value of the code point, or -1 if it has no numeric |
| * value, or -2 if it has a numeric value that cannot be represented as a |
| * nonnegative integer |
| */ |
| public static int getNumericValue(int ch) |
| { |
| return getNumericValueInternal(ch, true); |
| } |
| |
| /** |
| * Returns the Unicode numeric value of the code point as a nonnegative |
| * integer. |
| * <br>If the code point does not have a numeric value, then -1 is returned. <br> |
| * If the code point has a numeric value that cannot be represented as a |
| * nonnegative integer (for example, a fractional value), then -2 is |
| * returned. |
| * This returns values other than -1 for all and only those code points |
| * whose type is a numeric type. |
| * @param ch the code point to query |
| * @return the numeric value of the code point, or -1 if it has no numeric |
| * value, or -2 if it has a numeric value that cannot be represented as a |
| * nonnegative integer |
| */ |
| public static int getUnicodeNumericValue(int ch) |
| { |
| return getNumericValueInternal(ch, false); |
| } |
| |
| private static int getNumericValueInternal(int ch, boolean useEuropean) |
| { |
| int props = getProps(ch); |
| int type = UCharacterPropertyDB.getPropType(props); |
| |
| // if props == 0, it will just fall through and return -1 |
| if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER && |
| type != UCharacterCategory.LETTER_NUMBER && |
| type != UCharacterCategory.OTHER_NUMBER) { |
| |
| return useEuropean ? getEuropeanDigit(ch) : -1; |
| } |
| |
| int result = -1; |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| // not contained in exception data |
| result = UCharacterPropertyDB.getSignedValue(props); |
| } |
| else { |
| // contained in exception data |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_DIGIT_VALUE_)) { |
| result = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_DIGIT_VALUE_) & |
| LAST_CHAR_MASK_; |
| } |
| else { |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_DENOMINATOR_VALUE_)) { |
| return -2; |
| } |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_NUMERIC_VALUE_)) { |
| result = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_NUMERIC_VALUE_); |
| } |
| } |
| } |
| |
| return result; |
| } |
| |
| /** |
| * Returns a value indicating a code point's Unicode category.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.getType() except |
| * for the above mentioned code points that had their category changed.<br> |
| * Return results are constants from the interface |
| * <a href=UCharacterCategory.html>UCharacterCategory</a> |
| * @param ch code point whose type is to be determined |
| * @return category which is a value of UCharacterCategory |
| */ |
| public static int getType(int ch) |
| { |
| return UCharacterPropertyDB.getPropType(getProps(ch)); |
| } |
| |
| /** |
| * Determines if a code point has a defined meaning in the up-to-date Unicode |
| * standard.<br> |
| * E.g. supplementary code points though allocated space are not defined in |
| * Unicode yet.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isDefined() |
| * @param ch code point to be determined if it is defined in the most current |
| * version of Unicode |
| * @return true if this code point is defined in unicode |
| */ |
| public static boolean isDefined(int ch) |
| { |
| return getProps(ch) != 0; |
| } |
| |
| /** |
| * Determines if a code point is a Java digit. |
| * <br>This method observes the semantics of |
| * <code>java.lang.Character.isDigit()</code>. It returns true for |
| * decimal digits only. |
| * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this |
| * treated numeric letters and other numbers as digits. This has |
| * been changed to conform to the java semantics. |
| * @param ch code point to query |
| * @return true if this code point is a digit */ |
| public static boolean isDigit(int ch) |
| { |
| return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER; |
| } |
| |
| /** |
| * Determines if the specified code point is an ISO control character.<br> |
| * A code point is considered to be an ISO control character if it is in the |
| * range \u0000 through \u001F or in the range \u007F through |
| * \u009F.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isISOControl() |
| * @param ch code point to determine if it is an ISO control character |
| * @return true if code point is a ISO control character |
| */ |
| public static boolean isISOControl(int ch) |
| { |
| return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ && |
| ((ch <= UNIT_SEPERATOR_) || (ch >= DELETE_)); |
| } |
| |
| /** |
| * Determines if the specified code point is a letter.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isLetter() |
| * @param ch code point to determine if it is a letter |
| * @return true if code point is a letter |
| */ |
| public static boolean isLetter(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER; |
| } |
| |
| /** |
| * Determines if the specified code point is a letter or digit.<br> |
| * Note this method, unlike java.lang.Character does not regard the ascii |
| * characters 'A' - 'Z' and 'a' - 'z' as digits. |
| * @param ch code point to determine if it is a letter or a digit |
| * @return true if code point is a letter or a digit |
| */ |
| public static boolean isLetterOrDigit(int ch) |
| { |
| return isDigit(ch) || isLetter(ch); |
| } |
| |
| /** |
| * Determines if the specified code point is a lowercase character.<br> |
| * UnicodeData only contains case mappings for code points where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> For more information about Unicode case mapping please |
| * refer to the <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isLowerCase() |
| * @param ch code point to determine if it is in lowercase |
| * @return true if code point is a lowercase character |
| */ |
| public static boolean isLowerCase(int ch) |
| { |
| // if props == 0, it will just fall through and return false |
| return getType(ch) == UCharacterCategory.LOWERCASE_LETTER; |
| } |
| |
| /** |
| * Determines if the specified code point is a white space character.<br> |
| * A code point is considered to be an whitespace character if and only |
| * if it satisfies one of the following criteria: |
| * <ul> |
| * <li> It is a Unicode space separator (category "Zs"), but is not |
| * a no-break space (\u00A0 or \u202F or \uFEFF). |
| * <li> It is a Unicode line separator (category "Zl"). |
| * <li> It is a Unicode paragraph separator (category "Zp"). |
| * </ul> |
| * Up-to-date Unicode implementation of java.lang.Character.isWhitespace(). |
| * @param ch code point to determine if it is a white space |
| * @return true if the specified code point is a white space character |
| */ |
| public static boolean isWhitespace(int ch) |
| { |
| int cat = getType(ch); |
| // exclude no-break spaces |
| // if props == 0, it will just fall through and return false |
| return (cat == UCharacterCategory.SPACE_SEPARATOR || |
| cat == UCharacterCategory.LINE_SEPARATOR || |
| cat == UCharacterCategory.PARAGRAPH_SEPARATOR) && |
| (ch != NO_BREAK_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_) && |
| (ch != ZERO_WIDTH_NO_BREAK_SPACE_); |
| } |
| |
| /** |
| * Determines if the specified code point is a Unicode specified space |
| * character, ie if code point is in the category Zs, Zl and Zp.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar(). |
| * @param ch code point to determine if it is a space |
| * @return true if the specified code point is a space character |
| */ |
| public static boolean isSpaceChar(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.SPACE_SEPARATOR || |
| cat == UCharacterCategory.LINE_SEPARATOR || |
| cat == UCharacterCategory.PARAGRAPH_SEPARATOR; |
| } |
| |
| /** |
| * Determines if the specified code point is a titlecase character.<br> |
| * UnicodeData only contains case mappings for code points where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For more information about Unicode case mapping please refer to the |
| * <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isTitleCase(). |
| * @param ch code point to determine if it is in title case |
| * @return true if the specified code point is a titlecase character |
| */ |
| public static boolean isTitleCase(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.TITLECASE_LETTER; |
| } |
| |
| /** |
| * Determines if the specified code point may be any part of a Unicode |
| * identifier other than the starting character.<br> |
| * A code point may be part of a Unicode identifier if and only if it is one |
| * of the following: |
| * <ul> |
| * <li> Lu Uppercase letter |
| * <li> Ll Lowercase letter |
| * <li> Lt Titlecase letter |
| * <li> Lm Modifier letter |
| * <li> Lo Other letter |
| * <li> Nl Letter number |
| * <li> Pc Connecting punctuation character |
| * <li> Nd decimal number |
| * <li> Mc Spacing combining mark |
| * <li> Mn Non-spacing mark |
| * <li> Cf formatting code |
| * </ul> |
| * Up-to-date Unicode implementation of |
| * java.lang.Character.isUnicodeIdentifierPart().<br> |
| * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>. |
| * @param ch code point to determine if is can be part of a Unicode identifier |
| * @return true if code point is any character belonging a unicode identifier |
| * suffix after the first character |
| */ |
| public static boolean isUnicodeIdentifierPart(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER || |
| cat == UCharacterCategory.LETTER_NUMBER || |
| cat == UCharacterCategory.CONNECTOR_PUNCTUATION || |
| cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || |
| cat == UCharacterCategory.COMBINING_SPACING_MARK || |
| cat == UCharacterCategory.NON_SPACING_MARK || |
| // cat == UCharacterCategory.FORMAT; |
| isIdentifierIgnorable(ch); |
| } |
| |
| /** |
| * Determines if the specified code point is permissible as the first |
| * character in a Unicode identifier.<br> |
| * A code point may start a Unicode identifier if it is of type either |
| * <ul> |
| * <li> Lu Uppercase letter |
| * <li> Ll Lowercase letter |
| * <li> Lt Titlecase letter |
| * <li> Lm Modifier letter |
| * <li> Lo Other letter |
| * <li> Nl Letter number |
| * </ul> |
| * Up-to-date Unicode implementation of |
| * java.lang.Character.isUnicodeIdentifierStart().<br> |
| * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>. |
| * @param ch code point to determine if it can start a Unicode identifier |
| * @return true if code point is the first character belonging a unicode |
| * identifier |
| */ |
| public static boolean isUnicodeIdentifierStart(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER || |
| cat == UCharacterCategory.LETTER_NUMBER; |
| } |
| |
| /** |
| * Determines if the specified code point should be regarded as an ignorable |
| * character in a Unicode identifier.<br> |
| * A character is ignorable in the Unicode standard if it is of the type Cf, |
| * Formatting code.<br> |
| * Up-to-date Unicode implementation of |
| * java.lang.Character.isIdentifierIgnorable().<br> |
| * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>. |
| * @param ch code point to be determined if it can be ignored in a Unicode |
| * identifier. |
| * @return true if the code point is ignorable |
| */ |
| public static boolean isIdentifierIgnorable(int ch) |
| { |
| /* |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.FORMAT; |
| */ |
| // see java.lang.Character.isIdentifierIgnorable() on range of |
| // ignorable characters. |
| return ch <= 8 || (ch >= 0xe && ch <= 0x1b) || |
| (ch >= 0x7f && ch <= 0x9f) || |
| getType(ch) == UCharacterCategory.FORMAT; |
| } |
| |
| /** |
| * Determines if the specified code point is an uppercase character.<br> |
| * UnicodeData only contains case mappings for code point where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For language specific case conversion behavior, use |
| * toUpperCase(locale, str). <br> |
| * For example, the case conversion for dot-less i and dotted I in Turkish, |
| * or for final sigma in Greek. |
| * For more information about Unicode case mapping please refer to the |
| * <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isUpperCase(). |
| * @param ch code point to determine if it is in uppercase |
| * @return true if the code point is an uppercase character |
| */ |
| public static boolean isUpperCase(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER; |
| } |
| |
| /** |
| * The given code point is mapped to its lowercase equivalent; if the code |
| * point has no lowercase equivalent, the code point itself is returned.<br> |
| * UnicodeData only contains case mappings for code point where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For language specific case conversion behavior, use |
| * toLowerCase(locale, str). <br> |
| * For example, the case conversion for dot-less i and dotted I in Turkish, |
| * or for final sigma in Greek. |
| * For more information about Unicode case mapping please refer to the |
| * <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toLowerCase() |
| * @param ch code point whose lowercase equivalent is to be retrieved |
| * @return the lowercase equivalent code point |
| */ |
| public static int toLowerCase(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return itself |
| if(!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int cat = UCharacterPropertyDB.getPropType(props); |
| if (cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER) { |
| return ch + UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_); |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Converts argument code point and returns a String object representing the |
| * code point's value in UTF16 format.<br> |
| * The result is a string whose length is 1 for non-supplementary code points, |
| * 2 otherwise.<br> |
| * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this |
| * function.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toString() |
| * @param ch code point |
| * @return string representation of the code point, null if code point is not |
| * defined in unicode |
| */ |
| public static String toString(int ch) |
| { |
| if (ch < MIN_VALUE || ch > MAX_VALUE) { |
| return null; |
| } |
| |
| if (ch < UCharacter.SUPPLEMENTARY_MIN_VALUE) { |
| return String.valueOf((char)ch); |
| } |
| |
| char result[] = new char[2]; |
| result[0] = (char)UTF16.getLeadSurrogate(ch); |
| result[1] = (char)UTF16.getTrailSurrogate(ch); |
| return new String(result); |
| } |
| |
| /** |
| * Converts the code point argument to titlecase.<br> |
| * UnicodeData only contains case mappings for code points where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * There are only four Unicode characters that are truly titlecase forms |
| * that are distinct from uppercase forms. |
| * For more information about Unicode case mapping please refer |
| * to the <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * If no titlecase is available, the uppercase is returned. If no uppercase |
| * is available, the code point itself is returned.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toTitleCase() |
| * @param ch code point whose title case is to be retrieved |
| * @return titlecase code point |
| */ |
| public static int toTitleCase(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return itself |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.LOWERCASE_LETTER) { |
| // here, titlecase is same as uppercase |
| return ch - UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_TITLECASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_TITLECASE_); |
| } |
| else { |
| // here, titlecase is same as uppercase |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_); |
| } |
| } |
| } |
| return ch; // no mapping - return c itself |
| } |
| |
| /** |
| * Converts the character argument to uppercase.<br> |
| * UnicodeData only contains case mappings for characters where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For more information about Unicode case mapping please refer |
| * to the <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * If no uppercase is available, the character itself is returned.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toUpperCase() |
| * @param ch code point whose uppercase is to be retrieved |
| * @return uppercase code point |
| */ |
| public static int toUpperCase(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return itself |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.LOWERCASE_LETTER) { |
| // here, titlecase is same as uppercase */ |
| return ch - UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_); |
| } |
| } |
| return ch; // no mapping - return c itself |
| } |
| |
| // extra methods not in java.lang.Character =========================== |
| |
| /** |
| * Determines if the code point is a supplementary character.<br> |
| * A code point is a supplementary character if and only if it is greater than |
| * <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a> |
| * @param ch code point to be determined if it is in the supplementary plane |
| * @return true if code point is a supplementary character |
| */ |
| public static boolean isSupplementary(int ch) |
| { |
| return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE && |
| ch <= UCharacter.MAX_VALUE; |
| } |
| |
| /** |
| * Determines if the code point is in the BMP plane.<br> |
| * @param ch code point to be determined if it is not a supplementary |
| * character |
| * @return true if code point is not a supplementary character |
| */ |
| public static boolean isBMP(int ch) |
| { |
| return (ch >= 0 && ch < LAST_CHAR_MASK_); |
| } |
| |
| /** |
| * Determines whether the specified code point is a printable character |
| * according to the Unicode standard. |
| * @param ch code point to be determined if it is printable |
| * @return true if the code point is a printable character |
| */ |
| public static boolean isPrintable(int ch) |
| { |
| if (isISOControl(ch)) { |
| return false; |
| } |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return (cat != UCharacterCategory.UNASSIGNED && |
| cat != UCharacterCategory.CONTROL && |
| cat != UCharacterCategory.FORMAT && |
| cat != UCharacterCategory.PRIVATE_USE && |
| cat != UCharacterCategory.SURROGATE && |
| cat != UCharacterCategory.GENERAL_OTHER_TYPES); |
| } |
| |
| /** |
| * Determines whether the specified code point is of base form.<br> |
| * A code point of base form does not graphically combine with preceding |
| * characters, and is neither a control nor a format character. |
| * @param ch code point to be determined if it is of base form |
| * @return true if the code point is of base form |
| */ |
| public static boolean isBaseForm(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || |
| cat == UCharacterCategory.OTHER_NUMBER || |
| cat == UCharacterCategory.LETTER_NUMBER || |
| cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER || |
| cat == UCharacterCategory.NON_SPACING_MARK || |
| cat == UCharacterCategory.ENCLOSING_MARK || |
| cat == UCharacterCategory.COMBINING_SPACING_MARK; |
| } |
| |
| /** |
| * Returns the Bidirection property of a code point.<br> |
| * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional |
| * property.<br> |
| * Result returned belongs to the interface |
| * <a href=UCharacterDirection.html>UCharacterDirection</a> |
| * @param ch the code point to be determined its direction |
| * @return direction constant from UCharacterDirection. Otherwise is |
| * character is not defined, UCharacterDirection.BOUNDARY_NEUTRAL |
| * will be returned. |
| */ |
| public static int getDirection(int ch) |
| { |
| int props = getProps(ch); |
| if (props != 0) { |
| return UCharacterPropertyDB.getDirection(props); |
| } |
| return UCharacterDirection.LEFT_TO_RIGHT; |
| } |
| |
| /** |
| * Determines whether the code point has the "mirrored" property.<br> |
| * This property is set for characters that are commonly used in |
| * Right-To-Left contexts and need to be displayed with a "mirrored" |
| * glyph. |
| * @param ch code point whose mirror is to be determined |
| * @return true if the code point has the "mirrored" property |
| */ |
| public static boolean isMirrored(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return false |
| return UCharacterPropertyDB.isMirrored(props); |
| } |
| |
| /** |
| * Maps the specified code point to a "mirror-image" code point.<br> |
| * For code points with the "mirrored" property, implementations sometimes |
| * need a "poor man's" mapping to another code point such that the default |
| * glyph may serve as the mirror-image of the default glyph of the specified |
| * code point.<br> |
| * This is useful for text conversion to and from codepages with visual |
| * order, and for displays without glyph selection capabilities. |
| * @param ch code point whose mirror is to be retrieved |
| * @return another code point that may serve as a mirror-image substitute, or |
| * ch itself if there is no such mapping or ch does not have the |
| * "mirrored" property |
| */ |
| public static int getMirror(int ch) |
| { |
| int props = getProps(ch); |
| // mirrored - the value is a mirror offset |
| // if props == 0, it will just fall through and return false |
| if (UCharacterPropertyDB.isMirrored(props)) { |
| if(!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| return ch + UCharacterPropertyDB.getSignedValue(props); |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_MIRROR_MAPPING_)) |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_MIRROR_MAPPING_); |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Gets the combining class of the argument codepoint |
| * @param ch code point whose combining is to be retrieved |
| * @return the combining class of the codepoint |
| */ |
| public static int getCombiningClass(int ch) |
| { |
| int props = getProps(ch); |
| if(!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.NON_SPACING_MARK) { |
| return PROPERTY_DB_.getUnsignedValue(props); |
| } |
| else { |
| return 0; |
| } |
| } |
| else { |
| // the combining class is in bits 23..16 of the first exception value |
| return (PROPERTY_DB_.getException( |
| PROPERTY_DB_.getExceptionIndex(props), |
| UCharacterPropertyDB.EXC_COMBINING_CLASS_) |
| >> SHIFT_16_) & LAST_BYTE_MASK_; |
| } |
| } |
| |
| /** |
| * A code point is illegal if and only if |
| * <ul> |
| * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE |
| * <li> A surrogate value, 0xD800 to 0xDFFF |
| * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE |
| * </ul> |
| * Note: legal does not mean that it is assigned in this version of Unicode. |
| * @param ch code point to determine if it is a legal code point by itself |
| * @return true if and only if legal. |
| */ |
| public static boolean isLegal(int ch) |
| { |
| if (ch < MIN_VALUE) { |
| return false; |
| } |
| if (ch < SURROGATE_MIN_VALUE_) { |
| return true; |
| } |
| if (ch <= SURROGATE_MAX_VALUE_) { |
| return false; |
| } |
| if (isNonCharacter(ch)) { |
| return false; |
| } |
| return (ch <= MAX_VALUE); |
| } |
| |
| /** |
| * A string is legal iff all its code points are legal. |
| * A code point is illegal if and only if |
| * <ul> |
| * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE |
| * <li> A surrogate value, 0xD800 to 0xDFFF |
| * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE |
| * </ul> |
| * Note: legal does not mean that it is assigned in this version of Unicode. |
| * @param ch code point to determine if it is a legal code point by itself |
| * @return true if and only if legal. |
| */ |
| public static boolean isLegal(String str) |
| { |
| int size = str.length(); |
| int codepoint; |
| for (int i = 0; i < size; i ++) |
| { |
| codepoint = UTF16.charAt(str, i); |
| if (!isLegal(codepoint)) { |
| return false; |
| } |
| if (isSupplementary(codepoint)) { |
| i ++; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Gets the version of Unicode data used. |
| * @return the unicode version number used |
| */ |
| public static String getUnicodeVersion() |
| { |
| return PROPERTY_DB_.m_unicodeversion_; |
| } |
| |
| /** |
| * Retrieve the most current Unicode name of the argument code point, or |
| * null if the character is unassigned or outside the range |
| * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE.<br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param ch the code point for which to get the name |
| * @return most current Unicode name |
| */ |
| public static String getName(int ch) |
| { |
| return UCharacterName.getName(ch, |
| UCharacterNameChoice.U_UNICODE_CHAR_NAME); |
| } |
| |
| /** |
| * Retrieve the earlier version 1.0 Unicode name of the argument code point, |
| * or null if the character is unassigned or outside the range |
| * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE.<br> |
| * <br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param ch the code point for which to get the name |
| * @return version 1.0 Unicode name |
| */ |
| public static String getName1_0(int ch) |
| { |
| return UCharacterName.getName(ch, |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); |
| } |
| |
| /** |
| * Find a Unicode code point by its most current Unicode name and return its |
| * code point value.<br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param name most current Unicode character name whose code point is to be |
| * returned |
| * @return code point or -1 if name is not found |
| */ |
| public static int getCharFromName(String name) |
| { |
| return UCharacterName.getCharFromName( |
| UCharacterNameChoice.U_UNICODE_CHAR_NAME, name); |
| } |
| |
| /** |
| * Find a Unicode character by its version 1.0 Unicode name and return its |
| * code point value.<br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param name Unicode 1.0 code point name whose code point is to |
| * returned |
| * @return code point or -1 if name is not found |
| */ |
| public static int getCharFromName1_0(String name) |
| { |
| return UCharacterName.getCharFromName( |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name); |
| } |
| |
| /** |
| * Returns a code pointcorresponding to the two UTF16 characters.<br> |
| * If the argument lead is not a high surrogate character or trail is not a |
| * low surrogate character, UCharacter.REPLACEMENT_CHAR is returned. |
| * @param lead the lead char |
| * @param trail the trail char |
| * @return code point or UCharacter.REPLACEMENT_CHAR if surrogate characters |
| * are invalid. |
| */ |
| public static int getCodePoint(char lead, char trail) |
| { |
| if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { |
| return getRawSupplementary(lead, trail); |
| } |
| return UCharacter.REPLACEMENT_CHAR; |
| } |
| |
| /** |
| * Returns the code point corresponding to the UTF16 character.<br> |
| * If argument char16 is a surrogate character, UCharacter.REPLACEMENT_CHAR |
| * is returned |
| * @param char16 the UTF16 character |
| * @return code point or UCharacter.REPLACEMENT_CHAR if argument is not a |
| * invalid character. |
| * @exception IllegalArgumentException thrown when char16 is not a valid |
| * codepoint |
| */ |
| public static int getCodePoint(char char16) |
| { |
| if (UCharacter.isLegal(char16)) { |
| return char16; |
| } |
| throw new IllegalArgumentException("Illegal codepoint"); |
| } |
| |
| /** |
| * Gets uppercase version of the argument string. |
| * Casing is dependent on the default locale and context-sensitive. |
| * @param str source string to be performed on |
| * @return uppercase version of the argument string |
| */ |
| public static String toUpperCase(String str) |
| { |
| return toUpperCase(Locale.getDefault(), str); |
| } |
| |
| /** |
| * Gets lowercase version of the argument string. |
| * Casing is dependent on the default locale and context-sensitive |
| * @param str source string to be performed on |
| * @return lowercase version of the argument string |
| */ |
| public static String toLowerCase(String str) |
| { |
| return toLowerCase(Locale.getDefault(), str); |
| } |
| |
| /** |
| * Gets uppercase version of the argument string. |
| * Casing is dependent on the argument locale and context-sensitive. |
| * @param locale which string is to be converted in |
| * @param str source string to be performed on |
| * @return uppercase version of the argument string |
| */ |
| public static String toUpperCase(Locale locale, String str) |
| { |
| int size = str.length(); |
| StringBuffer result = new StringBuffer(size); // initial buffer |
| int offset = 0; |
| |
| while (offset < size) |
| { |
| int ch = UTF16.charAt(str, offset); |
| int chsize = UTF16.getCharCount(ch); |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) |
| { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.LOWERCASE_LETTER) { |
| ch -= UCharacterPropertyDB.getSignedValue(props); |
| } |
| UTF16.append(result, ch); |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| getSpecialUpperCase(ch, index, result, str, offset, |
| locale); |
| } |
| else { |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| ch = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_); |
| } |
| UTF16.append(result, ch); |
| } |
| } |
| offset += chsize; |
| } |
| return result.toString(); |
| } |
| |
| /** |
| * Gets lowercase version of the argument string. |
| * Casing is dependent on the argument locale and context-sensitive |
| * @param locale which string is to be converted in |
| * @param str source string to be performed on |
| * @return lowercase version of the argument string |
| */ |
| public static String toLowerCase(Locale locale, String str) |
| { |
| // case mapping loop |
| int offset = 0; |
| int length = str.length(); |
| StringBuffer result = new StringBuffer(length); |
| while (offset < length) { |
| int ch = UTF16.charAt(str, offset); |
| int chsize = UTF16.getCharCount(ch); |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.UPPERCASE_LETTER || |
| type == UCharacterCategory.TITLECASE_LETTER) { |
| ch += UCharacterPropertyDB.getSignedValue(props); |
| } |
| UTF16.append(result, ch); |
| } |
| else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| getSpecialLowerCase(ch, index, result, str, offset, |
| locale); |
| } |
| else { |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| ch = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_); |
| } |
| UTF16.append(result, ch); |
| } |
| } |
| offset += chsize; |
| } |
| return result.toString(); |
| } |
| |
| // TODO: Make public API |
| /** |
| * returns the maximum amount that a single character will expand in |
| * upper, lower, title, or fold case operations |
| */ |
| static int getMaxCaseExpansion() { |
| return 10; |
| } |
| |
| // TODO: Make public API? |
| /** |
| * produces the result of converting a single (possibly surrogate) |
| * character in a string. |
| * @param result |
| * @return length of returned value IF there is a change. -1 otherwise. |
| */ |
| static int toLowerCase(Locale locale, String str, int offset, char[] result) { |
| |
| // NOTE: we have to keep the original string around, because it is used |
| // for the context |
| |
| int ch = UTF16.charAt(str, offset); |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.UPPERCASE_LETTER || |
| type == UCharacterCategory.TITLECASE_LETTER) { |
| int chDelta = UCharacterPropertyDB.getSignedValue(props); |
| if (chDelta == 0) return -1; |
| int len = str.length(); |
| return UTF16.append(result, 0, ch + chDelta); |
| } |
| } else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| // TODO: avoid StringBuffer, put directly into array? |
| StringBuffer buf = new StringBuffer(); |
| getSpecialLowerCase(ch, index, buf, str, offset, |
| locale); |
| Utility.getChars(buf, 0, buf.length(), result, 0); |
| return buf.length(); |
| } else if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| return UTF16.append(result, 0, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)); |
| } |
| } |
| return -1; |
| } |
| |
| // TODO: Make public API? |
| /** |
| * produces the result of converting a single (possibly surrogate) |
| * character in a string. |
| * @param result |
| * @return length of returned value IF there is a change. -1 otherwise. |
| */ |
| static int toUpperCase(Locale locale, String str, int offset, char[] result) { |
| |
| // NOTE: we have to keep the original string around, because it is used |
| // for the context |
| |
| int ch = UTF16.charAt(str, offset); |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.LOWERCASE_LETTER || |
| type == UCharacterCategory.TITLECASE_LETTER) { |
| int chDelta = UCharacterPropertyDB.getSignedValue(props); |
| if (chDelta == 0) return -1; |
| int len = str.length(); |
| return UTF16.append(result, 0, ch - chDelta); |
| } |
| } else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| // TODO: avoid StringBuffer, put directly into array? |
| StringBuffer buf = new StringBuffer(); |
| getSpecialUpperCase(ch, index, buf, str, offset, |
| locale); |
| Utility.getChars(buf, 0, buf.length(), result, 0); |
| return buf.length(); |
| } else if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| return UTF16.append(result, 0, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)); |
| } |
| } |
| return -1; |
| } |
| |
| // TODO: Make public API? |
| /** |
| * produces the result of converting a single (possibly surrogate) |
| * character in a string. |
| * @param result |
| * @return length of returned value IF there is a change. -1 otherwise. |
| */ |
| static int toTitleCase(Locale locale, String str, int offset, char[] result) { |
| |
| // NOTE: we have to keep the original string around, because it is used |
| // for the context |
| |
| // TODO: simplify code by checking for the few special titlecases, |
| // and just jump to uppercase for the rest. |
| |
| int ch = UTF16.charAt(str, offset); |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.LOWERCASE_LETTER) { |
| // here, titlecase is same as uppercase |
| int chDelta = UCharacterPropertyDB.getSignedValue(props); |
| if (chDelta == 0) return -1; |
| int len = str.length(); |
| return UTF16.append(result, 0, ch - chDelta); |
| } |
| } else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_TITLECASE_)) { |
| return UTF16.append(result, 0, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_TITLECASE_)); |
| } else if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| // TODO: avoid StringBuffer, put directly into array? |
| StringBuffer buf = new StringBuffer(); |
| getSpecialUpperCase(ch, index, buf, str, offset, |
| locale); |
| Utility.getChars(buf, 0, buf.length(), result, 0); |
| return buf.length(); |
| } else if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| return UTF16.append(result, 0, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)); |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * The given character is mapped to its case folding equivalent according to |
| * UnicodeData.txt and CaseFolding.txt; if the character has no case folding |
| * equivalent, the character itself is returned. |
| * Only "simple", single-code point case folding mappings are used. |
| * For "full", multiple-code point mappings use the API |
| * foldCase(String str, boolean defaultmapping). |
| * @param ch the character to be converted |
| * @param defaultmapping Indicates if all mappings defined in CaseFolding.txt |
| * is to be used, otherwise the mappings for dotted I |
| * and dotless i marked with 'I' in CaseFolding.txt will |
| * be skipped. |
| * @return the case folding equivalent of the character, if any; |
| * otherwise the character itself. |
| * @see #foldCase(String, boolean) |
| */ |
| public static int foldCase(int ch, boolean defaultmapping) |
| { |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.UPPERCASE_LETTER || |
| type == UCharacterCategory.TITLECASE_LETTER) { |
| return ch + UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_CASE_FOLDING_)) { |
| int exception = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_CASE_FOLDING_); |
| if (exception != 0) { |
| int foldedcasech = |
| PROPERTY_DB_.getFoldCase(exception & LAST_CHAR_MASK_); |
| if (foldedcasech != 0){ |
| return foldedcasech; |
| } |
| } |
| else { |
| // special case folding mappings, hardcoded |
| if (defaultmapping && |
| (ch == LATIN_SMALL_LETTER_DOTLESS_I_ || |
| ch == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_)) { |
| // map dotted I and dotless i to U+0069 small i |
| return LATIN_SMALL_LETTER_I_; |
| } |
| // return ch itself because it is excluded from case folding |
| return ch; |
| } |
| } |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| // not else! - allow to fall through from above |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_); |
| } |
| } |
| |
| return ch; // no mapping - return the character itself |
| } |
| |
| /** |
| * The given string is mapped to its case folding equivalent according to |
| * UnicodeData.txt and CaseFolding.txt; if any character has no case folding |
| * equivalent, the character itself is returned. |
| * "Full", multiple-code point case folding mappings are returned here. |
| * For "simple" single-code point mappings use the API |
| * foldCase(int ch, boolean defaultmapping). |
| * @param str the String to be converted |
| * @param defaultmapping Indicates if all mappings defined in CaseFolding.txt |
| * is to be used, otherwise the mappings for dotted I |
| * and dotless i marked with 'I' in CaseFolding.txt will |
| * be skipped. |
| * @return the case folding equivalent of the character, if any; |
| * otherwise the character itself. |
| * @see #foldCase(int, boolean) |
| */ |
| public static String foldCase(String str, boolean defaultmapping) |
| { |
| int size = str.length(); |
| StringBuffer result = new StringBuffer(size); |
| int offset = 0; |
| int ch; |
| |
| // case mapping loop |
| while (offset < size) { |
| ch = UTF16.charAt(str, offset); |
| offset += UTF16.getCharCount(ch); |
| int props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.UPPERCASE_LETTER || |
| type == UCharacterCategory.TITLECASE_LETTER) { |
| ch += UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_CASE_FOLDING_)) { |
| int exception = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_CASE_FOLDING_); |
| if (exception != 0) { |
| PROPERTY_DB_.getFoldCase(exception & LAST_CHAR_MASK_, |
| exception >> SHIFT_24_, result); |
| } |
| else { |
| // special case folding mappings, hardcoded |
| if (defaultmapping && |
| (ch == LATIN_SMALL_LETTER_DOTLESS_I_ || |
| ch == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_)) { |
| // map dotted I and dotless i to U+0069 small i |
| result.append(LATIN_SMALL_LETTER_I_); |
| } |
| else { |
| // output c itself because it is excluded from |
| // case folding |
| UTF16.append(result, ch); |
| } |
| } |
| // do not fall through to the output of c |
| continue; |
| } |
| else { |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| ch = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_); |
| } |
| } |
| } |
| |
| // handle 1:1 code point mappings from UnicodeData.txt |
| UTF16.append(result, ch); |
| } |
| |
| return result.toString(); |
| } |
| |
| /** |
| * Return numeric value of Han code points. |
| * <br> This returns the value of Han 'numeric' code points, |
| * including those for zero, ten, hundred, thousand, ten thousand, |
| * and hundred million. Unicode does not consider these to be |
| * numeric. This includes both the standard and 'checkwriting' |
| * characters, the 'big circle' zero character, and the standard |
| * zero character. |
| * @draft |
| * @param ch code point to query |
| * @return value if it is a Han 'numeric character,' otherwise return -1. |
| */ |
| public static int getHanNumericValue(int ch) |
| { |
| switch(ch) |
| { |
| case IDEOGRAPHIC_NUMBER_ZERO_ : |
| case CJK_IDEOGRAPH_COMPLEX_ZERO_ : |
| return 0; // Han Zero |
| case CJK_IDEOGRAPH_FIRST_ : |
| case CJK_IDEOGRAPH_COMPLEX_ONE_ : |
| return 1; // Han One |
| case CJK_IDEOGRAPH_SECOND_ : |
| case CJK_IDEOGRAPH_COMPLEX_TWO_ : |
| return 2; // Han Two |
| case CJK_IDEOGRAPH_THIRD_ : |
| case CJK_IDEOGRAPH_COMPLEX_THREE_ : |
| return 3; // Han Three |
| case CJK_IDEOGRAPH_FOURTH_ : |
| case CJK_IDEOGRAPH_COMPLEX_FOUR_ : |
| return 4; // Han Four |
| case CJK_IDEOGRAPH_FIFTH_ : |
| case CJK_IDEOGRAPH_COMPLEX_FIVE_ : |
| return 5; // Han Five |
| case CJK_IDEOGRAPH_SIXTH_ : |
| case CJK_IDEOGRAPH_COMPLEX_SIX_ : |
| return 6; // Han Six |
| case CJK_IDEOGRAPH_SEVENTH_ : |
| case CJK_IDEOGRAPH_COMPLEX_SEVEN_ : |
| return 7; // Han Seven |
| case CJK_IDEOGRAPH_EIGHTH_ : |
| case CJK_IDEOGRAPH_COMPLEX_EIGHT_ : |
| return 8; // Han Eight |
| case CJK_IDEOGRAPH_NINETH_ : |
| case CJK_IDEOGRAPH_COMPLEX_NINE_ : |
| return 9; // Han Nine |
| case CJK_IDEOGRAPH_TEN_ : |
| case CJK_IDEOGRAPH_COMPLEX_TEN_ : |
| return 10; |
| case CJK_IDEOGRAPH_HUNDRED_ : |
| case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ : |
| return 100; |
| case CJK_IDEOGRAPH_THOUSAND_ : |
| case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ : |
| return 1000; |
| case CJK_IDEOGRAPH_TEN_THOUSAND_ : |
| return 10000; |
| case CJK_IDEOGRAPH_HUNDRED_MILLION_ : |
| return 100000000; |
| } |
| return -1; // no value |
| } |
| |
| // protected variables =================================== |
| |
| /** |
| * Shift and mask value for surrogates |
| */ |
| protected static final int LEAD_SURROGATE_SHIFT_ = 10; |
| protected static final int TRAIL_SURROGATE_MASK_ = 0x3FF; |
| |
| // protected methods ==================================================== |
| |
| /** |
| * Forms a supplementary code point from the argument character<br> |
| * Note this is for internal use hence no checks for the validity of the |
| * surrogate characters are done |
| * @param lead lead surrogate character |
| * @param trail trailing surrogate character |
| * @return code point of the supplementary character |
| */ |
| protected static int getRawSupplementary(char lead, char trail) |
| { |
| return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; |
| } |
| |
| // private variables ===================================== |
| |
| /** |
| * Database storing the sets of character property |
| */ |
| private static final UCharacterPropertyDB PROPERTY_DB_; |
| |
| /** |
| * Initialization of the UCharacterPropertyDB instance. |
| * RuntimeException thrown when data is missing or data has been corrupted. |
| */ |
| static |
| { |
| try |
| { |
| PROPERTY_DB_ = new UCharacterPropertyDB(); |
| } |
| catch (Exception e) |
| { |
| throw new RuntimeException(e.getMessage()); |
| } |
| } |
| |
| /** |
| * Offset to add to combined surrogate pair to avoid msking. |
| */ |
| private static final int SURROGATE_OFFSET_ = |
| SUPPLEMENTARY_MIN_VALUE - (0xD800 << LEAD_SURROGATE_SHIFT_) - 0xDC00; |
| |
| /** |
| * Surrogate code point values |
| */ |
| private static final int SURROGATE_MIN_VALUE_ = 0xD800; |
| private static final int SURROGATE_MAX_VALUE_ = 0xDFFF; |
| |
| /** |
| * To get the last character out from a data type |
| */ |
| private static final int LAST_CHAR_MASK_ = 0xFFFF; |
| |
| /** |
| * To get the last byte out from a data type |
| */ |
| private static final int LAST_BYTE_MASK_ = 0xFF; |
| |
| /** |
| * Shift 16 bits |
| */ |
| private static final int SHIFT_16_ = 16; |
| |
| /** |
| * Shift 24 bits |
| */ |
| private static final int SHIFT_24_ = 24; |
| |
| /** |
| * Minimum suffix value that indicates if a character is non character. |
| * Unicode 3.0 non characters |
| */ |
| private static final int NON_CHARACTER_SUFFIX_MIN_3_0_ = 0xFFFE; |
| |
| /** |
| * New minimum non character in Unicode 3.1 |
| */ |
| private static final int NON_CHARACTER_MIN_3_1_ = 0xFDD0; |
| |
| /** |
| * New non character range in Unicode 3.1 |
| */ |
| private static final int NON_CHARACTER_RANGE_3_1_ = |
| 0xFDEF - NON_CHARACTER_MIN_3_1_; |
| |
| /** |
| * Decimal radix |
| */ |
| private static final int DECIMAL_RADIX_ = 10; |
| |
| /** |
| * No break space code point |
| */ |
| private static final int NO_BREAK_SPACE_ = 0xA0; |
| |
| /** |
| * Narrow no break space code point |
| */ |
| private static final int NARROW_NO_BREAK_SPACE_ = 0x202F; |
| |
| /** |
| * Zero width no break space code point |
| */ |
| private static final int ZERO_WIDTH_NO_BREAK_SPACE_ = 0xFEFF; |
| |
| /** |
| * Ideographic number zero code point |
| */ |
| private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007; |
| |
| /** |
| * CJK Ideograph, First code point |
| */ |
| private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00; |
| |
| /** |
| * CJK Ideograph, Second code point |
| */ |
| private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c; |
| |
| /** |
| * CJK Ideograph, Third code point |
| */ |
| private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09; |
| |
| /** |
| * CJK Ideograph, Fourth code point |
| */ |
| private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8; |
| |
| /** |
| * CJK Ideograph, FIFTH code point |
| */ |
| private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94; |
| |
| /** |
| * CJK Ideograph, Sixth code point |
| */ |
| private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d; |
| |
| /** |
| * CJK Ideograph, Seventh code point |
| */ |
| private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03; |
| |
| /** |
| * CJK Ideograph, Eighth code point |
| */ |
| private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b; |
| |
| /** |
| * CJK Ideograph, Nineth code point |
| */ |
| private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d; |
| |
| /** |
| * Application Program command code point |
| */ |
| private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F; |
| |
| /** |
| * Unit seperator code point |
| */ |
| private static final int UNIT_SEPERATOR_ = 0x001F; |
| |
| /** |
| * Delete code point |
| */ |
| private static final int DELETE_ = 0x007F; |
| |
| /** |
| * Turkish ISO 639 2 character code |
| */ |
| private static final String TURKISH_ = "tr"; |
| |
| /** |
| * Azerbaijani ISO 639 2 character code |
| */ |
| private static final String AZERBAIJANI_ = "az"; |
| |
| /** |
| * Lithuanian ISO 639 2 character code |
| */ |
| private static final String LITHUANIAN_ = "lt"; |
| |
| /** |
| * Latin owercase i |
| */ |
| private static final char LATIN_SMALL_LETTER_I_ = 0x69; |
| |
| /** |
| * Latin uppercase I |
| */ |
| private static final char LATIN_CAPITAL_LETTER_I_ = 0x49; |
| |
| /** |
| * Latin capital letter i with dot above |
| */ |
| private static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; |
| |
| /** |
| * Latin small letter i with dot above |
| */ |
| private static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; |
| |
| /** |
| * Combining dot above |
| */ |
| private static final char COMBINING_DOT_ABOVE_ = 0x307; |
| |
| /** |
| * Greek capital letter sigma |
| */ |
| private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3; |
| |
| /** |
| * Greek small letter sigma |
| */ |
| private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3; |
| |
| /** |
| * Greek small letter rho |
| */ |
| private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2; |
| |
| /** |
| * ISO control character first range upper limit 0x0 - 0x1F |
| */ |
| private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F; |
| |
| /** |
| * Han digit characters |
| */ |
| private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6; |
| private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9; |
| private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3; |
| private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3; |
| private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086; |
| private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d; |
| private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678; |
| private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2; |
| private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c; |
| private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396; |
| private static final int CJK_IDEOGRAPH_TEN_ = 0x5341; |
| private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe; |
| private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e; |
| private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70; |
| private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343; |
| private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf; |
| private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c; |
| private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104; |
| |
| /** |
| * Hyphens |
| */ |
| private static final int HYPHEN_ = 0x2010; |
| private static final int SOFT_HYPHEN_ = 0xAD; |
| |
| /** |
| * LATIN SMALL LETTER J |
| */ |
| private static final int LATIN_SMALL_LETTER_J_ = 0x6a; |
| |
| /** |
| * LATIN SMALL LETTER I WITH OGONEK |
| */ |
| private static final int LATIN_SMALL_LETTER_I_WITH_OGONEK_ = 0x12f; |
| |
| /** |
| * LATIN SMALL LETTER I WITH TILDE BELOW |
| */ |
| private static final int LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ = 0x1e2d; |
| |
| /** |
| * LATIN SMALL LETTER I WITH DOT BELOW |
| */ |
| private static final int LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_ = 0x1ecb; |
| |
| /** |
| * Combining class for combining mark above |
| */ |
| private static final int COMBINING_MARK_ABOVE_CLASS_ = 230; |
| |
| /** |
| * LATIN CAPITAL LETTER J |
| */ |
| private static final int LATIN_CAPITAL_LETTER_J_ = 0x4a; |
| |
| /** |
| * LATIN CAPITAL LETTER I WITH OGONEK |
| */ |
| private static final int LATIN_CAPITAL_I_WITH_OGONEK_ = 0x12e; |
| |
| /** |
| * LATIN CAPITAL LETTER I WITH TILDE |
| */ |
| private static final int LATIN_CAPITAL_I_WITH_TILDE_ = 0x128; |
| |
| /** |
| * LATIN CAPITAL LETTER I WITH GRAVE |
| */ |
| private static final int LATIN_CAPITAL_I_WITH_GRAVE_ = 0xcc; |
| |
| /** |
| * LATIN CAPITAL LETTER I WITH ACUTE |
| */ |
| private static final int LATIN_CAPITAL_I_WITH_ACUTE_ = 0xcd; |
| |
| /** |
| * COMBINING GRAVE ACCENT |
| */ |
| private static final int COMBINING_GRAVE_ACCENT_ = 0x300; |
| |
| /** |
| * COMBINING ACUTE ACCENT |
| */ |
| private static final int COMBINING_ACUTE_ACCENT_ = 0x301; |
| |
| /** |
| * COMBINING TILDE |
| */ |
| private static final int COMBINING_TILDE_ = 0x303; |
| |
| // private methods ============================================== |
| |
| /** |
| * Gets the correct property information from UCharacterPropertyDB |
| * @param ch character whose information is to be retrieved |
| * @return a 32 bit information, returns 0 if no data is found. |
| */ |
| private static int getProps(int ch) |
| { |
| if (ch >= UCharacter.MIN_VALUE & ch <= UCharacter.MAX_VALUE) { |
| return PROPERTY_DB_.getProperty(ch); |
| } |
| return 0; |
| } |
| |
| /** |
| * Getting the locales used for case mapping |
| * @param locale to work with |
| * @return locale which the actual case mapping works with |
| */ |
| private static Locale getCaseLocale(Locale locale) |
| { |
| String language = locale.getLanguage(); |
| |
| // the locale can have no language |
| if (language.length() != 2) { |
| return locale; |
| } |
| |
| if (language.equals(TURKISH_) || language.equals(AZERBAIJANI_)) { |
| return new Locale("tr", "TR"); |
| } |
| if (language.equals(LITHUANIAN_)) { |
| return new Locale("lt", "LT"); |
| } |
| return locale; |
| } |
| |
| /** |
| * In Unicode 3.1.1, an ignorable sequence is a sequence of *zero* or more |
| * characters from the set {HYPHEN, SOFT HYPHEN, general category = Mn}. |
| * (Expected to change!) |
| * @param ch codepoint |
| * @param cat category of the argument codepoint |
| * @return true if ch is case ignorable. |
| */ |
| private static boolean isIgnorable(int ch, int cat) |
| { |
| return cat == UCharacterCategory.NON_SPACING_MARK || ch == HYPHEN_ || |
| ch == SOFT_HYPHEN_; |
| } |
| |
| /** |
| * Determines if offset is not followed by a sequence consisting of |
| * an ignorable sequence and then a cased letter {Ll, Lu, Lt}. |
| * @param str string to determine |
| * @param offset offset in string to check |
| * @return false if any character after index in src is a cased letter |
| * @see SpecialCasing.txt |
| */ |
| private static boolean isCFINAL(String str, int offset) |
| { |
| int length = str.length(); |
| offset += UTF16.getCharCount(UTF16.charAt(str, offset)); |
| while (offset < length) { |
| int ch = UTF16.charAt(str, offset); |
| int cat = getType(ch); |
| if (cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER) { |
| return false; // followed by cased letter |
| } |
| if (!isIgnorable(ch, cat)) { |
| return true; // not ignorable |
| } |
| offset += UTF16.getCharCount(ch); |
| } |
| |
| return true; |
| } |
| |
| /** |
| * Determines if offset is not preceded by a sequence consisting of a cased |
| * letter {Ll, Lu, Lt} and an ignorable sequence. |
| * @param str string to determine |
| * @param offset offset in string to check |
| * @return true if any character before index in src is a cased letter |
| * @see SpecialCasing.txt |
| */ |
| private static boolean isNotCINITIAL(String str, int offset) |
| { |
| offset --; |
| while (offset >= 0) { |
| int ch = UTF16.charAt(str, offset); |
| int cat = getType(ch); |
| if (cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER) { |
| return true; // preceded by cased letter |
| } |
| if (!isIgnorable(ch, cat)) { |
| return false; // not ignorable |
| } |
| offset -= UTF16.getCharCount(ch); |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Determines if a string at offset is preceded by any base characters |
| * { 'i', 'j', U+012f, U+1e2d, U+1ecb } with no intervening character with |
| * combining class = 230 |
| * @param str string to be determined |
| * @param offset offset in string to check |
| * @return true if some characters preceding the offset index belongs to |
| * the set { 'i', 'j', U+012f, U+1e2d, U+1ecb } |
| * @see SpecialCasing.txt |
| */ |
| private static boolean isAFTER_i(String str, int offset) |
| { |
| offset --; |
| while (offset >= 0) { |
| int ch = UTF16.charAt(str, offset); |
| if (ch == LATIN_SMALL_LETTER_I_ || ch == LATIN_SMALL_LETTER_J_ || |
| ch == LATIN_SMALL_LETTER_I_WITH_OGONEK_ || |
| ch == LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ || |
| ch == LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_) { |
| return true; // preceded by TYPE_i |
| } |
| |
| int cc = getCombiningClass(ch); |
| if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) { |
| // preceded by different base character not TYPE_i), or |
| // intervening cc == 230 |
| return false; |
| } |
| offset -= UTF16.getCharCount(ch); |
| } |
| |
| return false; // not preceded by TYPE_i |
| } |
| |
| /** |
| * Determines if a string at offset is preceded by base characters 'I' with |
| * no intervening combining class = 230 |
| * @param str string to be determined |
| * @param offset offset in string to check |
| * @return true if some characters preceding the offset index is the |
| * character 'I' with no intervening combining class = 230 |
| * @see SpecialCasing.txt |
| */ |
| private static boolean isAFTER_I(String str, int offset) |
| { |
| offset --; |
| while (offset >= 0) { |
| int ch = UTF16.charAt(str, offset); |
| if (ch == LATIN_CAPITAL_LETTER_I_) { |
| return true; // preceded by I |
| } |
| |
| int cc = getCombiningClass(ch); |
| if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) { |
| // preceded by different base character (not I), or |
| // intervening cc == 230 |
| return false; |
| } |
| offset -= UTF16.getCharCount(ch); |
| } |
| |
| return false; // not preceded by I |
| } |
| |
| /** |
| * Determines if a string at offset is followed by one or more characters |
| * of combining class = 230. |
| * @param str string to be determined |
| * @param offset offset in string to check |
| * @return true if a string at offset is followed by one or more characters |
| * of combining class = 230. |
| * @see SpecialCasing.txt |
| */ |
| private static boolean isFollowedByMOREABOVE(String str, int offset) |
| { |
| int length = str.length(); |
| offset += UTF16.getCharCount(UTF16.charAt(str, 0)); |
| while (offset < length) { |
| int ch = UTF16.charAt(str, offset); |
| int cc = getCombiningClass(ch); |
| if (cc == COMBINING_MARK_ABOVE_CLASS_) { |
| return true; // at least one cc==230 following |
| } |
| if (cc == 0) { |
| return false; // next base character, no more cc==230 following |
| } |
| offset += UTF16.getCharCount(ch); |
| } |
| |
| return false; // no more cc == 230 following |
| } |
| |
| /** |
| * Determines if a string at offset is followed by a dot above |
| * with no characters of combining class == 230 in between |
| * @param str string to be determined |
| * @param offset offset in string to check |
| * @return true if a string at offset is followed by oa dot above |
| * with no characters of combining class == 230 in between |
| * @see SpecialCasing.txt |
| */ |
| private static boolean isFollowedByDotAbove(String str, int offset) |
| { |
| int length = str.length(); |
| offset += UTF16.getCharCount(UTF16.charAt(str, 0)); |
| while (offset < length) { |
| int ch = UTF16.charAt(str, offset); |
| if (ch == COMBINING_DOT_ABOVE_) { |
| return true; |
| } |
| int cc = getCombiningClass(ch); |
| if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) { |
| return false; // next base character or cc==230 in between |
| } |
| offset += UTF16.getCharCount(ch); |
| } |
| |
| return false; // no dot above following |
| } |
| |
| /** |
| * Special casing uppercase management |
| * @param ch code point to convert |
| * @param index of exception containing special case information |
| * @param buffer to add uppercase |
| * @param str original string |
| * @param offset index of ch in str |
| * @param tr_az if uppercase is to be made with TURKISH or AZERBAIJANI |
| * in mind |
| * @param lt if uppercase is to be made with LITHUANIAN in mind |
| */ |
| private static void getSpecialUpperCase(int ch, int index, |
| StringBuffer buffer, String str, |
| int offset, Locale locale) |
| { |
| int exception = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_); |
| if (exception < 0) { |
| String language = locale.getLanguage(); |
| // use hardcoded conditions and mappings |
| if ((language.equals(TURKISH_) || language.equals(AZERBAIJANI_)) |
| && ch == LATIN_SMALL_LETTER_I_) { |
| // turkish: i maps to dotted I |
| buffer.append(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_); |
| } |
| else { |
| if (language.equals(LITHUANIAN_) && ch == COMBINING_DOT_ABOVE_ |
| && isAFTER_i(str, offset)) { |
| // lithuanian: remove DOT ABOVE after U+0069 "i" with |
| // upper or titlecase |
| return; // remove the dot (continue without output) |
| } |
| else { |
| // no known conditional special case mapping, use a normal |
| // mapping |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| UTF16.append(buffer, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)); |
| } |
| else { |
| UTF16.append(buffer, ch); |
| } |
| } |
| } |
| } |
| else { |
| // get the special case mapping string from the data file |
| index = exception & LAST_CHAR_MASK_; |
| PROPERTY_DB_.getUpperCase(index, buffer); |
| } |
| } |
| |
| /** |
| * Special casing lowercase management |
| * @param ch code point to convert |
| * @param index of exception containing special case information |
| * @param buffer to add lowercase |
| * @param str original string |
| * @param offset index of ch in str |
| * @param locale current locale |
| */ |
| private static void getSpecialLowerCase(int ch, int index, |
| StringBuffer buffer, String str, |
| int offset, Locale locale) |
| { |
| int exception = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_); |
| if (exception < 0) { |
| // fill u and i with the case mapping result string |
| // use hardcoded conditions and mappings |
| if (locale.getLanguage().equals(LITHUANIAN_) && |
| // base characters, find accents above |
| (((ch == LATIN_CAPITAL_LETTER_I_ || |
| ch == LATIN_CAPITAL_LETTER_J_ || |
| ch == LATIN_CAPITAL_I_WITH_OGONEK_) && |
| isFollowedByMOREABOVE(str, offset)) || |
| // precomposed with accent above, no need to find one |
| (ch == LATIN_CAPITAL_I_WITH_GRAVE_ || |
| ch == LATIN_CAPITAL_I_WITH_ACUTE_ || |
| ch == LATIN_CAPITAL_I_WITH_TILDE_))) { |
| // lithuanian: add a dot above if there are more accents |
| // above (to always have the dot) |
| switch(ch) { |
| case LATIN_CAPITAL_LETTER_I_: |
| buffer.append((char)LATIN_SMALL_LETTER_I_); |
| buffer.append((char)COMBINING_DOT_ABOVE_); |
| break; |
| case LATIN_CAPITAL_LETTER_J_: |
| buffer.append((char)LATIN_SMALL_LETTER_J_); |
| buffer.append((char)COMBINING_DOT_ABOVE_); |
| break; |
| case LATIN_CAPITAL_I_WITH_OGONEK_: |
| buffer.append((char)LATIN_SMALL_LETTER_I_WITH_OGONEK_); |
| buffer.append((char)COMBINING_DOT_ABOVE_); |
| break; |
| case LATIN_CAPITAL_I_WITH_GRAVE_: |
| buffer.append((char)LATIN_SMALL_LETTER_I_); |
| buffer.append((char)COMBINING_DOT_ABOVE_); |
| buffer.append((char)COMBINING_GRAVE_ACCENT_); |
| break; |
| case LATIN_CAPITAL_I_WITH_ACUTE_: |
| buffer.append((char)LATIN_SMALL_LETTER_I_); |
| buffer.append((char)COMBINING_DOT_ABOVE_); |
| buffer.append((char)COMBINING_ACUTE_ACCENT_); |
| break; |
| case LATIN_CAPITAL_I_WITH_TILDE_: |
| buffer.append((char)LATIN_SMALL_LETTER_I_); |
| buffer.append((char)COMBINING_DOT_ABOVE_); |
| buffer.append((char)COMBINING_TILDE_); |
| break; |
| } |
| /* |
| Note: This handling of I and of dot above differs from |
| Unicode 3.1.1's SpecialCasing-5.txt because the AFTER_i |
| condition there does not work for decomposed I+dot above. |
| This fix is being proposed to the UTC. |
| */ |
| } |
| else { |
| String language = locale.getLanguage(); |
| if ((language.equals(TURKISH_) || |
| language.equals(AZERBAIJANI_)) && |
| ch == LATIN_CAPITAL_LETTER_I_ && |
| !isFollowedByDotAbove(str, offset)) { |
| // turkish: I maps to dotless i |
| // other languages or turkish with decomposed I+dot above: |
| // I maps to i |
| buffer.append(LATIN_SMALL_LETTER_DOTLESS_I_); |
| } |
| else { |
| if (ch == COMBINING_DOT_ABOVE_ && |
| isAFTER_I(str, offset) && |
| !isFollowedByMOREABOVE(str, offset)) { |
| // decomposed I+dot above becomes i (see handling of |
| // U+0049 for turkish) and removes the dot above |
| return; // remove the dot (continue without output) |
| } |
| else { |
| if (ch == GREEK_CAPITAL_LETTER_SIGMA_ && |
| isCFINAL(str, offset) && |
| isNotCINITIAL(str, offset)) { |
| // greek capital sigma maps depending on |
| // surrounding cased letters |
| buffer.append(GREEK_SMALL_LETTER_RHO_); |
| } |
| else { |
| // no known conditional special case mapping, use |
| // a normal mapping |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| UTF16.append(buffer, |
| PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)); |
| } |
| else { |
| UTF16.append(buffer, ch); |
| } |
| } |
| } |
| } |
| } |
| } |
| else { |
| // get the special case mapping string from the data file |
| index = exception & LAST_CHAR_MASK_; |
| PROPERTY_DB_.getLowerCase(index, buffer); |
| } |
| } |
| |
| /** |
| * Determines if codepoint is a non character |
| * @param ch codepoint |
| * @return true if codepoint is a non character false otherwise |
| */ |
| private static boolean isNonCharacter(int ch) |
| { |
| if ((ch & LAST_CHAR_MASK_) >= NON_CHARACTER_SUFFIX_MIN_3_0_) { |
| return true; |
| } |
| |
| int difference = ch - NON_CHARACTER_MIN_3_1_; |
| return difference >= 0 && difference <= NON_CHARACTER_RANGE_3_1_; |
| } |
| } |
| |