| /** |
| ******************************************************************************* |
| * Copyright (C) 1996-2001, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| * |
| * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $ |
| * $Date: 2001/03/26 20:34:36 $ |
| * $Revision: 1.6 $ |
| * |
| ******************************************************************************* |
| */ |
| |
| |
| package com.ibm.text; |
| |
| import java.util.Locale; |
| |
| /** |
| * A static class designed to be a generic code point information source that |
| * handles surrogate pairs.<br> |
| * Data for code point information originates from Unicode 3.0 data files, |
| * UnicodeData.txt and Mirror.txt, downloadable from the Unicode Consortium site |
| * ftp://ftp.unicode.org/Public/<br> |
| * ICU's gennames and genprops programs are used to compact the information from |
| * the above mentioned files before being used by this package. The binary |
| * result files are named unames.dat and uprops.dat. <br> |
| * Both are jared with the package for release, hence to use this class please |
| * add the jar file name <code>ucharacter.jar</code> to your class path.<br> |
| * E.g. In Windows <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar |
| * </code><br> |
| * For more information about the data file format, please refer to |
| * <a href=http://oss.software.ibm.com/icu4j/icu4jhtml/com/ibm/text/ReadMe.html> |
| * Read Me</a>.<br> |
| * Each code point used here in in terms of a 32 bit int. This is so as to |
| * handle supplementary code points which has 21 bit in size.<br> |
| * APIs provide up-to-date Unicode implementation of java.lang.Character, <br> |
| * hence |
| * <ul> |
| * <li> Deprecated APIs are not defined here |
| * <li> UCharacter is not designed to be a char wrapper and does not have APIs |
| * to which involves management of that single char. e.g. char |
| * charValue(), int compareTo(java.lang.Character, java.lang.Character) |
| * etc. |
| * <li> To handle surrogates, int parameters APIs are provided |
| * <li> Java specific character information is not defined e.g. boolean |
| * isJavaIdentifierPart(char ch) |
| * <li> Has extra methods to fully utilize up-to-date Unicode data information |
| * <li> Provides methods to gets code points from a UTF-16 char or surrogate |
| * pairs |
| * </ul> |
| * <p> |
| * Difference between UCharacter and java.lang.Character |
| * <ul> |
| * <li> UCharacter supports Unicode 3.0 Data while java.lang.Character |
| * supports only Unicode 2.0. <br> |
| * Note : UCharacter will support Unicode 3.1 when it officially releases |
| * <li> UCharacter provides the support for supplementary code points.<br> |
| * Hence the code point type used is an int (support for 21 bits) where |
| * else java.lang.Character uses a char (16 bit) |
| * <li> The below control code points had their type is overwritten by ICU to |
| * the type shown |
| * <ul> |
| * <li> TAB 0x9 : U_SPACE_SEPARATOR |
| * <li> VT 0xb : U_SPACE_SEPARATOR |
| * <li> LF 0xa : U_PARAGRAPH_SEPARATOR |
| * <li> FF 0xc : U_LINE_SEPARATOR |
| * <li> CR 0xd : U_PARAGRAPH_SEPARATOR |
| * <li> FS 0x1c : U_PARAGRAPH_SEPARATOR |
| * <li> GS 0x1d : U_PARAGRAPH_SEPARATOR |
| * <li> RS 0x1e : U_PARAGRAPH_SEPARATOR |
| * <li> US 0x1f : U_SPACE_SEPARATOR |
| * <li> NL 0x85 : U_PARAGRAPH_SEPARATOR |
| * </ul> |
| * Because of these type overwrites, some methods might be affected. |
| * <li> java.lang.Character maps characters 'A' - 'Z' and 'a' - 'z' to the |
| * numeric values '10' - '35'. UCharacter does not treat the above |
| * code points as having numeric values |
| * <li> Further detail differences can be determined from the program |
| * <a href = ../test/text/UCharacterCompare.html> |
| * com.ibm.icu.test.text.UCharacterCompare</a> |
| * </ul> |
| * </p> |
| * Examples on using this class is located at the test program |
| * <a href = ../test/text/UCharacterCompare.html> |
| * com.ibm.icu.test.text.UCharacterTest</a> |
| * @author Syn Wee Quek |
| * @since oct 06 2000 |
| * @see com.ibm.text.UCharacterCategory |
| * @see com.ibm.text.UCharacterDirection |
| * @see com.ibm.icu.test.text.UCharacterCompare |
| * @see com.ibm.icu.test.text.UCharacterTest |
| */ |
| |
| public final class UCharacter |
| { |
| // public variables ============================================== |
| |
| /** |
| * The lowest Unicode code point value. Code points are non-ne N_VALUE |
| */ |
| public static final int MIN_VALUE = 0; |
| |
| /** |
| * The highest Unicode code point value (scalar value) according to the |
| * Unicode Standard.<br> |
| * This is a 21-bit value (21 bits, rounded up).<br> |
| * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE |
| */ |
| public static final int MAX_VALUE = 0x10ffff; |
| |
| /** |
| * The minimum value for Supplementary code points |
| */ |
| public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; |
| |
| /** |
| * Unicode value used when translating into Unicode encoding form and there |
| * is no existing character. |
| */ |
| public static final int REPLACEMENT_CHAR = '\uFFFD'; |
| |
| // protected variables =================================== |
| |
| /** |
| * Shift and mask value for surrogates |
| */ |
| protected static final int LEAD_SURROGATE_SHIFT_ = 10; |
| protected static final int TRAIL_SURROGATE_MASK_ = 0x3FF; |
| |
| // private variables ===================================== |
| |
| /** |
| * Database storing the sets of character property |
| */ |
| private static final UCharacterPropertyDB PROPERTY_DB_; |
| /** |
| * Initialization of the UCharacterPropertyDB instance. |
| * RuntimeException thrown when data is missing or data has been corrupted. |
| */ |
| static |
| { |
| try |
| { |
| PROPERTY_DB_ = new UCharacterPropertyDB(); |
| } |
| catch (Exception e) |
| { |
| throw new RuntimeException(e.getMessage()); |
| } |
| } |
| |
| /** |
| * Offset to add to combined surrogate pair to avoid msking. |
| */ |
| private static final int SURROGATE_OFFSET_ = |
| SUPPLEMENTARY_MIN_VALUE - (0xD800 << LEAD_SURROGATE_SHIFT_) - 0xDC00; |
| |
| /** |
| * Surrogate code point values |
| */ |
| private static final int SURROGATE_MIN_VALUE_ = 0xD800; |
| private static final int SURROGATE_MAX_VALUE_ = 0xDFFF; |
| |
| /** |
| * To get the last character out from a data type |
| */ |
| private static final int LAST_CHAR_MASK_ = 0xFFFF; |
| |
| /** |
| * To get the last byte out from a data type |
| */ |
| private static final int LAST_BYTE_MASK_ = 0xFF; |
| |
| /** |
| * Shift 16 bits |
| */ |
| private static final int SHIFT_16_ = 16; |
| |
| /** |
| * Minimum value that indicates if a character is not-a-character |
| */ |
| private static final int NOT_A_CHAR_SUFFIX_MIN_ = 0xFFFE; |
| |
| /** |
| * Decimal radix |
| */ |
| private static final int DECIMAL_RADIX_ = 10; |
| |
| /** |
| * No break space code point |
| */ |
| private static final int NO_BREAK_SPACE_ = 0xA0; |
| |
| /** |
| * Narrow no break space code point |
| */ |
| private static final int NARROW_NO_BREAK_SPACE_ = 0x202F; |
| |
| /** |
| * Zero width no break space code point |
| */ |
| private static final int ZERO_WIDTH_NO_BREAK_SPACE_ = 0xFEFF; |
| |
| /** |
| * Ideographic number zero code point |
| */ |
| private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007; |
| |
| /** |
| * CJK Ideograph, First code point |
| */ |
| private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00; |
| |
| /** |
| * CJK Ideograph, Second code point |
| */ |
| private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c; |
| |
| /** |
| * CJK Ideograph, Third code point |
| */ |
| private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09; |
| |
| /** |
| * CJK Ideograph, Fourth code point |
| */ |
| private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8; |
| |
| /** |
| * CJK Ideograph, FIFTH code point |
| */ |
| private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94; |
| |
| /** |
| * CJK Ideograph, Sixth code point |
| */ |
| private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d; |
| |
| /** |
| * CJK Ideograph, Seventh code point |
| */ |
| private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03; |
| |
| /** |
| * CJK Ideograph, Eighth code point |
| */ |
| private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b; |
| |
| /** |
| * CJK Ideograph, Nineth code point |
| */ |
| private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d; |
| |
| /** |
| * Application Program command code point |
| */ |
| private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F; |
| |
| /** |
| * Unit seperator code point |
| */ |
| private static final int UNIT_SEPERATOR_ = 0x001F; |
| |
| /** |
| * Delete code point |
| */ |
| private static final int DELETE_ = 0x007F; |
| |
| /** |
| * Turkish ISO 639 2 character code |
| */ |
| private static final String TURKISH_ = "tr"; |
| |
| /** |
| * Azerbaijani ISO 639 2 character code |
| */ |
| private static final String AZERBAIJANI_ = "az"; |
| |
| /** |
| * Lithuanian ISO 639 2 character code |
| */ |
| private static final String LITHUANIAN_ = "lt"; |
| |
| /** |
| * Latin owercase i |
| */ |
| private static final char LATIN_SMALL_LETTER_I_ = 0x69; |
| |
| /** |
| * Latin uppercase I |
| */ |
| private static final char LATIN_CAPITAL_LETTER_I_ = 0x49; |
| |
| /** |
| * Latin capital letter i with dot above |
| */ |
| private static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; |
| |
| /** |
| * Latin small letter i with dot above |
| */ |
| private static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; |
| |
| /** |
| * Combining dot above |
| */ |
| private static final char COMBINING_DOT_ABOVE_ = 0x307; |
| |
| /** |
| * Greek capital letter sigma |
| */ |
| private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3; |
| |
| /** |
| * Greek small letter sigma |
| */ |
| private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3; |
| |
| /** |
| * Greek small letter rho |
| */ |
| private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2; |
| |
| /** |
| * ISO control character first range upper limit 0x0 - 0x1F |
| */ |
| private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F; |
| |
| // constructor ==================================================== |
| |
| /** |
| * Private constructor to prevent instantiation |
| */ |
| private UCharacter() |
| { |
| } |
| |
| // public methods =================================================== |
| |
| /** |
| * Retrieves the decimal numeric value of a digit code point.<br> |
| * A code point is a valid digit if the following is true: |
| * <ul> |
| * <li> The method isDigit(ch) is true and the Unicode decimal digit value of |
| * ch is less than the specified radix. |
| * </ul> |
| * Note this method, unlike java.lang.Character.digit() does not regard the |
| * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. |
| * @param ch the code point whose numeric value is to be determined |
| * @param radix the radix which the digit is to be converted to |
| * @return the numeric value of the code point ch in the argument radix, |
| * this method returns -1 if ch is not a valid digit code point or |
| * if its digit value exceeds the radix. |
| */ |
| public static int digit(int ch, int radix) |
| { |
| int props = getProps(ch); |
| int result = -1; |
| // if props == 0, it will just fall through and return -1 |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| // not contained in exception data |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.DECIMAL_DIGIT_NUMBER) { |
| result = UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else { |
| // contained in exception data |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_DIGIT_VALUE_)) { |
| result = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_DIGIT_VALUE_) & |
| LAST_CHAR_MASK_; |
| } |
| else { |
| if (!PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_DENOMINATOR_VALUE_) |
| && PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_NUMERIC_VALUE_)) { |
| result = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_NUMERIC_VALUE_); |
| } |
| } |
| } |
| |
| if (result < 0) { |
| result = getHanDigit(ch); |
| } |
| |
| if (result < 0 || result >= radix) { |
| return -1; |
| } |
| return result; |
| } |
| |
| /** |
| * Retrieves the decimal numeric value of a digit code point in radix 10<br> |
| * Note this method, unlike java.lang.Character.digit() does not regard the |
| * ascii characters 'A' - 'Z' and 'a' - 'z' as digits. |
| * @param ch the code point whose numeric value is to be determined |
| * @return the numeric value of the code point ch, this method returns -1 if |
| * ch is not a valid digit code point |
| */ |
| public static int digit(int ch) |
| { |
| return digit(ch, DECIMAL_RADIX_); |
| } |
| |
| /** |
| * Returns the Unicode numeric value of the code point as a nonnegative |
| * integer. <br> |
| * If the code point does not have a numeric value, then -1 is returned. <br> |
| * If the code point has a numeric value that cannot be represented as a |
| * nonnegative integer (for example, a fractional value), then -2 is returned. |
| * <br> |
| * Note this method, unlike java.lang.Character.digit() does not regard the |
| * ascii characters 'A' - 'Z' and 'a' - 'z' as numbers. |
| * @param ch Unicode code point |
| * @return numeric value of the code point as a nonnegative integer |
| */ |
| public static int getNumericValue(int ch) |
| { |
| int props = getProps(ch); |
| int type = UCharacterPropertyDB.getPropType(props); |
| |
| // if props == 0, it will just fall through and return -1 |
| if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER && |
| type != UCharacterCategory.LETTER_NUMBER && |
| type != UCharacterCategory.OTHER_NUMBER) { |
| return -1; |
| } |
| |
| int result = -1; |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| // not contained in exception data |
| result = UCharacterPropertyDB.getSignedValue(props); |
| } |
| else { |
| // contained in exception data |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_DIGIT_VALUE_)) { |
| result = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_DIGIT_VALUE_); |
| } |
| else { |
| if (!PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_DENOMINATOR_VALUE_) |
| && PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_NUMERIC_VALUE_)) { |
| result = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_NUMERIC_VALUE_); |
| } |
| } |
| } |
| |
| if (result < 0) { |
| result = getHanDigit(ch); |
| } |
| |
| if (result < 0) { |
| return -2; |
| } |
| return result; |
| } |
| |
| /** |
| * Returns a value indicating a code point's Unicode category.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.getType() except |
| * for the above mentioned code points that had their category changed.<br> |
| * Return results are constants from the interface |
| * <a href=UCharacterCategory.html>UCharacterCategory</a> |
| * @param ch code point whose type is to be determined |
| * @return category which is a value of UCharacterCategory |
| */ |
| public static int getType(int ch) |
| { |
| return UCharacterPropertyDB.getPropType(getProps(ch)); |
| } |
| |
| /** |
| * Determines if a code point has a defined meaning in the up-to-date Unicode |
| * standard.<br> |
| * E.g. supplementary code points though allocated space are not defined in |
| * Unicode yet.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isDefined() |
| * @param ch code point to be determined if it is defined in the most current |
| * version of Unicode |
| * @return true if this code point is defined in unicode |
| */ |
| public static boolean isDefined(int ch) |
| { |
| return getProps(ch) != 0; |
| } |
| |
| /** |
| * Determines if a code point is a digit.<br> |
| * Note this method, unlike java.lang.Character.isDigit() does not regard the |
| * ascii characters 'A' - 'Z' and 'a' - 'z' as digits.<br> |
| * @param ch code point to determine if it is a digit |
| * @return true if this code point is a digit |
| */ |
| public static boolean isDigit(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || |
| cat == UCharacterCategory.OTHER_NUMBER || |
| cat == UCharacterCategory.LETTER_NUMBER; |
| } |
| |
| /** |
| * Determines if the specified code point is an ISO control character.<br> |
| * A code point is considered to be an ISO control character if it is in the |
| * range \u0000 through \u001F or in the range \u007F through |
| * \u009F.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isISOControl() |
| * @param ch code point to determine if it is an ISO control character |
| * @return true if code point is a ISO control character |
| */ |
| public static boolean isISOControl(int ch) |
| { |
| return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ && |
| ((ch <= UNIT_SEPERATOR_) || (ch >= DELETE_)); |
| } |
| |
| /** |
| * Determines if the specified code point is a letter.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isLetter() |
| * @param ch code point to determine if it is a letter |
| * @return true if code point is a letter |
| */ |
| public static boolean isLetter(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER; |
| } |
| |
| /** |
| * Determines if the specified code point is a letter or digit.<br> |
| * Note this method, unlike java.lang.Character does not regard the ascii |
| * characters 'A' - 'Z' and 'a' - 'z' as digits. |
| * @param ch code point to determine if it is a letter or a digit |
| * @return true if code point is a letter or a digit |
| */ |
| public static boolean isLetterOrDigit(int ch) |
| { |
| return isDigit(ch) || isLetter(ch); |
| } |
| |
| /** |
| * Determines if the specified code point is a lowercase character.<br> |
| * UnicodeData only contains case mappings for code points where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> For more information about Unicode case mapping please |
| * refer to the <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isLowerCase() |
| * @param ch code point to determine if it is in lowercase |
| * @return true if code point is a lowercase character |
| */ |
| public static boolean isLowerCase(int ch) |
| { |
| // if props == 0, it will just fall through and return false |
| return getType(ch) == UCharacterCategory.LOWERCASE_LETTER; |
| } |
| |
| /** |
| * Determines if the specified code point is a white space character.<br> |
| * A code point is considered to be an whitespace character if and only |
| * if it satisfies one of the following criteria: |
| * <ul> |
| * <li> It is a Unicode space separator (category "Zs"), but is not |
| * a no-break space (\u00A0 or \u202F or \uFEFF). |
| * <li> It is a Unicode line separator (category "Zl"). |
| * <li> It is a Unicode paragraph separator (category "Zp"). |
| * </ul> |
| * Up-to-date Unicode implementation of java.lang.Character.isWhitespace(). |
| * @param ch code point to determine if it is a white space |
| * @return true if the specified code point is a white space character |
| */ |
| public static boolean isWhitespace(int ch) |
| { |
| int cat = getType(ch); |
| // exclude no-break spaces |
| // if props == 0, it will just fall through and return false |
| return (cat == UCharacterCategory.SPACE_SEPARATOR || |
| cat == UCharacterCategory.LINE_SEPARATOR || |
| cat == UCharacterCategory.PARAGRAPH_SEPARATOR) && |
| (ch != NO_BREAK_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_) && |
| (ch != ZERO_WIDTH_NO_BREAK_SPACE_); |
| } |
| |
| /** |
| * Determines if the specified code point is a Unicode specified space |
| * character, ie if code point is in the category Zs, Zl and Zp.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar(). |
| * @param ch code point to determine if it is a space |
| * @return true if the specified code point is a space character |
| */ |
| public static boolean isSpaceChar(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.SPACE_SEPARATOR || |
| cat == UCharacterCategory.LINE_SEPARATOR || |
| cat == UCharacterCategory.PARAGRAPH_SEPARATOR; |
| } |
| |
| /** |
| * Determines if the specified code point is a titlecase character.<br> |
| * UnicodeData only contains case mappings for code points where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For more information about Unicode case mapping please refer to the |
| * <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isTitleCase(). |
| * @param ch code point to determine if it is in title case |
| * @return true if the specified code point is a titlecase character |
| */ |
| public static boolean isTitleCase(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.TITLECASE_LETTER; |
| } |
| |
| /** |
| * Determines if the specified code point may be any part of a Unicode |
| * identifier other than the starting character.<br> |
| * A code point may be part of a Unicode identifier if and only if it is one |
| * of the following: |
| * <ul> |
| * <li> Lu Uppercase letter |
| * <li> Ll Lowercase letter |
| * <li> Lt Titlecase letter |
| * <li> Lm Modifier letter |
| * <li> Lo Other letter |
| * <li> Nl Letter number |
| * <li> Pc Connecting punctuation character |
| * <li> Nd decimal number |
| * <li> Mc Spacing combining mark |
| * <li> Mn Non-spacing mark |
| * <li> Cf formatting code |
| * </ul> |
| * Up-to-date Unicode implementation of |
| * java.lang.Character.isUnicodeIdentifierPart().<br> |
| * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>. |
| * @param ch code point to determine if is can be part of a Unicode identifier |
| * @return true if code point is any character belonging a unicode identifier |
| * suffix after the first character |
| */ |
| public static boolean isUnicodeIdentifierPart(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER || |
| cat == UCharacterCategory.LETTER_NUMBER || |
| cat == UCharacterCategory.CONNECTOR_PUNCTUATION || |
| cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || |
| cat == UCharacterCategory.COMBINING_SPACING_MARK || |
| cat == UCharacterCategory.NON_SPACING_MARK || |
| cat == UCharacterCategory.FORMAT; |
| } |
| |
| /** |
| * Determines if the specified code point is permissible as the first |
| * character in a Unicode identifier.<br> |
| * A code point may start a Unicode identifier if it is of type either |
| * <ul> |
| * <li> Lu Uppercase letter |
| * <li> Ll Lowercase letter |
| * <li> Lt Titlecase letter |
| * <li> Lm Modifier letter |
| * <li> Lo Other letter |
| * <li> Nl Letter number |
| * </ul> |
| * Up-to-date Unicode implementation of |
| * java.lang.Character.isUnicodeIdentifierStart().<br> |
| * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>. |
| * @param ch code point to determine if it can start a Unicode identifier |
| * @return true if code point is the first character belonging a unicode |
| * identifier |
| */ |
| public static boolean isUnicodeIdentifierStart(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER || |
| cat == UCharacterCategory.LETTER_NUMBER; |
| } |
| |
| /** |
| * Determines if the specified code point should be regarded as an ignorable |
| * character in a Unicode identifier.<br> |
| * A character is ignorable in the Unicode standard if it is of the type Cf, |
| * Formatting code.<br> |
| * Up-to-date Unicode implementation of |
| * java.lang.Character.isIdentifierIgnorable().<br> |
| * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>. |
| * @param ch code point to be determined if it can be ignored in a Unicode |
| * identifier. |
| * @return true if the code point is ignorable |
| */ |
| public static boolean isIdentifierIgnorable(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.FORMAT; |
| } |
| |
| /** |
| * Determines if the specified code point is an uppercase character.<br> |
| * UnicodeData only contains case mappings for code point where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For language specific case conversion behavior, use |
| * toUpperCase(locale, str). <br> |
| * For example, the case conversion for dot-less i and dotted I in Turkish, |
| * or for final sigma in Greek. |
| * For more information about Unicode case mapping please refer to the |
| * <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.isUpperCase(). |
| * @param ch code point to determine if it is in uppercase |
| * @return true if the code point is an uppercase character |
| */ |
| public static boolean isUpperCase(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.UPPERCASE_LETTER; |
| } |
| |
| /** |
| * The given code point is mapped to its lowercase equivalent; if the code |
| * point has no lowercase equivalent, the code point itself is returned.<br> |
| * UnicodeData only contains case mappings for code point where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For language specific case conversion behavior, use |
| * toLowerCase(locale, str). <br> |
| * For example, the case conversion for dot-less i and dotted I in Turkish, |
| * or for final sigma in Greek. |
| * For more information about Unicode case mapping please refer to the |
| * <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toLowerCase() |
| * @param ch code point whose lowercase equivalent is to be retrieved |
| * @return the lowercase equivalent code point |
| */ |
| public static int toLowerCase(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return itself |
| if(!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| int cat = UCharacterPropertyDB.getPropType(props); |
| if (cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER) { |
| return ch + UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_); |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Converts argument code point and returns a String object representing the |
| * code point's value in UTF16 format.<br> |
| * The result is a string whose length is 1 for non-supplementary code points, |
| * 2 otherwise.<br> |
| * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this |
| * function.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toString() |
| * @param ch code point |
| * @return string representation of the code point, null if code point is not |
| * defined in unicode |
| */ |
| public static String toString(int ch) |
| { |
| if (ch < MIN_VALUE || ch > MAX_VALUE) { |
| return null; |
| } |
| |
| if (ch < UCharacter.SUPPLEMENTARY_MIN_VALUE) { |
| return String.valueOf((char)ch); |
| } |
| |
| char result[] = new char[2]; |
| result[0] = (char)UTF16.getLeadSurrogate(ch); |
| result[1] = (char)UTF16.getTrailSurrogate(ch); |
| return new String(result); |
| } |
| |
| /** |
| * Converts the code point argument to titlecase.<br> |
| * UnicodeData only contains case mappings for code points where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * There are only four Unicode characters that are truly titlecase forms |
| * that are distinct from uppercase forms. |
| * For more information about Unicode case mapping please refer |
| * to the <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * If no titlecase is available, the uppercase is returned. If no uppercase |
| * is available, the code point itself is returned.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toTitleCase() |
| * @param ch code point whose title case is to be retrieved |
| * @return titlecase code point |
| */ |
| public static int toTitleCase(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return itself |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.LOWERCASE_LETTER) { |
| // here, titlecase is same as uppercase |
| return ch - UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_TITLECASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_TITLECASE_); |
| } |
| else { |
| // here, titlecase is same as uppercase |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_); |
| } |
| } |
| } |
| return ch; // no mapping - return c itself |
| } |
| |
| /** |
| * Converts the character argument to uppercase.<br> |
| * UnicodeData only contains case mappings for characters where they are |
| * one-to-one mappings; it also omits information about context-sensitive |
| * case mappings.<br> |
| * For more information about Unicode case mapping please refer |
| * to the <a href=http://www.unicode.org/unicode/reports/tr21/> |
| * Technical report #21</a>.<br> |
| * If no uppercase is available, the character itself is returned.<br> |
| * Up-to-date Unicode implementation of java.lang.Character.toUpperCase() |
| * @param ch code point whose uppercase is to be retrieved |
| * @return uppercase code point |
| */ |
| public static int toUpperCase(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return itself |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.LOWERCASE_LETTER) { |
| // here, titlecase is same as uppercase */ |
| return ch - UCharacterPropertyDB.getSignedValue(props); |
| } |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_); |
| } |
| } |
| return ch; // no mapping - return c itself |
| } |
| |
| // extra methods not in java.lang.Character =========================== |
| |
| /** |
| * Determines if the code point is a supplementary character.<br> |
| * A code point is a supplementary character if and only if it is greater than |
| * <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a> |
| * @param ch code point to be determined if it is in the supplementary plane |
| * @return true if code point is a supplementary character |
| */ |
| public static boolean isSupplementary(int ch) |
| { |
| return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE && |
| ch <= UCharacter.MAX_VALUE; |
| } |
| |
| /** |
| * Determines if the code point is in the BMP plane.<br> |
| * @param ch code point to be determined if it is not a supplementary |
| * character |
| * @return true if code point is not a supplementary character |
| */ |
| public static boolean isBMP(int ch) |
| { |
| return (ch >= 0 && ch < LAST_CHAR_MASK_); |
| } |
| |
| /** |
| * Determines whether the specified code point is a printable character |
| * according to the Unicode standard. |
| * @param ch code point to be determined if it is printable |
| * @return true if the code point is a printable character |
| */ |
| public static boolean isPrintable(int ch) |
| { |
| if (isISOControl(ch)) { |
| return false; |
| } |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return (cat != UCharacterCategory.UNASSIGNED && |
| cat != UCharacterCategory.CONTROL && |
| cat != UCharacterCategory.FORMAT && |
| cat != UCharacterCategory.PRIVATE_USE && |
| cat != UCharacterCategory.SURROGATE && |
| cat != UCharacterCategory.GENERAL_OTHER_TYPES); |
| } |
| |
| /** |
| * Determines whether the specified code point is of base form.<br> |
| * A code point of base form does not graphically combine with preceding |
| * characters, and is neither a control nor a format character. |
| * @param ch code point to be determined if it is of base form |
| * @return true if the code point is of base form |
| */ |
| public static boolean isBaseForm(int ch) |
| { |
| int cat = getType(ch); |
| // if props == 0, it will just fall through and return false |
| return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER || |
| cat == UCharacterCategory.OTHER_NUMBER || |
| cat == UCharacterCategory.LETTER_NUMBER || |
| cat == UCharacterCategory.UPPERCASE_LETTER || |
| cat == UCharacterCategory.LOWERCASE_LETTER || |
| cat == UCharacterCategory.TITLECASE_LETTER || |
| cat == UCharacterCategory.MODIFIER_LETTER || |
| cat == UCharacterCategory.OTHER_LETTER || |
| cat == UCharacterCategory.NON_SPACING_MARK || |
| cat == UCharacterCategory.ENCLOSING_MARK || |
| cat == UCharacterCategory.COMBINING_SPACING_MARK; |
| } |
| |
| /** |
| * Returns the Bidirection property of a code point.<br> |
| * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional |
| * property.<br> |
| * Result returned belongs to the interface |
| * <a href=UCharacterDirection.html>UCharacterDirection</a> |
| * @param ch the code point to be determined its direction |
| * @return direction constant from UCharacterDirection. Otherwise is |
| * character is not defined, UCharacterDirection.BOUNDARY_NEUTRAL |
| * will be returned. |
| */ |
| public static int getDirection(int ch) |
| { |
| int props = getProps(ch); |
| if (props != 0) { |
| return UCharacterPropertyDB.getDirection(props); |
| } |
| return UCharacterDirection.LEFT_TO_RIGHT; |
| } |
| |
| /** |
| * Determines whether the code point has the "mirrored" property.<br> |
| * This property is set for characters that are commonly used in |
| * Right-To-Left contexts and need to be displayed with a "mirrored" |
| * glyph. |
| * @param ch code point whose mirror is to be determined |
| * @return true if the code point has the "mirrored" property |
| */ |
| public static boolean isMirrored(int ch) |
| { |
| int props = getProps(ch); |
| // if props == 0, it will just fall through and return false |
| return UCharacterPropertyDB.isMirrored(props); |
| } |
| |
| /** |
| * Maps the specified code point to a "mirror-image" code point.<br> |
| * For code points with the "mirrored" property, implementations sometimes |
| * need a "poor man's" mapping to another code point such that the default |
| * glyph may serve as the mirror-image of the default glyph of the specified |
| * code point.<br> |
| * This is useful for text conversion to and from codepages with visual |
| * order, and for displays without glyph selection capabilities. |
| * @param ch code point whose mirror is to be retrieved |
| * @return another code point that may serve as a mirror-image substitute, or |
| * ch itself if there is no such mapping or ch does not have the |
| * "mirrored" property |
| */ |
| public static int getMirror(int ch) |
| { |
| int props = getProps(ch); |
| // mirrored - the value is a mirror offset |
| // if props == 0, it will just fall through and return false |
| if (UCharacterPropertyDB.isMirrored(props)) { |
| if(!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| return ch + UCharacterPropertyDB.getSignedValue(props); |
| } |
| else |
| { |
| int index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_MIRROR_MAPPING_)) |
| return PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_MIRROR_MAPPING_); |
| } |
| } |
| return ch; |
| } |
| |
| /** |
| * Gets the combining class of the argument codepoint |
| * @param ch code point whose combining is to be retrieved |
| * @return the combining class of the codepoint |
| */ |
| public static byte getCombiningClass(int ch) |
| { |
| int props = getProps(ch); |
| if(!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.NON_SPACING_MARK) { |
| return (byte)(PROPERTY_DB_.getUnsignedValue(props)); |
| } |
| else { |
| return 0; |
| } |
| } |
| else { |
| // the combining class is in bits 23..16 of the first exception value |
| return (byte)((PROPERTY_DB_.getException(PROPERTY_DB_.getExceptionIndex( |
| props), UCharacterPropertyDB.EXC_COMBINING_CLASS_) |
| >> SHIFT_16_) & LAST_BYTE_MASK_); |
| } |
| } |
| |
| /** |
| * A code point is illegal if and only if |
| * <ul> |
| * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE |
| * <li> A surrogate value, 0xD800 to 0xDFFF |
| * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE |
| * </ul> |
| * Note: legal does not mean that it is assigned in this version of Unicode. |
| * @param ch code point to determine if it is a legal code point by itself |
| * @return true if and only if legal. |
| */ |
| public static boolean isLegal(int ch) |
| { |
| if (ch < MIN_VALUE) { |
| return false; |
| } |
| if (ch < SURROGATE_MIN_VALUE_) { |
| return true; |
| } |
| if (ch <= SURROGATE_MAX_VALUE_) { |
| return false; |
| } |
| if ((ch & LAST_CHAR_MASK_) >= NOT_A_CHAR_SUFFIX_MIN_) { |
| return false; |
| } |
| return (ch <= MAX_VALUE); |
| } |
| |
| /** |
| * A string is legal iff all its code points are legal. |
| * A code point is illegal if and only if |
| * <ul> |
| * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE |
| * <li> A surrogate value, 0xD800 to 0xDFFF |
| * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE |
| * </ul> |
| * Note: legal does not mean that it is assigned in this version of Unicode. |
| * @param ch code point to determine if it is a legal code point by itself |
| * @return true if and only if legal. |
| */ |
| public static boolean isLegal(String str) |
| { |
| int size = str.length(); |
| char lead, |
| trail; |
| int codepoint; |
| for (int i = 0; i < size; i ++) |
| { |
| codepoint = UTF16.charAt(str, i); |
| if (!isLegal(codepoint)) { |
| return false; |
| } |
| if (isSupplementary(codepoint)) { |
| i ++; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Gets the version of Unicode data used. |
| * @return the unicode version number used |
| */ |
| public static String getUnicodeVersion() |
| { |
| return PROPERTY_DB_.m_unicodeversion_; |
| } |
| |
| /** |
| * Retrieve the most current Unicode name of the argument code point.<br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param ch the code point for which to get the name |
| * @return most current Unicode name |
| */ |
| public static String getName(int ch) |
| { |
| return UCharacterName.getName(ch, |
| UCharacterNameChoice.U_UNICODE_CHAR_NAME); |
| } |
| |
| /** |
| * Retrieve the earlier version 1.0 Unicode name of the argument code point. |
| * <br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param ch the code point for which to get the name |
| * @return version 1.0 Unicode name |
| */ |
| public static String getName1_0(int ch) |
| { |
| return UCharacterName.getName(ch, |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); |
| } |
| |
| /** |
| * Find a Unicode code point by its most current Unicode name and return its |
| * code point value.<br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param name most current Unicode character name whose code point is to be |
| * returned |
| * @return code point |
| */ |
| public static int getCharFromName(String name) |
| { |
| return UCharacterName.getCharFromName( |
| UCharacterNameChoice.U_UNICODE_CHAR_NAME, name); |
| } |
| |
| /** |
| * Find a Unicode character by its version 1.0 Unicode name and return its |
| * code point value.<br> |
| * Note calling any methods related to code point names, e.g. get*Name*() |
| * incurs a one-time initialisation cost to construct the name tables. |
| * @param name Unicode 1.0 code point name whose code point is to |
| * returned |
| * @return code point |
| */ |
| public static int getCharFromName1_0(String name) |
| { |
| return UCharacterName.getCharFromName( |
| UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name); |
| } |
| |
| /** |
| * Returns a code pointcorresponding to the two UTF16 characters.<br> |
| * If the argument lead is not a high surrogate character or trail is not a |
| * low surrogate character, UCharacter.REPLACEMENT_CHAR is returned. |
| * @param lead the lead char |
| * @param trail the trail char |
| * @return code point or UCharacter.REPLACEMENT_CHAR if surrogate characters |
| * are invalid. |
| */ |
| public static int getCodePoint(char lead, char trail) |
| { |
| if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { |
| return getRawSupplementary(lead, trail); |
| } |
| return UCharacter.REPLACEMENT_CHAR; |
| } |
| |
| /** |
| * Returns the code point corresponding to the UTF16 character.<br> |
| * If argument char16 is a surrogate character, UCharacter.REPLACEMENT_CHAR |
| * is returned |
| * @param char16 the UTF16 character |
| * @return code point or UCharacter.REPLACEMENT_CHAR if argument is not a |
| * invalid character. |
| * @exception IllegalArgumentException thrown when char16 is not a valid |
| * codepoint |
| */ |
| public static int getCodePoint(char char16) |
| { |
| if (UCharacter.isLegal(char16)) { |
| return char16; |
| } |
| throw new IllegalArgumentException("Illegal codepoint"); |
| } |
| |
| /** |
| * Gets uppercase version of the argument string. |
| * Casing is dependent on the default locale and context-sensitive. |
| * @param str source string to be performed on |
| * @return uppercase version of the argument string |
| */ |
| public static String toUpperCase(String str) |
| { |
| return toUpperCase(Locale.getDefault(), str); |
| } |
| |
| /** |
| * Gets lowercase version of the argument string. |
| * Casing is dependent on the default locale and context-sensitive |
| * @param str source string to be performed on |
| * @return lowercase version of the argument string |
| */ |
| public static String toLowerCase(String str) |
| { |
| return toLowerCase(Locale.getDefault(), str); |
| } |
| |
| /** |
| * Gets uppercase version of the argument string. |
| * Casing is dependent on the argument locale and context-sensitive. |
| * @param locale which string is to be converted in |
| * @param str source string to be performed on |
| * @return uppercase version of the argument string |
| */ |
| public static String toUpperCase(Locale locale, String str) |
| { |
| int size = UTF16.countCodePoint(str); |
| StringBuffer result = new StringBuffer(size << 1); // initial buffer |
| int props; |
| int exception; |
| int ch; |
| int index; |
| String lang = locale.getLanguage(); |
| boolean tr_az = lang.equals(TURKISH_) || lang.equals(AZERBAIJANI_); |
| boolean lt = lang.equals(LITHUANIAN_); |
| |
| for (int i = 0; i < size; i ++) |
| { |
| ch = UTF16.charAtCodePointOffset(str, i); |
| props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) |
| { |
| if (UCharacterPropertyDB.getPropType(props) == |
| UCharacterCategory.LOWERCASE_LETTER) { |
| ch -= UCharacterPropertyDB.getSignedValue(props); |
| } |
| UTF16.append(result, ch); |
| } |
| else |
| { |
| index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| getSpecialUpperCase(ch, index, result, str, i, tr_az, lt); |
| } |
| else { |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)) { |
| UTF16.append(result, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_UPPERCASE_)); |
| } |
| } |
| } |
| } |
| return result.toString(); |
| } |
| |
| /** |
| * Gets lowercase version of the argument string. |
| * Casing is dependent on the argument locale and context-sensitive |
| * @param locale which string is to be converted in |
| * @param str source string to be performed on |
| * @return lowercase version of the argument string |
| */ |
| public static String toLowerCase(Locale locale, String str) |
| { |
| int size = UTF16.countCodePoint(str); |
| StringBuffer result = new StringBuffer(size << 1); // initial buffer |
| int props; |
| int exception; |
| int ch; |
| int index; |
| String lang = locale.getLanguage(); |
| boolean tr_az = lang.equals(TURKISH_) || lang.equals(AZERBAIJANI_); |
| boolean lt = lang.equals(LITHUANIAN_); |
| int type; |
| |
| for (int i = 0; i < size; i ++) |
| { |
| ch = UTF16.charAtCodePointOffset(str, i); |
| props = PROPERTY_DB_.getProperty(ch); |
| if (!UCharacterPropertyDB.isExceptionIndicator(props)) { |
| type = UCharacterPropertyDB.getPropType(props); |
| if (type == UCharacterCategory.UPPERCASE_LETTER || |
| type == UCharacterCategory.TITLECASE_LETTER) { |
| ch += UCharacterPropertyDB.getSignedValue(props); |
| } |
| UTF16.append(result, ch); |
| } |
| else { |
| index = UCharacterPropertyDB.getExceptionIndex(props); |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_)) { |
| getSpecialLowerCase(ch, index, result, str, i, tr_az, lt); |
| } |
| else { |
| if (PROPERTY_DB_.hasExceptionValue(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)) { |
| UTF16.append(result, PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_LOWERCASE_)); |
| } |
| } |
| } |
| } |
| return result.toString(); |
| } |
| |
| // protected methods ==================================================== |
| |
| /** |
| * Forms a supplementary code point from the argument character<br> |
| * Note this is for internal use hence no checks for the validity of the |
| * surrogate characters are done |
| * @param lead lead surrogate character |
| * @param trail trailing surrogate character |
| * @return code point of the supplementary character |
| */ |
| protected static int getRawSupplementary(char lead, char trail) |
| { |
| return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; |
| } |
| |
| // private methods ============================================== |
| |
| /** |
| * Gets the correct property information from UCharacterPropertyDB |
| * @param ch character whose information is to be retrieved |
| * @return a 32 bit information, returns 0 if no data is found. |
| */ |
| private static int getProps(int ch) |
| { |
| if (ch >= UCharacter.MIN_VALUE & ch <= UCharacter.MAX_VALUE) { |
| return PROPERTY_DB_.getProperty(ch); |
| } |
| return 0; |
| } |
| |
| /** |
| * Getting Han character digit values |
| * @param ch code point to test if it is a Han character |
| * @return Han digit value if ch is a Han digit character |
| */ |
| private static int getHanDigit(int ch) |
| { |
| switch(ch) |
| { |
| case IDEOGRAPHIC_NUMBER_ZERO_ : |
| return 0; // Han Zero |
| case CJK_IDEOGRAPH_FIRST_ : |
| return 1; // Han One |
| case CJK_IDEOGRAPH_SECOND_ : |
| return 2; // Han Two |
| case CJK_IDEOGRAPH_THIRD_ : |
| return 3; // Han Three |
| case CJK_IDEOGRAPH_FOURTH_ : |
| return 4; // Han Four |
| case CJK_IDEOGRAPH_FIFTH_ : |
| return 5; // Han Five |
| case CJK_IDEOGRAPH_SIXTH_ : |
| return 6; // Han Six |
| case CJK_IDEOGRAPH_SEVENTH_ : |
| return 7; // Han Seven |
| case CJK_IDEOGRAPH_EIGHTH_ : |
| return 8; // Han Eight |
| case CJK_IDEOGRAPH_NINETH_ : |
| return 9; // Han Nine |
| } |
| return -1; // no value |
| } |
| |
| /** |
| * Special casing uppercase management |
| * @param ch code point to convert |
| * @param index of exception containing special case information |
| * @param buffer to add uppercase |
| * @param str original string |
| * @param chindex index of ch in str |
| * @param tr_az if uppercase is to be made with TURKISH or AZERBAIJANI |
| * in mind |
| * @param lt if uppercase is to be made with LITHUANIAN in mind |
| */ |
| private static void getSpecialUpperCase(int ch, int index, |
| StringBuffer buffer, String str, |
| int chindex, boolean tr_az, |
| boolean lt) |
| { |
| int exception = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_); |
| if (exception < 0) { |
| // use hardcoded conditions and mappings |
| if (ch == LATIN_SMALL_LETTER_I_) { |
| if (tr_az) { |
| // turkish and azerbaijani : i maps to dotted I |
| buffer.append(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_); |
| } |
| else { |
| // other languages: i maps to I |
| buffer.append(LATIN_CAPITAL_LETTER_I_); |
| } |
| } |
| else { |
| if (ch == COMBINING_DOT_ABOVE_ && lt) { |
| // lithuanian: remove DOT ABOVE after U+0069 "i" with upper |
| // or titlecase |
| for (int j = chindex; j > 0; j ++) { |
| ch = UTF16.charAtCodePointOffset(str, j); |
| if (getType(ch) != UCharacterCategory.NON_SPACING_MARK) { |
| break; |
| } |
| } |
| |
| // if the base letter is not an 'i' (U+0069)? keep the dot |
| if (ch != LATIN_SMALL_LETTER_I_) { |
| buffer.append(COMBINING_DOT_ABOVE_); |
| } |
| } |
| else { |
| // no known conditional special case mapping, output the code |
| // point itself |
| UTF16.append(buffer, ch); |
| } |
| } |
| } |
| else { |
| // get the special case mapping string from the data file |
| index = exception & LAST_CHAR_MASK_; |
| PROPERTY_DB_.getUpperCase(index, buffer); |
| } |
| } |
| |
| /** |
| * Special casing lowercase management |
| * @param ch code point to convert |
| * @param index of exception containing special case information |
| * @param buffer to add lowercase |
| * @param str original string |
| * @param chindex index of ch in str |
| * @param tr_az if uppercase is to be made with TURKISH or AZERBAIJANI |
| * in mind |
| * @param lt if uppercase is to be made with LITHUANIAN in mind |
| */ |
| private static void getSpecialLowerCase(int ch, int index, |
| StringBuffer buffer, String str, |
| int chindex, boolean tr_az, |
| boolean lt) |
| { |
| int exception = PROPERTY_DB_.getException(index, |
| UCharacterPropertyDB.EXC_SPECIAL_CASING_); |
| if (exception < 0) { |
| // use hardcoded conditions and mappings |
| if (ch == LATIN_CAPITAL_LETTER_I_) { |
| if (tr_az) { |
| // turkish and azerbaijani : I maps to dotless i |
| buffer.append(LATIN_SMALL_LETTER_DOTLESS_I_); |
| } |
| else { |
| // other languages: I maps to i |
| buffer.append(LATIN_SMALL_LETTER_I_); |
| } |
| } |
| else { |
| if (ch == GREEK_CAPITAL_LETTER_SIGMA_) { |
| // greek capital sigma maps depending on whether the following |
| // character is a letter |
| chindex ++; |
| if (chindex != str.length() && |
| isLetter(UTF16.charAtCodePointOffset(str, chindex))) { |
| buffer.append(GREEK_SMALL_LETTER_SIGMA_); |
| } |
| else { |
| buffer.append(GREEK_SMALL_LETTER_RHO_); |
| } |
| } |
| else { |
| // no known conditional special case mapping, output the code |
| // point itself |
| UTF16.append(buffer, ch); |
| } |
| } |
| } |
| else |
| { |
| // get the special case mapping string from the data file |
| index = exception & LAST_CHAR_MASK_; |
| PROPERTY_DB_.getLowerCase(index, buffer); |
| } |
| } |
| } |
| |