blob: 54f10d5bdb9c2c7569c598d5e2f0450a2edf826c [file] [log] [blame]
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $
* $Date: 2001/12/04 20:09:07 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
package com.ibm.text;
import java.util.Locale;
import com.ibm.util.Utility;
/**
* <p>
* The UCharacter class provides extensions to the
* <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html>
* java.lang.Character</a> class. These extensions provide support for
* Unicode 3.1 properties and together with the <a href=UTF16.html>UTF16</a>
* class, provide support for supplementary characters (those with code
* points above U+FFFF).
* </p>
* <p>
* Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
* </p>
* <p>
* To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.<br>
* E.g. In Windows <br>
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
* Otherwise, another method would be to copy the files uprops.dat and
* unames.dat from the icu4j source subdirectory
* <i>$ICU4J_SRC/src/com/ibm/text/resources</i> to your class directory
* <i>$ICU4J_CLASS/com/ibm/text/resources</i>.
* </p>
* <p>
* For more information about the data file format, please refer to
* <a href=http://oss.software.ibm.com/icu4j/doc/com/ibm/text/ReadMe.html>
* Read Me</a>.
* </p>
* <p>
* Aside from the additions for UTF-16 support, and the updated Unicode 3.1
* properties, the main differences between UCharacter and Character are:
* <ul>
* <li> UCharacter is not designed to be a char wrapper and does not have
* APIs to which involves management of that single char.<br>
* These include:
* <ul>
* <li> char charValue(),
* <li> int compareTo(java.lang.Character, java.lang.Character), etc.
* </ul>
* <li> UCharacter does not include Character APIs that are deprecated, not
* does it include the Java-specific character information, such as
* boolean isJavaIdentifierPart(char ch).
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
* values '10' - '35'. UCharacter also does this in digit and
* getNumericValue, to adhere to the java semantics of these
* methods. New methods unicodeDigit, and
* getUnicodeNumericValue do not treat the above code points
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* <li> For consistency with ICU4C's data, control code points below have their
* Unicode general category reset to the types below.
* <ul>
* <li> TAB 0x9 : U_SPACE_SEPARATOR
* <li> VT 0xb : U_SPACE_SEPARATOR
* <li> LF 0xa : U_PARAGRAPH_SEPARATOR
* <li> FF 0xc : U_LINE_SEPARATOR
* <li> CR 0xd : U_PARAGRAPH_SEPARATOR
* <li> FS 0x1c : U_PARAGRAPH_SEPARATOR
* <li> GS 0x1d : U_PARAGRAPH_SEPARATOR
* <li> RS 0x1e : U_PARAGRAPH_SEPARATOR
* <li> US 0x1f : U_SPACE_SEPARATOR
* <li> NL 0x85 : U_PARAGRAPH_SEPARATOR
* </ul>
* <p>
* Further detail differences can be determined from the program
* <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/test/text/UCharacterCompare.java>
* com.ibm.icu.test.text.UCharacterCompare</a>
* </p>
* @author Syn Wee Quek
* @since oct 06 2000
* @see com.ibm.text.UCharacterCategory
* @see com.ibm.text.UCharacterDirection
*/
public final class UCharacter
{
// public variables ==============================================
/**
* The lowest Unicode code point value. Code points are non-ne N_VALUE
*/
public static final int MIN_VALUE = 0;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.<br>
* This is a 21-bit value (21 bits, rounded up).<br>
* Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
*/
public static final int MAX_VALUE = 0x10ffff;
/**
* The minimum value for Supplementary code points
*/
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Unicode value used when translating into Unicode encoding form and there
* is no existing character.
*/
public static final int REPLACEMENT_CHAR = '\uFFFD';
// constructor ====================================================
/**
* Private constructor to prevent instantiation
*/
private UCharacter()
{
}
// public methods ===================================================
/**
* Retrieves the numeric value of a decimal digit code point.
* <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as
* digits.
* This has been changed to conform to the java semantics.
* <br>A code point is a valid digit if and only if:
* <ul>
* <li>ch is a decimal digit or one of the european letters, and
* <li>the value of ch is less than the specified radix.
* </ul>
* @param ch the code point to query
* @param radix the radix
* @return the numeric value represented by the code point in the
* specified radix, or -1 if the code point is not a decimal digit
* or if its value is too large for the radix
*/
public static int digit(int ch, int radix)
{
int props = getProps(ch);
int result = -1;
// if props == 0, it will just fall through and return -1
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
// not contained in exception data
if (UCharacterPropertyDB.getPropType(props) ==
UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
result = UCharacterPropertyDB.getSignedValue(props);
}
}
if (result < 0 && radix > 10) {
result = getEuropeanDigit(ch);
}
if (result < 0 || result >= radix) {
return -1;
}
return result;
}
private static boolean isEuropeanDigit(int ch) {
return (ch <= 0x7a && ((ch >= 0x41 && ch <= 0x5a) || ch >= 0x61)) ||
(ch >= 0xff21 && (ch <= 0xff3a || (ch >= 0xff41 && ch <= 0xff5a)));
}
private static int getEuropeanDigit(int ch) {
if (ch <= 0x7a) {
if (ch >= 0x41 && ch <= 0x5a) {
return ch + 10 - 0x41;
} else if (ch >= 0x61) {
return ch + 10 - 0x61;
}
} else if (ch >= 0xff21) {
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
} else if (ch >= 0xff41 && ch <= 0xff5a) {
return ch + 10 - 0xff41;
}
}
return -1;
}
/**
* Retrieves the numeric value of a decimal digit code point.
* <br>This is a convenience overload of <code>digit(int, int)</code>
* that provides a decimal radix.
* <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
* treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch the code point to query
* @return the numeric value represented by the code point,
* or -1 if the code point is not a decimal digit or if its
* value is too large for a decimal radix
*/
public static int digit(int ch)
{
return digit(ch, DECIMAL_RADIX_);
}
/**
* Returns the Unicode numeric value of the code point as a nonnegative
* integer.
* <br>If the code point does not have a numeric value, then -1 is returned.
* <br>
* If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is
* returned.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this returned -1 for ASCII letters and their
* fullwidth counterparts. This has been changed to
* conform to the java semantics.
* @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric
* value, or -2 if it has a numeric value that cannot be represented as a
* nonnegative integer
*/
public static int getNumericValue(int ch)
{
return getNumericValueInternal(ch, true);
}
/**
* Returns the Unicode numeric value of the code point as a nonnegative
* integer.
* <br>If the code point does not have a numeric value, then -1 is returned. <br>
* If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is
* returned.
* This returns values other than -1 for all and only those code points
* whose type is a numeric type.
* @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric
* value, or -2 if it has a numeric value that cannot be represented as a
* nonnegative integer
*/
public static int getUnicodeNumericValue(int ch)
{
return getNumericValueInternal(ch, false);
}
private static int getNumericValueInternal(int ch, boolean useEuropean)
{
int props = getProps(ch);
int type = UCharacterPropertyDB.getPropType(props);
// if props == 0, it will just fall through and return -1
if (type != UCharacterCategory.DECIMAL_DIGIT_NUMBER &&
type != UCharacterCategory.LETTER_NUMBER &&
type != UCharacterCategory.OTHER_NUMBER) {
return useEuropean ? getEuropeanDigit(ch) : -1;
}
int result = -1;
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
// not contained in exception data
result = UCharacterPropertyDB.getSignedValue(props);
}
else {
// contained in exception data
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_DIGIT_VALUE_)) {
result = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_DIGIT_VALUE_) &
LAST_CHAR_MASK_;
}
else {
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_DENOMINATOR_VALUE_)) {
return -2;
}
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_NUMERIC_VALUE_)) {
result = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_NUMERIC_VALUE_);
}
}
}
return result;
}
/**
* Returns a value indicating a code point's Unicode category.<br>
* Up-to-date Unicode implementation of java.lang.Character.getType() except
* for the above mentioned code points that had their category changed.<br>
* Return results are constants from the interface
* <a href=UCharacterCategory.html>UCharacterCategory</a>
* @param ch code point whose type is to be determined
* @return category which is a value of UCharacterCategory
*/
public static int getType(int ch)
{
return UCharacterPropertyDB.getPropType(getProps(ch));
}
/**
* Determines if a code point has a defined meaning in the up-to-date Unicode
* standard.<br>
* E.g. supplementary code points though allocated space are not defined in
* Unicode yet.<br>
* Up-to-date Unicode implementation of java.lang.Character.isDefined()
* @param ch code point to be determined if it is defined in the most current
* version of Unicode
* @return true if this code point is defined in unicode
*/
public static boolean isDefined(int ch)
{
return getProps(ch) != 0;
}
/**
* Determines if a code point is a Java digit.
* <br>This method observes the semantics of
* <code>java.lang.Character.isDigit()</code>. It returns true for
* decimal digits only.
* <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
* treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch code point to query
* @return true if this code point is a digit */
public static boolean isDigit(int ch)
{
return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
}
/**
* Determines if the specified code point is an ISO control character.<br>
* A code point is considered to be an ISO control character if it is in the
* range &#92u0000 through &#92u001F or in the range &#92u007F through
* &#92u009F.<br>
* Up-to-date Unicode implementation of java.lang.Character.isISOControl()
* @param ch code point to determine if it is an ISO control character
* @return true if code point is a ISO control character
*/
public static boolean isISOControl(int ch)
{
return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ &&
((ch <= UNIT_SEPERATOR_) || (ch >= DELETE_));
}
/**
* Determines if the specified code point is a letter.<br>
* Up-to-date Unicode implementation of java.lang.Character.isLetter()
* @param ch code point to determine if it is a letter
* @return true if code point is a letter
*/
public static boolean isLetter(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER ||
cat == UCharacterCategory.MODIFIER_LETTER ||
cat == UCharacterCategory.OTHER_LETTER;
}
/**
* Determines if the specified code point is a letter or digit.<br>
* Note this method, unlike java.lang.Character does not regard the ascii
* characters 'A' - 'Z' and 'a' - 'z' as digits.
* @param ch code point to determine if it is a letter or a digit
* @return true if code point is a letter or a digit
*/
public static boolean isLetterOrDigit(int ch)
{
return isDigit(ch) || isLetter(ch);
}
/**
* Determines if the specified code point is a lowercase character.<br>
* UnicodeData only contains case mappings for code points where they are
* one-to-one mappings; it also omits information about context-sensitive
* case mappings.<br> For more information about Unicode case mapping please
* refer to the <a href=http://www.unicode.org/unicode/reports/tr21/>
* Technical report #21</a>.<br>
* Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
* @param ch code point to determine if it is in lowercase
* @return true if code point is a lowercase character
*/
public static boolean isLowerCase(int ch)
{
// if props == 0, it will just fall through and return false
return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
}
/**
* Determines if the specified code point is a white space character.<br>
* A code point is considered to be an whitespace character if and only
* if it satisfies one of the following criteria:
* <ul>
* <li> It is a Unicode space separator (category "Zs"), but is not
* a no-break space (&#92u00A0 or &#92u202F or &#92uFEFF).
* <li> It is a Unicode line separator (category "Zl").
* <li> It is a Unicode paragraph separator (category "Zp").
* </ul>
* Up-to-date Unicode implementation of java.lang.Character.isWhitespace().
* @param ch code point to determine if it is a white space
* @return true if the specified code point is a white space character
*/
public static boolean isWhitespace(int ch)
{
int cat = getType(ch);
// exclude no-break spaces
// if props == 0, it will just fall through and return false
return (cat == UCharacterCategory.SPACE_SEPARATOR ||
cat == UCharacterCategory.LINE_SEPARATOR ||
cat == UCharacterCategory.PARAGRAPH_SEPARATOR) &&
(ch != NO_BREAK_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_) &&
(ch != ZERO_WIDTH_NO_BREAK_SPACE_);
}
/**
* Determines if the specified code point is a Unicode specified space
* character, ie if code point is in the category Zs, Zl and Zp.<br>
* Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
* @param ch code point to determine if it is a space
* @return true if the specified code point is a space character
*/
public static boolean isSpaceChar(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.SPACE_SEPARATOR ||
cat == UCharacterCategory.LINE_SEPARATOR ||
cat == UCharacterCategory.PARAGRAPH_SEPARATOR;
}
/**
* Determines if the specified code point is a titlecase character.<br>
* UnicodeData only contains case mappings for code points where they are
* one-to-one mappings; it also omits information about context-sensitive
* case mappings.<br>
* For more information about Unicode case mapping please refer to the
* <a href=http://www.unicode.org/unicode/reports/tr21/>
* Technical report #21</a>.<br>
* Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
* @param ch code point to determine if it is in title case
* @return true if the specified code point is a titlecase character
*/
public static boolean isTitleCase(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.TITLECASE_LETTER;
}
/**
* Determines if the specified code point may be any part of a Unicode
* identifier other than the starting character.<br>
* A code point may be part of a Unicode identifier if and only if it is one
* of the following:
* <ul>
* <li> Lu Uppercase letter
* <li> Ll Lowercase letter
* <li> Lt Titlecase letter
* <li> Lm Modifier letter
* <li> Lo Other letter
* <li> Nl Letter number
* <li> Pc Connecting punctuation character
* <li> Nd decimal number
* <li> Mc Spacing combining mark
* <li> Mn Non-spacing mark
* <li> Cf formatting code
* </ul>
* Up-to-date Unicode implementation of
* java.lang.Character.isUnicodeIdentifierPart().<br>
* See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
* @param ch code point to determine if is can be part of a Unicode identifier
* @return true if code point is any character belonging a unicode identifier
* suffix after the first character
*/
public static boolean isUnicodeIdentifierPart(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER ||
cat == UCharacterCategory.MODIFIER_LETTER ||
cat == UCharacterCategory.OTHER_LETTER ||
cat == UCharacterCategory.LETTER_NUMBER ||
cat == UCharacterCategory.CONNECTOR_PUNCTUATION ||
cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
cat == UCharacterCategory.COMBINING_SPACING_MARK ||
cat == UCharacterCategory.NON_SPACING_MARK ||
// cat == UCharacterCategory.FORMAT;
isIdentifierIgnorable(ch);
}
/**
* Determines if the specified code point is permissible as the first
* character in a Unicode identifier.<br>
* A code point may start a Unicode identifier if it is of type either
* <ul>
* <li> Lu Uppercase letter
* <li> Ll Lowercase letter
* <li> Lt Titlecase letter
* <li> Lm Modifier letter
* <li> Lo Other letter
* <li> Nl Letter number
* </ul>
* Up-to-date Unicode implementation of
* java.lang.Character.isUnicodeIdentifierStart().<br>
* See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
* @param ch code point to determine if it can start a Unicode identifier
* @return true if code point is the first character belonging a unicode
* identifier
*/
public static boolean isUnicodeIdentifierStart(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER ||
cat == UCharacterCategory.MODIFIER_LETTER ||
cat == UCharacterCategory.OTHER_LETTER ||
cat == UCharacterCategory.LETTER_NUMBER;
}
/**
* Determines if the specified code point should be regarded as an ignorable
* character in a Unicode identifier.<br>
* A character is ignorable in the Unicode standard if it is of the type Cf,
* Formatting code.<br>
* Up-to-date Unicode implementation of
* java.lang.Character.isIdentifierIgnorable().<br>
* See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
* @param ch code point to be determined if it can be ignored in a Unicode
* identifier.
* @return true if the code point is ignorable
*/
public static boolean isIdentifierIgnorable(int ch)
{
/*
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.FORMAT;
*/
// see java.lang.Character.isIdentifierIgnorable() on range of
// ignorable characters.
return ch <= 8 || (ch >= 0xe && ch <= 0x1b) ||
(ch >= 0x7f && ch <= 0x9f) ||
getType(ch) == UCharacterCategory.FORMAT;
}
/**
* Determines if the specified code point is an uppercase character.<br>
* UnicodeData only contains case mappings for code point where they are
* one-to-one mappings; it also omits information about context-sensitive
* case mappings.<br>
* For language specific case conversion behavior, use
* toUpperCase(locale, str). <br>
* For example, the case conversion for dot-less i and dotted I in Turkish,
* or for final sigma in Greek.
* For more information about Unicode case mapping please refer to the
* <a href=http://www.unicode.org/unicode/reports/tr21/>
* Technical report #21</a>.<br>
* Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
* @param ch code point to determine if it is in uppercase
* @return true if the code point is an uppercase character
*/
public static boolean isUpperCase(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.UPPERCASE_LETTER;
}
/**
* The given code point is mapped to its lowercase equivalent; if the code
* point has no lowercase equivalent, the code point itself is returned.<br>
* UnicodeData only contains case mappings for code point where they are
* one-to-one mappings; it also omits information about context-sensitive
* case mappings.<br>
* For language specific case conversion behavior, use
* toLowerCase(locale, str). <br>
* For example, the case conversion for dot-less i and dotted I in Turkish,
* or for final sigma in Greek.
* For more information about Unicode case mapping please refer to the
* <a href=http://www.unicode.org/unicode/reports/tr21/>
* Technical report #21</a>.<br>
* Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
* @param ch code point whose lowercase equivalent is to be retrieved
* @return the lowercase equivalent code point
*/
public static int toLowerCase(int ch)
{
int props = getProps(ch);
// if props == 0, it will just fall through and return itself
if(!UCharacterPropertyDB.isExceptionIndicator(props)) {
int cat = UCharacterPropertyDB.getPropType(props);
if (cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER) {
return ch + UCharacterPropertyDB.getSignedValue(props);
}
}
else
{
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_LOWERCASE_)) {
return PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_LOWERCASE_);
}
}
return ch;
}
/**
* Converts argument code point and returns a String object representing the
* code point's value in UTF16 format.<br>
* The result is a string whose length is 1 for non-supplementary code points,
* 2 otherwise.<br>
* com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
* function.<br>
* Up-to-date Unicode implementation of java.lang.Character.toString()
* @param ch code point
* @return string representation of the code point, null if code point is not
* defined in unicode
*/
public static String toString(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
return null;
}
if (ch < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
return String.valueOf((char)ch);
}
char result[] = new char[2];
result[0] = (char)UTF16.getLeadSurrogate(ch);
result[1] = (char)UTF16.getTrailSurrogate(ch);
return new String(result);
}
/**
* Converts the code point argument to titlecase.<br>
* UnicodeData only contains case mappings for code points where they are
* one-to-one mappings; it also omits information about context-sensitive
* case mappings.<br>
* There are only four Unicode characters that are truly titlecase forms
* that are distinct from uppercase forms.
* For more information about Unicode case mapping please refer
* to the <a href=http://www.unicode.org/unicode/reports/tr21/>
* Technical report #21</a>.<br>
* If no titlecase is available, the uppercase is returned. If no uppercase
* is available, the code point itself is returned.<br>
* Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
* @param ch code point whose title case is to be retrieved
* @return titlecase code point
*/
public static int toTitleCase(int ch)
{
int props = getProps(ch);
// if props == 0, it will just fall through and return itself
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
if (UCharacterPropertyDB.getPropType(props) ==
UCharacterCategory.LOWERCASE_LETTER) {
// here, titlecase is same as uppercase
return ch - UCharacterPropertyDB.getSignedValue(props);
}
}
else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_TITLECASE_)) {
return PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_TITLECASE_);
}
else {
// here, titlecase is same as uppercase
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_UPPERCASE_)) {
return PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_UPPERCASE_);
}
}
}
return ch; // no mapping - return c itself
}
/**
* Converts the character argument to uppercase.<br>
* UnicodeData only contains case mappings for characters where they are
* one-to-one mappings; it also omits information about context-sensitive
* case mappings.<br>
* For more information about Unicode case mapping please refer
* to the <a href=http://www.unicode.org/unicode/reports/tr21/>
* Technical report #21</a>.<br>
* If no uppercase is available, the character itself is returned.<br>
* Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
* @param ch code point whose uppercase is to be retrieved
* @return uppercase code point
*/
public static int toUpperCase(int ch)
{
int props = getProps(ch);
// if props == 0, it will just fall through and return itself
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
if (UCharacterPropertyDB.getPropType(props) ==
UCharacterCategory.LOWERCASE_LETTER) {
// here, titlecase is same as uppercase */
return ch - UCharacterPropertyDB.getSignedValue(props);
}
}
else
{
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_UPPERCASE_)) {
return PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_UPPERCASE_);
}
}
return ch; // no mapping - return c itself
}
// extra methods not in java.lang.Character ===========================
/**
* Determines if the code point is a supplementary character.<br>
* A code point is a supplementary character if and only if it is greater than
* <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
* @param ch code point to be determined if it is in the supplementary plane
* @return true if code point is a supplementary character
*/
public static boolean isSupplementary(int ch)
{
return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
ch <= UCharacter.MAX_VALUE;
}
/**
* Determines if the code point is in the BMP plane.<br>
* @param ch code point to be determined if it is not a supplementary
* character
* @return true if code point is not a supplementary character
*/
public static boolean isBMP(int ch)
{
return (ch >= 0 && ch < LAST_CHAR_MASK_);
}
/**
* Determines whether the specified code point is a printable character
* according to the Unicode standard.
* @param ch code point to be determined if it is printable
* @return true if the code point is a printable character
*/
public static boolean isPrintable(int ch)
{
if (isISOControl(ch)) {
return false;
}
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return (cat != UCharacterCategory.UNASSIGNED &&
cat != UCharacterCategory.CONTROL &&
cat != UCharacterCategory.FORMAT &&
cat != UCharacterCategory.PRIVATE_USE &&
cat != UCharacterCategory.SURROGATE &&
cat != UCharacterCategory.GENERAL_OTHER_TYPES);
}
/**
* Determines whether the specified code point is of base form.<br>
* A code point of base form does not graphically combine with preceding
* characters, and is neither a control nor a format character.
* @param ch code point to be determined if it is of base form
* @return true if the code point is of base form
*/
public static boolean isBaseForm(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
cat == UCharacterCategory.OTHER_NUMBER ||
cat == UCharacterCategory.LETTER_NUMBER ||
cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER ||
cat == UCharacterCategory.MODIFIER_LETTER ||
cat == UCharacterCategory.OTHER_LETTER ||
cat == UCharacterCategory.NON_SPACING_MARK ||
cat == UCharacterCategory.ENCLOSING_MARK ||
cat == UCharacterCategory.COMBINING_SPACING_MARK;
}
/**
* Returns the Bidirection property of a code point.<br>
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
* property.<br>
* Result returned belongs to the interface
* <a href=UCharacterDirection.html>UCharacterDirection</a>
* @param ch the code point to be determined its direction
* @return direction constant from UCharacterDirection. Otherwise is
* character is not defined, UCharacterDirection.BOUNDARY_NEUTRAL
* will be returned.
*/
public static int getDirection(int ch)
{
int props = getProps(ch);
if (props != 0) {
return UCharacterPropertyDB.getDirection(props);
}
return UCharacterDirection.LEFT_TO_RIGHT;
}
/**
* Determines whether the code point has the "mirrored" property.<br>
* This property is set for characters that are commonly used in
* Right-To-Left contexts and need to be displayed with a "mirrored"
* glyph.
* @param ch code point whose mirror is to be determined
* @return true if the code point has the "mirrored" property
*/
public static boolean isMirrored(int ch)
{
int props = getProps(ch);
// if props == 0, it will just fall through and return false
return UCharacterPropertyDB.isMirrored(props);
}
/**
* Maps the specified code point to a "mirror-image" code point.<br>
* For code points with the "mirrored" property, implementations sometimes
* need a "poor man's" mapping to another code point such that the default
* glyph may serve as the mirror-image of the default glyph of the specified
* code point.<br>
* This is useful for text conversion to and from codepages with visual
* order, and for displays without glyph selection capabilities.
* @param ch code point whose mirror is to be retrieved
* @return another code point that may serve as a mirror-image substitute, or
* ch itself if there is no such mapping or ch does not have the
* "mirrored" property
*/
public static int getMirror(int ch)
{
int props = getProps(ch);
// mirrored - the value is a mirror offset
// if props == 0, it will just fall through and return false
if (UCharacterPropertyDB.isMirrored(props)) {
if(!UCharacterPropertyDB.isExceptionIndicator(props)) {
return ch + UCharacterPropertyDB.getSignedValue(props);
}
else
{
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_MIRROR_MAPPING_))
return PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_MIRROR_MAPPING_);
}
}
return ch;
}
/**
* Gets the combining class of the argument codepoint
* @param ch code point whose combining is to be retrieved
* @return the combining class of the codepoint
*/
public static int getCombiningClass(int ch)
{
int props = getProps(ch);
if(!UCharacterPropertyDB.isExceptionIndicator(props)) {
if (UCharacterPropertyDB.getPropType(props) ==
UCharacterCategory.NON_SPACING_MARK) {
return PROPERTY_DB_.getUnsignedValue(props);
}
else {
return 0;
}
}
else {
// the combining class is in bits 23..16 of the first exception value
return (PROPERTY_DB_.getException(
PROPERTY_DB_.getExceptionIndex(props),
UCharacterPropertyDB.EXC_COMBINING_CLASS_)
>> SHIFT_16_) & LAST_BYTE_MASK_;
}
}
/**
* A code point is illegal if and only if
* <ul>
* <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
* <li> A surrogate value, 0xD800 to 0xDFFF
* <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
* </ul>
* Note: legal does not mean that it is assigned in this version of Unicode.
* @param ch code point to determine if it is a legal code point by itself
* @return true if and only if legal.
*/
public static boolean isLegal(int ch)
{
if (ch < MIN_VALUE) {
return false;
}
if (ch < SURROGATE_MIN_VALUE_) {
return true;
}
if (ch <= SURROGATE_MAX_VALUE_) {
return false;
}
if (isNonCharacter(ch)) {
return false;
}
return (ch <= MAX_VALUE);
}
/**
* A string is legal iff all its code points are legal.
* A code point is illegal if and only if
* <ul>
* <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
* <li> A surrogate value, 0xD800 to 0xDFFF
* <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
* </ul>
* Note: legal does not mean that it is assigned in this version of Unicode.
* @param ch code point to determine if it is a legal code point by itself
* @return true if and only if legal.
*/
public static boolean isLegal(String str)
{
int size = str.length();
int codepoint;
for (int i = 0; i < size; i ++)
{
codepoint = UTF16.charAt(str, i);
if (!isLegal(codepoint)) {
return false;
}
if (isSupplementary(codepoint)) {
i ++;
}
}
return true;
}
/**
* Gets the version of Unicode data used.
* @return the unicode version number used
*/
public static String getUnicodeVersion()
{
return PROPERTY_DB_.m_unicodeversion_;
}
/**
* Retrieve the most current Unicode name of the argument code point, or
* null if the character is unassigned or outside the range
* UCharacter.MIN_VALUE and UCharacter.MAX_VALUE.<br>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param ch the code point for which to get the name
* @return most current Unicode name
*/
public static String getName(int ch)
{
return UCharacterName.getName(ch,
UCharacterNameChoice.U_UNICODE_CHAR_NAME);
}
/**
* Retrieve the earlier version 1.0 Unicode name of the argument code point,
* or null if the character is unassigned or outside the range
* UCharacter.MIN_VALUE and UCharacter.MAX_VALUE.<br>
* <br>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param ch the code point for which to get the name
* @return version 1.0 Unicode name
*/
public static String getName1_0(int ch)
{
return UCharacterName.getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
/**
* Find a Unicode code point by its most current Unicode name and return its
* code point value.<br>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name most current Unicode character name whose code point is to be
* returned
* @return code point or -1 if name is not found
*/
public static int getCharFromName(String name)
{
return UCharacterName.getCharFromName(
UCharacterNameChoice.U_UNICODE_CHAR_NAME, name);
}
/**
* Find a Unicode character by its version 1.0 Unicode name and return its
* code point value.<br>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name Unicode 1.0 code point name whose code point is to
* returned
* @return code point or -1 if name is not found
*/
public static int getCharFromName1_0(String name)
{
return UCharacterName.getCharFromName(
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name);
}
/**
* Returns a code pointcorresponding to the two UTF16 characters.<br>
* If the argument lead is not a high surrogate character or trail is not a
* low surrogate character, UCharacter.REPLACEMENT_CHAR is returned.
* @param lead the lead char
* @param trail the trail char
* @return code point or UCharacter.REPLACEMENT_CHAR if surrogate characters
* are invalid.
*/
public static int getCodePoint(char lead, char trail)
{
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
return getRawSupplementary(lead, trail);
}
return UCharacter.REPLACEMENT_CHAR;
}
/**
* Returns the code point corresponding to the UTF16 character.<br>
* If argument char16 is a surrogate character, UCharacter.REPLACEMENT_CHAR
* is returned
* @param char16 the UTF16 character
* @return code point or UCharacter.REPLACEMENT_CHAR if argument is not a
* invalid character.
* @exception IllegalArgumentException thrown when char16 is not a valid
* codepoint
*/
public static int getCodePoint(char char16)
{
if (UCharacter.isLegal(char16)) {
return char16;
}
throw new IllegalArgumentException("Illegal codepoint");
}
/**
* Gets uppercase version of the argument string.
* Casing is dependent on the default locale and context-sensitive.
* @param str source string to be performed on
* @return uppercase version of the argument string
*/
public static String toUpperCase(String str)
{
return toUpperCase(Locale.getDefault(), str);
}
/**
* Gets lowercase version of the argument string.
* Casing is dependent on the default locale and context-sensitive
* @param str source string to be performed on
* @return lowercase version of the argument string
*/
public static String toLowerCase(String str)
{
return toLowerCase(Locale.getDefault(), str);
}
/**
* Gets uppercase version of the argument string.
* Casing is dependent on the argument locale and context-sensitive.
* @param locale which string is to be converted in
* @param str source string to be performed on
* @return uppercase version of the argument string
*/
public static String toUpperCase(Locale locale, String str)
{
int size = str.length();
StringBuffer result = new StringBuffer(size); // initial buffer
int offset = 0;
while (offset < size)
{
int ch = UTF16.charAt(str, offset);
int chsize = UTF16.getCharCount(ch);
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props))
{
if (UCharacterPropertyDB.getPropType(props) ==
UCharacterCategory.LOWERCASE_LETTER) {
ch -= UCharacterPropertyDB.getSignedValue(props);
}
UTF16.append(result, ch);
}
else
{
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_)) {
getSpecialUpperCase(ch, index, result, str, offset,
locale);
}
else {
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_UPPERCASE_)) {
ch = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_UPPERCASE_);
}
UTF16.append(result, ch);
}
}
offset += chsize;
}
return result.toString();
}
/**
* Gets lowercase version of the argument string.
* Casing is dependent on the argument locale and context-sensitive
* @param locale which string is to be converted in
* @param str source string to be performed on
* @return lowercase version of the argument string
*/
public static String toLowerCase(Locale locale, String str)
{
// case mapping loop
int offset = 0;
int length = str.length();
StringBuffer result = new StringBuffer(length);
while (offset < length) {
int ch = UTF16.charAt(str, offset);
int chsize = UTF16.getCharCount(ch);
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
int type = UCharacterPropertyDB.getPropType(props);
if (type == UCharacterCategory.UPPERCASE_LETTER ||
type == UCharacterCategory.TITLECASE_LETTER) {
ch += UCharacterPropertyDB.getSignedValue(props);
}
UTF16.append(result, ch);
}
else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_)) {
getSpecialLowerCase(ch, index, result, str, offset,
locale);
}
else {
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_LOWERCASE_)) {
ch = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_LOWERCASE_);
}
UTF16.append(result, ch);
}
}
offset += chsize;
}
return result.toString();
}
// TODO: Make public API
/**
* returns the maximum amount that a single character will expand in
* upper, lower, title, or fold case operations
*/
static int getMaxCaseExpansion() {
return 10;
}
// TODO: Make public API?
/**
* produces the result of converting a single (possibly surrogate)
* character in a string.
* @param result
* @return length of returned value IF there is a change. -1 otherwise.
*/
static int toLowerCase(Locale locale, String str, int offset, char[] result) {
// NOTE: we have to keep the original string around, because it is used
// for the context
int ch = UTF16.charAt(str, offset);
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
int type = UCharacterPropertyDB.getPropType(props);
if (type == UCharacterCategory.UPPERCASE_LETTER ||
type == UCharacterCategory.TITLECASE_LETTER) {
int chDelta = UCharacterPropertyDB.getSignedValue(props);
if (chDelta == 0) return -1;
int len = str.length();
return UTF16.append(result, 0, ch + chDelta);
}
} else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_)) {
// TODO: avoid StringBuffer, put directly into array?
StringBuffer buf = new StringBuffer();
getSpecialLowerCase(ch, index, buf, str, offset,
locale);
Utility.getChars(buf, 0, buf.length(), result, 0);
return buf.length();
} else if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_LOWERCASE_)) {
return UTF16.append(result, 0, PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_LOWERCASE_));
}
}
return -1;
}
// TODO: Make public API?
/**
* produces the result of converting a single (possibly surrogate)
* character in a string.
* @param result
* @return length of returned value IF there is a change. -1 otherwise.
*/
static int toUpperCase(Locale locale, String str, int offset, char[] result) {
// NOTE: we have to keep the original string around, because it is used
// for the context
int ch = UTF16.charAt(str, offset);
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
int type = UCharacterPropertyDB.getPropType(props);
if (type == UCharacterCategory.LOWERCASE_LETTER ||
type == UCharacterCategory.TITLECASE_LETTER) {
int chDelta = UCharacterPropertyDB.getSignedValue(props);
if (chDelta == 0) return -1;
int len = str.length();
return UTF16.append(result, 0, ch - chDelta);
}
} else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_)) {
// TODO: avoid StringBuffer, put directly into array?
StringBuffer buf = new StringBuffer();
getSpecialUpperCase(ch, index, buf, str, offset,
locale);
Utility.getChars(buf, 0, buf.length(), result, 0);
return buf.length();
} else if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_UPPERCASE_)) {
return UTF16.append(result, 0, PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_UPPERCASE_));
}
}
return -1;
}
// TODO: Make public API?
/**
* produces the result of converting a single (possibly surrogate)
* character in a string.
* @param result
* @return length of returned value IF there is a change. -1 otherwise.
*/
static int toTitleCase(Locale locale, String str, int offset, char[] result) {
// NOTE: we have to keep the original string around, because it is used
// for the context
// TODO: simplify code by checking for the few special titlecases,
// and just jump to uppercase for the rest.
int ch = UTF16.charAt(str, offset);
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
int type = UCharacterPropertyDB.getPropType(props);
if (type == UCharacterCategory.LOWERCASE_LETTER) {
// here, titlecase is same as uppercase
int chDelta = UCharacterPropertyDB.getSignedValue(props);
if (chDelta == 0) return -1;
int len = str.length();
return UTF16.append(result, 0, ch - chDelta);
}
} else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_TITLECASE_)) {
return UTF16.append(result, 0, PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_TITLECASE_));
} else if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_)) {
// TODO: avoid StringBuffer, put directly into array?
StringBuffer buf = new StringBuffer();
getSpecialUpperCase(ch, index, buf, str, offset,
locale);
Utility.getChars(buf, 0, buf.length(), result, 0);
return buf.length();
} else if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_UPPERCASE_)) {
return UTF16.append(result, 0, PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_UPPERCASE_));
}
}
return -1;
}
/**
* The given character is mapped to its case folding equivalent according to
* UnicodeData.txt and CaseFolding.txt; if the character has no case folding
* equivalent, the character itself is returned.
* Only "simple", single-code point case folding mappings are used.
* For "full", multiple-code point mappings use the API
* foldCase(String str, boolean defaultmapping).
* @param ch the character to be converted
* @param defaultmapping Indicates if all mappings defined in CaseFolding.txt
* is to be used, otherwise the mappings for dotted I
* and dotless i marked with 'I' in CaseFolding.txt will
* be skipped.
* @return the case folding equivalent of the character, if any;
* otherwise the character itself.
* @see #foldCase(String, boolean)
*/
public static int foldCase(int ch, boolean defaultmapping)
{
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
int type = UCharacterPropertyDB.getPropType(props);
if (type == UCharacterCategory.UPPERCASE_LETTER ||
type == UCharacterCategory.TITLECASE_LETTER) {
return ch + UCharacterPropertyDB.getSignedValue(props);
}
}
else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_CASE_FOLDING_)) {
int exception = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_CASE_FOLDING_);
if (exception != 0) {
int foldedcasech =
PROPERTY_DB_.getFoldCase(exception & LAST_CHAR_MASK_);
if (foldedcasech != 0){
return foldedcasech;
}
}
else {
// special case folding mappings, hardcoded
if (defaultmapping &&
(ch == LATIN_SMALL_LETTER_DOTLESS_I_ ||
ch == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_)) {
// map dotted I and dotless i to U+0069 small i
return LATIN_SMALL_LETTER_I_;
}
// return ch itself because it is excluded from case folding
return ch;
}
}
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_LOWERCASE_)) {
// not else! - allow to fall through from above
return PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_LOWERCASE_);
}
}
return ch; // no mapping - return the character itself
}
/**
* The given string is mapped to its case folding equivalent according to
* UnicodeData.txt and CaseFolding.txt; if any character has no case folding
* equivalent, the character itself is returned.
* "Full", multiple-code point case folding mappings are returned here.
* For "simple" single-code point mappings use the API
* foldCase(int ch, boolean defaultmapping).
* @param str the String to be converted
* @param defaultmapping Indicates if all mappings defined in CaseFolding.txt
* is to be used, otherwise the mappings for dotted I
* and dotless i marked with 'I' in CaseFolding.txt will
* be skipped.
* @return the case folding equivalent of the character, if any;
* otherwise the character itself.
* @see #foldCase(int, boolean)
*/
public static String foldCase(String str, boolean defaultmapping)
{
int size = str.length();
StringBuffer result = new StringBuffer(size);
int offset = 0;
int ch;
// case mapping loop
while (offset < size) {
ch = UTF16.charAt(str, offset);
offset += UTF16.getCharCount(ch);
int props = PROPERTY_DB_.getProperty(ch);
if (!UCharacterPropertyDB.isExceptionIndicator(props)) {
int type = UCharacterPropertyDB.getPropType(props);
if (type == UCharacterCategory.UPPERCASE_LETTER ||
type == UCharacterCategory.TITLECASE_LETTER) {
ch += UCharacterPropertyDB.getSignedValue(props);
}
}
else {
int index = UCharacterPropertyDB.getExceptionIndex(props);
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_CASE_FOLDING_)) {
int exception = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_CASE_FOLDING_);
if (exception != 0) {
PROPERTY_DB_.getFoldCase(exception & LAST_CHAR_MASK_,
exception >> SHIFT_24_, result);
}
else {
// special case folding mappings, hardcoded
if (defaultmapping &&
(ch == LATIN_SMALL_LETTER_DOTLESS_I_ ||
ch == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_)) {
// map dotted I and dotless i to U+0069 small i
result.append(LATIN_SMALL_LETTER_I_);
}
else {
// output c itself because it is excluded from
// case folding
UTF16.append(result, ch);
}
}
// do not fall through to the output of c
continue;
}
else {
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_LOWERCASE_)) {
ch = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_LOWERCASE_);
}
}
}
// handle 1:1 code point mappings from UnicodeData.txt
UTF16.append(result, ch);
}
return result.toString();
}
/**
* Return numeric value of Han code points.
* <br> This returns the value of Han 'numeric' code points,
* including those for zero, ten, hundred, thousand, ten thousand,
* and hundred million. Unicode does not consider these to be
* numeric. This includes both the standard and 'checkwriting'
* characters, the 'big circle' zero character, and the standard
* zero character.
* @draft
* @param ch code point to query
* @return value if it is a Han 'numeric character,' otherwise return -1.
*/
public static int getHanNumericValue(int ch)
{
switch(ch)
{
case IDEOGRAPHIC_NUMBER_ZERO_ :
case CJK_IDEOGRAPH_COMPLEX_ZERO_ :
return 0; // Han Zero
case CJK_IDEOGRAPH_FIRST_ :
case CJK_IDEOGRAPH_COMPLEX_ONE_ :
return 1; // Han One
case CJK_IDEOGRAPH_SECOND_ :
case CJK_IDEOGRAPH_COMPLEX_TWO_ :
return 2; // Han Two
case CJK_IDEOGRAPH_THIRD_ :
case CJK_IDEOGRAPH_COMPLEX_THREE_ :
return 3; // Han Three
case CJK_IDEOGRAPH_FOURTH_ :
case CJK_IDEOGRAPH_COMPLEX_FOUR_ :
return 4; // Han Four
case CJK_IDEOGRAPH_FIFTH_ :
case CJK_IDEOGRAPH_COMPLEX_FIVE_ :
return 5; // Han Five
case CJK_IDEOGRAPH_SIXTH_ :
case CJK_IDEOGRAPH_COMPLEX_SIX_ :
return 6; // Han Six
case CJK_IDEOGRAPH_SEVENTH_ :
case CJK_IDEOGRAPH_COMPLEX_SEVEN_ :
return 7; // Han Seven
case CJK_IDEOGRAPH_EIGHTH_ :
case CJK_IDEOGRAPH_COMPLEX_EIGHT_ :
return 8; // Han Eight
case CJK_IDEOGRAPH_NINETH_ :
case CJK_IDEOGRAPH_COMPLEX_NINE_ :
return 9; // Han Nine
case CJK_IDEOGRAPH_TEN_ :
case CJK_IDEOGRAPH_COMPLEX_TEN_ :
return 10;
case CJK_IDEOGRAPH_HUNDRED_ :
case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ :
return 100;
case CJK_IDEOGRAPH_THOUSAND_ :
case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ :
return 1000;
case CJK_IDEOGRAPH_TEN_THOUSAND_ :
return 10000;
case CJK_IDEOGRAPH_HUNDRED_MILLION_ :
return 100000000;
}
return -1; // no value
}
// protected variables ===================================
/**
* Shift and mask value for surrogates
*/
protected static final int LEAD_SURROGATE_SHIFT_ = 10;
protected static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
// protected methods ====================================================
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
protected static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
// private variables =====================================
/**
* Database storing the sets of character property
*/
private static final UCharacterPropertyDB PROPERTY_DB_;
/**
* Initialization of the UCharacterPropertyDB instance.
* RuntimeException thrown when data is missing or data has been corrupted.
*/
static
{
try
{
PROPERTY_DB_ = new UCharacterPropertyDB();
}
catch (Exception e)
{
throw new RuntimeException(e.getMessage());
}
}
/**
* Offset to add to combined surrogate pair to avoid msking.
*/
private static final int SURROGATE_OFFSET_ =
SUPPLEMENTARY_MIN_VALUE - (0xD800 << LEAD_SURROGATE_SHIFT_) - 0xDC00;
/**
* Surrogate code point values
*/
private static final int SURROGATE_MIN_VALUE_ = 0xD800;
private static final int SURROGATE_MAX_VALUE_ = 0xDFFF;
/**
* To get the last character out from a data type
*/
private static final int LAST_CHAR_MASK_ = 0xFFFF;
/**
* To get the last byte out from a data type
*/
private static final int LAST_BYTE_MASK_ = 0xFF;
/**
* Shift 16 bits
*/
private static final int SHIFT_16_ = 16;
/**
* Shift 24 bits
*/
private static final int SHIFT_24_ = 24;
/**
* Minimum suffix value that indicates if a character is non character.
* Unicode 3.0 non characters
*/
private static final int NON_CHARACTER_SUFFIX_MIN_3_0_ = 0xFFFE;
/**
* New minimum non character in Unicode 3.1
*/
private static final int NON_CHARACTER_MIN_3_1_ = 0xFDD0;
/**
* New non character range in Unicode 3.1
*/
private static final int NON_CHARACTER_RANGE_3_1_ =
0xFDEF - NON_CHARACTER_MIN_3_1_;
/**
* Decimal radix
*/
private static final int DECIMAL_RADIX_ = 10;
/**
* No break space code point
*/
private static final int NO_BREAK_SPACE_ = 0xA0;
/**
* Narrow no break space code point
*/
private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
/**
* Zero width no break space code point
*/
private static final int ZERO_WIDTH_NO_BREAK_SPACE_ = 0xFEFF;
/**
* Ideographic number zero code point
*/
private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
/**
* CJK Ideograph, First code point
*/
private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
/**
* CJK Ideograph, Second code point
*/
private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
/**
* CJK Ideograph, Third code point
*/
private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
/**
* CJK Ideograph, Fourth code point
*/
private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
/**
* CJK Ideograph, FIFTH code point
*/
private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
/**
* CJK Ideograph, Sixth code point
*/
private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
/**
* CJK Ideograph, Seventh code point
*/
private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
/**
* CJK Ideograph, Eighth code point
*/
private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
/**
* CJK Ideograph, Nineth code point
*/
private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
/**
* Application Program command code point
*/
private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
/**
* Unit seperator code point
*/
private static final int UNIT_SEPERATOR_ = 0x001F;
/**
* Delete code point
*/
private static final int DELETE_ = 0x007F;
/**
* Turkish ISO 639 2 character code
*/
private static final String TURKISH_ = "tr";
/**
* Azerbaijani ISO 639 2 character code
*/
private static final String AZERBAIJANI_ = "az";
/**
* Lithuanian ISO 639 2 character code
*/
private static final String LITHUANIAN_ = "lt";
/**
* Latin owercase i
*/
private static final char LATIN_SMALL_LETTER_I_ = 0x69;
/**
* Latin uppercase I
*/
private static final char LATIN_CAPITAL_LETTER_I_ = 0x49;
/**
* Latin capital letter i with dot above
*/
private static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
/**
* Latin small letter i with dot above
*/
private static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
/**
* Combining dot above
*/
private static final char COMBINING_DOT_ABOVE_ = 0x307;
/**
* Greek capital letter sigma
*/
private static final char GREEK_CAPITAL_LETTER_SIGMA_ = 0x3a3;
/**
* Greek small letter sigma
*/
private static final char GREEK_SMALL_LETTER_SIGMA_ = 0x3c3;
/**
* Greek small letter rho
*/
private static final char GREEK_SMALL_LETTER_RHO_ = 0x3c2;
/**
* ISO control character first range upper limit 0x0 - 0x1F
*/
private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F;
/**
* Han digit characters
*/
private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
/**
* Hyphens
*/
private static final int HYPHEN_ = 0x2010;
private static final int SOFT_HYPHEN_ = 0xAD;
/**
* LATIN SMALL LETTER J
*/
private static final int LATIN_SMALL_LETTER_J_ = 0x6a;
/**
* LATIN SMALL LETTER I WITH OGONEK
*/
private static final int LATIN_SMALL_LETTER_I_WITH_OGONEK_ = 0x12f;
/**
* LATIN SMALL LETTER I WITH TILDE BELOW
*/
private static final int LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ = 0x1e2d;
/**
* LATIN SMALL LETTER I WITH DOT BELOW
*/
private static final int LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_ = 0x1ecb;
/**
* Combining class for combining mark above
*/
private static final int COMBINING_MARK_ABOVE_CLASS_ = 230;
/**
* LATIN CAPITAL LETTER J
*/
private static final int LATIN_CAPITAL_LETTER_J_ = 0x4a;
/**
* LATIN CAPITAL LETTER I WITH OGONEK
*/
private static final int LATIN_CAPITAL_I_WITH_OGONEK_ = 0x12e;
/**
* LATIN CAPITAL LETTER I WITH TILDE
*/
private static final int LATIN_CAPITAL_I_WITH_TILDE_ = 0x128;
/**
* LATIN CAPITAL LETTER I WITH GRAVE
*/
private static final int LATIN_CAPITAL_I_WITH_GRAVE_ = 0xcc;
/**
* LATIN CAPITAL LETTER I WITH ACUTE
*/
private static final int LATIN_CAPITAL_I_WITH_ACUTE_ = 0xcd;
/**
* COMBINING GRAVE ACCENT
*/
private static final int COMBINING_GRAVE_ACCENT_ = 0x300;
/**
* COMBINING ACUTE ACCENT
*/
private static final int COMBINING_ACUTE_ACCENT_ = 0x301;
/**
* COMBINING TILDE
*/
private static final int COMBINING_TILDE_ = 0x303;
// private methods ==============================================
/**
* Gets the correct property information from UCharacterPropertyDB
* @param ch character whose information is to be retrieved
* @return a 32 bit information, returns 0 if no data is found.
*/
private static int getProps(int ch)
{
if (ch >= UCharacter.MIN_VALUE & ch <= UCharacter.MAX_VALUE) {
return PROPERTY_DB_.getProperty(ch);
}
return 0;
}
/**
* Getting the locales used for case mapping
* @param locale to work with
* @return locale which the actual case mapping works with
*/
private static Locale getCaseLocale(Locale locale)
{
String language = locale.getLanguage();
// the locale can have no language
if (language.length() != 2) {
return locale;
}
if (language.equals(TURKISH_) || language.equals(AZERBAIJANI_)) {
return new Locale("tr", "TR");
}
if (language.equals(LITHUANIAN_)) {
return new Locale("lt", "LT");
}
return locale;
}
/**
* In Unicode 3.1.1, an ignorable sequence is a sequence of *zero* or more
* characters from the set {HYPHEN, SOFT HYPHEN, general category = Mn}.
* (Expected to change!)
* @param ch codepoint
* @param cat category of the argument codepoint
* @return true if ch is case ignorable.
*/
private static boolean isIgnorable(int ch, int cat)
{
return cat == UCharacterCategory.NON_SPACING_MARK || ch == HYPHEN_ ||
ch == SOFT_HYPHEN_;
}
/**
* Determines if offset is not followed by a sequence consisting of
* an ignorable sequence and then a cased letter {Ll, Lu, Lt}.
* @param str string to determine
* @param offset offset in string to check
* @return false if any character after index in src is a cased letter
* @see SpecialCasing.txt
*/
private static boolean isCFINAL(String str, int offset)
{
int length = str.length();
offset += UTF16.getCharCount(UTF16.charAt(str, offset));
while (offset < length) {
int ch = UTF16.charAt(str, offset);
int cat = getType(ch);
if (cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER) {
return false; // followed by cased letter
}
if (!isIgnorable(ch, cat)) {
return true; // not ignorable
}
offset += UTF16.getCharCount(ch);
}
return true;
}
/**
* Determines if offset is not preceded by a sequence consisting of a cased
* letter {Ll, Lu, Lt} and an ignorable sequence.
* @param str string to determine
* @param offset offset in string to check
* @return true if any character before index in src is a cased letter
* @see SpecialCasing.txt
*/
private static boolean isNotCINITIAL(String str, int offset)
{
offset --;
while (offset >= 0) {
int ch = UTF16.charAt(str, offset);
int cat = getType(ch);
if (cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER) {
return true; // preceded by cased letter
}
if (!isIgnorable(ch, cat)) {
return false; // not ignorable
}
offset -= UTF16.getCharCount(ch);
}
return false;
}
/**
* Determines if a string at offset is preceded by any base characters
* { 'i', 'j', U+012f, U+1e2d, U+1ecb } with no intervening character with
* combining class = 230
* @param str string to be determined
* @param offset offset in string to check
* @return true if some characters preceding the offset index belongs to
* the set { 'i', 'j', U+012f, U+1e2d, U+1ecb }
* @see SpecialCasing.txt
*/
private static boolean isAFTER_i(String str, int offset)
{
offset --;
while (offset >= 0) {
int ch = UTF16.charAt(str, offset);
if (ch == LATIN_SMALL_LETTER_I_ || ch == LATIN_SMALL_LETTER_J_ ||
ch == LATIN_SMALL_LETTER_I_WITH_OGONEK_ ||
ch == LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ ||
ch == LATIN_SMALL_LETTER_I_WITH_DOT_BELOW_) {
return true; // preceded by TYPE_i
}
int cc = getCombiningClass(ch);
if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) {
// preceded by different base character not TYPE_i), or
// intervening cc == 230
return false;
}
offset -= UTF16.getCharCount(ch);
}
return false; // not preceded by TYPE_i
}
/**
* Determines if a string at offset is preceded by base characters 'I' with
* no intervening combining class = 230
* @param str string to be determined
* @param offset offset in string to check
* @return true if some characters preceding the offset index is the
* character 'I' with no intervening combining class = 230
* @see SpecialCasing.txt
*/
private static boolean isAFTER_I(String str, int offset)
{
offset --;
while (offset >= 0) {
int ch = UTF16.charAt(str, offset);
if (ch == LATIN_CAPITAL_LETTER_I_) {
return true; // preceded by I
}
int cc = getCombiningClass(ch);
if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) {
// preceded by different base character (not I), or
// intervening cc == 230
return false;
}
offset -= UTF16.getCharCount(ch);
}
return false; // not preceded by I
}
/**
* Determines if a string at offset is followed by one or more characters
* of combining class = 230.
* @param str string to be determined
* @param offset offset in string to check
* @return true if a string at offset is followed by one or more characters
* of combining class = 230.
* @see SpecialCasing.txt
*/
private static boolean isFollowedByMOREABOVE(String str, int offset)
{
int length = str.length();
offset += UTF16.getCharCount(UTF16.charAt(str, 0));
while (offset < length) {
int ch = UTF16.charAt(str, offset);
int cc = getCombiningClass(ch);
if (cc == COMBINING_MARK_ABOVE_CLASS_) {
return true; // at least one cc==230 following
}
if (cc == 0) {
return false; // next base character, no more cc==230 following
}
offset += UTF16.getCharCount(ch);
}
return false; // no more cc == 230 following
}
/**
* Determines if a string at offset is followed by a dot above
* with no characters of combining class == 230 in between
* @param str string to be determined
* @param offset offset in string to check
* @return true if a string at offset is followed by oa dot above
* with no characters of combining class == 230 in between
* @see SpecialCasing.txt
*/
private static boolean isFollowedByDotAbove(String str, int offset)
{
int length = str.length();
offset += UTF16.getCharCount(UTF16.charAt(str, 0));
while (offset < length) {
int ch = UTF16.charAt(str, offset);
if (ch == COMBINING_DOT_ABOVE_) {
return true;
}
int cc = getCombiningClass(ch);
if (cc == 0 || cc == COMBINING_MARK_ABOVE_CLASS_) {
return false; // next base character or cc==230 in between
}
offset += UTF16.getCharCount(ch);
}
return false; // no dot above following
}
/**
* Special casing uppercase management
* @param ch code point to convert
* @param index of exception containing special case information
* @param buffer to add uppercase
* @param str original string
* @param offset index of ch in str
* @param tr_az if uppercase is to be made with TURKISH or AZERBAIJANI
* in mind
* @param lt if uppercase is to be made with LITHUANIAN in mind
*/
private static void getSpecialUpperCase(int ch, int index,
StringBuffer buffer, String str,
int offset, Locale locale)
{
int exception = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_);
if (exception < 0) {
String language = locale.getLanguage();
// use hardcoded conditions and mappings
if ((language.equals(TURKISH_) || language.equals(AZERBAIJANI_))
&& ch == LATIN_SMALL_LETTER_I_) {
// turkish: i maps to dotted I
buffer.append(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_);
}
else {
if (language.equals(LITHUANIAN_) && ch == COMBINING_DOT_ABOVE_
&& isAFTER_i(str, offset)) {
// lithuanian: remove DOT ABOVE after U+0069 "i" with
// upper or titlecase
return; // remove the dot (continue without output)
}
else {
// no known conditional special case mapping, use a normal
// mapping
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_UPPERCASE_)) {
UTF16.append(buffer, PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_UPPERCASE_));
}
else {
UTF16.append(buffer, ch);
}
}
}
}
else {
// get the special case mapping string from the data file
index = exception & LAST_CHAR_MASK_;
PROPERTY_DB_.getUpperCase(index, buffer);
}
}
/**
* Special casing lowercase management
* @param ch code point to convert
* @param index of exception containing special case information
* @param buffer to add lowercase
* @param str original string
* @param offset index of ch in str
* @param locale current locale
*/
private static void getSpecialLowerCase(int ch, int index,
StringBuffer buffer, String str,
int offset, Locale locale)
{
int exception = PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_SPECIAL_CASING_);
if (exception < 0) {
// fill u and i with the case mapping result string
// use hardcoded conditions and mappings
if (locale.getLanguage().equals(LITHUANIAN_) &&
// base characters, find accents above
(((ch == LATIN_CAPITAL_LETTER_I_ ||
ch == LATIN_CAPITAL_LETTER_J_ ||
ch == LATIN_CAPITAL_I_WITH_OGONEK_) &&
isFollowedByMOREABOVE(str, offset)) ||
// precomposed with accent above, no need to find one
(ch == LATIN_CAPITAL_I_WITH_GRAVE_ ||
ch == LATIN_CAPITAL_I_WITH_ACUTE_ ||
ch == LATIN_CAPITAL_I_WITH_TILDE_))) {
// lithuanian: add a dot above if there are more accents
// above (to always have the dot)
switch(ch) {
case LATIN_CAPITAL_LETTER_I_:
buffer.append((char)LATIN_SMALL_LETTER_I_);
buffer.append((char)COMBINING_DOT_ABOVE_);
break;
case LATIN_CAPITAL_LETTER_J_:
buffer.append((char)LATIN_SMALL_LETTER_J_);
buffer.append((char)COMBINING_DOT_ABOVE_);
break;
case LATIN_CAPITAL_I_WITH_OGONEK_:
buffer.append((char)LATIN_SMALL_LETTER_I_WITH_OGONEK_);
buffer.append((char)COMBINING_DOT_ABOVE_);
break;
case LATIN_CAPITAL_I_WITH_GRAVE_:
buffer.append((char)LATIN_SMALL_LETTER_I_);
buffer.append((char)COMBINING_DOT_ABOVE_);
buffer.append((char)COMBINING_GRAVE_ACCENT_);
break;
case LATIN_CAPITAL_I_WITH_ACUTE_:
buffer.append((char)LATIN_SMALL_LETTER_I_);
buffer.append((char)COMBINING_DOT_ABOVE_);
buffer.append((char)COMBINING_ACUTE_ACCENT_);
break;
case LATIN_CAPITAL_I_WITH_TILDE_:
buffer.append((char)LATIN_SMALL_LETTER_I_);
buffer.append((char)COMBINING_DOT_ABOVE_);
buffer.append((char)COMBINING_TILDE_);
break;
}
/*
Note: This handling of I and of dot above differs from
Unicode 3.1.1's SpecialCasing-5.txt because the AFTER_i
condition there does not work for decomposed I+dot above.
This fix is being proposed to the UTC.
*/
}
else {
String language = locale.getLanguage();
if ((language.equals(TURKISH_) ||
language.equals(AZERBAIJANI_)) &&
ch == LATIN_CAPITAL_LETTER_I_ &&
!isFollowedByDotAbove(str, offset)) {
// turkish: I maps to dotless i
// other languages or turkish with decomposed I+dot above:
// I maps to i
buffer.append(LATIN_SMALL_LETTER_DOTLESS_I_);
}
else {
if (ch == COMBINING_DOT_ABOVE_ &&
isAFTER_I(str, offset) &&
!isFollowedByMOREABOVE(str, offset)) {
// decomposed I+dot above becomes i (see handling of
// U+0049 for turkish) and removes the dot above
return; // remove the dot (continue without output)
}
else {
if (ch == GREEK_CAPITAL_LETTER_SIGMA_ &&
isCFINAL(str, offset) &&
isNotCINITIAL(str, offset)) {
// greek capital sigma maps depending on
// surrounding cased letters
buffer.append(GREEK_SMALL_LETTER_RHO_);
}
else {
// no known conditional special case mapping, use
// a normal mapping
if (PROPERTY_DB_.hasExceptionValue(index,
UCharacterPropertyDB.EXC_LOWERCASE_)) {
UTF16.append(buffer,
PROPERTY_DB_.getException(index,
UCharacterPropertyDB.EXC_LOWERCASE_));
}
else {
UTF16.append(buffer, ch);
}
}
}
}
}
}
else {
// get the special case mapping string from the data file
index = exception & LAST_CHAR_MASK_;
PROPERTY_DB_.getLowerCase(index, buffer);
}
}
/**
* Determines if codepoint is a non character
* @param ch codepoint
* @return true if codepoint is a non character false otherwise
*/
private static boolean isNonCharacter(int ch)
{
if ((ch & LAST_CHAR_MASK_) >= NON_CHARACTER_SUFFIX_MIN_3_0_) {
return true;
}
int difference = ch - NON_CHARACTER_MIN_3_1_;
return difference >= 0 && difference <= NON_CHARACTER_RANGE_3_1_;
}
}