| /* |
| ******************************************************************************* |
| * Copyright (C) 1996-1999, International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| // FILE NAME : unicode.cpp |
| // |
| // CREATED |
| // Wednesday, December 11, 1996 |
| // |
| // CHANGES |
| // Wednesday, February 4, 1998 |
| // Changed logic in toUpperCase and toLowerCase in order |
| // to avoid 0xFFFF to be returned when receiving |
| // confusing Unichar to lowercase or to uppercase |
| // (e.g. Letterlike symbols) |
| // |
| // CHANGES BY |
| // Bertramd A. DAMIBA |
| // |
| // CREATED BY |
| // Helena Shih |
| // |
| // CHANGES |
| // Thursday, April 15, 1999 |
| // Modified the definitions of all the functions |
| // C++ Wrappers for Unicode |
| // CHANGES BY |
| // Madhu Katragadda |
| // 5/20/99 Madhu Added the function u_getVersion() |
| // 07/09/99 stephen Added definition for {MIN,MAX}_VALUE |
| // 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit |
| //******************************************************************************************** |
| |
| #include "unicode/unicode.h" |
| |
| #include "unicode/uchar.h" |
| |
| |
| const UChar Unicode::MIN_VALUE = 0x0000; |
| const UChar Unicode::MAX_VALUE = 0xFFFF; |
| const int8_t Unicode::MIN_RADIX = 2; |
| const int8_t Unicode::MAX_RADIX = 36; |
| |
| Unicode::Unicode() |
| { |
| } |
| |
| Unicode::Unicode(const Unicode& other) |
| { |
| } |
| |
| Unicode::~Unicode() |
| { |
| } |
| |
| const Unicode& |
| Unicode::operator=(const Unicode& other) |
| { |
| return *this; |
| } |
| |
| // Checks if ch is a lower case letter. |
| bool_t |
| Unicode::isLowerCase(UChar ch) |
| { |
| return (u_islower(ch) ); |
| } |
| |
| // Checks if ch is a upper case letter. |
| bool_t |
| Unicode::isUpperCase(UChar ch) |
| { |
| return (u_isupper(ch) ); |
| } |
| |
| // Checks if ch is a title case letter; usually upper case letters. |
| bool_t |
| Unicode::isTitleCase(UChar ch) |
| { |
| return (u_istitle(ch) ); |
| } |
| |
| // Checks if ch is a decimal digit. |
| bool_t |
| Unicode::isDigit(UChar ch) |
| { |
| return (u_isdigit(ch) ); |
| } |
| |
| // Checks if ch is a unicode character with assigned character type. |
| bool_t |
| Unicode::isDefined(UChar ch) |
| { |
| return (u_isdefined(ch) ); |
| } |
| |
| |
| // Gets the character's linguistic directionality. |
| Unicode::EDirectionProperty |
| Unicode::characterDirection( UChar ch ) |
| { |
| |
| return ((EDirectionProperty)u_charDirection(ch) ); |
| } |
| |
| // Get the script associated with the character |
| Unicode::EUnicodeScript |
| Unicode::getScript(UChar ch) |
| { |
| |
| |
| return ((EUnicodeScript) u_charScript(ch) ); |
| } |
| |
| // Checks if the Unicode character is a base form character that can take a diacritic. |
| bool_t |
| Unicode::isBaseForm(UChar ch) |
| { |
| return (u_isbase(ch) ); |
| |
| } |
| |
| // Checks if the Unicode character is a control character. |
| bool_t |
| Unicode::isControl(UChar ch) |
| { |
| return( u_iscntrl(ch) ); |
| } |
| |
| // Checks if the Unicode character is printable. |
| bool_t |
| Unicode::isPrintable(UChar ch) |
| { |
| return( u_isprint(ch) ); |
| } |
| |
| // Checks if the Unicode character is a letter. |
| bool_t |
| Unicode::isLetter(UChar ch) |
| { |
| return(u_isalpha(ch) ); |
| } |
| |
| // Checks if the Unicode character can start a Java identifier. |
| bool_t |
| Unicode::isJavaIdentifierStart(UChar ch) |
| { |
| return( u_isJavaIDStart(ch) ); |
| } |
| |
| // Checks if the Unicode character can be a Java identifier part other than starting the |
| // identifier. |
| bool_t |
| Unicode::isJavaIdentifierPart(UChar ch) |
| { |
| return (u_isJavaIDPart(ch) ); |
| } |
| |
| // Checks if the Unicode character can start a Unicode identifier. |
| bool_t |
| Unicode::isUnicodeIdentifierStart(UChar ch) |
| { |
| return(u_isIDStart(ch)); |
| } |
| |
| // Checks if the Unicode character can be a Unicode identifier part other than starting the |
| // identifier. |
| bool_t |
| Unicode::isUnicodeIdentifierPart(UChar ch) |
| { |
| return (u_isIDPart(ch) ); |
| } |
| |
| // Checks if the Unicode character can be ignorable in a Java or Unicode identifier. |
| bool_t |
| Unicode::isIdentifierIgnorable(UChar ch) |
| { |
| return( u_isIDIgnorable(ch) ); |
| } |
| |
| // Transforms the Unicode character to its lower case equivalent. |
| UChar |
| Unicode::toLowerCase(UChar ch) |
| { |
| return (u_tolower(ch) ); |
| |
| } |
| |
| // Transforms the Unicode character to its upper case equivalent. |
| UChar |
| Unicode::toUpperCase(UChar ch) |
| { |
| return(u_toupper(ch) ); |
| } |
| |
| // Transforms the Unicode character to its title case equivalent. |
| UChar |
| Unicode::toTitleCase(UChar ch) |
| { |
| return(u_totitle(ch) ); |
| } |
| |
| // Checks if the Unicode character is a space character. |
| bool_t |
| Unicode::isSpaceChar(UChar ch) |
| { |
| return(u_isspace(ch) ); |
| } |
| |
| // Determines if the specified character is white space according to ICU. |
| bool_t |
| Unicode::isWhitespace(UChar ch) { |
| // TODO Move this implementation to C, and make this call the C |
| // implementation. |
| // TODO Optional -- reimplement in terms of modified category |
| // code -- see Mark Davis's note (below). If this is done, |
| // the implementation still must conform to the specified |
| // semantics. That is, U+00A0 and U+FEFF must return false, |
| // and the ranges U+0009 - U+000D and U+001C - U+001F must |
| // return true. Characters other than these in Zs, Zl, or Zp |
| // must return true. |
| |
| int8_t cat = Unicode::getType(ch); |
| return |
| (cat == SPACE_SEPARATOR && ch != 0x00A0 && ch != 0xFEFF) || |
| (((((int32_t(1) << LINE_SEPARATOR) | |
| (int32_t(1) << PARAGRAPH_SEPARATOR)) >> cat) & int32_t(1)) != 0) || |
| (ch <= 0x1F && ((((int32_t(1) << 0x0009) | |
| (int32_t(1) << 0x000A) | |
| (int32_t(1) << 0x000B) | |
| (int32_t(1) << 0x000C) | |
| (int32_t(1) << 0x000D) | |
| (int32_t(1) << 0x001C) | |
| (int32_t(1) << 0x001D) | |
| (int32_t(1) << 0x001E) | |
| (int32_t(1) << 0x001F)) >> ch) & int32_t(1)) != 0); |
| |
| // From Mark Davis: |
| //| What we should do is to make sure that the special Cc characters like CR |
| //| have either Zs, Zl, or Zp in the property database. We can then just call |
| //| the equivalent of: |
| //| |
| //| public static boolean isWhileSpace(char ch) { |
| //| return ((1 << Character.getType(c)) & WHITESPACE_MASK) != 0; } |
| //| |
| //| where WHITESPACE_MASK = (1 << Zs) | (1 << Zl) | (1 << Zp); |
| //| |
| //| This is much faster code, since it just looksup the property value and does |
| //| a couple of arithmetics to get the right answer. |
| // |
| // (We still have to make sure U+00A0 and U+FEFF are excluded, so the code |
| // might not be as simple as this. - aliu) |
| } |
| |
| // Gets if the Unicode character's character property. |
| int8_t |
| Unicode::getType(UChar ch) |
| { |
| return(u_charType(ch) ); |
| } |
| |
| |
| |
| // Gets table cell width of the Unicode character. |
| uint16_t |
| Unicode::getCellWidth(UChar ch) |
| { |
| return (u_charCellWidth(ch) ); |
| } |
| |
| int32_t |
| Unicode::digitValue(UChar ch) |
| { |
| return (u_charDigitValue(ch) ); |
| } |
| |
| int8_t |
| Unicode::digit(UChar ch, int8_t radix) { |
| int8_t value = -1; |
| if (radix >= MIN_RADIX && radix <= MAX_RADIX) { |
| value = (int8_t) u_charDigitValue(ch); |
| if (value < 0) { |
| if (ch >= (UChar)'A' && ch <= (UChar)'Z') { |
| value = ch - ((UChar)'A' - 10); |
| } else if (ch >= (UChar)'a' && ch <= (UChar)'z') { |
| value = ch - ((UChar)'a' - 10); |
| } |
| } |
| } |
| return (value < radix) ? value : -1; |
| } |
| |
| UChar |
| Unicode::forDigit(int32_t digit, int8_t radix) { |
| if ((radix < MIN_RADIX) || (radix > MAX_RADIX) || |
| (digit < 0) || (digit >= radix)) { |
| return (UChar)0; |
| } |
| return (UChar)(((digit < 10) ? (UChar)'0' : ((UChar)'a' - 10)) |
| + digit); |
| } |
| |
| void |
| Unicode::getUnicodeVersion(UVersionInfo versionArray) |
| { |
| u_getUnicodeVersion(versionArray); |
| } |
| |
| |