| /* |
| ****************************************************************************** |
| * Copyright (C) 1996-2001, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ****************************************************************************** |
| */ |
| // FILE NAME : unicode.h |
| // |
| // CREATED |
| // Wednesday, December 11, 1996 |
| // |
| // CREATED BY |
| // Helena Shih |
| // |
| // CHANGES |
| // Thursday, April 15, 1999 |
| // Modified the definitions of all the functions |
| // C++ Wrappers for Unicode |
| // CHANGES BY |
| // Madhu Katragadda |
| // 5/20/99 Madhu Added the function getVersion() |
| // 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit |
| //***************************************************************************** |
| |
| |
| |
| #ifndef UNICODE_H |
| #define UNICODE_H |
| |
| #include "unicode/utypes.h" |
| #include "unicode/uchar.h" |
| |
| U_NAMESPACE_BEGIN |
| /** |
| * This class is deprecated and will be removed. |
| * Use the C API, see uchar.h and utf.h. |
| * The Unicode class is a pure 1:1 wrapper for the functions and macros there. |
| * |
| * Old documentation: |
| * |
| * The Unicode class allows you to query the properties associated with |
| * individual Unicode character values. |
| * <p> |
| * The Unicode character information, provided implicitly by the |
| * <a href="http://www.unicode.org/">Unicode Standard</a>, |
| * includes information about the sript |
| * (for example, symbols or control characters) to which the character belongs, |
| * as well as semantic information such as whether a character is a digit or |
| * uppercase, lowercase, or uncased. |
| * <P> |
| * |
| * @subclassing Do not subclass. |
| * @deprecated To be removed after 2002-sep-30; use the C API, see uchar.h and utf.h. |
| */ |
| class U_COMMON_API Unicode |
| { |
| public: |
| /* |
| * In C++, static const members actually take up memory and need to be accessed. |
| * enum values are more like C #define's. |
| * The following is a collection of constants, not an enumeration type. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| enum { |
| /** The lowest Unicode code point value. Code points are non-negative. */ |
| MIN_VALUE=0, |
| |
| /** |
| * The highest Unicode code point value (scalar value) according to |
| * The Unicode Standard. This is a 21-bit value (20.1 bits, rounded up). |
| * For a single character, UChar32 is a simple type that can hold any code point value. |
| */ |
| MAX_VALUE=0x10ffff, |
| |
| /** |
| * The maximum number of code units (UChar's) per code point (UChar32). |
| * This depends on the default UTF that the ICU library is compiled for. |
| * Currently, the only natively supported UTF is UTF-16, which means that |
| * UChar is 16 bits wide and this value is 2 (for surrogate pairs). |
| * This may change in the future. |
| */ |
| MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH, |
| |
| /** |
| * The minimum radix available for conversion to and from Strings. |
| * The constant value of this field is the smallest value permitted |
| * for the radix argument in radix-conversion methods such as the |
| * <code>digit</code> method and the <code>forDigit</code> |
| * method. |
| * |
| * @see Unicode#digit |
| * @see Unicode#forDigit |
| */ |
| MIN_RADIX=2, |
| |
| /** |
| * The maximum radix available for conversion to and from Strings. |
| * The constant value of this field is the largest value permitted |
| * for the radix argument in radix-conversion methods such as the |
| * <code>digit</code> method and the <code>forDigit</code> |
| * method. |
| * |
| * @see Unicode#digit |
| * @see Unicode#forDigit |
| */ |
| MAX_RADIX=36 |
| }; |
| |
| /** |
| * Public data for enumerated Unicode general category types |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| |
| enum EUnicodeGeneralTypes |
| { |
| UNASSIGNED = 0, |
| UPPERCASE_LETTER = 1, |
| LOWERCASE_LETTER = 2, |
| TITLECASE_LETTER = 3, |
| MODIFIER_LETTER = 4, |
| OTHER_LETTER = 5, |
| NON_SPACING_MARK = 6, |
| ENCLOSING_MARK = 7, |
| COMBINING_SPACING_MARK = 8, |
| DECIMAL_DIGIT_NUMBER = 9, |
| LETTER_NUMBER = 10, |
| OTHER_NUMBER = 11, |
| SPACE_SEPARATOR = 12, |
| LINE_SEPARATOR = 13, |
| PARAGRAPH_SEPARATOR = 14, |
| CONTROL = 15, |
| FORMAT = 16, |
| PRIVATE_USE = 17, |
| SURROGATE = 18, |
| DASH_PUNCTUATION = 19, |
| START_PUNCTUATION = 20, |
| END_PUNCTUATION = 21, |
| CONNECTOR_PUNCTUATION = 22, |
| OTHER_PUNCTUATION = 23, |
| MATH_SYMBOL = 24, |
| CURRENCY_SYMBOL = 25, |
| MODIFIER_SYMBOL = 26, |
| OTHER_SYMBOL = 27, |
| INITIAL_PUNCTUATION = 28, |
| FINAL_PUNCTUATION = 29, |
| GENERAL_TYPES_COUNT = 30 |
| }; |
| |
| /* Please keep these values in sync with UCharScript */ |
| /** |
| * These are the same values as uchar.h::UCharScript |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| enum EUnicodeScript |
| { |
| kBasicLatin=UBLOCK_BASIC_LATIN, |
| kLatin1Supplement, |
| kLatinExtendedA, |
| kLatinExtendedB, |
| kIPAExtension, |
| kSpacingModifier, |
| kCombiningDiacritical, |
| kGreek, |
| kCyrillic, |
| kArmenian, |
| kHebrew, |
| kArabic, |
| kSyriac, |
| kThaana, |
| kDevanagari, |
| kBengali, |
| kGurmukhi, |
| kGujarati, |
| kOriya, |
| kTamil, |
| kTelugu, |
| kKannada, |
| kMalayalam, |
| kSinhala, |
| kThai, |
| kLao, |
| kTibetan, |
| kMyanmar, |
| kGeorgian, |
| kHangulJamo, |
| kEthiopic, |
| kCherokee, |
| kUnifiedCanadianAboriginalSyllabics, |
| kogham, |
| kRunic, |
| kKhmer, |
| kMongolian, |
| kLatinExtendedAdditional, |
| kGreekExtended, |
| kGeneralPunctuation, |
| kSuperSubScript, |
| kCurrencySymbolScript, |
| kSymbolCombiningMark, |
| kLetterlikeSymbol, |
| kNumberForm, |
| kArrow, |
| kMathOperator, |
| kMiscTechnical, |
| kControlPicture, |
| kOpticalCharacter, |
| kEnclosedAlphanumeric, |
| kBoxDrawing, |
| kBlockElement, |
| kGeometricShape, |
| kMiscSymbol, |
| kDingbat, |
| kBraillePatterns, |
| kCJKRadicalsSupplement, |
| kKangxiRadicals, |
| kIdeographicDescriptionCharacters, |
| kCJKSymbolPunctuation, |
| kHiragana, |
| kKatakana, |
| kBopomofo, |
| kHangulCompatibilityJamo, |
| kKanbun, |
| kBopomofoExtended, |
| kEnclosedCJKLetterMonth, |
| kCJKCompatibility, |
| kCJKUnifiedIdeographExtensionA, |
| kCJKUnifiedIdeograph, |
| kYiSyllables, |
| kYiRadicals, |
| kHangulSyllable, |
| kHighSurrogate, |
| kHighPrivateUseSurrogate, |
| kLowSurrogate, |
| kPrivateUse, |
| kCJKCompatibilityIdeograph, |
| kAlphabeticPresentation, |
| kArabicPresentationA, |
| kCombiningHalfMark, |
| kCJKCompatibilityForm, |
| kSmallFormVariant, |
| kArabicPresentationB, |
| kNoScript, |
| kHalfwidthFullwidthForm, |
| kScriptCount=UBLOCK_COUNT |
| }; |
| |
| /** |
| * This specifies the language directional property of a character set. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| enum EDirectionProperty { |
| LEFT_TO_RIGHT = 0, |
| RIGHT_TO_LEFT = 1, |
| EUROPEAN_NUMBER = 2, |
| EUROPEAN_NUMBER_SEPARATOR = 3, |
| EUROPEAN_NUMBER_TERMINATOR = 4, |
| ARABIC_NUMBER = 5, |
| COMMON_NUMBER_SEPARATOR = 6, |
| BLOCK_SEPARATOR = 7, |
| SEGMENT_SEPARATOR = 8, |
| WHITE_SPACE_NEUTRAL = 9, |
| OTHER_NEUTRAL = 10, |
| LEFT_TO_RIGHT_EMBEDDING = 11, |
| LEFT_TO_RIGHT_OVERRIDE = 12, |
| RIGHT_TO_LEFT_ARABIC = 13, |
| RIGHT_TO_LEFT_EMBEDDING = 14, |
| RIGHT_TO_LEFT_OVERRIDE = 15, |
| POP_DIRECTIONAL_FORMAT = 16, |
| DIR_NON_SPACING_MARK = 17, |
| BOUNDARY_NEUTRAL = 18 |
| }; |
| |
| /** |
| * Values returned by the getCellWidth() function. |
| * @see Unicode#getCellWidth |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| enum ECellWidths |
| { |
| ZERO_WIDTH = 0, |
| HALF_WIDTH = 1, |
| FULL_WIDTH = 2, |
| NEUTRAL = 3 |
| }; |
| |
| /** |
| * Does this code unit alone represent a Unicode code point? |
| * If so, then the code point value is the same as the code unit value, |
| * or <code>(UChar32)c</code>. |
| * Being a single, lead, or trail unit are mutually exclusive properties. |
| * |
| * @param c The code unit to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isSingle(UChar c); |
| |
| /** |
| * Is this code unit the first of a multiple-unit sequence? |
| * Being a single, lead, or trail unit are mutually exclusive properties. |
| * |
| * @param c The code unit to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isLead(UChar c); |
| |
| /** |
| * Is this code unit one of, but not the first, of a multiple-unit sequence? |
| * Being a single, lead, or trail unit are mutually exclusive properties. |
| * |
| * @param c The code unit to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isTrail(UChar c); |
| |
| /** |
| * Is this code point a surrogate character? |
| * Surrogates are not characters; they are reserved for |
| * use in UTF-16 strings as leading and trailing code units |
| * of multiple-unit sequences for single code points. |
| * |
| * @param c The code point to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isSurrogate(UChar32 c); |
| |
| /** |
| * Is this code point a Unicode character? |
| * The value range for Unicode characters is limited to |
| * 0x10ffff==MAX_VALUE, and some values within this |
| * range are reserved and not characters, too. |
| * Those are the surrogate values and all values where the least |
| * significant 16 bits are either 0xfffe or 0xffff. |
| * |
| * @param c The code point to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isUnicodeChar(UChar32 c); |
| |
| /** |
| * Is this code point an error value? |
| * In ICU, code point access with macros or functions does not result |
| * in a UErrorCode to be set if a code unit sequence is illegal |
| * or irregular, but instead the resulting code point will be |
| * one of few special error values. This function tests for one of those. |
| * |
| * @param c The code point to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isError(UChar32 c); |
| |
| /** |
| * Is this code point a Unicode character, and not an error value? |
| * This is an efficient combination of |
| * <code>isUnicodeChar(c) && !isError(c)</code>. |
| * |
| * @param c The code point to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isValid(UChar32 c); |
| |
| /** |
| * When writing code units for a given code point, is more than one |
| * code unit necessary? |
| * If not, then a single UChar value of <code>(UChar)c</code> can |
| * be written to a UChar array. Otherwise, multiple code units need to be |
| * calculated and written. |
| * |
| * @param c The code point to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool needMultipleUChar(UChar32 c); |
| |
| /** |
| * When writing code units for a given code point, how many |
| * code units are necessary? |
| * |
| * @param c The code point to be tested. |
| * @return Boolean value. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline int32_t charLength(UChar32 c); |
| |
| /** |
| * This function returns an average size of a UChar array compared to the |
| * size that it would need to hold similar text if UTF-16 were used. |
| * With UTF-16, this always returns its argument. |
| * With UTF-8, the number returned will be larger, with UTF-32, smaller. |
| * It will typically be less than <code>size*MAX_CHAR_LENGTH</code>. |
| * |
| * @param size The size of the array if UTF-16 were used. |
| * @return An average size necessary for the UTF that ICU was compiled for. |
| * (Only UTF-16 is supported right now, therefore, |
| * this will always be <code>size</code> itself. This may change in the future.) |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline int32_t arraySize(int32_t size); |
| |
| /** |
| * Determines whether the specified UChar is a lowercase character |
| * according to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character is lowercase; false otherwise. |
| * |
| * @see Unicode#isUpperCase |
| * @see Unicode#isTitleCase |
| * @see Unicode#toLowerCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isLowerCase(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is an uppercase character |
| * according to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character is uppercase; false otherwise. |
| * @see Unicode#isLowerCase |
| * @see Unicode#isTitleCase |
| * @see Unicode#toUpperCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isUpperCase(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is a titlecase character |
| * according to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character is titlecase; false otherwise. |
| * @see Unicode#isUpperCase |
| * @see Unicode#isLowerCase |
| * @see Unicode#toTitleCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isTitleCase(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is a digit according to Unicode |
| * 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character is a digit; false otherwise. |
| * @see Unicode#digit |
| * @see Unicode#forDigit |
| * @see Unicode#digitValue |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isDigit(UChar32 ch); |
| |
| /** |
| * Determines whether the specified numeric value is actually a defined character |
| * according to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character has a defined Unicode meaning; false otherwise. |
| * |
| * @see Unicode#isDigit |
| * @see Unicode#isLetter |
| * @see Unicode#isLetterOrDigit |
| * @see Unicode#isUpperCase |
| * @see Unicode#isLowerCase |
| * @see Unicode#isTitleCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isDefined(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is a control character according |
| * to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the Unicode character is a control character; false otherwise. |
| * |
| * @see Unicode#isPrintable |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isControl(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is a printable character according |
| * to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the Unicode character is a printable character; false otherwise. |
| * |
| * @see Unicode#isControl |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isPrintable(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is of the base form according |
| * to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the Unicode character is of the base form; false otherwise. |
| * |
| * @see Unicode#isLetter |
| * @see Unicode#isDigit |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isBaseForm(UChar32 ch); |
| |
| /** |
| * Determines whether the specified character is a letter |
| * according to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character is a letter; false otherwise. |
| * |
| * |
| * @see Unicode#isDigit |
| * @see Unicode#isLetterOrDigit |
| * @see Unicode#isUpperCase |
| * @see Unicode#isLowerCase |
| * @see Unicode#isTitleCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isLetter(UChar32 ch); |
| |
| /** |
| * A convenience method for determining if a Unicode character |
| * is allowed as the first character in a Java identifier. |
| * <P> |
| * A character may start a Java identifier if and only if |
| * it is one of the following: |
| * <ul> |
| * <li> a letter |
| * <li> a currency symbol (such as "$") |
| * <li> a connecting punctuation symbol (such as "_"). |
| * </ul> |
| * |
| * @param ch the Unicode character. |
| * @return TRUE if the character may start a Java identifier; |
| * FALSE otherwise. |
| * @see isJavaIdentifierPart |
| * @see isLetter |
| * @see isUnicodeIdentifierStart |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isJavaIdentifierStart(UChar32 ch); |
| |
| /** |
| * A convenience method for determining if a Unicode character |
| * may be part of a Java identifier other than the starting |
| * character. |
| * <P> |
| * A character may be part of a Java identifier if and only if |
| * it is one of the following: |
| * <ul> |
| * <li> a letter |
| * <li> a currency symbol (such as "$") |
| * <li> a connecting punctuation character (such as "_"). |
| * <li> a digit |
| * <li> a numeric letter (such as a Roman numeral character) |
| * <li> a combining mark |
| * <li> a non-spacing mark |
| * <li> an ignorable control character |
| * </ul> |
| * |
| * @param ch the Unicode character. |
| * @return TRUE if the character may be part of a Unicode identifier; |
| * FALSE otherwise. |
| * @see isIdentifierIgnorable |
| * @see isJavaIdentifierStart |
| * @see isLetter |
| * @see isDigit |
| * @see isUnicodeIdentifierPart |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isJavaIdentifierPart(UChar32 ch); |
| |
| /** |
| * A convenience method for determining if a Unicode character |
| * is allowed to start in a Unicode identifier. |
| * A character may start a Unicode identifier if and only if |
| * it is a letter. |
| * |
| * @param ch the Unicode character. |
| * @return TRUE if the character may start a Unicode identifier; |
| * FALSE otherwise. |
| * @see isJavaIdentifierStart |
| * @see isLetter |
| * @see isUnicodeIdentifierPart |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isUnicodeIdentifierStart(UChar32 ch); |
| |
| /** |
| * A convenience method for determining if a Unicode character |
| * may be part of a Unicode identifier other than the starting |
| * character. |
| * <P> |
| * A character may be part of a Unicode identifier if and only if |
| * it is one of the following: |
| * <ul> |
| * <li> a letter |
| * <li> a connecting punctuation character (such as "_"). |
| * <li> a digit |
| * <li> a numeric letter (such as a Roman numeral character) |
| * <li> a combining mark |
| * <li> a non-spacing mark |
| * <li> an ignorable control character |
| * </ul> |
| * |
| * @param ch the Unicode character. |
| * @return TRUE if the character may be part of a Unicode identifier; |
| * FALSE otherwise. |
| * @see isIdentifierIgnorable |
| * @see isJavaIdentifierPart |
| * @see isLetterOrDigit |
| * @see isUnicodeIdentifierStart |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isUnicodeIdentifierPart(UChar32 ch); |
| |
| /** |
| * A convenience method for determining if a Unicode character |
| * should be regarded as an ignorable character in a Java |
| * identifier or a Unicode identifier. |
| * <P> |
| * The following Unicode characters are ignorable in a Java identifier |
| * or a Unicode identifier: |
| * <table> |
| * <tr><td>0x0000 through 0x0008,</td> |
| * <td>ISO control characters that</td></tr> |
| * <tr><td>0x000E through 0x001B,</td> <td>are not whitespace</td></tr> |
| * <tr><td>and 0x007F through 0x009F</td></tr> |
| * <tr><td>0x200C through 0x200F</td> <td>join controls</td></tr> |
| * <tr><td>0x200A through 0x200E</td> <td>bidirectional controls</td></tr> |
| * <tr><td>0x206A through 0x206F</td> <td>format controls</td></tr> |
| * <tr><td>0xFEFF</td> <td>zero-width no-break space</td></tr> |
| * </table> |
| * |
| * @param ch the Unicode character. |
| * @return TRUE if the character may be part of a Unicode identifier; |
| * FALSE otherwise. |
| * @see isJavaIdentifierPart |
| * @see isUnicodeIdentifierPart |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isIdentifierIgnorable(UChar32 ch); |
| |
| /** |
| * The given character is mapped to its lowercase equivalent according to |
| * Unicode 2.1.2; if the character has no lowercase equivalent, the character |
| * itself is returned. |
| * <P> |
| * A character has a lowercase equivalent if and only if a lowercase mapping |
| * is specified for the character in the Unicode 2.0 attribute table. |
| * <P> |
| * Unicode::toLowerCase() only deals with the general letter case conversion. |
| * For language specific case conversion behavior, use UnicodeString::toLower(). |
| * For example, the case conversion for dot-less i and dotted I in Turkish, |
| * or for final sigma in Greek. |
| * |
| * @param ch the character to be converted |
| * @return the lowercase equivalent of the character, if any; |
| * otherwise the character itself. |
| * |
| * @see UnicodeString#toLower |
| * @see Unicode#isLowerCase |
| * @see Unicode#isUpperCase |
| * @see Unicode#toUpperCase |
| * @see Unicode#toTitleCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UChar32 toLowerCase(UChar32 ch); |
| |
| /** |
| * The given character is mapped to its uppercase equivalent according to Unicode |
| * 2.1.2; if the character has no uppercase equivalent, the character itself is |
| * returned. |
| * <P> |
| * Unicode::toUpperCase() only deals with the general letter case conversion. |
| * For language specific case conversion behavior, use UnicodeString::toUpper(). |
| * For example, the case conversion for dot-less i and dotted I in Turkish, |
| * or ess-zed (i.e., "sharp S") in German. |
| * |
| * @param ch the character to be converted |
| * @return the uppercase equivalent of the character, if any; |
| * otherwise the character itself. |
| * |
| * @see UnicodeString#toUpper |
| * @see Unicode#isUpperCase |
| * @see Unicode#isLowerCase |
| * @see Unicode#toLowerCase |
| * @see Unicode#toTitleCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UChar32 toUpperCase(UChar32 ch); |
| |
| /** |
| * The given character is mapped to its titlecase equivalent according to Unicode |
| * 2.1.2. There are only four Unicode characters that are truly titlecase forms |
| * that are distinct from uppercase forms. As a rule, if a character has no |
| * true titlecase equivalent, its uppercase equivalent is returned. |
| * <P> |
| * A character has a titlecase equivalent if and only if a titlecase mapping |
| * is specified for the character in the Unicode 2.1.2 data. |
| * |
| * @param ch the character to be converted |
| * @return the titlecase equivalent of the character, if any; |
| * otherwise the character itself. |
| * @see Unicode#isTitleCase |
| * @see Unicode#toUpperCase |
| * @see Unicode#toLowerCase |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UChar32 toTitleCase(UChar32 ch); |
| |
| /** |
| * The given character is mapped to its case folding equivalent according to |
| * UnicodeData.txt and CaseFolding.txt; if the character has no case folding equivalent, the character |
| * itself is returned. |
| * Only "simple", single-code point case folding mappings are used. |
| * "Full" mappings are used by UnicodeString::foldCase(). |
| * |
| * @param c the character to be converted |
| * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I |
| * @return the case folding equivalent of the character, if any; |
| * otherwise the character itself. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UChar32 |
| foldCase(UChar32 c, uint32_t options); |
| |
| /** |
| * Determines if the specified character is a Unicode space character |
| * according to Unicode 2.1.2. |
| * |
| * @param ch the character to be tested |
| * @return true if the character is a space character; false otherwise. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isSpaceChar(UChar32 ch); |
| |
| /** |
| * Determines if the specified character is white space according to ICU. |
| * A character is considered to be an ICU whitespace character if and only |
| * if it satisfies one of the following criteria: |
| * <ul> |
| * <li> It is a Unicode space separator (category "Zs"), but is not |
| * a no-break space (\u00A0 or \uFEFF). |
| * <li> It is a Unicode line separator (category "Zl"). |
| * <li> It is a Unicode paragraph separator (category "Zp"). |
| * <li> It is \u0009, HORIZONTAL TABULATION. |
| * <li> It is \u000A, LINE FEED. |
| * <li> It is \u000B, VERTICAL TABULATION. |
| * <li> It is \u000C, FORM FEED. |
| * <li> It is \u000D, CARRIAGE RETURN. |
| * <li> It is \u001C, FILE SEPARATOR. |
| * <li> It is \u001D, GROUP SEPARATOR. |
| * <li> It is \u001E, RECORD SEPARATOR. |
| * <li> It is \u001F, UNIT SEPARATOR. |
| * </ul> |
| * Note: This method corresponds to the Java method |
| * <tt>java.lang.Character.isWhitespace()</tt>. |
| * |
| * @param ch the character to be tested. |
| * @return true if the character is an ICU whitespace character; |
| * false otherwise. |
| * @see #isSpaceChar |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isWhitespace(UChar32 ch); |
| |
| /** |
| * Returns a value indicating a character category according to Unicode |
| * 2.1.2. |
| * @param ch the character to be tested |
| * @return a value of type int, the character category. |
| * @see Unicode#UNASSIGNED |
| * @see Unicode#UPPERCASE_LETTER |
| * @see Unicode#LOWERCASE_LETTER |
| * @see Unicode#TITLECASE_LETTER |
| * @see Unicode#MODIFIER_LETTER |
| * @see Unicode#OTHER_LETTER |
| * @see Unicode#NON_SPACING_MARK |
| * @see Unicode#ENCLOSING_MARK |
| * @see Unicode#COMBINING_SPACING_MARK |
| * @see Unicode#DECIMAL_DIGIT_NUMBER |
| * @see Unicode#OTHER_NUMBER |
| * @see Unicode#SPACE_SEPARATOR |
| * @see Unicode#LINE_SEPARATOR |
| * @see Unicode#PARAGRAPH_SEPARATOR |
| * @see Unicode#CONTROL |
| * @see Unicode#PRIVATE_USE |
| * @see Unicode#SURROGATE |
| * @see Unicode#DASH_PUNCTUATION |
| * @see Unicode#OPEN_PUNCTUATION |
| * @see Unicode#CLOSE_PUNCTUATION |
| * @see Unicode#CONNECTOR_PUNCTUATION |
| * @see Unicode#OTHER_PUNCTUATION |
| * @see Unicode#LETTER_NUMBER |
| * @see Unicode#MATH_SYMBOL |
| * @see Unicode#CURRENCY_SYMBOL |
| * @see Unicode#MODIFIER_SYMBOL |
| * @see Unicode#OTHER_SYMBOL |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline int8_t getType(UChar32 ch); |
| |
| /** |
| * Returns the combining class of the code point as specified in UnicodeData.txt. |
| * |
| * @param c the code point of the character |
| * @return the combining class of the character |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline uint8_t getCombiningClass(UChar32 c); |
| |
| /** |
| * Returns the linguistic direction property of a character. |
| * <P> |
| * Returns the linguistic direction property of a character. |
| * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional |
| * property. |
| * @see #EDirectionProperty |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline EDirectionProperty characterDirection(UChar32 ch); |
| |
| /** |
| * Determines whether the character has the "mirrored" property. |
| * This property is set for characters that are commonly used in |
| * Right-To-Left contexts and need to be displayed with a "mirrored" |
| * glyph. |
| * |
| * @param c the character (code point, Unicode scalar value) to be tested |
| * @return TRUE if the character has the "mirrored" property |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UBool isMirrored(UChar32 c); |
| |
| /** |
| * Maps the specified character to a "mirror-image" character. |
| * For characters with the "mirrored" property, implementations |
| * sometimes need a "poor man's" mapping to another Unicode |
| * character (code point) such that the default glyph may serve |
| * as the mirror-image of the default glyph of the specified |
| * character. This is useful for text conversion to and from |
| * codepages with visual order, and for displays without glyph |
| * selecetion capabilities. |
| * |
| * @param c the character (code point, Unicode scalar value) to be mapped |
| * @return another Unicode code point that may serve as a mirror-image |
| * substitute, or c itself if there is no such mapping or c |
| * does not have the "mirrored" property |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UChar32 charMirror(UChar32 c); |
| |
| /** |
| * Returns the script associated with a character. |
| * @see #EUnicodeScript |
| * @draft |
| */ |
| static inline EUnicodeScript getScript(UChar32 ch); |
| |
| /** |
| * Returns a value indicating the display-cell width of the character |
| * when used in Asian text, according to the Unicode standard (see p. 6-130 |
| * of The Unicode Standard, Version 2.0). The results for various characters |
| * are as follows: |
| * <P> |
| * ZERO_WIDTH: Characters which are considered to take up no display-cell space: |
| * control characters |
| * format characters |
| * line and paragraph separators |
| * non-spacing marks |
| * combining Hangul jungseong |
| * combining Hangul jongseong |
| * unassigned Unicode values |
| * <P> |
| * HALF_WIDTH: Characters which take up half a cell in standard Asian text: |
| * all characters in the General Scripts Area except combining Hangul choseong |
| * and the characters called out specifically above as ZERO_WIDTH |
| * alphabetic and Arabic presentation forms |
| * halfwidth CJK punctuation |
| * halfwidth Katakana |
| * halfwidth Hangul Jamo |
| * halfwidth forms, arrows, and shapes |
| * <P> |
| * FULL_WIDTH: Characters which take up a full cell in standard Asian text: |
| * combining Hangul choseong |
| * all characters in the CJK Phonetics and Symbols Area |
| * all characters in the CJK Ideographs Area |
| * all characters in the Hangul Syllables Area |
| * CJK compatibility ideographs |
| * CJK compatibility forms |
| * small form variants |
| * fullwidth ASCII |
| * fullwidth punctuation and currency signs |
| * <P> |
| * NEUTRAL: Characters whose cell width is context-dependent: |
| * all characters in the Symbols Area, except those specifically called out above |
| * all characters in the Surrogates Area |
| * all charcaters in the Private Use Area |
| * <P> |
| * For Korean text, this algorithm should work properly with properly normalized Korean |
| * text. Precomposed Hangul syllables and non-combining jamo are all considered full- |
| * width characters. For combining jamo, we treat we treat choseong (initial consonants) |
| * as double-width characters and junseong (vowels) and jongseong (final consonants) |
| * as non-spacing marks. This will work right in text that uses the precomposed |
| * choseong characters instead of teo choseong characters in a row, and which uses the |
| * choseong filler character at the beginning of syllables that don't have an initial |
| * consonant. The results may be slightly off with Korean text following different |
| * conventions. |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline uint16_t getCellWidth(UChar32 ch); |
| |
| /** |
| * Retrieve the name of a Unicode character. |
| * Depending on <code>nameChoice</code>, the character name written |
| * into the buffer is the "modern" name or the name that was defined |
| * in Unicode version 1.0. |
| * The name contains only "invariant" characters |
| * like A-Z, 0-9, space, and '-'. |
| * |
| * @param code The character (code point) for which to get the name. |
| * It must be <code>0<=code<0x10ffff</code>. |
| * @param buffer Destination address for copying the name. |
| * @param bufferLength <code>==sizeof(buffer)</code> |
| * @param nameChoice Selector for which name to get. |
| * |
| * @see UCharNameChoice |
| * |
| * Example: |
| * <pre> |
| *   char buffer[100]; |
| *   UTextOffset length=Unicode::getCharName( |
| *   0x284, buffer, sizeof(buffer)); |
| *   |
| *   // use invariant-character conversion to Unicode |
| *   UnicodeString name(buffer, length, ""); |
| * </pre> |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UTextOffset |
| getCharName(uint32_t code, |
| char *buffer, UTextOffset bufferLength, |
| UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME); |
| |
| /** |
| * Retrives the decimal numeric value of a digit character. |
| * @param ch the digit character for which to get the numeric value |
| * @return the numeric value of ch in decimal radix. This method returns |
| * -1 if ch is not a valid digit character. |
| * @see Unicode#digit |
| * @see Unicode#forDigit |
| * @see Unicode#isDigit |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline int32_t digitValue(UChar32 ch); |
| |
| /** |
| * Returns the numeric value of the character <code>ch</code> in the |
| * specified radix. |
| * <p> |
| * If the radix is not in the range <code>MIN_RADIX <= |
| * radix <= MAX_RADIX</code> or if the |
| * value of <code>ch</code> is not a valid digit in the specified |
| * radix, <code>-1</code> is returned. A character is a valid digit |
| * if at least one of the following is true: |
| * <ul> |
| * <li>The method <code>isDigit</code> is true of the character |
| * and the Unicode decimal digit value of the character (or its |
| * single-character decomposition) is less than the specified radix. |
| * In this case the decimal digit value is returned. |
| * <li>The character is one of the uppercase Latin letters |
| * <code>'A'</code> through <code>'Z'</code> and its code is less than |
| * <code>radix + 'A' - 10</code>. |
| * In this case, <code>ch - 'A' + 10</code> |
| * is returned. |
| * <li>The character is one of the lowercase Latin letters |
| * <code>'a'</code> through <code>'z'</code> and its code is less than |
| * <code>radix + 'a' - 10</code>. |
| * In this case, <code>ch - 'a' + 10</code> |
| * is returned. |
| * </ul> |
| * |
| * @param ch the character to be converted. |
| * @param radix the radix. |
| * @return the numeric value represented by the character in the |
| * specified radix. |
| * @see Unicode#MIN_RADIX |
| * @see Unicode#MAX_RADIX |
| * @see Unicode#forDigit |
| * @see Unicode#digitValue |
| * @see Unicode#isDigit |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline int32_t digit(UChar32 ch, int8_t radix); |
| |
| /** |
| * Determines the character representation for a specific digit in |
| * the specified radix. If the value of <code>radix</code> is not a |
| * valid radix, or the value of <code>digit</code> is not a valid |
| * digit in the specified radix, the null character |
| * (<code>U+0000</code>) is returned. |
| * <p> |
| * The <code>radix</code> argument is valid if it is greater than or |
| * equal to <code>MIN_RADIX</code> and less than or equal to |
| * <code>MAX_RADIX</code>. The <code>digit</code> argument is valid if |
| * <code>0 <= digit <= radix</code>. |
| * <p> |
| * If the digit is less than 10, then |
| * <code>'0' + digit</code> is returned. Otherwise, the value |
| * <code>'a' + digit - 10</code> is returned. |
| * |
| * @param digit the number to convert to a character. |
| * @param radix the radix. |
| * @return the <code>char</code> representation of the specified digit |
| * in the specified radix. |
| * @see Unicode#MIN_RADIX |
| * @see Unicode#MAX_RADIX |
| * @see Unicode#digit |
| * @see Unicode#digitValue |
| * @see Unicode#isDigit |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static inline UChar32 forDigit(int32_t digit, int8_t radix); |
| |
| /** |
| * Retrieves the Unicode Standard Version number that is used |
| * @param info the version # information, the result will be filled in |
| * |
| * @deprecated See the Unicode class description. |
| */ |
| static void getUnicodeVersion(UVersionInfo info); |
| |
| protected: |
| // These constructors, destructor, and assignment operator must |
| // be protected (not private, as they semantically are) to make |
| // various UNIX compilers happy. [LIU] |
| // They should be private to prevent anyone from instantiating or |
| // subclassing Unicode. |
| Unicode(); |
| Unicode(const Unicode &other); |
| ~Unicode(); |
| const Unicode &operator=(const Unicode &other); |
| }; |
| |
| /* inline implementations --------------------------------------------------- */ |
| |
| inline UBool |
| Unicode::isSingle(UChar c) { |
| return UTF_IS_SINGLE(c); |
| } |
| |
| inline UBool |
| Unicode::isLead(UChar c) { |
| return UTF_IS_LEAD(c); |
| } |
| |
| inline UBool |
| Unicode::isTrail(UChar c) { |
| return UTF_IS_TRAIL(c); |
| } |
| |
| inline UBool |
| Unicode::isSurrogate(UChar32 c) { |
| return UTF_IS_SURROGATE(c); |
| } |
| |
| inline UBool |
| Unicode::isUnicodeChar(UChar32 c) { |
| return UTF_IS_UNICODE_CHAR(c); |
| } |
| |
| inline UBool |
| Unicode::isError(UChar32 c) { |
| return UTF_IS_ERROR(c); |
| } |
| |
| inline UBool |
| Unicode::isValid(UChar32 c) { |
| return UTF_IS_VALID(c); |
| } |
| |
| inline UBool |
| Unicode::needMultipleUChar(UChar32 c) { |
| return UTF_NEED_MULTIPLE_UCHAR(c); |
| } |
| |
| inline int32_t |
| Unicode::charLength(UChar32 c) { |
| return UTF_CHAR_LENGTH(c); |
| } |
| |
| inline int32_t |
| Unicode::arraySize(int32_t size) { |
| return UTF_ARRAY_SIZE(size); |
| } |
| |
| // Checks if ch is a lower case letter. |
| inline UBool |
| Unicode::isLowerCase(UChar32 ch) { |
| return u_islower(ch); |
| } |
| |
| // Checks if ch is a upper case letter. |
| inline UBool |
| Unicode::isUpperCase(UChar32 ch) { |
| return u_isupper(ch); |
| } |
| |
| // Checks if ch is a title case letter; usually upper case letters. |
| inline UBool |
| Unicode::isTitleCase(UChar32 ch) { |
| return u_istitle(ch); |
| } |
| |
| // Checks if ch is a decimal digit. |
| inline UBool |
| Unicode::isDigit(UChar32 ch) { |
| return u_isdigit(ch); |
| } |
| |
| // Checks if ch is a unicode character with assigned character type. |
| inline UBool |
| Unicode::isDefined(UChar32 ch) { |
| return u_isdefined(ch); |
| } |
| |
| // Checks if the Unicode character is a control character. |
| inline UBool |
| Unicode::isControl(UChar32 ch) { |
| return u_iscntrl(ch); |
| } |
| |
| // Checks if the Unicode character is printable. |
| inline UBool |
| Unicode::isPrintable(UChar32 ch) { |
| return u_isprint(ch); |
| } |
| |
| // Checks if the Unicode character is a base form character that can take a diacritic. |
| inline UBool |
| Unicode::isBaseForm(UChar32 ch) { |
| return u_isbase(ch); |
| } |
| |
| // Checks if the Unicode character is a letter. |
| inline UBool |
| Unicode::isLetter(UChar32 ch) { |
| return u_isalpha(ch); |
| } |
| |
| // Checks if the Unicode character can start a Java identifier. |
| inline UBool |
| Unicode::isJavaIdentifierStart(UChar32 ch) { |
| return u_isJavaIDStart(ch); |
| } |
| |
| // Checks if the Unicode character can be a Java identifier part other than starting the |
| // identifier. |
| inline UBool |
| Unicode::isJavaIdentifierPart(UChar32 ch) { |
| return u_isJavaIDPart(ch); |
| } |
| |
| // Checks if the Unicode character can start a Unicode identifier. |
| inline UBool |
| Unicode::isUnicodeIdentifierStart(UChar32 ch) { |
| return u_isIDStart(ch); |
| } |
| |
| // Checks if the Unicode character can be a Unicode identifier part other than starting the |
| // identifier. |
| inline UBool |
| Unicode::isUnicodeIdentifierPart(UChar32 ch) { |
| return u_isIDPart(ch); |
| } |
| |
| // Checks if the Unicode character can be ignorable in a Java or Unicode identifier. |
| inline UBool |
| Unicode::isIdentifierIgnorable(UChar32 ch) { |
| return u_isIDIgnorable(ch); |
| } |
| |
| // Transforms the Unicode character to its lower case equivalent. |
| inline UChar32 |
| Unicode::toLowerCase(UChar32 ch) { |
| return u_tolower(ch); |
| } |
| |
| // Transforms the Unicode character to its upper case equivalent. |
| inline UChar32 |
| Unicode::toUpperCase(UChar32 ch) { |
| return u_toupper(ch); |
| } |
| |
| // Transforms the Unicode character to its title case equivalent. |
| inline UChar32 |
| Unicode::toTitleCase(UChar32 ch) { |
| return u_totitle(ch); |
| } |
| |
| // Transforms the Unicode character to its case folded equivalent. |
| inline UChar32 |
| Unicode::foldCase(UChar32 ch, uint32_t options) { |
| return u_foldCase(ch, options); |
| } |
| |
| // Checks if the Unicode character is a space character. |
| inline UBool |
| Unicode::isSpaceChar(UChar32 ch) { |
| return u_isspace(ch); |
| } |
| |
| // Determines if the specified character is white space according to ICU. |
| inline UBool |
| Unicode::isWhitespace(UChar32 ch) { |
| return u_isWhitespace(ch); |
| } |
| |
| // Gets if the Unicode character's character property. |
| inline int8_t |
| Unicode::getType(UChar32 ch) { |
| return u_charType(ch); |
| } |
| |
| inline uint8_t |
| Unicode::getCombiningClass(UChar32 c) { |
| return u_getCombiningClass(c); |
| } |
| |
| // Gets the character's linguistic directionality. |
| inline Unicode::EDirectionProperty |
| Unicode::characterDirection(UChar32 ch) { |
| return (EDirectionProperty)u_charDirection(ch); |
| } |
| |
| // Determines if the character has the "mirrored" property. |
| inline UBool |
| Unicode::isMirrored(UChar32 ch) { |
| return u_isMirrored(ch); |
| } |
| |
| // Maps the character to a "mirror-image" character, or to itself. |
| inline UChar32 |
| Unicode::charMirror(UChar32 ch) { |
| return u_charMirror(ch); |
| } |
| |
| // Get the script associated with the character |
| inline Unicode::EUnicodeScript |
| Unicode::getScript(UChar32 ch) { |
| return (EUnicodeScript) u_charScript(ch); |
| } |
| |
| // Gets table cell width of the Unicode character. |
| inline uint16_t |
| Unicode::getCellWidth(UChar32 ch) { |
| return u_charCellWidth(ch); |
| } |
| |
| inline UTextOffset |
| Unicode::getCharName(uint32_t code, |
| char *buffer, UTextOffset bufferLength, |
| UCharNameChoice nameChoice) { |
| UErrorCode errorCode=U_ZERO_ERROR; |
| UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode); |
| return U_SUCCESS(errorCode) ? length : 0; |
| } |
| |
| inline int32_t |
| Unicode::digitValue(UChar32 ch) { |
| return u_charDigitValue(ch); |
| } |
| |
| inline int32_t |
| Unicode::digit(UChar32 ch, int8_t radix) { |
| return u_digit(ch, radix); |
| } |
| |
| inline UChar32 |
| Unicode::forDigit(int32_t digit, int8_t radix) { |
| return u_forDigit(digit, radix); |
| } |
| |
| inline void |
| Unicode::getUnicodeVersion(UVersionInfo versionArray) { |
| u_getUnicodeVersion(versionArray); |
| } |
| U_NAMESPACE_END |
| |
| #endif |