blob: a38690a147ed9a08e7cd265364776dcd50505336 [file] [log] [blame]
/*
********************************************************************************
* *
* COPYRIGHT: *
* (C) Copyright Taligent, Inc., 1997 *
* (C) Copyright International Business Machines Corporation, 1997-1998 *
* Licensed Material - Program-Property of IBM - All Rights Reserved. *
* US Government Users Restricted Rights - Use, duplication, or disclosure *
* restricted by GSA ADP Schedule Contract with IBM Corp. *
* *
********************************************************************************
*
* File UCHAR.H
*
* Modification History:
*
* Date Name Description
* 04/02/97 aliu Creation.
* 03/29/99 helena Updated for C APIs.
* 4/15/99 Madhu Updated for C Implementation and Javadoc
* 5/20/99 Madhu Added the function u_getVersion()
* 8/19/1999 srl Upgraded scripts to Unicode 3.0
********************************************************************************
*/
#ifndef UCHAR_H
#define UCHAR_H
#include "utypes.h"
/*===========================================================================*/
/* Unicode version number */
/*===========================================================================*/
#define UNICODE_VERSION "3.0.0.beta"
/**
* The Unicode C API allows you to query the properties associated with individual
* Unicode character values.
* <p>
* The Unicode character information, provided implicitly by the
* Unicode character encoding standard, includes information about the script
* (for example, symbols or control characters) to which the character belongs,
* as well as semantic information such as whether a character is a digit or
* uppercase, lowercase, or uncased.
* <P>
*/
struct UCharDigitPair{
uint16_t fUnicode;
int8_t fValue;
};
typedef struct UCharDigitPair UCharDigitPair;
struct BlockScriptMap {
UChar fFirstCode;
UChar fLastCode;
};
typedef struct BlockScriptMap BlockScriptMap;
static bool_t tablesCreated=FALSE;
static bool_t ulTablesCreated=FALSE;
static bool_t dirTablesCreated=FALSE;
static void createTables(void);
static void createUlTables(void);
static void createDirTables(void);
/**
* The Unicode C API allows you to query the properties associated with individual
* Unicode character values.
* <p>
* The Unicode character information, provided implicitly by the
* Unicode character encoding standard, includes information about the script
* (for example, symbols or control characters) to which the character belongs,
* as well as semantic information such as whether a character is a digit or
* uppercase, lowercase, or uncased.
* <P>
*/
/**
* Constants.
*/
/**
* The minimum value a UChar can have. The lowest value a
* UChar can have is 0x0000.
*/
static UChar UCHAR_MIN_VALUE;
/**
* The maximum value a UChar can have. The greatest value a
* UChar can have is 0xffff.
*/
static UChar UCHAR_MAX_VALUE;
/**
* Data for enumerated Unicode general category types
*/
enum UCharCategory
{
UNASSIGNED = 0,
UPPERCASE_LETTER = 1,
LOWERCASE_LETTER = 2,
TITLECASE_LETTER = 3,
MODIFIER_LETTER = 4,
OTHER_LETTER = 5,
NON_SPACING_MARK = 6,
ENCLOSING_MARK = 7,
COMBINING_SPACING_MARK = 8,
DECIMAL_DIGIT_NUMBER = 9,
LETTER_NUMBER = 10,
OTHER_NUMBER = 11,
SPACE_SEPARATOR = 12,
LINE_SEPARATOR = 13,
PARAGRAPH_SEPARATOR = 14,
CONTROL = 15,
FORMAT = 16,
PRIVATE_USE = 17,
SURROGATE = 18,
DASH_PUNCTUATION = 19,
START_PUNCTUATION = 20,
END_PUNCTUATION = 21,
CONNECTOR_PUNCTUATION = 22,
OTHER_PUNCTUATION = 23,
MATH_SYMBOL = 24,
CURRENCY_SYMBOL = 25,
MODIFIER_SYMBOL = 26,
OTHER_SYMBOL = 27,
INITIAL_PUNCTUATION = 28,
FINAL_PUNCTUATION = 29,
GENERAL_OTHER_TYPES = 30
};
typedef enum UCharCategory UCharCategory;
/**
* This specifies the language directional property of a character set.
*/
enum UCharDirection {
LEFT_TO_RIGHT = 0,
RIGHT_TO_LEFT = 1,
EUROPEAN_NUMBER = 2,
EUROPEAN_NUMBER_SEPARATOR = 3,
EUROPEAN_NUMBER_TERMINATOR = 4,
ARABIC_NUMBER = 5,
COMMON_NUMBER_SEPARATOR = 6,
BLOCK_SEPARATOR = 7,
SEGMENT_SEPARATOR = 8,
WHITE_SPACE_NEUTRAL = 9,
OTHER_NEUTRAL = 10,
LEFT_TO_RIGHT_EMBEDDING = 11,
LEFT_TO_RIGHT_OVERRIDE = 12,
RIGHT_TO_LEFT_ARABIC = 13,
RIGHT_TO_LEFT_EMBEDDING = 14,
RIGHT_TO_LEFT_OVERRIDE = 15,
POP_DIRECTIONAL_FORMAT = 16,
DIR_NON_SPACING_MARK = 17,
BOUNDARY_NEUTRAL = 18
};
typedef enum UCharDirection UCharDirection;
/**
* Script range as defined in the Unicode standard.
*/
/* Generated from Unicode Data files */
enum UCharScript {
/* Script names */
BASIC_LATIN,
LATIN_1_SUPPLEMENT,
LATIN_EXTENDED_A,
LATIN_EXTENDED_B,
IPA_EXTENSIONS,
SPACING_MODIFIER_LETTERS,
COMBINING_DIACRITICAL_MARKS,
GREEK,
CYRILLIC,
ARMENIAN,
HEBREW,
ARABIC,
SYRIAC,
THAANA,
DEVANAGARI,
BENGALI,
GURMUKHI,
GUJARATI,
ORIYA,
TAMIL,
TELUGU,
KANNADA,
MALAYALAM,
SINHALA,
THAI,
LAO,
TIBETAN,
MYANMAR,
GEORGIAN,
HANGUL_JAMO,
ETHIOPIC,
CHEROKEE,
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
OGHAM,
RUNIC,
KHMER,
MONGOLIAN,
LATIN_EXTENDED_ADDITIONAL,
GREEK_EXTENDED,
GENERAL_PUNCTUATION,
SUPERSCRIPTS_AND_SUBSCRIPTS,
CURRENCY_SYMBOLS,
COMBINING_MARKS_FOR_SYMBOLS,
LETTERLIKE_SYMBOLS,
NUMBER_FORMS,
ARROWS,
MATHEMATICAL_OPERATORS,
MISCELLANEOUS_TECHNICAL,
CONTROL_PICTURES,
OPTICAL_CHARACTER_RECOGNITION,
ENCLOSED_ALPHANUMERICS,
BOX_DRAWING,
BLOCK_ELEMENTS,
GEOMETRIC_SHAPES,
MISCELLANEOUS_SYMBOLS,
DINGBATS,
BRAILLE_PATTERNS,
CJK_RADICALS_SUPPLEMENT,
KANGXI_RADICALS,
IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
CJK_SYMBOLS_AND_PUNCTUATION,
HIRAGANA,
KATAKANA,
BOPOMOFO,
HANGUL_COMPATIBILITY_JAMO,
KANBUN,
BOPOMOFO_EXTENDED,
ENCLOSED_CJK_LETTERS_AND_MONTHS,
CJK_COMPATIBILITY,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
CJK_UNIFIED_IDEOGRAPHS,
YI_SYLLABLES,
YI_RADICALS,
HANGUL_SYLLABLES,
HIGH_SURROGATES,
HIGH_PRIVATE_USE_SURROGATES,
LOW_SURROGATES,
PRIVATE_USE_AREA, /* PRIVATE_USE */
CJK_COMPATIBILITY_IDEOGRAPHS,
ALPHABETIC_PRESENTATION_FORMS,
ARABIC_PRESENTATION_FORMS_A,
COMBINING_HALF_MARKS,
CJK_COMPATIBILITY_FORMS,
SMALL_FORM_VARIANTS,
ARABIC_PRESENTATION_FORMS_B,
SPECIALS,
HALFWIDTH_AND_FULLWIDTH_FORMS,
SCRIPT_COUNT,
NO_SCRIPT,
/* Enums for compatibility with ICU 1.2.4 and previous */
LATIN1_SUPPLEMENT=LATIN_1_SUPPLEMENT,
IPA_EXTENSION=IPA_EXTENSIONS,
SPACING_MODIFIER=SPACING_MODIFIER_LETTERS,
COMBINING_DIACRITICAL=COMBINING_DIACRITICAL_MARKS,
SUPER_SUBSCRIPT=SUPERSCRIPTS_AND_SUBSCRIPTS,
CURRENCY_SYMBOL_SCRIPT=CURRENCY_SYMBOLS,
SYMBOL_COMBINING_MARK=COMBINING_MARKS_FOR_SYMBOLS,
LETTERLIKE_SYMBOL=LETTERLIKE_SYMBOLS,
NUMBER_FORM=NUMBER_FORMS,
ARROW=ARROWS,
MATH_OPERATOR=MATHEMATICAL_OPERATORS,
MISC_TECHNICAL=MISCELLANEOUS_TECHNICAL,
CONTROL_PICTURE=CONTROL_PICTURES,
OPTICAL_CHARACTER=OPTICAL_CHARACTER_RECOGNITION,
ENCLOSED_ALPHANUMERIC=ENCLOSED_ALPHANUMERICS,
BOXDRAWING=BOX_DRAWING,
BLOCK_ELEMENT=BLOCK_ELEMENTS,
GEOMETRIC_SHAPE=GEOMETRIC_SHAPES,
MISC_SYMBOL=MISCELLANEOUS_SYMBOLS,
DINGBAT=DINGBATS,
CJK_SYMBOL_PUNCTUATION=CJK_SYMBOLS_AND_PUNCTUATION,
ENCLOSED_CJK_LETTER_MONTH=ENCLOSED_CJK_LETTERS_AND_MONTHS,
CJK_UNIFIED_IDEOGRAPH=CJK_UNIFIED_IDEOGRAPHS,
HANGUL_SYLLABLE=HANGUL_SYLLABLES,
HIGH_SURROGATE=HIGH_SURROGATES,
HIGH_PRIVATE_USE_SURROGATE=HIGH_PRIVATE_USE_SURROGATES,
LOW_SURROGATE=LOW_SURROGATES,
PRIVATE_USE_CHARACTERS=PRIVATE_USE_AREA,
CJK_COMPATIBILITY_IDEOGRAPH=CJK_COMPATIBILITY_IDEOGRAPHS,
ALPHABETIC_PRESENTATION=ALPHABETIC_PRESENTATION_FORMS,
ARABIC_PRESENTATION_A=ARABIC_PRESENTATION_FORMS_A,
COMBINING_HALFMARK=COMBINING_HALF_MARKS,
CJK_COMPATIBILITY_FORM=CJK_COMPATIBILITY_FORMS,
SMALL_FORM_VARIANT=SMALL_FORM_VARIANTS,
ARABIC_PRESENTATION_B=ARABIC_PRESENTATION_FORMS_B,
HALFWIDTH_FULLWIDTH_FORM=HALFWIDTH_AND_FULLWIDTH_FORMS
};
typedef enum UCharScript UCharScript;
/**
* Values returned by the u_getCellWidth() function.
*/
enum UCellWidth
{
ZERO_WIDTH = 0,
HALF_WIDTH = 1,
FULL_WIDTH = 2,
NEUTRAL = 3
};
typedef enum UCellWidth UCellWidth;
/**
* Functions to classify characters.
*/
/**
* Determines whether the specified UChar is a lowercase character
* according to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the character is lowercase; false otherwise.
* @see UNICODE_VERSION
* @see uisupper()
* @see uistitle()
* @see uislower()
*/
CAPI bool_t U_EXPORT2
u_islower(UChar c);
/**
* Determines whether the specified character is an uppercase character
* according to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the character is uppercase; false otherwise.
* @see uislower()
* @see uistitle
* @see utolower()
*/
CAPI bool_t U_EXPORT2
u_isupper(UChar c);
/**
* Determines whether the specified character is a titlecase character
* according to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the character is titlecase; false otherwise.
* @see uisupper()
* @see uislower()
* @see utotitle()
*/
CAPI bool_t U_EXPORT2
u_istitle(UChar c);
/**
* Determines whether the specified character is a digit according to Unicode
* 2.1.2.
*
* @param ch the character to be tested
* @return true if the character is a digit; false otherwise.
*/
CAPI bool_t U_EXPORT2
u_isdigit(UChar c);
/**
* Determines whether the specified numeric value is actually a defined character
* according to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the character has a defined Unicode meaning; false otherwise.
*
* @see uisdigit()
* @see uisalpha()
* @see uisalnum()
* @see uisupper()
* @see uislower()
* @see uistitle()
*/
CAPI bool_t U_EXPORT2
u_isdefined(UChar c);
/**
* Determines whether the specified character is a letter
* according to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the character is a letter; false otherwise.
*
* @see uisdigit()
* @see uisalnum()
*/
CAPI bool_t U_EXPORT2
u_isalpha(UChar c);
/**
* Determines if the specified character is a space character or not.
*
* @param ch the character to be tested
* @return true if the character is a space character; false otherwise.
*/
CAPI bool_t U_EXPORT2
u_isspace(UChar c);
/**
* Determines whether the specified character is a control character or not.
*
* @param ch the character to be tested
* @return true if the Unicode character is a control character; false otherwise.
*
* @see uisprint()
*/
CAPI bool_t U_EXPORT2
u_iscntrl(UChar c);
/**
* Determines whether the specified character is a printable character according
* to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the Unicode character is a printable character; false otherwise.
*
* @see uiscntrl()
*/
CAPI bool_t U_EXPORT2
u_isprint(UChar c);
/**
* Determines whether the specified character is of the base form according
* to Unicode 2.1.2.
*
* @param ch the character to be tested
* @return true if the Unicode character is of the base form; false otherwise.
*
* @see uisalpha()
* @see uisdigit()
*/
CAPI bool_t U_EXPORT2
u_isbase(UChar c);
/**
* Returns the linguistic direction property of a character.
* <P>
* Returns the linguistic direction property of a character.
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
* property.
* @see UCharDirection
*/
CAPI UCharDirection U_EXPORT2
u_charDirection(UChar c);
/**
* Returns a value indicating the display-cell width of the character
* when used in Asian text, according to the Unicode standard (see p. 6-130
* of The Unicode Standard, Version 2.0). The results for various characters
* are as follows:
* <P>
* ZERO_WIDTH: Characters which are considered to take up no display-cell space:
* control characters
* format characters
* line and paragraph separators
* non-spacing marks
* combining Hangul jungseong
* combining Hangul jongseong
* unassigned Unicode values
* <P>
* HALF_WIDTH: Characters which take up half a cell in standard Asian text:
* all characters in the General Scripts Area except combining Hangul choseong
* and the characters called out specifically above as ZERO_WIDTH
* alphabetic and Arabic presentation forms
* halfwidth CJK punctuation
* halfwidth Katakana
* halfwidth Hangul Jamo
* halfwidth forms, arrows, and shapes
* <P>
* FULL_WIDTH: Characters which take up a full cell in standard Asian text:
* combining Hangul choseong
* all characters in the CJK Phonetics and Symbols Area
* all characters in the CJK Ideographs Area
* all characters in the Hangul Syllables Area
* CJK compatibility ideographs
* CJK compatibility forms
* small form variants
* fullwidth ASCII
* fullwidth punctuation and currency signs
* <P>
* NEUTRAL: Characters whose cell width is context-dependent:
* all characters in the Symbols Area, except those specifically called out above
* all characters in the Surrogates Area
* all charcaters in the Private Use Area
* <P>
* For Korean text, this algorithm should work properly with properly normalized Korean
* text. Precomposed Hangul syllables and non-combining jamo are all considered full-
* width characters. For combining jamo, we treat we treat choseong (initial consonants)
* as double-width characters and junseong (vowels) and jongseong (final consonants)
* as non-spacing marks. This will work right in text that uses the precomposed
* choseong characters instead of teo choseong characters in a row, and which uses the
* choseong filler character at the beginning of syllables that don't have an initial
* consonant. The results may be slightly off with Korean text following different
* conventions.
*/
CAPI uint16_t U_EXPORT2
u_charCellWidth(UChar c);
/**
* Returns a value indicating a character category according to Unicode
* 2.1.2.
* @param c the character to be tested
* @return a value of type int, the character category.
* @see UCharCategory
*/
CAPI int8_t U_EXPORT2
u_charType(UChar c);
/**
* Retrives the decimal numeric value of a digit character.
* @param c the digit character for which to get the numeric value
* @return the numeric value of ch in decimal radix. This method returns
* -1 if ch is not a valid digit character.
*/
CAPI int32_t U_EXPORT2
u_charDigitValue(UChar c);
/**
*
* Returns the script associated with a character.
* @see #UCharScript
*/
CAPI UCharScript U_EXPORT2
u_charScript(UChar ch);
/**
* The following functions are java specific.
*/
/**
* A convenience method for determining if a Unicode character
* is allowed to start in a Unicode identifier.
* A character may start a Unicode identifier if and only if
* it is a letter.
*
* @param c the Unicode character.
* @return TRUE if the character may start a Unicode identifier;
* FALSE otherwise.
* @see u_isalpha
* @see u_isIDPart
*/
CAPI bool_t U_EXPORT2
u_isIDStart(UChar c);
/**
* A convenience method for determining if a Unicode character
* may be part of a Unicode identifier other than the starting
* character.
* <P>
* A character may be part of a Unicode identifier if and only if
* it is one of the following:
* <ul>
* <li> a letter
* <li> a connecting punctuation character (such as "_").
* <li> a digit
* <li> a numeric letter (such as a Roman numeral character)
* <li> a combining mark
* <li> a non-spacing mark
* <li> an ignorable control character
* </ul>
*
* @param c the Unicode character.
* @return TRUE if the character may be part of a Unicode identifier;
* FALSE otherwise.
* @see u_isIDIgnorable
* @see u_isIDStart
*/
CAPI bool_t U_EXPORT2
u_isIDPart(UChar c);
/**
* A convenience method for determining if a Unicode character
* should be regarded as an ignorable character
* in a Unicode identifier.
* <P>
* The following Unicode characters are ignorable in a
* Unicode identifier:
* <table>
* <tr><td>0x0000 through 0x0008,</td>
* <td>ISO control characters that</td></tr>
* <tr><td>0x000E through 0x001B,</td> <td>are not whitespace</td></tr>
* <tr><td>and 0x007F through 0x009F</td></tr>
* <tr><td>0x200C through 0x200F</td> <td>join controls</td></tr>
* <tr><td>0x200A through 0x200E</td> <td>bidirectional controls</td></tr>
* <tr><td>0x206A through 0x206F</td> <td>format controls</td></tr>
* <tr><td>0xFEFF</td> <td>zero-width no-break space</td></tr>
* </table>
*
* @param c the Unicode character.
* @return TRUE if the character may be part of a Unicode identifier;
* FALSE otherwise.
* @see u_isIDPart
*/
CAPI bool_t U_EXPORT2
u_isIDIgnorable(UChar c);
/**
* A convenience method for determining if a Unicode character
* is allowed as the first character in a Java identifier.
* <P>
* A character may start a Java identifier if and only if
* it is one of the following:
* <ul>
* <li> a letter
* <li> a currency symbol (such as "$")
* <li> a connecting punctuation symbol (such as "_").
* </ul>
*
* @param c the Unicode character.
* @return TRUE if the character may start a Java identifier;
* FALSE otherwise.
* @see u_isJavaIDPart
* @see u_isalpha
* @see u_isIDStart
*/
CAPI bool_t U_EXPORT2
u_isJavaIDStart(UChar c);
/**
* A convenience method for determining if a Unicode character
* may be part of a Java identifier other than the starting
* character.
* <P>
* A character may be part of a Java identifier if and only if
* it is one of the following:
* <ul>
* <li> a letter
* <li> a currency symbol (such as "$")
* <li> a connecting punctuation character (such as "_").
* <li> a digit
* <li> a numeric letter (such as a Roman numeral character)
* <li> a combining mark
* <li> a non-spacing mark
* <li> an ignorable control character
* </ul>
*
* @param c the Unicode character.
* @return TRUE if the character may be part of a Unicode identifier;
* FALSE otherwise.
* @see u_isIDIgnorable
* @see u_isJavaIDStart
* @see u_isalpha
* @see u_isdigit
* @see u_isIDPart
*/
CAPI bool_t U_EXPORT2
u_isJavaIDPart(UChar c);
/**
* Functions to change character case.
*/
/**
* The given character is mapped to its lowercase equivalent according to
* Unicode 2.1.2; if the character has no lowercase equivalent, the character
* itself is returned.
* <P>
* A character has a lowercase equivalent if and only if a lowercase mapping
* is specified for the character in the Unicode 2.1.2 attribute table.
* <P>
* utolower() only deals with the general letter case conversion.
* For language specific case conversion behavior, use ustrToUpper().
* For example, the case conversion for dot-less i and dotted I in Turkish,
* or for final sigma in Greek.
*
* @param ch the character to be converted
* @return the lowercase equivalent of the character, if any;
* otherwise the character itself.
*/
CAPI UChar U_EXPORT2
u_tolower(UChar c);
/**
* The given character is mapped to its uppercase equivalent according to Unicode
* 2.1.2; if the character has no uppercase equivalent, the character itself is
* returned.
* <P>
* utoupper() only deals with the general letter case conversion.
* For language specific case conversion behavior, use ustrToUpper().
* For example, the case conversion for dot-less i and dotted I in Turkish,
* or ess-zed (i.e., "sharp S") in German.
*
* @param ch the character to be converted
* @return the uppercase equivalent of the character, if any;
* otherwise the character itself.
*/
CAPI UChar U_EXPORT2
u_toupper(UChar c);
/**
* The given character is mapped to its titlecase equivalent according to Unicode
* 2.1.2. There are only four Unicode characters that are truly titlecase forms
* that are distinct from uppercase forms. As a rule, if a character has no
* true titlecase equivalent, its uppercase equivalent is returned.
* <P>
* A character has a titlecase equivalent if and only if a titlecase mapping
* is specified for the character in the Unicode 2.1.2 data.
*
* @param ch the character to be converted
* @return the titlecase equivalent of the character, if any;
* otherwise the character itself.
*/
CAPI UChar U_EXPORT2
u_totitle(UChar c);
/**
*
*The function is used to get the Unicode standard Version that is used
*@return the Unicode stabdard Version number
*/
CAPI const char* U_EXPORT2
u_getVersion(void);
#endif /*_UCHAR*/
/*eof*/