| /* |
| * Copyright © {1997-1999}, International Business Machines Corporation and others. All Rights Reserved. |
| ***************************************************************************************** |
| * |
| * File TXTBDAT.H |
| * |
| * Modification History: |
| * |
| * Date Name Description |
| * 02/18/97 aliu Converted from OpenClass. |
| * Made static data members const where appropriate. |
| * 03/25/97 aliu Removed subclasses, and merged their static data into this |
| * class. Instantiated four static instances for character, |
| * word, sentence, and line. Made forward(), backward(), and |
| * map() methods inline. |
| * 04/15/97 aliu Worked around bug in AIX xlC compiler which occurs if static |
| * arrays contain const elements. |
| * 05/06/97 aliu Made kSI, kStop, and kSI_Stop into #defines to help out |
| * non-compliant compilers. |
| ***************************************************************************************** |
| */ |
| |
| #ifndef TXTBDAT_H |
| #define TXTBDAT_H |
| |
| #include "unicode/utypes.h" |
| class WordBreakTable; |
| class UnicodeClassMapping; |
| class SpecialMapping; |
| |
| /** |
| * This class wraps up the data tables needed for SimpleTextBoundary. |
| * It is statically instantiated for each type of text boundary. This |
| * class is not designed to be subclassed. |
| */ |
| class TextBoundaryData { |
| public: |
| ~TextBoundaryData() {} // Do not subclass |
| |
| // Fast inline accessors |
| const WordBreakTable* forward(void) const; |
| const WordBreakTable* backward(void) const; |
| const UnicodeClassMapping* map(void) const; |
| |
| static const TextBoundaryData kCharacterBreakData; |
| static const TextBoundaryData kWordBreakData; |
| static const TextBoundaryData kLineBreakData; |
| static const TextBoundaryData kSentenceBreakData; |
| |
| typedef uint8_t Node; |
| typedef uint8_t Type; |
| |
| private: |
| static const UChar ASCII_END_OF_TEXT; |
| static const UChar ASCII_HORIZONTAL_TABULATION; |
| static const UChar ASCII_LINEFEED; |
| static const UChar ASCII_VERTICAL_TABULATION; |
| static const UChar ASCII_FORM_FEED; |
| static const UChar ASCII_CARRIAGE_RETURN; |
| static const UChar ASCII_SPACE; |
| static const UChar ASCII_EXCLAMATION_MARK; |
| static const UChar ASCII_QUOTATION_MARK; |
| static const UChar ASCII_NUMBER_SIGN; |
| static const UChar ASCII_DOLLAR_SIGN; |
| static const UChar ASCII_PERCENT; |
| static const UChar ASCII_AMPERSAND; |
| static const UChar ASCII_APOSTROPHE; |
| static const UChar ASCII_COMMA; |
| static const UChar ASCII_FULL_STOP; |
| static const UChar ASCII_COLON; |
| static const UChar ASCII_SEMICOLON; |
| static const UChar ASCII_QUESTION_MARK; |
| static const UChar ASCII_NONBREAKING_SPACE; |
| static const UChar ASCII_CENT_SIGN; |
| static const UChar ASCII_POUND_SIGN; |
| static const UChar ASCII_YEN_SIGN; |
| static const UChar LATIN1_SOFTHYPHEN; |
| static const UChar LATIN1_DEGREE_SIGN; |
| static const UChar ARABIC_PERCENT_SIGN; |
| static const UChar ARABIC_DECIMAL_SEPARATOR; |
| static const UChar HANGUL_CHOSEONG_LOW; |
| static const UChar HANGUL_CHOSEONG_HIGH; |
| static const UChar HANGUL_JUNGSEONG_LOW; |
| static const UChar HANGUL_JUNGSEONG_HIGH; |
| static const UChar HANGUL_JONGSEONG_LOW; |
| static const UChar HANGUL_JONGSEONG_HIGH; |
| static const UChar FIGURE_SPACE; |
| static const UChar NONBREAKING_HYPHEN; |
| static const UChar PUNCTUATION_HYPHENATION_POINT; |
| static const UChar PUNCTUATION_LINE_SEPARATOR; |
| static const UChar PUNCTUATION_PARAGRAPH_SEPARATOR; |
| static const UChar PER_MILLE_SIGN; |
| static const UChar PER_TEN_THOUSAND_SIGN; |
| static const UChar PRIME; |
| static const UChar DOUBLE_PRIME; |
| static const UChar TRIPLE_PRIME; |
| static const UChar DEGREE_CELSIUS; |
| static const UChar DEGREE_FAHRENHEIT; |
| static const UChar PUNCTUATION_IDEOGRAPHIC_COMMA; |
| static const UChar PUNCTUATION_IDEOGRAPHIC_FULL_STOP; |
| static const UChar IDEOGRAPHIC_ITERATION_MARK; |
| static const UChar HIRAGANA_LETTER_SMALL_A; |
| static const UChar HIRAGANA_LETTER_A; |
| static const UChar HIRAGANA_LETTER_SMALL_I; |
| static const UChar HIRAGANA_LETTER_I; |
| static const UChar HIRAGANA_LETTER_SMALL_U; |
| static const UChar HIRAGANA_LETTER_U; |
| static const UChar HIRAGANA_LETTER_SMALL_E; |
| static const UChar HIRAGANA_LETTER_E; |
| static const UChar HIRAGANA_LETTER_SMALL_O; |
| static const UChar HIRAGANA_LETTER_O; |
| static const UChar HIRAGANA_LETTER_DI; |
| static const UChar HIRAGANA_LETTER_SMALL_TU; |
| static const UChar HIRAGANA_LETTER_TU; |
| static const UChar HIRAGANA_LETTER_MO; |
| static const UChar HIRAGANA_LETTER_SMALL_YA; |
| static const UChar HIRAGANA_LETTER_YA; |
| static const UChar HIRAGANA_LETTER_SMALL_YU; |
| static const UChar HIRAGANA_LETTER_YU; |
| static const UChar HIRAGANA_LETTER_SMALL_YO; |
| static const UChar HIRAGANA_LETTER_YO; |
| static const UChar HIRAGANA_LETTER_RO; |
| static const UChar HIRAGANA_LETTER_SMALL_WA; |
| static const UChar HIRAGANA_LETTER_WA; |
| static const UChar HIRAGANA_LETTER_VU; |
| static const UChar COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK; |
| static const UChar HIRAGANA_SEMIVOICED_SOUND_MARK; |
| static const UChar HIRAGANA_ITERATION_MARK; |
| static const UChar HIRAGANA_VOICED_ITERATION_MARK; |
| static const UChar KATAKANA_LETTER_SMALL_A; |
| static const UChar KATAKANA_LETTER_A; |
| static const UChar KATAKANA_LETTER_SMALL_I; |
| static const UChar KATAKANA_LETTER_I; |
| static const UChar KATAKANA_LETTER_SMALL_U; |
| static const UChar KATAKANA_LETTER_U; |
| static const UChar KATAKANA_LETTER_SMALL_E; |
| static const UChar KATAKANA_LETTER_E; |
| static const UChar KATAKANA_LETTER_SMALL_O; |
| static const UChar KATAKANA_LETTER_O; |
| static const UChar KATAKANA_LETTER_DI; |
| static const UChar KATAKANA_LETTER_SMALL_TU; |
| static const UChar KATAKANA_LETTER_TU; |
| static const UChar KATAKANA_LETTER_MO; |
| static const UChar KATAKANA_LETTER_SMALL_YA; |
| static const UChar KATAKANA_LETTER_YA; |
| static const UChar KATAKANA_LETTER_SMALL_YU; |
| static const UChar KATAKANA_LETTER_YU; |
| static const UChar KATAKANA_LETTER_SMALL_YO; |
| static const UChar KATAKANA_LETTER_YO; |
| static const UChar KATAKANA_LETTER_RO; |
| static const UChar KATAKANA_LETTER_SMALL_WA; |
| static const UChar KATAKANA_LETTER_WA; |
| static const UChar KATAKANA_LETTER_VU; |
| static const UChar KATAKANA_LETTER_SMALL_KA; |
| static const UChar KATAKANA_LETTER_SMALL_KE; |
| static const UChar KATAKANA_LETTER_VA; |
| static const UChar KATAKANA_LETTER_VO; |
| static const UChar KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK; |
| static const UChar KATAKANA_ITERATION_MARK; |
| static const UChar KATAKANA_VOICED_ITERATION_MARK; |
| static const UChar UNICODE_LOW_BOUND_HAN; |
| static const UChar UNICODE_HIGH_BOUND_HAN; |
| static const UChar HANGUL_SYL_LOW; |
| static const UChar HANGUL_SYL_HIGH; |
| static const UChar CJK_COMPATIBILITY_F900; |
| static const UChar CJK_COMPATIBILITY_FA2D; |
| static const UChar UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE; |
| static const UChar FULLWIDTH_EXCLAMATION_MARK; |
| static const UChar FULLWIDTH_FULL_STOP; |
| static const UChar FULLWIDTH_QUESTION_MARK; |
| static const UChar END_OF_STRING; |
| |
| private: |
| // Character data |
| enum CharacterMapping |
| { |
| // These enum values must occur in this order; do not |
| // modify unless you know what you are doing! The forward |
| // and backward data tables are indexed by these enums. |
| kAccent_diacritic = 0, |
| kBaseForm = 1, |
| kBaseCR = 2, |
| kBaseLF = 3, |
| kChoseong = 4, // Korean initial consonant |
| kJungseong = 5, // Korean vowel |
| kJongseong = 6, // Korean final consonant |
| kEOS = 7, |
| kCharacterCol_count = 8 |
| }; |
| |
| static Node kCharacterForwardData[]; |
| static const int32_t kCharacterForwardData_length; |
| static WordBreakTable* kCharacterForward; |
| static Node kCharacterBackwardData[]; |
| static const int32_t kCharacterBackwardData_length; |
| static WordBreakTable* kCharacterBackward; |
| static Type kCharacterRawMapping[]; |
| static const int32_t kCharacterRawMapping_length; |
| static SpecialMapping kCharacterExceptionChar[]; |
| static const int32_t kCharacterExceptionChar_length; |
| static const UBool kCharacterExceptionFlags[]; |
| static UnicodeClassMapping* kCharacterMap; |
| static Type kCharacterAsciiValues[]; |
| |
| private: |
| // Word data |
| enum WordMapping |
| { |
| // These enum values must occur in this order; do not |
| // modify unless you know what you are doing! The forward |
| // and backward data tables are indexed by these enums. |
| kBreak = 0, |
| kLetter = 1, |
| kNumber = 2, |
| kMidLetter = 3, |
| kMidLetNum = 4, |
| kPreNum = 5, |
| kPostNum = 6, |
| kMidNum = 7, |
| kPreMidNum = 8, |
| kBlank = 9, |
| kLF = 10, |
| kKata = 11, |
| kHira = 12, |
| kKanji = 13, |
| kDiacrit = 14, |
| kCR = 15, |
| kNsm = 16, |
| kwEOS = 17, |
| kWordCol_count = 18 |
| }; |
| |
| static Node kWordForwardData[]; |
| static const int32_t kWordForwardData_length; |
| static WordBreakTable* kWordForward; |
| static Node kWordBackwardData[]; |
| static const int32_t kWordBackwardData_length; |
| static WordBreakTable* kWordBackward; |
| static Type kWordRawMapping[]; |
| static const int32_t kWordRawMapping_length; |
| static SpecialMapping kWordExceptionChar[]; |
| static const int32_t kWordExceptionChar_length; |
| static UnicodeClassMapping* kWordMap; |
| static Type kWordAsciiValues[]; |
| static const UBool kWordExceptionFlags[]; |
| |
| private: |
| // Sentence data |
| enum SentenceMapping |
| { |
| // These enum values must occur in this order; do not |
| // modify unless you know what you are doing! The forward |
| // and backward data tables are indexed by these enums. |
| kOther = 0, |
| kSpace = 1, |
| kTerminator = 2, |
| kAmbiguousTerm = 3, |
| kOpenBracket = 4, |
| kCloseBracket = 5, |
| kCJK = 6, |
| kParagraphBreak = 7, |
| kLowerCase = 8, |
| kUpperCase = 9, |
| ksNumber = 10, |
| kQuote = 11, |
| //ksCR, |
| ksNsm = 12, |
| ksEOS = 13, |
| kSentenceCol_count = 14 |
| }; |
| |
| static Node kSentenceForwardData[]; |
| static const int32_t kSentenceForwardData_length; |
| static WordBreakTable* kSentenceForward; |
| static Node kSentenceBackwardData[]; |
| static const int32_t kSentenceBackwardData_length; |
| static WordBreakTable* kSentenceBackward; |
| static Type kSentenceRawMapping[]; |
| static const int32_t kSentenceRawMapping_length; |
| static SpecialMapping kSentenceExceptionChar[]; |
| static const int32_t kSentenceExceptionChar_length; |
| static UnicodeClassMapping* kSentenceMap; |
| static Type kSentenceAsciiValues[]; |
| static const UBool kSentenceExceptionFlags[]; |
| |
| private: |
| // Line data |
| enum LineMapping |
| { |
| // These enum values must occur in this order; do not |
| // modify unless you know what you are doing! The forward |
| // and backward data tables are indexed by these enums. |
| kLineBreak, |
| //always breaks (must be present as first item) |
| kLineBlank, |
| //spaces, tabs, nulls. |
| kLineCR, |
| //carriage return |
| kLineNonBlank, |
| //everything not included elsewhere |
| kLineOp, |
| //hyphens.... |
| kLineJwrd, |
| //hiragana, katakana, and kanji |
| kLinePreJwrd, |
| //characters that bind to the beginning of a Japanese word |
| kLinePostJwrd, |
| //characters that bind to the end of a Japanese word |
| kLineDigit, |
| //digits |
| kLineNumPunct, |
| //punctuation that can appear within a number |
| kLineCurrency, |
| //currency symbols that can precede a number |
| kLineNsm, |
| // non-spacing marks |
| kLineNbsp, |
| // non-breaking characters |
| kLineEOS, |
| kLineCol_count |
| }; |
| |
| static Node kLineForwardData[]; |
| static const int32_t kLineForwardData_length; |
| static WordBreakTable* kLineForward; |
| static Node kLineBackwardData[]; |
| static const int32_t kLineBackwardData_length; |
| static WordBreakTable* kLineBackward; |
| static Type kLineRawMapping[]; |
| static const int32_t kLineRawMapping_length; |
| static SpecialMapping kLineExceptionChar[]; |
| static const int32_t kLineExceptionChar_length; |
| static const UBool kLineExceptionFlags[]; |
| static UnicodeClassMapping* kLineMap; |
| static Type kLineAsciiValues[]; |
| |
| protected: |
| /** |
| * Copy constructor and assignment operator provided to make |
| * compiler happy only. DO NOT CALL. |
| */ |
| TextBoundaryData(const TextBoundaryData&) {} |
| TextBoundaryData& operator=(const TextBoundaryData&) { return *this; } |
| TextBoundaryData() {} // Do not subclass |
| TextBoundaryData(const WordBreakTable* forward, |
| const WordBreakTable* backward, |
| const UnicodeClassMapping* map) |
| : fForward(forward), fBackward(backward), fMap(map) {} |
| |
| private: |
| const WordBreakTable* fForward; |
| const WordBreakTable* fBackward; |
| const UnicodeClassMapping* fMap; |
| }; |
| |
| inline const WordBreakTable* TextBoundaryData::forward() const |
| { |
| return fForward; |
| } |
| |
| inline const WordBreakTable* TextBoundaryData::backward() const |
| { |
| return fBackward; |
| } |
| |
| inline const UnicodeClassMapping* TextBoundaryData::map() const |
| { |
| return fMap; |
| } |
| |
| // These used to be static consts in the class, but some compilers didn't like that. |
| #define kStop (0) |
| #define kSI (0x80) |
| #define kSI_Stop (kSI+kStop) |
| |
| #define kSI_1 (kSI+1) |
| #define kSI_2 (kSI+2) |
| #define kSI_3 (kSI+3) |
| #define kSI_4 (kSI+4) |
| #define kSI_5 (kSI+5) |
| #define kSI_6 (kSI+6) |
| #define kSI_7 (kSI+7) |
| #define kSI_8 (kSI+8) |
| #define kSI_9 (kSI+9) |
| #define kSI_10 (kSI+10) |
| #define kSI_11 (kSI+11) |
| #define kSI_12 (kSI+12) |
| #define kSI_13 (kSI+13) |
| #define kSI_14 (kSI+14) |
| |
| #endif // _TXTBDAT |
| //eof |