blob: 97e3ebc81b72b0c20315bc74f89e6cda8146772d [file] [log] [blame]
/*
*****************************************************************************************
* Copyright (C) 1997-1999, International Business Machines
* Corporation and others. All Rights Reserved.
*****************************************************************************************
*
* File WDBKDAT.CPP
*
* Modification History:
*
* Date Name Description
* 02/18/97 aliu Converted from OpenClass.
* Recoded kRawMapping table for Unicode::getType() type codes.
* Made static data members const where appropriate.
* 03/25/97 aliu Moved into TextBoundaryData; no longer a subclass.
* 04/15/97 aliu Worked around bug in AIX xlC compiler which occurs if static
* arrays contain const elements.
* 05/06/97 aliu Made SpecialMapping an array of objects instead of pointers,
* to help out non-compliant compilers.
* 08/14/98 helena Sync-up JDK1.2.
*****************************************************************************************
*/
// *****************************************************************************
// This file was generated from the java source file WordBreakData.java
// *****************************************************************************
#include "txtbdat.h"
#include "wdbktbl.h"
#include "unicdcm.h"
// *****************************************************************************
// class WordBreakData
// This class contains the following transition state data for word break.
// For more detailed explanation on the boundary break state machine, please
// see the internal documentation of wdbktbl.cpp.
// *****************************************************************************
// The forward transition states of word boundary data.
TextBoundaryData::Node TextBoundaryData::kWordForwardData[] = {
// brk let num mLe mLN
// prN poN mNu pMN blk
// lf kat hir kan dia
// cr nsm EOS
// 0
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop,
// 1
kSI_14, kSI_2, kSI_3, kSI_14, kSI_14,
kSI_5, kSI_14, kSI_14, kSI_5, kSI_6,
kSI_4, kSI_10, kSI_11, kSI_12, kSI_9,
kSI_13, 1, kSI_Stop,
// 2
kSI_Stop, kSI_2, kSI_3, kSI_7, kSI_7,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_7, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, 2, kSI_Stop,
// 3
kSI_Stop, kSI_2, kSI_3, kSI_Stop, kSI_8,
kSI_Stop, kSI_14, kSI_8, kSI_8, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, 3, kSI_Stop,
// 4
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop,
// 5
kSI_Stop, kSI_Stop, kSI_3, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, 5, kSI_Stop,
// 6
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_6,
kSI_4, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_13, 6, kSI_Stop,
// 7
kStop, kSI_2, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 7, kStop,
// 8
kStop, kStop, kSI_3, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 8, kStop,
// 9
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_10, kSI_11, kSI_Stop, kSI_9,
kSI_Stop, 9, kSI_Stop,
// 10
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_10, kSI_Stop, kSI_Stop, kSI_10,
kSI_Stop, 10, kSI_Stop,
// 11
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_11, kSI_Stop, kSI_11,
kSI_Stop, 11, kSI_Stop,
// 12
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_12, kSI_Stop,
kSI_Stop, 12, kSI_Stop,
// 13
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_4, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop,
// 14
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop, kSI_Stop,
kSI_Stop, 14, kSI_Stop
};
const int32_t TextBoundaryData::kWordForwardData_length =
sizeof(TextBoundaryData::kWordForwardData) / sizeof(TextBoundaryData::kWordForwardData[0]);
WordBreakTable* TextBoundaryData::kWordForward = new WordBreakTable(kWordCol_count, kWordForwardData, kWordForwardData_length);
// The forward transition states of word boundary data.
TextBoundaryData::Node TextBoundaryData::kWordBackwardData[] = {
// brk let num mLe mLN
// prN poN mNu pMN blk
// lf kat hir kan dia
// cr nsm EOS
// 0
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop,
// 1
kSI_6, kSI_2, kSI_3, kSI_4, kSI_5,
kSI_6, kSI_7, kSI_7, kSI_5, kSI_8,
kSI_8, kSI_9, kSI_10, kSI_12, kSI_11,
kSI_8, 1, kStop,
// 2
kStop, kSI_2, kSI_3, 4, 4,
kStop, kStop, kStop, 4, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 2, kStop,
// 3
kStop, kSI_2, kSI_3, kStop, 7,
kSI_Stop, kStop, 7, kSI_7, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 3, kStop,
// 4
kStop, kSI_2, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 4, kStop,
// 5
kStop, kSI_2, kSI_3, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 5, kStop,
// 6
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 6, kStop,
// 7
kStop, kStop, kSI_3, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, 7, kStop,
// 8
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kSI_8,
kSI_8, kStop, kStop, kStop, kStop,
kSI_8, 8, kStop,
// 9
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kSI_9, kStop, kStop, 9,
kStop, 9, kStop,
// 10
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kSI_10, kStop, 10,
kStop, 10, kStop,
// 11
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kSI_9, kSI_10, kStop, kSI_11,
kStop, 11, kStop,
// 12
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kStop, kStop,
kStop, kStop, kStop, kSI_12, kStop,
kStop, 12, kStop
};
const int32_t TextBoundaryData::kWordBackwardData_length =
sizeof(TextBoundaryData::kWordBackwardData) / sizeof(TextBoundaryData::kWordBackwardData[0]);
WordBreakTable* TextBoundaryData::kWordBackward = new WordBreakTable(kWordCol_count, kWordBackwardData, kWordBackwardData_length);
// The word type mapping of the break table.
TextBoundaryData::Type TextBoundaryData::kWordRawMapping[] = {
// Re-coded to match Unicode 2 types [LIU]
kBreak, // UNASSIGNED = 0,
kLetter, // UPPERCASE_LETTER = 1,
kLetter, // LOWERCASE_LETTER = 2,
kLetter, // TITLECASE_LETTER = 3,
kLetter, // MODIFIER_LETTER = 4,
kLetter, // OTHER_LETTER = 5,
kNsm, // NON_SPACING_MARK = 6,
kNsm, // ENCLOSING_MARK = 7,
kBreak, // COMBINING_SPACING_MARK = 8,
kNumber, // DECIMAL_DIGIT_NUMBER = 9,
kLetter, // LETTER_NUMBER = 10,
kNumber, // OTHER_NUMBER = 11,
kBlank, // SPACE_SEPARATOR = 12,
kBreak, // LINE_SEPARATOR = 13,
kBreak, // PARAGRAPH_SEPARATOR = 14,
kBreak, // CONTROL = 15,
kBreak, // FORMAT = 16,
kBreak, // PRIVATE_USE = 17,
kBreak, // SURROGATE = 18,
kMidLetter, // DASH_PUNCTUATION = 19,
kBreak, // START_PUNCTUATION = 20,
kBreak, // END_PUNCTUATION = 21,
kBreak, // CONNECTOR_PUNCTUATION = 22,
kBreak, // OTHER_PUNCTUATION = 23,
kBreak, // MATH_SYMBOL = 24,
kPreNum, // CURRENCY_SYMBOL = 25,
kBreak, // MODIFIER_SYMBOL = 26,
kBreak, // OTHER_SYMBOL = 27,
kBreak // UNDEFINED = 28
};
const int32_t TextBoundaryData::kWordRawMapping_length =
sizeof(TextBoundaryData::kWordRawMapping) / sizeof(TextBoundaryData::kWordRawMapping[0]);
// The exceptions of the word break data.
SpecialMapping TextBoundaryData::kWordExceptionChar[] = {
//note: the ranges in this table must be sorted in ascending order
//as required by the UnicodeClassMapping class.
SpecialMapping(TextBoundaryData::ASCII_HORIZONTAL_TABULATION, TextBoundaryData::kBlank),
SpecialMapping(TextBoundaryData::ASCII_LINEFEED, TextBoundaryData::kLF),
SpecialMapping(TextBoundaryData::ASCII_FORM_FEED, TextBoundaryData::kLF),
SpecialMapping(TextBoundaryData::ASCII_CARRIAGE_RETURN, TextBoundaryData::kCR),
SpecialMapping(TextBoundaryData::ASCII_QUOTATION_MARK, TextBoundaryData::kMidLetNum),
SpecialMapping(TextBoundaryData::ASCII_NUMBER_SIGN, TextBoundaryData::kPreNum),
SpecialMapping(TextBoundaryData::ASCII_PERCENT, TextBoundaryData::kPostNum),
SpecialMapping(TextBoundaryData::ASCII_AMPERSAND, TextBoundaryData::kPostNum),
SpecialMapping(TextBoundaryData::ASCII_APOSTROPHE, TextBoundaryData::kMidLetNum),
SpecialMapping(TextBoundaryData::ASCII_COMMA, TextBoundaryData::kMidNum),
SpecialMapping(TextBoundaryData::ASCII_FULL_STOP, TextBoundaryData::kPreMidNum),
SpecialMapping(TextBoundaryData::ASCII_CENT_SIGN, TextBoundaryData::kPostNum),
SpecialMapping(TextBoundaryData::LATIN1_SOFTHYPHEN, TextBoundaryData::kMidLetter),
SpecialMapping(TextBoundaryData::ARABIC_PERCENT_SIGN, TextBoundaryData::kPostNum),
SpecialMapping(TextBoundaryData::ARABIC_DECIMAL_SEPARATOR, TextBoundaryData::kMidNum),
SpecialMapping(TextBoundaryData::PUNCTUATION_HYPHENATION_POINT, TextBoundaryData::kMidLetter),
SpecialMapping(TextBoundaryData::PUNCTUATION_LINE_SEPARATOR,
TextBoundaryData::PUNCTUATION_PARAGRAPH_SEPARATOR, TextBoundaryData::kLF),
SpecialMapping(TextBoundaryData::PER_MILLE_SIGN, TextBoundaryData::kPostNum),
SpecialMapping(TextBoundaryData::PER_TEN_THOUSAND_SIGN, TextBoundaryData::kPostNum),
SpecialMapping(TextBoundaryData::IDEOGRAPHIC_ITERATION_MARK, TextBoundaryData::kKanji),
SpecialMapping(TextBoundaryData::HIRAGANA_LETTER_SMALL_A,
TextBoundaryData::HIRAGANA_LETTER_VU, TextBoundaryData::kHira),
SpecialMapping(TextBoundaryData::COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
TextBoundaryData::HIRAGANA_SEMIVOICED_SOUND_MARK, TextBoundaryData::kDiacrit),
SpecialMapping(TextBoundaryData::KATAKANA_LETTER_SMALL_A,
TextBoundaryData::KATAKANA_LETTER_SMALL_KE, TextBoundaryData::kKata),
SpecialMapping(TextBoundaryData::UNICODE_LOW_BOUND_HAN,
TextBoundaryData::UNICODE_HIGH_BOUND_HAN, TextBoundaryData::kKanji),
SpecialMapping(TextBoundaryData::HANGUL_SYL_LOW,
TextBoundaryData::HANGUL_SYL_HIGH, TextBoundaryData::kLetter),
SpecialMapping(TextBoundaryData::CJK_COMPATIBILITY_F900,
TextBoundaryData::CJK_COMPATIBILITY_FA2D, TextBoundaryData::kKanji),
SpecialMapping(TextBoundaryData::END_OF_STRING, TextBoundaryData::kwEOS)
};
const UBool TextBoundaryData::kWordExceptionFlags[] = {
FALSE, // kNonCharacter = 0,
FALSE, // kUppercaseLetter = 1,
FALSE, // kLowercaseLetter = 2,
FALSE, // kTitlecaseLetter = 3,
TRUE, // kModifierLetter = 4,
TRUE, // kOtherLetter = 5,
TRUE, // kNonSpacingMark = 6,
FALSE, // kEnclosingMark = 7,
FALSE, // kCombiningSpacingMark = 8,
FALSE, // kDecimalNumber = 9,
FALSE, // kLetterNumber = 10,
FALSE, // kOtherNumber = 11,
FALSE, // kSpaceSeparator = 12,
TRUE, // kLineSeparator = 13,
TRUE, // kParagraphSeparator = 14,
TRUE, // kControlCharacter = 15,
FALSE, // kFormatCharacter = 16,
FALSE, // kPrivateUseCharacter = 17,
FALSE, // kSurrogate = 18,
TRUE, // kDashPunctuation = 19,
FALSE, // kOpenPunctuation = 20,
FALSE, // kClosePunctuation = 21,
FALSE, // kConnectorPunctuation = 22,
TRUE, // kOtherPunctuation = 23,
FALSE, // kMathSymbol = 24,
TRUE, // kCurrencySymbol = 25,
FALSE, // kModifierSymbol = 26,
FALSE, // kOtherSymbol = 27
FALSE // UNDEFINED = 28,
};
const int32_t TextBoundaryData::kWordExceptionChar_length =
sizeof(TextBoundaryData::kWordExceptionChar) / sizeof(TextBoundaryData::kWordExceptionChar[0]);
TextBoundaryData::Type TextBoundaryData::kWordAsciiValues[] = {
// null soh stx etx eot enq ask bell
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// bs ht kLF vt ff cr so si
kBreak, kBlank, kLF, kBreak, kLF, kCR, kBreak, kBreak,
// dle dc1 dc2 dc3 dc4 nak syn etb
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// can em sub esc fs gs rs us
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// sp ! " # $ % & '
kBlank, kBreak, kMidLetNum, kPreNum, kPreNum, kPostNum, kPostNum, kMidLetNum,
// ( ) * + , - . /
kBreak, kBreak, kBreak, kBreak, kMidNum, kMidLetter, kPreMidNum, kBreak,
// 0 1 2 3 4 5 6 7
kNumber, kNumber, kNumber, kNumber, kNumber, kNumber, kNumber, kNumber,
// 8 9 : ; < = > ?
kNumber, kNumber, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// @ A B C D E F G
kBreak, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// H I J K L M N O
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// P Q R S T U V W
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// X Y Z [ \ ] ^ _
kLetter, kLetter, kLetter, kBreak, kBreak, kBreak, kBreak, kBreak,
// ` a b c d e f g
kBreak, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// h i j k l m n o
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// p q r s t u v w
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// x y z { | } ~ del
kLetter, kLetter, kLetter, kBreak, kBreak, kBreak, kBreak, kBreak,
// ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak, kBreak,
// nbsp inv-! cents pounds currency yen broken-bar section
kBlank, kBreak, kPostNum, kPreNum, kPreNum, kPreNum, kBreak, kBreak,
// umlaut copyright super-a gui-left not soft-hyph registered macron
kBreak, kBreak, kLetter, kBreak, kBreak, kMidLetter, kBreak, kBreak,
// degree +/- super-2 super-3 acute micro paragraph bullet
kBreak, kBreak, kNumber, kNumber, kBreak, kLetter, kBreak, kBreak,
// cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
kBreak, kLetter, kBreak, kBreak, kNumber, kNumber, kNumber, kBreak,
// A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kBreak,
// O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter,
// edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kBreak,
// o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter, kLetter
};
UnicodeClassMapping* TextBoundaryData::kWordMap =
new UnicodeClassMapping(kWordRawMapping, kWordRawMapping_length,
kWordExceptionChar, kWordExceptionChar_length,
kWordExceptionFlags,
kWordAsciiValues );
/**
* This is the single instance of TextBoundaryData containing word
* break data.
*/
const TextBoundaryData TextBoundaryData::kWordBreakData(TextBoundaryData::kWordForward,
TextBoundaryData::kWordBackward,
TextBoundaryData::kWordMap);
//eof