| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * |
| * Copyright (C) 1999-2014 International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| ******************************************************************************* |
| * file name: rbbidata.h |
| * encoding: UTF-8 |
| * tab size: 8 (not used) |
| * indentation:4 |
| * |
| * RBBI data formats Includes |
| * |
| * Structs that describes the format of the Binary RBBI data, |
| * as it is stored in ICU's data file. |
| * |
| * RBBIDataWrapper - Instances of this class sit between the |
| * raw data structs and the RulesBasedBreakIterator objects |
| * that are created by applications. The wrapper class |
| * provides reference counting for the underlying data, |
| * and direct pointers to data that would not otherwise |
| * be accessible without ugly pointer arithmetic. The |
| * wrapper does not attempt to provide any higher level |
| * abstractions for the data itself. |
| * |
| * There will be only one instance of RBBIDataWrapper for any |
| * set of RBBI run time data being shared by instances |
| * (clones) of RulesBasedBreakIterator. |
| */ |
| |
| #ifndef __RBBIDATA_H__ |
| #define __RBBIDATA_H__ |
| |
| #include "unicode/utypes.h" |
| #include "unicode/udata.h" |
| #include "udataswp.h" |
| |
| /** |
| * Swap RBBI data. See udataswp.h. |
| * @internal |
| */ |
| U_CAPI int32_t U_EXPORT2 |
| ubrk_swap(const UDataSwapper *ds, |
| const void *inData, int32_t length, void *outData, |
| UErrorCode *pErrorCode); |
| |
| #ifdef __cplusplus |
| |
| #include "unicode/ucptrie.h" |
| #include "unicode/uobject.h" |
| #include "unicode/unistr.h" |
| #include "unicode/uversion.h" |
| #include "umutex.h" |
| |
| |
| U_NAMESPACE_BEGIN |
| |
| // The current RBBI data format version. |
| static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; |
| |
| /* |
| * The following structs map exactly onto the raw data from ICU common data file. |
| */ |
| struct RBBIDataHeader { |
| uint32_t fMagic; /* == 0xbla0 */ |
| UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ |
| /* if there is one associated with this data. */ |
| /* (version originates in rbbi, is copied to UDataInfo) */ |
| uint32_t fLength; /* Total length in bytes of this RBBI Data, */ |
| /* including all sections, not just the header. */ |
| uint32_t fCatCount; /* Number of character categories. */ |
| |
| /* */ |
| /* Offsets and sizes of each of the subsections within the RBBI data. */ |
| /* All offsets are bytes from the start of the RBBIDataHeader. */ |
| /* All sizes are in bytes. */ |
| /* */ |
| uint32_t fFTable; /* forward state transition table. */ |
| uint32_t fFTableLen; |
| uint32_t fRTable; /* Offset to the reverse state transition table. */ |
| uint32_t fRTableLen; |
| uint32_t fTrie; /* Offset to Trie data for character categories */ |
| uint32_t fTrieLen; |
| uint32_t fRuleSource; /* Offset to the source for for the break */ |
| uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ |
| uint32_t fStatusTable; /* Offset to the table of rule status values */ |
| uint32_t fStatusTableLen; |
| |
| uint32_t fReserved[6]; /* Reserved for expansion */ |
| |
| }; |
| |
| |
| |
| template <typename T> |
| struct RBBIStateTableRowT { |
| T fAccepting; // Non-zero if this row is for an accepting state. |
| // Value 0: not an accepting state. |
| // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. |
| // >1: Look-ahead match has completed. |
| // Actual boundary position happened earlier. |
| // Value here == fLookAhead in earlier |
| // state, at actual boundary pos. |
| T fLookAhead; // Non-zero if this row is for a state that |
| // corresponds to a '/' in the rule source. |
| // Value is the same as the fAccepting |
| // value for the rule (which will appear |
| // in a different state. |
| T fTagsIdx; // Non-zero if this row covers a {tagged} position |
| // from a rule. Value is the index in the |
| // StatusTable of the set of matching |
| // tags (rule status values) |
| T fNextState[1]; // Next State, indexed by char category. |
| // Variable-length array declared with length 1 |
| // to disable bounds checkers. |
| // Array Size is actually fData->fHeader->fCatCount |
| // CAUTION: see RBBITableBuilder::getTableSize() |
| // before changing anything here. |
| }; |
| |
| typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; |
| typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; |
| |
| constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting |
| |
| union RBBIStateTableRow { |
| RBBIStateTableRow16 r16; |
| RBBIStateTableRow8 r8; |
| }; |
| |
| struct RBBIStateTable { |
| uint32_t fNumStates; // Number of states. |
| uint32_t fRowLen; // Length of a state table row, in bytes. |
| uint32_t fDictCategoriesStart; // Char category number of the first dictionary |
| // char class, or the the largest category number + 1 |
| // if there are no dictionary categories. |
| uint32_t fLookAheadResultsSize; // Size of run-time array required for holding |
| // look-ahead results. Indexed by row.fLookAhead. |
| uint32_t fFlags; // Option Flags for this state table. |
| char fTableData[1]; // First RBBIStateTableRow begins here. |
| // Variable-length array declared with length 1 |
| // to disable bounds checkers. |
| // (making it char[] simplifies ugly address |
| // arithmetic for indexing variable length rows.) |
| }; |
| |
| constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; |
| constexpr uint32_t RBBI_BOF_REQUIRED = 2; |
| constexpr uint32_t RBBI_8BITS_ROWS = 4; |
| |
| |
| /* */ |
| /* The reference counting wrapper class */ |
| /* */ |
| class RBBIDataWrapper : public UMemory { |
| public: |
| enum EDontAdopt { |
| kDontAdopt |
| }; |
| RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); |
| RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); |
| RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); |
| ~RBBIDataWrapper(); |
| |
| static UBool isDataVersionAcceptable(const UVersionInfo version); |
| |
| void init0(); |
| void init(const RBBIDataHeader *data, UErrorCode &status); |
| RBBIDataWrapper *addReference(); |
| void removeReference(); |
| UBool operator ==(const RBBIDataWrapper &other) const; |
| int32_t hashCode(); |
| const UnicodeString &getRuleSourceString() const; |
| void printData(); |
| void printTable(const char *heading, const RBBIStateTable *table); |
| |
| /* */ |
| /* Pointers to items within the data */ |
| /* */ |
| const RBBIDataHeader *fHeader; |
| const RBBIStateTable *fForwardTable; |
| const RBBIStateTable *fReverseTable; |
| const char *fRuleSource; |
| const int32_t *fRuleStatusTable; |
| |
| /* number of int32_t values in the rule status table. Used to sanity check indexing */ |
| int32_t fStatusMaxIdx; |
| |
| UCPTrie *fTrie; |
| |
| private: |
| u_atomic_int32_t fRefCount; |
| UDataMemory *fUDataMem; |
| UnicodeString fRuleString; |
| UBool fDontFreeData; |
| |
| RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ |
| RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ |
| }; |
| |
| |
| |
| U_NAMESPACE_END |
| |
| U_CFUNC UBool rbbi_cleanup(void); |
| |
| #endif /* C++ */ |
| |
| #endif |