| /* |
| * Copyright © {1999}, International Business Machines Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| * Date Name Description |
| * 12/15/99 rgillam Port from Java. |
| ********************************************************************** |
| */ |
| |
| #ifndef RBBI_BLD_H |
| #define RBBI_BLD_H |
| |
| #include "rbbi.h" |
| #include "rbbi_tbl.h" |
| #include "unicode/uniset.h" |
| #include "uvector.h" |
| |
| class ExpressionList; |
| |
| //======================================================================= |
| // RuleBasedBreakIterator.Builder |
| //======================================================================= |
| /** |
| * The Builder class has the job of constructing a RuleBasedBreakIterator from a |
| * textual description. A Builder is constructed by RuleBasedBreakIterator's |
| * constructor, which uses it to construct the iterator itself and then throws it |
| * away. |
| * <p>The construction logic is separated out into its own class for two primary |
| * reasons: |
| * <ul><li>The construction logic is quite complicated and large. Separating it |
| * out into its own class means the code must only be loaded into memory while a |
| * RuleBasedBreakIterator is being constructed, and can be purged after that. |
| * <li>There is a fair amount of state that must be maintained throughout the |
| * construction process that is not needed by the iterator after construction. |
| * Separating this state out into another class prevents all of the functions that |
| * construct the iterator from having to have really long parameter lists, |
| * (hopefully) contributing to readability and maintainability.</ul> |
| * <p>It'd be really nice if this could be an independent class rather than an |
| * inner class, because that would shorten the source file considerably, but |
| * making Builder an inner class of RuleBasedBreakIterator allows it direct access |
| * to RuleBasedBreakIterator's private members, which saves us from having to |
| * provide some kind of "back door" to the Builder class that could then also be |
| * used by other classes. |
| */ |
| class RuleBasedBreakIteratorBuilder { |
| |
| protected: |
| /** |
| * The iterator we're constructing. |
| */ |
| RuleBasedBreakIterator& iterator; |
| |
| /** |
| * The tables object for the iterator we're constructing. |
| */ |
| RuleBasedBreakIteratorTables* tables; |
| |
| /** |
| * A temporary place to hold the rules as they're being processed. |
| */ |
| UVector tempRuleList; |
| |
| /** |
| * A temporary holding place used for calculating the character categories. |
| * This object contains UnicodeSet objects. |
| */ |
| UVector categories; |
| |
| /** |
| * The number of categories (and thus the number of columns in the finished state tables) |
| */ |
| int32_t numCategories; |
| |
| /** |
| * A table used to map parts of regexp text to lists of character categories, |
| * rather than having to figure them out from scratch each time |
| */ |
| ExpressionList* expressions; |
| |
| /** |
| * A temporary holding place for the list of ignore characters |
| */ |
| UnicodeSet ignoreChars; |
| |
| /** |
| * A temporary holding place where the forward state table is built |
| */ |
| UVector tempStateTable; |
| |
| /** |
| * A list of all the states that have to be filled in with transitions to the |
| * next state that is created. Used when building the state table from the |
| * regular expressions. |
| */ |
| UVector decisionPointList; |
| |
| /** |
| * A UStack for holding decision point lists. This is used to handle nested |
| * parentheses and braces in regexps. |
| */ |
| UStack decisionPointStack; |
| |
| /** |
| * A list of states that loop back on themselves. Used to handle .*? |
| */ |
| UVector loopingStates; |
| |
| /** |
| * Looping states actually have to be backfilled later in the process |
| * than everything else. This is where a the list of states to backfill |
| * is accumulated. This is also used to handle .*? |
| */ |
| UVector statesToBackfill; |
| |
| /** |
| * A list mapping pairs of state numbers for states that are to be combined |
| * to the state number of the state representing their combination. Used |
| * in the process of making the state table deterministic to prevent |
| * infinite recursion. |
| */ |
| UVector mergeList; |
| |
| /** |
| * A flag that is used to indicate when the list of looping states can |
| * be reset. |
| */ |
| bool_t clearLoopingStates; |
| |
| /** |
| * A place where an error message can be stored if we get a parse error. |
| * The error message is never displayed anywhere, so this is useful pretty |
| * much only in conjunction with a debugger. |
| */ |
| UnicodeString errorMessage; |
| |
| /** |
| * A bit mask used to indicate a bit in the table's flags column that marks a |
| * state as an accepting state. |
| */ |
| static const int32_t END_STATE_FLAG /*= 0x8000*/; |
| |
| /** |
| * A bit mask used to indicate a bit in the table's flags column that marks a |
| * state as one the builder shouldn't loop to any looping states |
| */ |
| static const int32_t DONT_LOOP_FLAG /*= 0x4000*/; |
| |
| /** |
| * A bit mask used to indicate a bit in the table's flags column that marks a |
| * state as a lookahead state. |
| */ |
| static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/; |
| |
| /** |
| * A bit mask representing the union of the mask values listed above. |
| * Used for clearing or masking off the flag bits. |
| */ |
| static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG |
| | DONT_LOOP_FLAG*/; |
| |
| public: |
| |
| /** |
| * The Builder class contains a reference to the iterator it's supposed to build. |
| */ |
| RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild); |
| |
| /** |
| * Destructor. |
| */ |
| ~RuleBasedBreakIteratorBuilder(); |
| |
| /** |
| * This is the main function for setting up the BreakIterator's tables. It |
| * just vectors different parts of the job off to other functions. |
| */ |
| virtual void buildBreakIterator(const UnicodeString& description, |
| UErrorCode& err); |
| |
| private: |
| |
| /** |
| * Thus function has three main purposes: |
| * <ul><li>Perform general syntax checking on the description, so the rest of the |
| * build code can assume that it's parsing a legal description. |
| * <li>Split the description into separate rules |
| * <li>Perform variable-name substitutions (so that no one else sees variable names) |
| * </ul> |
| */ |
| virtual void buildRuleList(UnicodeString& description, |
| UErrorCode& err); |
| |
| protected: |
| |
| /** |
| * This function performs variable-name substitutions. First it does syntax |
| * checking on the variable-name definition. If it's syntactically valid, it |
| * then goes through the remainder of the description and does a simple |
| * find-and-replace of the variable name with its text. (The variable text |
| * must be enclosed in either [] or () for this to work.) |
| */ |
| virtual void processSubstitution(UnicodeString& description, |
| UTextOffset ruleStart, |
| UTextOffset ruleEnd, |
| UTextOffset startPos, |
| UErrorCode& err); |
| |
| /** |
| * This function defines a protocol for handling substitution names that |
| * are "special," i.e., that have some property beyond just being |
| * substitutions. At the RuleBasedBreakIterator level, we have one |
| * special substitution name, "<ignore>". Subclasses can override this |
| * function to add more. Any special processing that has to go on beyond |
| * that which is done by the normal substitution-processing code is done |
| * here. |
| */ |
| virtual void handleSpecialSubstitution(const UnicodeString& replace, |
| const UnicodeString& replaceWith, |
| int32_t startPos, |
| const UnicodeString& description, |
| UErrorCode& err); |
| |
| /** |
| * This function provides a hook for subclasses to mess with the character |
| * category table. |
| */ |
| virtual void mungeExpressionList(); |
| |
| /** |
| * This function builds the character category table. On entry, |
| * tempRuleList is a UVector of break rules that has had variable names substituted. |
| * On exit, the charCategoryTable data member has been initialized to hold the |
| * character category table, and tempRuleList's rules have been munged to contain |
| * character category numbers everywhere a literal character or a [] expression |
| * originally occurred. |
| */ |
| virtual void buildCharCategories(UErrorCode& err); |
| |
| private: |
| |
| /** |
| * This is the function that builds the forward state table. Most of the real |
| * work is done in parseRule(), which is called once for each rule in the |
| * description. |
| */ |
| virtual void buildStateTable(UErrorCode& err); |
| |
| /** |
| * This is where most of the work really happens. This routine parses a single |
| * rule in the rule description, adding and modifying states in the state |
| * table according to the new expression. The state table is kept deterministic |
| * throughout the whole operation, although some ugly postprocessing is needed |
| * to handle the *? token. |
| */ |
| virtual void parseRule(const UnicodeString& rule, |
| bool_t forward); |
| |
| /** |
| * Update entries in the state table, and merge states when necessary to keep |
| * the table deterministic. |
| * @param rows The list of rows that need updating (the decision point list) |
| * @param pendingChars A character category list, encoded in a String. This is the |
| * list of the columns that need updating. |
| * @param newValue Update the cells specfied above to contain this value |
| */ |
| virtual void updateStateTable(const UVector& rows, |
| const UnicodeString& pendingChars, |
| int16_t newValue); |
| |
| /** |
| * The real work of making the state table deterministic happens here. This function |
| * merges a state in the state table (specified by rowNum) with a state that is |
| * passed in (newValues). The basic process is to copy the nonzero cells in newStates |
| * into the state in the state table (we'll call that oldValues). If there's a |
| * collision (i.e., if the same cell has a nonzero value in both states, and it's |
| * not the SAME value), then we have to reconcile the collision. We do this by |
| * creating a new state, adding it to the end of the state table, and using this |
| * function recursively to merge the original two states into a single, combined |
| * state. This process may happen recursively (i.e., each successive level may |
| * involve collisions). To prevent infinite recursion, we keep a log of merge |
| * operations. Any time we're merging two states we've merged before, we can just |
| * supply the row number for the result of that merge operation rather than creating |
| * a new state just like it. |
| * @param rowNum The row number in the state table of the state to be updated |
| * @param newValues The state to merge it with. |
| * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() |
| * (itself a copy of the decision point list from parseRule()). Newly-created |
| * states get added to the decision point list if their "parents" were on it. |
| */ |
| virtual void mergeStates(int32_t rowNum, |
| int16_t* newValues, |
| const UVector& rowsBeingUpdated); |
| |
| /** |
| * The merge list is a list of pairs of rows that have been merged somewhere in |
| * the process of building this state table, along with the row number of the |
| * row containing the merged state. This function looks up a pair of row numbers |
| * and returns the row number of the row they combine into. (It returns 0 if |
| * this pair of rows isn't in the merge list.) |
| */ |
| virtual int32_t searchMergeList(int32_t a, int32_t b); |
| |
| /** |
| * This function is used to update the list of current loooping states (i.e., |
| * states that are controlled by a *? construct). It backfills values from |
| * the looping states into unpopulated cells of the states that are currently |
| * marked for backfilling, and then updates the list of looping states to be |
| * the new list |
| * @param newLoopingStates The list of new looping states |
| * @param endStates The list of states to treat as end states (states that |
| * can exit the loop). |
| */ |
| virtual void setLoopingStates(const UVector* newLoopingStates, |
| const UVector& endStates); |
| |
| /** |
| * This removes "ending states" and states reachable from them from the |
| * list of states to backfill. |
| * @param The row number of the state to remove from the backfill list |
| */ |
| virtual void eliminateBackfillStates(int32_t baseState); |
| |
| /** |
| * This function completes the backfilling process by actually doing the |
| * backfilling on the states that are marked for it |
| */ |
| virtual void backfillLoopingStates(void); |
| |
| /** |
| * This function completes the state-table-building process by doing several |
| * postprocessing steps and copying everything into its final resting place |
| * in the iterator itself |
| * @param forward True if we're working on the forward state table |
| */ |
| virtual void finishBuildingStateTable(bool_t forward); |
| |
| /** |
| * This function builds the backward state table from the forward state |
| * table and any additional rules (identified by the ! on the front) |
| * supplied in the description |
| */ |
| virtual void buildBackwardsStateTable(UErrorCode& err); |
| |
| protected: |
| |
| /** |
| * Throws an IllegalArgumentException representing a syntax error in the rule |
| * description. The exception's message contains some debugging information. |
| * @param message A message describing the problem |
| * @param position The position in the description where the problem was |
| * discovered |
| * @param context The string containing the error |
| */ |
| virtual void setUpErrorMessage(const UnicodeString& message, |
| int32_t position, |
| const UnicodeString& context); |
| }; |
| |
| #endif |