| /* | 
 | ********************************************************************** | 
 | *   Copyright (C) 1999 Alan Liu and others. All rights reserved. | 
 | ********************************************************************** | 
 | *   Date        Name        Description | 
 | *   10/22/99    alan        Creation.  This is an internal header; it | 
 | *                           shall not be exported. | 
 | ********************************************************************** | 
 | */ | 
 |  | 
 | #ifndef RBBI_BLD_H | 
 | #define RBBI_BLD_H | 
 |  | 
 | #include "rbbi.h" | 
 | #include "uniset.h" | 
 | #include "uvector.h" | 
 |  | 
 | //======================================================================= | 
 | // RuleBasedBreakIterator.Builder | 
 | //======================================================================= | 
 | /** | 
 |  * The Builder class has the job of constructing a RuleBasedBreakIterator from a | 
 |  * textual description.  A Builder is constructed by RuleBasedBreakIterator's | 
 |  * constructor, which uses it to construct the iterator itself and then throws it | 
 |  * away. | 
 |  * <p>The construction logic is separated out into its own class for two primary | 
 |  * reasons: | 
 |  * <ul><li>The construction logic is quite complicated and large.  Separating it | 
 |  * out into its own class means the code must only be loaded into memory while a | 
 |  * RuleBasedBreakIterator is being constructed, and can be purged after that. | 
 |  * <li>There is a fair amount of state that must be maintained throughout the | 
 |  * construction process that is not needed by the iterator after construction. | 
 |  * Separating this state out into another class prevents all of the functions that | 
 |  * construct the iterator from having to have really long parameter lists, | 
 |  * (hopefully) contributing to readability and maintainability.</ul> | 
 |  * <p>It'd be really nice if this could be an independent class rather than an | 
 |  * inner class, because that would shorten the source file considerably, but | 
 |  * making Builder an inner class of RuleBasedBreakIterator allows it direct access | 
 |  * to RuleBasedBreakIterator's private members, which saves us from having to | 
 |  * provide some kind of "back door" to the Builder class that could then also be | 
 |  * used by other classes. | 
 |  */ | 
 | class RuleBasedBreakIteratorBuilder { | 
 |  | 
 | protected: | 
 |  | 
 |     /** | 
 |      * A temporary holding place used for calculating the character categories. | 
 |      * This object contains UnicodeSet objects. | 
 |      */ | 
 |     UVector categories; | 
 |  | 
 |     /** | 
 |      * A table used to map parts of regexp text to lists of character categories, | 
 |      * rather than having to figure them out from scratch each time | 
 |      */ | 
 |     Hashtable expressions; | 
 |  | 
 |     /** | 
 |      * A temporary holding place for the list of ignore characters | 
 |      */ | 
 |     UnicodeSet ignoreChars; | 
 |  | 
 |     /** | 
 |      * A temporary holding place where the forward state table is built | 
 |      */ | 
 |     UVector tempStateTable; | 
 |  | 
 |     /** | 
 |      * A list of all the states that have to be filled in with transitions to the | 
 |      * next state that is created.  Used when building the state table from the | 
 |      * regular expressions. | 
 |      */ | 
 |     UVector decisionPointList; | 
 |  | 
 |     /** | 
 |      * A UStack for holding decision point lists.  This is used to handle nested | 
 |      * parentheses and braces in regexps. | 
 |      */ | 
 |     UStack decisionPointStack; | 
 |  | 
 |     /** | 
 |      * A list of states that loop back on themselves.  Used to handle .*? | 
 |      */ | 
 |     UVector loopingStates; | 
 |  | 
 |     /** | 
 |      * Looping states actually have to be backfilled later in the process | 
 |      * than everything else.  This is where a the list of states to backfill | 
 |      * is accumulated.  This is also used to handle .*? | 
 |      */ | 
 |     UVector statesToBackfill; | 
 |  | 
 |     /** | 
 |      * A list mapping pairs of state numbers for states that are to be combined | 
 |      * to the state number of the state representing their combination.  Used | 
 |      * in the process of making the state table deterministic to prevent | 
 |      * infinite recursion. | 
 |      */ | 
 |     UVector mergeList; | 
 |  | 
 |     /** | 
 |      * A flag that is used to indicate when the list of looping states can | 
 |      * be reset. | 
 |      */ | 
 |     bool_t clearLoopingStates; | 
 |  | 
 | public: | 
 |  | 
 |     /** | 
 |      * No special construction is required for the Builder. | 
 |      */ | 
 |     RuleBasedBreakIteratorBuilder(); | 
 |  | 
 |     /** | 
 |      * This is the main function for setting up the BreakIterator's tables.  It | 
 |      * just UVectors different parts of the job off to other functions. | 
 |      */ | 
 |     virtual void buildBreakIterator(); | 
 |  | 
 | private: | 
 |  | 
 |     /** | 
 |      * Thus function has three main purposes: | 
 |      * <ul><li>Perform general syntax checking on the description, so the rest of the | 
 |      * build code can assume that it's parsing a legal description. | 
 |      * <li>Split the description into separate rules | 
 |      * <li>Perform variable-name substitutions (so that no one else sees variable names) | 
 |      * </ul> | 
 |      */ | 
 |     virtual UVector buildRuleList(UnicodeString description); | 
 |  | 
 | protected: | 
 |  | 
 |     /** | 
 |      * This function performs variable-name substitutions.  First it does syntax | 
 |      * checking on the variable-name definition.  If it's syntactically valid, it | 
 |      * then goes through the remainder of the description and does a simple | 
 |      * find-and-replace of the variable name with its text.  (The variable text | 
 |      * must be enclosed in either [] or () for this to work.) | 
 |      */ | 
 |     virtual UnicodeString processSubstitution(UnicodeString substitutionRule, UnicodeString description, | 
 |                     int32_t startPos); | 
 |  | 
 |     /** | 
 |      * This function defines a protocol for handling substitution names that | 
 |      * are "special," i.e., that have some property beyond just being | 
 |      * substitutions.  At the RuleBasedBreakIterator level, we have one | 
 |      * special substitution name, "<ignore>".  Subclasses can override this | 
 |      * function to add more.  Any special processing that has to go on beyond | 
 |      * that which is done by the normal substitution-processing code is done | 
 |      * here. | 
 |      */ | 
 |     virtual void handleSpecialSubstitution(UnicodeString replace, UnicodeString replaceWith, | 
 |                 int32_t startPos, UnicodeString description); | 
 |  | 
 |     /** | 
 |      * This function builds the character category table.  On entry, | 
 |      * tempRuleList is a UVector of break rules that has had variable names substituted. | 
 |      * On exit, the charCategoryTable data member has been initialized to hold the | 
 |      * character category table, and tempRuleList's rules have been munged to contain | 
 |      * character category numbers everywhere a literal character or a [] expression | 
 |      * originally occurred. | 
 |      */ | 
 |     virtual void buildCharCategories(UVector tempRuleList); | 
 |  | 
 | private: | 
 |  | 
 |     /** | 
 |      * This is the function that builds the forward state table.  Most of the real | 
 |      * work is done in parseRule(), which is called once for each rule in the | 
 |      * description. | 
 |      */ | 
 |     virtual void buildStateTable(UVector tempRuleList); | 
 |  | 
 |     /** | 
 |      * This is where most of the work really happens.  This routine parses a single | 
 |      * rule in the rule description, adding and modifying states in the state | 
 |      * table according to the new expression.  The state table is kept deterministic | 
 |      * throughout the whole operation, although some ugly postprocessing is needed | 
 |      * to handle the *? token. | 
 |      */ | 
 |     virtual void parseRule(UnicodeString rule, bool_t forward); | 
 |  | 
 |     /** | 
 |      * Update entries in the state table, and merge states when necessary to keep | 
 |      * the table deterministic. | 
 |      * @param rows The list of rows that need updating (the decision point list) | 
 |      * @param pendingChars A character category list, encoded in a String.  This is the | 
 |      * list of the columns that need updating. | 
 |      * @param newValue Update the cells specfied above to contain this value | 
 |      */ | 
 |     virtual void updateStateTable(UVector rows, | 
 |                                   UnicodeString pendingChars, | 
 |                                   int16_t newValue); | 
 |  | 
 |     /** | 
 |      * The real work of making the state table deterministic happens here.  This function | 
 |      * merges a state in the state table (specified by rowNum) with a state that is | 
 |      * passed in (newValues).  The basic process is to copy the nonzero cells in newStates | 
 |      * into the state in the state table (we'll call that oldValues).  If there's a | 
 |      * collision (i.e., if the same cell has a nonzero value in both states, and it's | 
 |      * not the SAME value), then we have to reconcile the collision.  We do this by | 
 |      * creating a new state, adding it to the end of the state table, and using this | 
 |      * function recursively to merge the original two states into a single, combined | 
 |      * state.  This process may happen recursively (i.e., each successive level may | 
 |      * involve collisions).  To prevent infinite recursion, we keep a log of merge | 
 |      * operations.  Any time we're merging two states we've merged before, we can just | 
 |      * supply the row number for the result of that merge operation rather than creating | 
 |      * a new state just like it. | 
 |      * @param rowNum The row number in the state table of the state to be updated | 
 |      * @param newValues The state to merge it with. | 
 |      * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() | 
 |      * (itself a copy of the decision point list from parseRule()).  Newly-created | 
 |      * states get added to the decision point list if their "parents" were on it. | 
 |      */ | 
 |     virtual void mergeStates(int32_t rowNum, | 
 |                              int16_t* newValues, | 
 |                              UVector rowsBeingUpdated); | 
 |  | 
 |     /** | 
 |      * The merge list is a list of pairs of rows that have been merged somewhere in | 
 |      * the process of building this state table, along with the row number of the | 
 |      * row containing the merged state.  This function looks up a pair of row numbers | 
 |      * and returns the row number of the row they combine into.  (It returns 0 if | 
 |      * this pair of rows isn't in the merge list.) | 
 |      */ | 
 |     virtual int32_t searchMergeList(int32_t a, int32_t b); | 
 |  | 
 |     /** | 
 |      * This function is used to update the list of current loooping states (i.e., | 
 |      * states that are controlled by a *? construct).  It backfills values from | 
 |      * the looping states into unpopulated cells of the states that are currently | 
 |      * marked for backfilling, and then updates the list of looping states to be | 
 |      * the new list | 
 |      * @param newLoopingStates The list of new looping states | 
 |      * @param endStates The list of states to treat as end states (states that | 
 |      * can exit the loop). | 
 |      */ | 
 |     virtual void setLoopingStates(UVector newLoopingStates, UVector endStates); | 
 |  | 
 |     /** | 
 |      * This removes "ending states" and states reachable from them from the | 
 |      * list of states to backfill. | 
 |      * @param The row number of the state to remove from the backfill list | 
 |      */ | 
 |     virtual void eliminateBackfillStates(int32_t baseState); | 
 |  | 
 |     /** | 
 |      * This function completes the backfilling process by actually doing the | 
 |      * backfilling on the states that are marked for it | 
 |      */ | 
 |     virtual void backfillLoopingStates(); | 
 |  | 
 |     /** | 
 |      * This function completes the state-table-building process by doing several | 
 |      * postprocessing steps and copying everything into its final resting place | 
 |      * in the iterator itself | 
 |      * @param forward True if we're working on the forward state table | 
 |      */ | 
 |     virtual void finishBuildingStateTable(bool_t forward); | 
 |  | 
 |     /** | 
 |      * This function builds the backward state table from the forward state | 
 |      * table and any additional rules (identified by the ! on the front) | 
 |      * supplied in the description | 
 |      */ | 
 |     virtual void buildBackwardsStateTable(UVector tempRuleList); | 
 |  | 
 | protected: | 
 |  | 
 |     /** | 
 |      * Throws an IllegalArgumentException representing a syntax error in the rule | 
 |      * description.  The exception's message contains some debugging information. | 
 |      * @param message A message describing the problem | 
 |      * @param position The position in the description where the problem was | 
 |      * discovered | 
 |      * @param context The string containing the error | 
 |      */ | 
 |     virtual void error(UnicodeString message, int32_t position, UnicodeString context); | 
 | }; | 
 |  | 
 | #endif |