blob: a1db9262ec9a98716a77753555d2b064f127f774 [file] [log] [blame]
/*
* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 12/15/99 rgillam Port from Java.
**********************************************************************
*/
#ifndef RBBI_BLD_H
#define RBBI_BLD_H
#include "rbbi.h"
#include "rbbi_tbl.h"
#include "unicode/uniset.h"
#include "uvector.h"
class ExpressionList;
//=======================================================================
// RuleBasedBreakIterator.Builder
//=======================================================================
/**
* The Builder class has the job of constructing a RuleBasedBreakIterator from a
* textual description. A Builder is constructed by RuleBasedBreakIterator's
* constructor, which uses it to construct the iterator itself and then throws it
* away.
* <p>The construction logic is separated out into its own class for two primary
* reasons:
* <ul><li>The construction logic is quite complicated and large. Separating it
* out into its own class means the code must only be loaded into memory while a
* RuleBasedBreakIterator is being constructed, and can be purged after that.
* <li>There is a fair amount of state that must be maintained throughout the
* construction process that is not needed by the iterator after construction.
* Separating this state out into another class prevents all of the functions that
* construct the iterator from having to have really long parameter lists,
* (hopefully) contributing to readability and maintainability.</ul>
* <p>It'd be really nice if this could be an independent class rather than an
* inner class, because that would shorten the source file considerably, but
* making Builder an inner class of RuleBasedBreakIterator allows it direct access
* to RuleBasedBreakIterator's private members, which saves us from having to
* provide some kind of "back door" to the Builder class that could then also be
* used by other classes.
*/
class RuleBasedBreakIteratorBuilder {
protected:
/**
* The iterator we're constructing.
*/
RuleBasedBreakIterator& iterator;
/**
* The tables object for the iterator we're constructing.
*/
RuleBasedBreakIteratorTables* tables;
/**
* A temporary place to hold the rules as they're being processed.
*/
UVector tempRuleList;
/**
* A temporary holding place used for calculating the character categories.
* This object contains UnicodeSet objects.
*/
UVector categories;
/**
* The number of categories (and thus the number of columns in the finished state tables)
*/
int32_t numCategories;
/**
* A table used to map parts of regexp text to lists of character categories,
* rather than having to figure them out from scratch each time
*/
ExpressionList* expressions;
/**
* A temporary holding place for the list of ignore characters
*/
UnicodeSet ignoreChars;
/**
* A temporary holding place where the forward state table is built
*/
UVector tempStateTable;
/**
* A list of all the states that have to be filled in with transitions to the
* next state that is created. Used when building the state table from the
* regular expressions.
*/
UVector decisionPointList;
/**
* A UStack for holding decision point lists. This is used to handle nested
* parentheses and braces in regexps.
*/
UStack decisionPointStack;
/**
* A list of states that loop back on themselves. Used to handle .*?
*/
UVector loopingStates;
/**
* Looping states actually have to be backfilled later in the process
* than everything else. This is where a the list of states to backfill
* is accumulated. This is also used to handle .*?
*/
UVector statesToBackfill;
/**
* A list mapping pairs of state numbers for states that are to be combined
* to the state number of the state representing their combination. Used
* in the process of making the state table deterministic to prevent
* infinite recursion.
*/
UVector mergeList;
/**
* A flag that is used to indicate when the list of looping states can
* be reset.
*/
UBool clearLoopingStates;
/**
* A place where an error message can be stored if we get a parse error.
* The error message is never displayed anywhere, so this is useful pretty
* much only in conjunction with a debugger.
*/
UnicodeString errorMessage;
/**
* A bit mask used to indicate a bit in the table's flags column that marks a
* state as an accepting state.
*/
static const int32_t END_STATE_FLAG /*= 0x8000*/;
/**
* A bit mask used to indicate a bit in the table's flags column that marks a
* state as one the builder shouldn't loop to any looping states
*/
static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;
/**
* A bit mask used to indicate a bit in the table's flags column that marks a
* state as a lookahead state.
*/
static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;
/**
* A bit mask representing the union of the mask values listed above.
* Used for clearing or masking off the flag bits.
*/
static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
| DONT_LOOP_FLAG*/;
public:
/**
* The Builder class contains a reference to the iterator it's supposed to build.
*/
RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);
/**
* Destructor.
*/
~RuleBasedBreakIteratorBuilder();
/**
* This is the main function for setting up the BreakIterator's tables. It
* just vectors different parts of the job off to other functions.
*/
virtual void buildBreakIterator(const UnicodeString& description,
UErrorCode& err);
private:
/**
* Thus function has three main purposes:
* <ul><li>Perform general syntax checking on the description, so the rest of the
* build code can assume that it's parsing a legal description.
* <li>Split the description into separate rules
* <li>Perform variable-name substitutions (so that no one else sees variable names)
* </ul>
*/
virtual void buildRuleList(UnicodeString& description,
UErrorCode& err);
protected:
/**
* This function performs variable-name substitutions. First it does syntax
* checking on the variable-name definition. If it's syntactically valid, it
* then goes through the remainder of the description and does a simple
* find-and-replace of the variable name with its text. (The variable text
* must be enclosed in either [] or () for this to work.)
*/
virtual void processSubstitution(UnicodeString& description,
UTextOffset ruleStart,
UTextOffset ruleEnd,
UTextOffset startPos,
UErrorCode& err);
/**
* This function defines a protocol for handling substitution names that
* are "special," i.e., that have some property beyond just being
* substitutions. At the RuleBasedBreakIterator level, we have one
* special substitution name, "<ignore>". Subclasses can override this
* function to add more. Any special processing that has to go on beyond
* that which is done by the normal substitution-processing code is done
* here.
*/
virtual void handleSpecialSubstitution(const UnicodeString& replace,
const UnicodeString& replaceWith,
int32_t startPos,
const UnicodeString& description,
UErrorCode& err);
/**
* This function provides a hook for subclasses to mess with the character
* category table.
*/
virtual void mungeExpressionList();
/**
* This function builds the character category table. On entry,
* tempRuleList is a UVector of break rules that has had variable names substituted.
* On exit, the charCategoryTable data member has been initialized to hold the
* character category table, and tempRuleList's rules have been munged to contain
* character category numbers everywhere a literal character or a [] expression
* originally occurred.
*/
virtual void buildCharCategories(UErrorCode& err);
private:
/**
* This is the function that builds the forward state table. Most of the real
* work is done in parseRule(), which is called once for each rule in the
* description.
*/
virtual void buildStateTable(UErrorCode& err);
/**
* This is where most of the work really happens. This routine parses a single
* rule in the rule description, adding and modifying states in the state
* table according to the new expression. The state table is kept deterministic
* throughout the whole operation, although some ugly postprocessing is needed
* to handle the *? token.
*/
virtual void parseRule(const UnicodeString& rule,
UBool forward);
/**
* Update entries in the state table, and merge states when necessary to keep
* the table deterministic.
* @param rows The list of rows that need updating (the decision point list)
* @param pendingChars A character category list, encoded in a String. This is the
* list of the columns that need updating.
* @param newValue Update the cells specfied above to contain this value
*/
virtual void updateStateTable(const UVector& rows,
const UnicodeString& pendingChars,
int16_t newValue);
/**
* The real work of making the state table deterministic happens here. This function
* merges a state in the state table (specified by rowNum) with a state that is
* passed in (newValues). The basic process is to copy the nonzero cells in newStates
* into the state in the state table (we'll call that oldValues). If there's a
* collision (i.e., if the same cell has a nonzero value in both states, and it's
* not the SAME value), then we have to reconcile the collision. We do this by
* creating a new state, adding it to the end of the state table, and using this
* function recursively to merge the original two states into a single, combined
* state. This process may happen recursively (i.e., each successive level may
* involve collisions). To prevent infinite recursion, we keep a log of merge
* operations. Any time we're merging two states we've merged before, we can just
* supply the row number for the result of that merge operation rather than creating
* a new state just like it.
* @param rowNum The row number in the state table of the state to be updated
* @param newValues The state to merge it with.
* @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
* (itself a copy of the decision point list from parseRule()). Newly-created
* states get added to the decision point list if their "parents" were on it.
*/
virtual void mergeStates(int32_t rowNum,
int16_t* newValues,
const UVector& rowsBeingUpdated);
/**
* The merge list is a list of pairs of rows that have been merged somewhere in
* the process of building this state table, along with the row number of the
* row containing the merged state. This function looks up a pair of row numbers
* and returns the row number of the row they combine into. (It returns 0 if
* this pair of rows isn't in the merge list.)
*/
virtual int32_t searchMergeList(int32_t a, int32_t b);
/**
* This function is used to update the list of current loooping states (i.e.,
* states that are controlled by a *? construct). It backfills values from
* the looping states into unpopulated cells of the states that are currently
* marked for backfilling, and then updates the list of looping states to be
* the new list
* @param newLoopingStates The list of new looping states
* @param endStates The list of states to treat as end states (states that
* can exit the loop).
*/
virtual void setLoopingStates(const UVector* newLoopingStates,
const UVector& endStates);
/**
* This removes "ending states" and states reachable from them from the
* list of states to backfill.
* @param The row number of the state to remove from the backfill list
*/
virtual void eliminateBackfillStates(int32_t baseState);
/**
* This function completes the backfilling process by actually doing the
* backfilling on the states that are marked for it
*/
virtual void backfillLoopingStates(void);
/**
* This function completes the state-table-building process by doing several
* postprocessing steps and copying everything into its final resting place
* in the iterator itself
* @param forward True if we're working on the forward state table
*/
virtual void finishBuildingStateTable(UBool forward);
/**
* This function builds the backward state table from the forward state
* table and any additional rules (identified by the ! on the front)
* supplied in the description
*/
virtual void buildBackwardsStateTable(UErrorCode& err);
protected:
/**
* Throws an IllegalArgumentException representing a syntax error in the rule
* description. The exception's message contains some debugging information.
* @param message A message describing the problem
* @param position The position in the description where the problem was
* discovered
* @param context The string containing the error
*/
virtual void setUpErrorMessage(const UnicodeString& message,
int32_t position,
const UnicodeString& context);
};
#endif