blob: f83fb115e5f5d819c6af63dd4a06966849913d5a [file] [log] [blame]
/*
**********************************************************************
* Copyright (C) 1999 Alan Liu and others. All rights reserved.
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation. This is an internal header; it
* shall not be exported.
**********************************************************************
*/
#ifndef RBBI_BLD_H
#define RBBI_BLD_H
#include "rbbi.h"
#include "uniset.h"
#include "uvector.h"
//=======================================================================
// RuleBasedBreakIterator.Builder
//=======================================================================
/**
* The Builder class has the job of constructing a RuleBasedBreakIterator from a
* textual description. A Builder is constructed by RuleBasedBreakIterator's
* constructor, which uses it to construct the iterator itself and then throws it
* away.
* <p>The construction logic is separated out into its own class for two primary
* reasons:
* <ul><li>The construction logic is quite complicated and large. Separating it
* out into its own class means the code must only be loaded into memory while a
* RuleBasedBreakIterator is being constructed, and can be purged after that.
* <li>There is a fair amount of state that must be maintained throughout the
* construction process that is not needed by the iterator after construction.
* Separating this state out into another class prevents all of the functions that
* construct the iterator from having to have really long parameter lists,
* (hopefully) contributing to readability and maintainability.</ul>
* <p>It'd be really nice if this could be an independent class rather than an
* inner class, because that would shorten the source file considerably, but
* making Builder an inner class of RuleBasedBreakIterator allows it direct access
* to RuleBasedBreakIterator's private members, which saves us from having to
* provide some kind of "back door" to the Builder class that could then also be
* used by other classes.
*/
class RuleBasedBreakIteratorBuilder {
protected:
/**
* A temporary holding place used for calculating the character categories.
* This object contains UnicodeSet objects.
*/
UVector categories;
/**
* A table used to map parts of regexp text to lists of character categories,
* rather than having to figure them out from scratch each time
*/
Hashtable expressions;
/**
* A temporary holding place for the list of ignore characters
*/
UnicodeSet ignoreChars;
/**
* A temporary holding place where the forward state table is built
*/
UVector tempStateTable;
/**
* A list of all the states that have to be filled in with transitions to the
* next state that is created. Used when building the state table from the
* regular expressions.
*/
UVector decisionPointList;
/**
* A UStack for holding decision point lists. This is used to handle nested
* parentheses and braces in regexps.
*/
UStack decisionPointStack;
/**
* A list of states that loop back on themselves. Used to handle .*?
*/
UVector loopingStates;
/**
* Looping states actually have to be backfilled later in the process
* than everything else. This is where a the list of states to backfill
* is accumulated. This is also used to handle .*?
*/
UVector statesToBackfill;
/**
* A list mapping pairs of state numbers for states that are to be combined
* to the state number of the state representing their combination. Used
* in the process of making the state table deterministic to prevent
* infinite recursion.
*/
UVector mergeList;
/**
* A flag that is used to indicate when the list of looping states can
* be reset.
*/
bool_t clearLoopingStates;
public:
/**
* No special construction is required for the Builder.
*/
RuleBasedBreakIteratorBuilder();
/**
* This is the main function for setting up the BreakIterator's tables. It
* just UVectors different parts of the job off to other functions.
*/
virtual void buildBreakIterator();
private:
/**
* Thus function has three main purposes:
* <ul><li>Perform general syntax checking on the description, so the rest of the
* build code can assume that it's parsing a legal description.
* <li>Split the description into separate rules
* <li>Perform variable-name substitutions (so that no one else sees variable names)
* </ul>
*/
virtual UVector buildRuleList(UnicodeString description);
protected:
/**
* This function performs variable-name substitutions. First it does syntax
* checking on the variable-name definition. If it's syntactically valid, it
* then goes through the remainder of the description and does a simple
* find-and-replace of the variable name with its text. (The variable text
* must be enclosed in either [] or () for this to work.)
*/
virtual UnicodeString processSubstitution(UnicodeString substitutionRule, UnicodeString description,
int32_t startPos);
/**
* This function defines a protocol for handling substitution names that
* are "special," i.e., that have some property beyond just being
* substitutions. At the RuleBasedBreakIterator level, we have one
* special substitution name, "<ignore>". Subclasses can override this
* function to add more. Any special processing that has to go on beyond
* that which is done by the normal substitution-processing code is done
* here.
*/
virtual void handleSpecialSubstitution(UnicodeString replace, UnicodeString replaceWith,
int32_t startPos, UnicodeString description);
/**
* This function builds the character category table. On entry,
* tempRuleList is a UVector of break rules that has had variable names substituted.
* On exit, the charCategoryTable data member has been initialized to hold the
* character category table, and tempRuleList's rules have been munged to contain
* character category numbers everywhere a literal character or a [] expression
* originally occurred.
*/
virtual void buildCharCategories(UVector tempRuleList);
private:
/**
* This is the function that builds the forward state table. Most of the real
* work is done in parseRule(), which is called once for each rule in the
* description.
*/
virtual void buildStateTable(UVector tempRuleList);
/**
* This is where most of the work really happens. This routine parses a single
* rule in the rule description, adding and modifying states in the state
* table according to the new expression. The state table is kept deterministic
* throughout the whole operation, although some ugly postprocessing is needed
* to handle the *? token.
*/
virtual void parseRule(UnicodeString rule, bool_t forward);
/**
* Update entries in the state table, and merge states when necessary to keep
* the table deterministic.
* @param rows The list of rows that need updating (the decision point list)
* @param pendingChars A character category list, encoded in a String. This is the
* list of the columns that need updating.
* @param newValue Update the cells specfied above to contain this value
*/
virtual void updateStateTable(UVector rows,
UnicodeString pendingChars,
int16_t newValue);
/**
* The real work of making the state table deterministic happens here. This function
* merges a state in the state table (specified by rowNum) with a state that is
* passed in (newValues). The basic process is to copy the nonzero cells in newStates
* into the state in the state table (we'll call that oldValues). If there's a
* collision (i.e., if the same cell has a nonzero value in both states, and it's
* not the SAME value), then we have to reconcile the collision. We do this by
* creating a new state, adding it to the end of the state table, and using this
* function recursively to merge the original two states into a single, combined
* state. This process may happen recursively (i.e., each successive level may
* involve collisions). To prevent infinite recursion, we keep a log of merge
* operations. Any time we're merging two states we've merged before, we can just
* supply the row number for the result of that merge operation rather than creating
* a new state just like it.
* @param rowNum The row number in the state table of the state to be updated
* @param newValues The state to merge it with.
* @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
* (itself a copy of the decision point list from parseRule()). Newly-created
* states get added to the decision point list if their "parents" were on it.
*/
virtual void mergeStates(int32_t rowNum,
int16_t* newValues,
UVector rowsBeingUpdated);
/**
* The merge list is a list of pairs of rows that have been merged somewhere in
* the process of building this state table, along with the row number of the
* row containing the merged state. This function looks up a pair of row numbers
* and returns the row number of the row they combine into. (It returns 0 if
* this pair of rows isn't in the merge list.)
*/
virtual int32_t searchMergeList(int32_t a, int32_t b);
/**
* This function is used to update the list of current loooping states (i.e.,
* states that are controlled by a *? construct). It backfills values from
* the looping states into unpopulated cells of the states that are currently
* marked for backfilling, and then updates the list of looping states to be
* the new list
* @param newLoopingStates The list of new looping states
* @param endStates The list of states to treat as end states (states that
* can exit the loop).
*/
virtual void setLoopingStates(UVector newLoopingStates, UVector endStates);
/**
* This removes "ending states" and states reachable from them from the
* list of states to backfill.
* @param The row number of the state to remove from the backfill list
*/
virtual void eliminateBackfillStates(int32_t baseState);
/**
* This function completes the backfilling process by actually doing the
* backfilling on the states that are marked for it
*/
virtual void backfillLoopingStates();
/**
* This function completes the state-table-building process by doing several
* postprocessing steps and copying everything into its final resting place
* in the iterator itself
* @param forward True if we're working on the forward state table
*/
virtual void finishBuildingStateTable(bool_t forward);
/**
* This function builds the backward state table from the forward state
* table and any additional rules (identified by the ! on the front)
* supplied in the description
*/
virtual void buildBackwardsStateTable(UVector tempRuleList);
protected:
/**
* Throws an IllegalArgumentException representing a syntax error in the rule
* description. The exception's message contains some debugging information.
* @param message A message describing the problem
* @param position The position in the description where the problem was
* discovered
* @param context The string containing the error
*/
virtual void error(UnicodeString message, int32_t position, UnicodeString context);
};
#endif