|  |  | 
|  | // | 
|  | //  file:  rbbiscan.cpp | 
|  | // | 
|  | //  Copyright (C) 2002-2008, International Business Machines Corporation and others. | 
|  | //  All Rights Reserved. | 
|  | // | 
|  | //  This file contains the Rule Based Break Iterator Rule Builder functions for | 
|  | //   scanning the rules and assembling a parse tree.  This is the first phase | 
|  | //   of compiling the rules. | 
|  | // | 
|  | //  The overall of the rules is managed by class RBBIRuleBuilder, which will | 
|  | //  create and use an instance of this class as part of the process. | 
|  | // | 
|  |  | 
|  | #include "unicode/utypes.h" | 
|  |  | 
|  | #if !UCONFIG_NO_BREAK_ITERATION | 
|  |  | 
|  | #include "unicode/unistr.h" | 
|  | #include "unicode/uniset.h" | 
|  | #include "unicode/uchar.h" | 
|  | #include "unicode/uchriter.h" | 
|  | #include "unicode/parsepos.h" | 
|  | #include "unicode/parseerr.h" | 
|  | #include "util.h" | 
|  | #include "cmemory.h" | 
|  | #include "cstring.h" | 
|  |  | 
|  | #include "rbbirpt.h"   // Contains state table for the rbbi rules parser. | 
|  | //   generated by a Perl script. | 
|  | #include "rbbirb.h" | 
|  | #include "rbbinode.h" | 
|  | #include "rbbiscan.h" | 
|  | #include "rbbitblb.h" | 
|  |  | 
|  | #include "uassert.h" | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | // Unicode Set init strings for each of the character classes needed for parsing a rule file. | 
|  | //               (Initialized with hex values for portability to EBCDIC based machines. | 
|  | //                Really ugly, but there's no good way to avoid it.) | 
|  | // | 
|  | //              The sets are referred to by name in the rbbirpt.txt, which is the | 
|  | //              source form of the state transition table for the RBBI rule parser. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | static const UChar gRuleSet_rule_char_pattern[]       = { | 
|  | //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0 | 
|  | 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, | 
|  | //   -    \      u    0     0     7      f     ]     -     [    \      p | 
|  | 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, | 
|  | //   {     L     }    ]     -     [      \     p     {     N    }      ]     ] | 
|  | 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; | 
|  |  | 
|  | static const UChar gRuleSet_name_char_pattern[]       = { | 
|  | //    [    _      \    p     {     L      }     \     p     {    N      }     ] | 
|  | 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; | 
|  |  | 
|  | static const UChar gRuleSet_digit_char_pattern[] = { | 
|  | //    [    0      -    9     ] | 
|  | 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; | 
|  |  | 
|  | static const UChar gRuleSet_name_start_char_pattern[] = { | 
|  | //    [    _      \    p     {     L      }     ] | 
|  | 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; | 
|  |  | 
|  | static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any" | 
|  |  | 
|  |  | 
|  | U_CDECL_BEGIN | 
|  | static void U_CALLCONV RBBISetTable_deleter(void *p) { | 
|  | U_NAMESPACE_QUALIFIER RBBISetTableEl *px = (U_NAMESPACE_QUALIFIER RBBISetTableEl *)p; | 
|  | delete px->key; | 
|  | // Note:  px->val is owned by the linked list "fSetsListHead" in scanner. | 
|  | //        Don't delete the value nodes here. | 
|  | uprv_free(px); | 
|  | } | 
|  | U_CDECL_END | 
|  |  | 
|  | U_NAMESPACE_BEGIN | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  Constructor. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) | 
|  | { | 
|  | fRB                 = rb; | 
|  | fStackPtr           = 0; | 
|  | fStack[fStackPtr]   = 0; | 
|  | fNodeStackPtr       = 0; | 
|  | fRuleNum            = 0; | 
|  | fNodeStack[0]       = NULL; | 
|  |  | 
|  | fSymbolTable                            = NULL; | 
|  | fSetTable                               = NULL; | 
|  |  | 
|  | fScanIndex = 0; | 
|  | fNextIndex = 0; | 
|  |  | 
|  | fReverseRule        = FALSE; | 
|  | fLookAheadRule      = FALSE; | 
|  |  | 
|  | fLineNum    = 1; | 
|  | fCharNum    = 0; | 
|  | fQuoteMode  = FALSE; | 
|  |  | 
|  | // Do not check status until after all critical fields are sufficiently initialized | 
|  | //   that the destructor can run cleanly. | 
|  | if (U_FAILURE(*rb->fStatus)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // | 
|  | //  Set up the constant Unicode Sets. | 
|  | //     Note:  These could be made static, lazily initialized, and shared among | 
|  | //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler, | 
|  | //            and the time to build these few sets should be small compared to a | 
|  | //            full break iterator build. | 
|  | fRuleSets[kRuleSet_rule_char-128]       = UnicodeSet(gRuleSet_rule_char_pattern,       *rb->fStatus); | 
|  | UnicodeSet *whitespaceSet = uprv_openRuleWhiteSpaceSet(rb->fStatus); | 
|  | if (U_FAILURE(*rb->fStatus)) { | 
|  | return; | 
|  | } | 
|  | fRuleSets[kRuleSet_white_space-128]     = *whitespaceSet; | 
|  | delete whitespaceSet; | 
|  | fRuleSets[kRuleSet_name_char-128]       = UnicodeSet(gRuleSet_name_char_pattern,       *rb->fStatus); | 
|  | fRuleSets[kRuleSet_name_start_char-128] = UnicodeSet(gRuleSet_name_start_char_pattern, *rb->fStatus); | 
|  | fRuleSets[kRuleSet_digit_char-128]      = UnicodeSet(gRuleSet_digit_char_pattern,      *rb->fStatus); | 
|  | if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { | 
|  | // This case happens if ICU's data is missing.  UnicodeSet tries to look up property | 
|  | //   names from the init string, can't find them, and claims an illegal arguement. | 
|  | //   Change the error so that the actual problem will be clearer to users. | 
|  | *rb->fStatus = U_BRK_INIT_ERROR; | 
|  | } | 
|  | if (U_FAILURE(*rb->fStatus)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); | 
|  | if (fSymbolTable == NULL) { | 
|  | *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; | 
|  | return; | 
|  | } | 
|  | fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); | 
|  | if (U_FAILURE(*rb->fStatus)) { | 
|  | return; | 
|  | } | 
|  | uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  Destructor | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | RBBIRuleScanner::~RBBIRuleScanner() { | 
|  | delete fSymbolTable; | 
|  | if (fSetTable != NULL) { | 
|  | uhash_close(fSetTable); | 
|  | fSetTable = NULL; | 
|  |  | 
|  | } | 
|  |  | 
|  |  | 
|  | // Node Stack. | 
|  | //   Normally has one entry, which is the entire parse tree for the rules. | 
|  | //   If errors occured, there may be additional subtrees left on the stack. | 
|  | while (fNodeStackPtr > 0) { | 
|  | delete fNodeStack[fNodeStackPtr]; | 
|  | fNodeStackPtr--; | 
|  | } | 
|  |  | 
|  | } | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  doParseAction        Do some action during rule parsing. | 
|  | //                       Called by the parse state machine. | 
|  | //                       Actions build the parse tree and Unicode Sets, | 
|  | //                       and maintain the parse stack for nested expressions. | 
|  | // | 
|  | //                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types. | 
|  | //                              They represent exactly the same thing.  They're separate | 
|  | //                              only to work around enum forward declaration restrictions | 
|  | //                              in some compilers, while at the same time avoiding multiple | 
|  | //                              definitions problems.  I'm sure that there's a better way. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | UBool RBBIRuleScanner::doParseActions(int32_t action) | 
|  | { | 
|  | RBBINode *n       = NULL; | 
|  |  | 
|  | UBool   returnVal = TRUE; | 
|  |  | 
|  | switch (action) { | 
|  |  | 
|  | case doExprStart: | 
|  | pushNewNode(RBBINode::opStart); | 
|  | fRuleNum++; | 
|  | break; | 
|  |  | 
|  |  | 
|  | case doExprOrOperator: | 
|  | { | 
|  | fixOpStack(RBBINode::precOpCat); | 
|  | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|  | RBBINode  *orNode      = pushNewNode(RBBINode::opOr); | 
|  | orNode->fLeftChild     = operandNode; | 
|  | operandNode->fParent   = orNode; | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doExprCatOperator: | 
|  | // concatenation operator. | 
|  | // For the implicit concatenation of adjacent terms in an expression that are | 
|  | //   not separated by any other operator.  Action is invoked between the | 
|  | //   actions for the two terms. | 
|  | { | 
|  | fixOpStack(RBBINode::precOpCat); | 
|  | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|  | RBBINode  *catNode     = pushNewNode(RBBINode::opCat); | 
|  | catNode->fLeftChild    = operandNode; | 
|  | operandNode->fParent   = catNode; | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doLParen: | 
|  | // Open Paren. | 
|  | //   The openParen node is a dummy operation type with a low precedence, | 
|  | //     which has the affect of ensuring that any real binary op that | 
|  | //     follows within the parens binds more tightly to the operands than | 
|  | //     stuff outside of the parens. | 
|  | pushNewNode(RBBINode::opLParen); | 
|  | break; | 
|  |  | 
|  | case doExprRParen: | 
|  | fixOpStack(RBBINode::precLParen); | 
|  | break; | 
|  |  | 
|  | case doNOP: | 
|  | break; | 
|  |  | 
|  | case doStartAssign: | 
|  | // We've just scanned "$variable = " | 
|  | // The top of the node stack has the $variable ref node. | 
|  |  | 
|  | // Save the start position of the RHS text in the StartExpression node | 
|  | //   that precedes the $variableReference node on the stack. | 
|  | //   This will eventually be used when saving the full $variable replacement | 
|  | //   text as a string. | 
|  | n = fNodeStack[fNodeStackPtr-1]; | 
|  | n->fFirstPos = fNextIndex;              // move past the '=' | 
|  |  | 
|  | // Push a new start-of-expression node; needed to keep parse of the | 
|  | //   RHS expression happy. | 
|  | pushNewNode(RBBINode::opStart); | 
|  | break; | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | case doEndAssign: | 
|  | { | 
|  | // We have reached the end of an assignement statement. | 
|  | //   Current scan char is the ';' that terminates the assignment. | 
|  |  | 
|  | // Terminate expression, leaves expression parse tree rooted in TOS node. | 
|  | fixOpStack(RBBINode::precStart); | 
|  |  | 
|  | RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2]; | 
|  | RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1]; | 
|  | RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr]; | 
|  |  | 
|  | // Save original text of right side of assignment, excluding the terminating ';' | 
|  | //  in the root of the node for the right-hand-side expression. | 
|  | RHSExprNode->fFirstPos = startExprNode->fFirstPos; | 
|  | RHSExprNode->fLastPos  = fScanIndex; | 
|  | fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); | 
|  |  | 
|  | // Expression parse tree becomes l. child of the $variable reference node. | 
|  | varRefNode->fLeftChild = RHSExprNode; | 
|  | RHSExprNode->fParent   = varRefNode; | 
|  |  | 
|  | // Make a symbol table entry for the $variableRef node. | 
|  | fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); | 
|  | if (U_FAILURE(*fRB->fStatus)) { | 
|  | // This is a round-about way to get the parse position set | 
|  | //  so that duplicate symbols error messages include a line number. | 
|  | UErrorCode t = *fRB->fStatus; | 
|  | *fRB->fStatus = U_ZERO_ERROR; | 
|  | error(t); | 
|  | } | 
|  |  | 
|  | // Clean up the stack. | 
|  | delete startExprNode; | 
|  | fNodeStackPtr-=3; | 
|  | break; | 
|  | } | 
|  |  | 
|  | case doEndOfRule: | 
|  | { | 
|  | fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression | 
|  | if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node. | 
|  | break; | 
|  | } | 
|  | #ifdef RBBI_DEBUG | 
|  | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} | 
|  | #endif | 
|  | U_ASSERT(fNodeStackPtr == 1); | 
|  |  | 
|  | // If this rule includes a look-ahead '/', add a endMark node to the | 
|  | //   expression tree. | 
|  | if (fLookAheadRule) { | 
|  | RBBINode  *thisRule       = fNodeStack[fNodeStackPtr]; | 
|  | RBBINode  *endNode        = pushNewNode(RBBINode::endMark); | 
|  | RBBINode  *catNode        = pushNewNode(RBBINode::opCat); | 
|  | fNodeStackPtr -= 2; | 
|  | catNode->fLeftChild       = thisRule; | 
|  | catNode->fRightChild      = endNode; | 
|  | fNodeStack[fNodeStackPtr] = catNode; | 
|  | endNode->fVal             = fRuleNum; | 
|  | endNode->fLookAheadEnd    = TRUE; | 
|  | } | 
|  |  | 
|  | // All rule expressions are ORed together. | 
|  | // The ';' that terminates an expression really just functions as a '|' with | 
|  | //   a low operator prededence. | 
|  | // | 
|  | // Each of the four sets of rules are collected separately. | 
|  | //  (forward, reverse, safe_forward, safe_reverse) | 
|  | //  OR this rule into the appropriate group of them. | 
|  | // | 
|  | RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); | 
|  |  | 
|  | if (*destRules != NULL) { | 
|  | // This is not the first rule encounted. | 
|  | // OR previous stuff  (from *destRules) | 
|  | // with the current rule expression (on the Node Stack) | 
|  | //  with the resulting OR expression going to *destRules | 
|  | // | 
|  | RBBINode  *thisRule    = fNodeStack[fNodeStackPtr]; | 
|  | RBBINode  *prevRules   = *destRules; | 
|  | RBBINode  *orNode      = pushNewNode(RBBINode::opOr); | 
|  | orNode->fLeftChild     = prevRules; | 
|  | prevRules->fParent     = orNode; | 
|  | orNode->fRightChild    = thisRule; | 
|  | thisRule->fParent      = orNode; | 
|  | *destRules             = orNode; | 
|  | } | 
|  | else | 
|  | { | 
|  | // This is the first rule encountered (for this direction). | 
|  | // Just move its parse tree from the stack to *destRules. | 
|  | *destRules = fNodeStack[fNodeStackPtr]; | 
|  | } | 
|  | fReverseRule   = FALSE;   // in preparation for the next rule. | 
|  | fLookAheadRule = FALSE; | 
|  | fNodeStackPtr  = 0; | 
|  | } | 
|  | break; | 
|  |  | 
|  |  | 
|  | case doRuleError: | 
|  | error(U_BRK_RULE_SYNTAX); | 
|  | returnVal = FALSE; | 
|  | break; | 
|  |  | 
|  |  | 
|  | case doVariableNameExpectedErr: | 
|  | error(U_BRK_RULE_SYNTAX); | 
|  | break; | 
|  |  | 
|  |  | 
|  | // | 
|  | //  Unary operands  + ? * | 
|  | //    These all appear after the operand to which they apply. | 
|  | //    When we hit one, the operand (may be a whole sub expression) | 
|  | //    will be on the top of the stack. | 
|  | //    Unary Operator becomes TOS, with the old TOS as its one child. | 
|  | case doUnaryOpPlus: | 
|  | { | 
|  | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|  | RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus); | 
|  | plusNode->fLeftChild   = operandNode; | 
|  | operandNode->fParent   = plusNode; | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doUnaryOpQuestion: | 
|  | { | 
|  | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|  | RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion); | 
|  | qNode->fLeftChild      = operandNode; | 
|  | operandNode->fParent   = qNode; | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doUnaryOpStar: | 
|  | { | 
|  | RBBINode  *operandNode = fNodeStack[fNodeStackPtr--]; | 
|  | RBBINode  *starNode    = pushNewNode(RBBINode::opStar); | 
|  | starNode->fLeftChild   = operandNode; | 
|  | operandNode->fParent   = starNode; | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doRuleChar: | 
|  | // A "Rule Character" is any single character that is a literal part | 
|  | // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]" | 
|  | // These are pretty uncommon in break rules; the terms are more commonly | 
|  | //  sets.  To keep things uniform, treat these characters like as | 
|  | // sets that just happen to contain only one character. | 
|  | { | 
|  | n = pushNewNode(RBBINode::setRef); | 
|  | findSetFor(fC.fChar, n); | 
|  | n->fFirstPos = fScanIndex; | 
|  | n->fLastPos  = fNextIndex; | 
|  | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|  | break; | 
|  | } | 
|  |  | 
|  | case doDotAny: | 
|  | // scanned a ".", meaning match any single character. | 
|  | { | 
|  | n = pushNewNode(RBBINode::setRef); | 
|  | findSetFor(kAny, n); | 
|  | n->fFirstPos = fScanIndex; | 
|  | n->fLastPos  = fNextIndex; | 
|  | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|  | break; | 
|  | } | 
|  |  | 
|  | case doSlash: | 
|  | // Scanned a '/', which identifies a look-ahead break position in a rule. | 
|  | n = pushNewNode(RBBINode::lookAhead); | 
|  | n->fVal      = fRuleNum; | 
|  | n->fFirstPos = fScanIndex; | 
|  | n->fLastPos  = fNextIndex; | 
|  | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|  | fLookAheadRule = TRUE; | 
|  | break; | 
|  |  | 
|  |  | 
|  | case doStartTagValue: | 
|  | // Scanned a '{', the opening delimiter for a tag value within a rule. | 
|  | n = pushNewNode(RBBINode::tag); | 
|  | n->fVal      = 0; | 
|  | n->fFirstPos = fScanIndex; | 
|  | n->fLastPos  = fNextIndex; | 
|  | break; | 
|  |  | 
|  | case doTagDigit: | 
|  | // Just scanned a decimal digit that's part of a tag value | 
|  | { | 
|  | n = fNodeStack[fNodeStackPtr]; | 
|  | uint32_t v = u_charDigitValue(fC.fChar); | 
|  | U_ASSERT(v < 10); | 
|  | n->fVal = n->fVal*10 + v; | 
|  | break; | 
|  | } | 
|  |  | 
|  | case doTagValue: | 
|  | n = fNodeStack[fNodeStackPtr]; | 
|  | n->fLastPos = fNextIndex; | 
|  | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|  | break; | 
|  |  | 
|  | case doTagExpectedError: | 
|  | error(U_BRK_MALFORMED_RULE_TAG); | 
|  | returnVal = FALSE; | 
|  | break; | 
|  |  | 
|  | case doOptionStart: | 
|  | // Scanning a !!option.   At the start of string. | 
|  | fOptionStart = fScanIndex; | 
|  | break; | 
|  |  | 
|  | case doOptionEnd: | 
|  | { | 
|  | UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); | 
|  | if (opt == UNICODE_STRING("chain", 5)) { | 
|  | fRB->fChainRules = TRUE; | 
|  | } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { | 
|  | fRB->fLBCMNoChain = TRUE; | 
|  | } else if (opt == UNICODE_STRING("forward", 7)) { | 
|  | fRB->fDefaultTree   = &fRB->fForwardTree; | 
|  | } else if (opt == UNICODE_STRING("reverse", 7)) { | 
|  | fRB->fDefaultTree   = &fRB->fReverseTree; | 
|  | } else if (opt == UNICODE_STRING("safe_forward", 12)) { | 
|  | fRB->fDefaultTree   = &fRB->fSafeFwdTree; | 
|  | } else if (opt == UNICODE_STRING("safe_reverse", 12)) { | 
|  | fRB->fDefaultTree   = &fRB->fSafeRevTree; | 
|  | } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { | 
|  | fRB->fLookAheadHardBreak = TRUE; | 
|  | } else { | 
|  | error(U_BRK_UNRECOGNIZED_OPTION); | 
|  | } | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doReverseDir: | 
|  | fReverseRule = TRUE; | 
|  | break; | 
|  |  | 
|  | case doStartVariableName: | 
|  | n = pushNewNode(RBBINode::varRef); | 
|  | if (U_FAILURE(*fRB->fStatus)) { | 
|  | break; | 
|  | } | 
|  | n->fFirstPos = fScanIndex; | 
|  | break; | 
|  |  | 
|  | case doEndVariableName: | 
|  | n = fNodeStack[fNodeStackPtr]; | 
|  | if (n==NULL || n->fType != RBBINode::varRef) { | 
|  | error(U_BRK_INTERNAL_ERROR); | 
|  | break; | 
|  | } | 
|  | n->fLastPos = fScanIndex; | 
|  | fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); | 
|  | // Look the newly scanned name up in the symbol table | 
|  | //   If there's an entry, set the l. child of the var ref to the replacement expression. | 
|  | //   (We also pass through here when scanning assignments, but no harm is done, other | 
|  | //    than a slight wasted effort that seems hard to avoid.  Lookup will be null) | 
|  | n->fLeftChild = fSymbolTable->lookupNode(n->fText); | 
|  | break; | 
|  |  | 
|  | case doCheckVarDef: | 
|  | n = fNodeStack[fNodeStackPtr]; | 
|  | if (n->fLeftChild == NULL) { | 
|  | error(U_BRK_UNDEFINED_VARIABLE); | 
|  | returnVal = FALSE; | 
|  | } | 
|  | break; | 
|  |  | 
|  | case doExprFinished: | 
|  | break; | 
|  |  | 
|  | case doRuleErrorAssignExpr: | 
|  | error(U_BRK_ASSIGN_ERROR); | 
|  | returnVal = FALSE; | 
|  | break; | 
|  |  | 
|  | case doExit: | 
|  | returnVal = FALSE; | 
|  | break; | 
|  |  | 
|  | case doScanUnicodeSet: | 
|  | scanSet(); | 
|  | break; | 
|  |  | 
|  | default: | 
|  | error(U_BRK_INTERNAL_ERROR); | 
|  | returnVal = FALSE; | 
|  | break; | 
|  | } | 
|  | return returnVal; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  Error         Report a rule parse error. | 
|  | //                Only report it if no previous error has been recorded. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | void RBBIRuleScanner::error(UErrorCode e) { | 
|  | if (U_SUCCESS(*fRB->fStatus)) { | 
|  | *fRB->fStatus = e; | 
|  | if (fRB->fParseError) { | 
|  | fRB->fParseError->line  = fLineNum; | 
|  | fRB->fParseError->offset = fCharNum; | 
|  | fRB->fParseError->preContext[0] = 0; | 
|  | fRB->fParseError->preContext[0] = 0; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  fixOpStack   The parse stack holds partially assembled chunks of the parse tree. | 
|  | //               An entry on the stack may be as small as a single setRef node, | 
|  | //               or as large as the parse tree | 
|  | //               for an entire expression (this will be the one item left on the stack | 
|  | //               when the parsing of an RBBI rule completes. | 
|  | // | 
|  | //               This function is called when a binary operator is encountered. | 
|  | //               It looks back up the stack for operators that are not yet associated | 
|  | //               with a right operand, and if the precedence of the stacked operator >= | 
|  | //               the precedence of the current operator, binds the operand left, | 
|  | //               to the previously encountered operator. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { | 
|  | RBBINode *n; | 
|  | // printNodeStack("entering fixOpStack()"); | 
|  | for (;;) { | 
|  | n = fNodeStack[fNodeStackPtr-1];   // an operator node | 
|  | if (n->fPrecedence == 0) { | 
|  | RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); | 
|  | error(U_BRK_INTERNAL_ERROR); | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { | 
|  | // The most recent operand goes with the current operator, | 
|  | //   not with the previously stacked one. | 
|  | break; | 
|  | } | 
|  | // Stack operator is a binary op  ( '|' or concatenation) | 
|  | //   TOS operand becomes right child of this operator. | 
|  | //   Resulting subexpression becomes the TOS operand. | 
|  | n->fRightChild = fNodeStack[fNodeStackPtr]; | 
|  | fNodeStack[fNodeStackPtr]->fParent = n; | 
|  | fNodeStackPtr--; | 
|  | // printNodeStack("looping in fixOpStack()   "); | 
|  | } | 
|  |  | 
|  | if (p <= RBBINode::precLParen) { | 
|  | // Scan is at a right paren or end of expression. | 
|  | //  The scanned item must match the stack, or else there was an error. | 
|  | //  Discard the left paren (or start expr) node from the stack, | 
|  | //  leaving the completed (sub)expression as TOS. | 
|  | if (n->fPrecedence != p) { | 
|  | // Right paren encountered matched start of expression node, or | 
|  | // end of expression matched with a left paren node. | 
|  | error(U_BRK_MISMATCHED_PAREN); | 
|  | } | 
|  | fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; | 
|  | fNodeStackPtr--; | 
|  | // Delete the now-discarded LParen or Start node. | 
|  | delete n; | 
|  | } | 
|  | // printNodeStack("leaving fixOpStack()"); | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //   findSetFor    given a UnicodeString, | 
|  | //                  - find the corresponding Unicode Set  (uset node) | 
|  | //                         (create one if necessary) | 
|  | //                  - Set fLeftChild of the caller's node (should be a setRef node) | 
|  | //                         to the uset node | 
|  | //                 Maintain a hash table of uset nodes, so the same one is always used | 
|  | //                    for the same string. | 
|  | //                 If a "to adopt" set is provided and we haven't seen this key before, | 
|  | //                    add the provided set to the hash table. | 
|  | //                 If the string is one (32 bit) char in length, the set contains | 
|  | //                    just one element which is the char in question. | 
|  | //                 If the string is "any", return a set containing all chars. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { | 
|  |  | 
|  | RBBISetTableEl   *el; | 
|  |  | 
|  | // First check whether we've already cached a set for this string. | 
|  | // If so, just use the cached set in the new node. | 
|  | //   delete any set provided by the caller, since we own it. | 
|  | el = (RBBISetTableEl *)uhash_get(fSetTable, &s); | 
|  | if (el != NULL) { | 
|  | delete setToAdopt; | 
|  | node->fLeftChild = el->val; | 
|  | U_ASSERT(node->fLeftChild->fType == RBBINode::uset); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Haven't seen this set before. | 
|  | // If the caller didn't provide us with a prebuilt set, | 
|  | //   create a new UnicodeSet now. | 
|  | if (setToAdopt == NULL) { | 
|  | if (s.compare(kAny, -1) == 0) { | 
|  | setToAdopt = new UnicodeSet(0x000000, 0x10ffff); | 
|  | } else { | 
|  | UChar32 c; | 
|  | c = s.char32At(0); | 
|  | setToAdopt = new UnicodeSet(c, c); | 
|  | } | 
|  | } | 
|  |  | 
|  | // | 
|  | // Make a new uset node to refer to this UnicodeSet | 
|  | // This new uset node becomes the child of the caller's setReference node. | 
|  | // | 
|  | RBBINode *usetNode    = new RBBINode(RBBINode::uset); | 
|  | if (usetNode == NULL) { | 
|  | error(U_MEMORY_ALLOCATION_ERROR); | 
|  | return; | 
|  | } | 
|  | usetNode->fInputSet   = setToAdopt; | 
|  | usetNode->fParent     = node; | 
|  | node->fLeftChild      = usetNode; | 
|  | usetNode->fText = s; | 
|  |  | 
|  |  | 
|  | // | 
|  | // Add the new uset node to the list of all uset nodes. | 
|  | // | 
|  | fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); | 
|  |  | 
|  |  | 
|  | // | 
|  | // Add the new set to the set hash table. | 
|  | // | 
|  | el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); | 
|  | UnicodeString *tkey = new UnicodeString(s); | 
|  | if (tkey == NULL || el == NULL || setToAdopt == NULL) { | 
|  | // Delete to avoid memory leak | 
|  | delete tkey; | 
|  | tkey = NULL; | 
|  | uprv_free(el); | 
|  | el = NULL; | 
|  | delete setToAdopt; | 
|  | setToAdopt = NULL; | 
|  |  | 
|  | error(U_MEMORY_ALLOCATION_ERROR); | 
|  | return; | 
|  | } | 
|  | el->key = tkey; | 
|  | el->val = usetNode; | 
|  | uhash_put(fSetTable, el->key, el, fRB->fStatus); | 
|  |  | 
|  | return; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | // | 
|  | //  Assorted Unicode character constants. | 
|  | //     Numeric because there is no portable way to enter them as literals. | 
|  | //     (Think EBCDIC). | 
|  | // | 
|  | static const UChar      chCR        = 0x0d;      // New lines, for terminating comments. | 
|  | static const UChar      chLF        = 0x0a; | 
|  | static const UChar      chNEL       = 0x85;      //    NEL newline variant | 
|  | static const UChar      chLS        = 0x2028;    //    Unicode Line Separator | 
|  | static const UChar      chApos      = 0x27;      //  single quote, for quoted chars. | 
|  | static const UChar      chPound     = 0x23;      // '#', introduces a comment. | 
|  | static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape | 
|  | static const UChar      chLParen    = 0x28; | 
|  | static const UChar      chRParen    = 0x29; | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  stripRules    Return a rules string without unnecessary | 
|  | //                characters. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { | 
|  | UnicodeString strippedRules; | 
|  | int rulesLength = rules.length(); | 
|  | for (int idx = 0; idx < rulesLength; ) { | 
|  | UChar ch = rules[idx++]; | 
|  | if (ch == chPound) { | 
|  | while (idx < rulesLength | 
|  | && ch != chCR && ch != chLF && ch != chNEL) | 
|  | { | 
|  | ch = rules[idx++]; | 
|  | } | 
|  | } | 
|  | if (!u_isISOControl(ch)) { | 
|  | strippedRules.append(ch); | 
|  | } | 
|  | } | 
|  | // strippedRules = strippedRules.unescape(); | 
|  | return strippedRules; | 
|  | } | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  nextCharLL    Low Level Next Char from rule input source. | 
|  | //                Get a char from the input character iterator, | 
|  | //                keep track of input position for error reporting. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | UChar32  RBBIRuleScanner::nextCharLL() { | 
|  | UChar32  ch; | 
|  |  | 
|  | if (fNextIndex >= fRB->fRules.length()) { | 
|  | return (UChar32)-1; | 
|  | } | 
|  | ch         = fRB->fRules.char32At(fNextIndex); | 
|  | fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); | 
|  |  | 
|  | if (ch == chCR || | 
|  | ch == chNEL || | 
|  | ch == chLS   || | 
|  | ch == chLF && fLastChar != chCR) { | 
|  | // Character is starting a new line.  Bump up the line number, and | 
|  | //  reset the column to 0. | 
|  | fLineNum++; | 
|  | fCharNum=0; | 
|  | if (fQuoteMode) { | 
|  | error(U_BRK_NEW_LINE_IN_QUOTED_STRING); | 
|  | fQuoteMode = FALSE; | 
|  | } | 
|  | } | 
|  | else { | 
|  | // Character is not starting a new line.  Except in the case of a | 
|  | //   LF following a CR, increment the column position. | 
|  | if (ch != chLF) { | 
|  | fCharNum++; | 
|  | } | 
|  | } | 
|  | fLastChar = ch; | 
|  | return ch; | 
|  | } | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //   nextChar     for rules scanning.  At this level, we handle stripping | 
|  | //                out comments and processing backslash character escapes. | 
|  | //                The rest of the rules grammar is handled at the next level up. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { | 
|  |  | 
|  | // Unicode Character constants needed for the processing done by nextChar(), | 
|  | //   in hex because literals wont work on EBCDIC machines. | 
|  |  | 
|  | fScanIndex = fNextIndex; | 
|  | c.fChar    = nextCharLL(); | 
|  | c.fEscaped = FALSE; | 
|  |  | 
|  | // | 
|  | //  check for '' sequence. | 
|  | //  These are recognized in all contexts, whether in quoted text or not. | 
|  | // | 
|  | if (c.fChar == chApos) { | 
|  | if (fRB->fRules.char32At(fNextIndex) == chApos) { | 
|  | c.fChar    = nextCharLL();        // get nextChar officially so character counts | 
|  | c.fEscaped = TRUE;                //   stay correct. | 
|  | } | 
|  | else | 
|  | { | 
|  | // Single quote, by itself. | 
|  | //   Toggle quoting mode. | 
|  | //   Return either '('  or ')', because quotes cause a grouping of the quoted text. | 
|  | fQuoteMode = !fQuoteMode; | 
|  | if (fQuoteMode == TRUE) { | 
|  | c.fChar = chLParen; | 
|  | } else { | 
|  | c.fChar = chRParen; | 
|  | } | 
|  | c.fEscaped = FALSE;      // The paren that we return is not escaped. | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (fQuoteMode) { | 
|  | c.fEscaped = TRUE; | 
|  | } | 
|  | else | 
|  | { | 
|  | // We are not in a 'quoted region' of the source. | 
|  | // | 
|  | if (c.fChar == chPound) { | 
|  | // Start of a comment.  Consume the rest of it. | 
|  | //  The new-line char that terminates the comment is always returned. | 
|  | //  It will be treated as white-space, and serves to break up anything | 
|  | //    that might otherwise incorrectly clump together with a comment in | 
|  | //    the middle (a variable name, for example.) | 
|  | for (;;) { | 
|  | c.fChar = nextCharLL(); | 
|  | if (c.fChar == (UChar32)-1 ||  // EOF | 
|  | c.fChar == chCR     || | 
|  | c.fChar == chLF     || | 
|  | c.fChar == chNEL    || | 
|  | c.fChar == chLS)       {break;} | 
|  | } | 
|  | } | 
|  | if (c.fChar == (UChar32)-1) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // | 
|  | //  check for backslash escaped characters. | 
|  | //  Use UnicodeString::unescapeAt() to handle them. | 
|  | // | 
|  | if (c.fChar == chBackSlash) { | 
|  | c.fEscaped = TRUE; | 
|  | int32_t startX = fNextIndex; | 
|  | c.fChar = fRB->fRules.unescapeAt(fNextIndex); | 
|  | if (fNextIndex == startX) { | 
|  | error(U_BRK_HEX_DIGITS_EXPECTED); | 
|  | } | 
|  | fCharNum += fNextIndex-startX; | 
|  | } | 
|  | } | 
|  | // putc(c.fChar, stdout); | 
|  | } | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  Parse RBBI rules.   The state machine for rules parsing is here. | 
|  | //                      The state tables are hand-written in the file rbbirpt.txt, | 
|  | //                      and converted to the form used here by a perl | 
|  | //                      script rbbicst.pl | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | void RBBIRuleScanner::parse() { | 
|  | uint16_t                state; | 
|  | const RBBIRuleTableEl  *tableEl; | 
|  |  | 
|  | if (U_FAILURE(*fRB->fStatus)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | state = 1; | 
|  | nextChar(fC); | 
|  | // | 
|  | // Main loop for the rule parsing state machine. | 
|  | //   Runs once per state transition. | 
|  | //   Each time through optionally performs, depending on the state table, | 
|  | //      - an advance to the the next input char | 
|  | //      - an action to be performed. | 
|  | //      - pushing or popping a state to/from the local state return stack. | 
|  | // | 
|  | for (;;) { | 
|  | //  Bail out if anything has gone wrong. | 
|  | //  RBBI rule file parsing stops on the first error encountered. | 
|  | if (U_FAILURE(*fRB->fStatus)) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Quit if state == 0.  This is the normal way to exit the state machine. | 
|  | // | 
|  | if (state == 0) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Find the state table element that matches the input char from the rule, or the | 
|  | //    class of the input character.  Start with the first table row for this | 
|  | //    state, then linearly scan forward until we find a row that matches the | 
|  | //    character.  The last row for each state always matches all characters, so | 
|  | //    the search will stop there, if not before. | 
|  | // | 
|  | tableEl = &gRuleParseStateTable[state]; | 
|  | #ifdef RBBI_DEBUG | 
|  | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { | 
|  | RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ", | 
|  | fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | for (;;) { | 
|  | #ifdef RBBI_DEBUG | 
|  | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");} | 
|  | #endif | 
|  | if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) { | 
|  | // Table row specified an individual character, not a set, and | 
|  | //   the input character is not escaped, and | 
|  | //   the input character matched it. | 
|  | break; | 
|  | } | 
|  | if (tableEl->fCharClass == 255) { | 
|  | // Table row specified default, match anything character class. | 
|  | break; | 
|  | } | 
|  | if (tableEl->fCharClass == 254 && fC.fEscaped)  { | 
|  | // Table row specified "escaped" and the char was escaped. | 
|  | break; | 
|  | } | 
|  | if (tableEl->fCharClass == 253 && fC.fEscaped && | 
|  | (fC.fChar == 0x50 || fC.fChar == 0x70 ))  { | 
|  | // Table row specified "escaped P" and the char is either 'p' or 'P'. | 
|  | break; | 
|  | } | 
|  | if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  { | 
|  | // Table row specified eof and we hit eof on the input. | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class && | 
|  | fC.fEscaped == FALSE &&                                      //   char is not escaped && | 
|  | fC.fChar != (UChar32)-1) {                                   //   char is not EOF | 
|  | if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { | 
|  | // Table row specified a character class, or set of characters, | 
|  | //   and the current char matches it. | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | // No match on this row, advance to the next  row for this state, | 
|  | tableEl++; | 
|  | } | 
|  | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} | 
|  |  | 
|  | // | 
|  | // We've found the row of the state table that matches the current input | 
|  | //   character from the rules string. | 
|  | // Perform any action specified  by this row in the state table. | 
|  | if (doParseActions((int32_t)tableEl->fAction) == FALSE) { | 
|  | // Break out of the state machine loop if the | 
|  | //   the action signalled some kind of error, or | 
|  | //   the action was to exit, occurs on normal end-of-rules-input. | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (tableEl->fPushState != 0) { | 
|  | fStackPtr++; | 
|  | if (fStackPtr >= kStackSize) { | 
|  | error(U_BRK_INTERNAL_ERROR); | 
|  | RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); | 
|  | fStackPtr--; | 
|  | } | 
|  | fStack[fStackPtr] = tableEl->fPushState; | 
|  | } | 
|  |  | 
|  | if (tableEl->fNextChar) { | 
|  | nextChar(fC); | 
|  | } | 
|  |  | 
|  | // Get the next state from the table entry, or from the | 
|  | //   state stack if the next state was specified as "pop". | 
|  | if (tableEl->fNextState != 255) { | 
|  | state = tableEl->fNextState; | 
|  | } else { | 
|  | state = fStack[fStackPtr]; | 
|  | fStackPtr--; | 
|  | if (fStackPtr < 0) { | 
|  | error(U_BRK_INTERNAL_ERROR); | 
|  | RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); | 
|  | fStackPtr++; | 
|  | } | 
|  | } | 
|  |  | 
|  | } | 
|  |  | 
|  | // | 
|  | // If there were NO user specified reverse rules, set up the equivalent of ".*;" | 
|  | // | 
|  | if (fRB->fReverseTree == NULL) { | 
|  | fRB->fReverseTree  = pushNewNode(RBBINode::opStar); | 
|  | RBBINode  *operand = pushNewNode(RBBINode::setRef); | 
|  | findSetFor(kAny, operand); | 
|  | fRB->fReverseTree->fLeftChild = operand; | 
|  | operand->fParent              = fRB->fReverseTree; | 
|  | fNodeStackPtr -= 2; | 
|  | } | 
|  |  | 
|  |  | 
|  | // | 
|  | // Parsing of the input RBBI rules is complete. | 
|  | // We now have a parse tree for the rule expressions | 
|  | // and a list of all UnicodeSets that are referenced. | 
|  | // | 
|  | #ifdef RBBI_DEBUG | 
|  | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} | 
|  | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) | 
|  | { | 
|  | RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); | 
|  | fRB->fForwardTree->printTree(TRUE); | 
|  | RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); | 
|  | fRB->fReverseTree->printTree(TRUE); | 
|  | RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); | 
|  | fRB->fSafeFwdTree->printTree(TRUE); | 
|  | RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); | 
|  | fRB->fSafeRevTree->printTree(TRUE); | 
|  | } | 
|  | #endif | 
|  | } | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  printNodeStack     for debugging... | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | #ifdef RBBI_DEBUG | 
|  | void RBBIRuleScanner::printNodeStack(const char *title) { | 
|  | int i; | 
|  | RBBIDebugPrintf("%s.  Dumping node stack...\n", title); | 
|  | for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);} | 
|  | } | 
|  | #endif | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  pushNewNode   create a new RBBINode of the specified type and push it | 
|  | //                onto the stack of nodes. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) { | 
|  | fNodeStackPtr++; | 
|  | if (fNodeStackPtr >= kStackSize) { | 
|  | error(U_BRK_INTERNAL_ERROR); | 
|  | RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); | 
|  | *fRB->fStatus = U_BRK_INTERNAL_ERROR; | 
|  | return NULL; | 
|  | } | 
|  | fNodeStack[fNodeStackPtr] = new RBBINode(t); | 
|  | if (fNodeStack[fNodeStackPtr] == NULL) { | 
|  | *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | return fNodeStack[fNodeStackPtr]; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | //------------------------------------------------------------------------------ | 
|  | // | 
|  | //  scanSet    Construct a UnicodeSet from the text at the current scan | 
|  | //             position.  Advance the scan position to the first character | 
|  | //             after the set. | 
|  | // | 
|  | //             A new RBBI setref node referring to the set is pushed onto the node | 
|  | //             stack. | 
|  | // | 
|  | //             The scan position is normally under the control of the state machine | 
|  | //             that controls rule parsing.  UnicodeSets, however, are parsed by | 
|  | //             the UnicodeSet constructor, not by the RBBI rule parser. | 
|  | // | 
|  | //------------------------------------------------------------------------------ | 
|  | void RBBIRuleScanner::scanSet() { | 
|  | UnicodeSet    *uset; | 
|  | ParsePosition  pos; | 
|  | int            startPos; | 
|  | int            i; | 
|  |  | 
|  | if (U_FAILURE(*fRB->fStatus)) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | pos.setIndex(fScanIndex); | 
|  | startPos = fScanIndex; | 
|  | UErrorCode localStatus = U_ZERO_ERROR; | 
|  | uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE, | 
|  | fSymbolTable, | 
|  | localStatus); | 
|  | if (uset == NULL) { | 
|  | localStatus = U_MEMORY_ALLOCATION_ERROR; | 
|  | } | 
|  | if (U_FAILURE(localStatus)) { | 
|  | //  TODO:  Get more accurate position of the error from UnicodeSet's return info. | 
|  | //         UnicodeSet appears to not be reporting correctly at this time. | 
|  | #ifdef RBBI_DEBUG | 
|  | RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); | 
|  | #endif | 
|  | error(localStatus); | 
|  | delete uset; | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Verify that the set contains at least one code point. | 
|  | // | 
|  | if (uset->isEmpty()) { | 
|  | // This set is empty. | 
|  | //  Make it an error, because it almost certainly is not what the user wanted. | 
|  | //  Also, avoids having to think about corner cases in the tree manipulation code | 
|  | //   that occurs later on. | 
|  | error(U_BRK_RULE_EMPTY_SET); | 
|  | delete uset; | 
|  | return; | 
|  | } | 
|  |  | 
|  |  | 
|  | // Advance the RBBI parse postion over the UnicodeSet pattern. | 
|  | //   Don't just set fScanIndex because the line/char positions maintained | 
|  | //   for error reporting would be thrown off. | 
|  | i = pos.getIndex(); | 
|  | for (;;) { | 
|  | if (fNextIndex >= i) { | 
|  | break; | 
|  | } | 
|  | nextCharLL(); | 
|  | } | 
|  |  | 
|  | if (U_SUCCESS(*fRB->fStatus)) { | 
|  | RBBINode         *n; | 
|  |  | 
|  | n = pushNewNode(RBBINode::setRef); | 
|  | n->fFirstPos = startPos; | 
|  | n->fLastPos  = fNextIndex; | 
|  | fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); | 
|  | //  findSetFor() serves several purposes here: | 
|  | //     - Adopts storage for the UnicodeSet, will be responsible for deleting. | 
|  | //     - Mantains collection of all sets in use, needed later for establishing | 
|  | //          character categories for run time engine. | 
|  | //     - Eliminates mulitiple instances of the same set. | 
|  | //     - Creates a new uset node if necessary (if this isn't a duplicate.) | 
|  | findSetFor(n->fText, n, uset); | 
|  | } | 
|  |  | 
|  | } | 
|  |  | 
|  | U_NAMESPACE_END | 
|  |  | 
|  | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |