| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ********************************************************************** |
| * Copyright (c) 2002-2016, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| // |
| // rbbitblb.cpp |
| // |
| |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_BREAK_ITERATION |
| |
| #include "unicode/unistr.h" |
| #include "rbbitblb.h" |
| #include "rbbirb.h" |
| #include "rbbiscan.h" |
| #include "rbbisetb.h" |
| #include "rbbidata.h" |
| #include "cstring.h" |
| #include "uassert.h" |
| #include "uvectr32.h" |
| #include "cmemory.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| const int32_t kMaxStateFor8BitsTable = 255; |
| |
| RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) : |
| fRB(rb), |
| fTree(*rootNode), |
| fStatus(&status), |
| fDStates(nullptr), |
| fSafeTable(nullptr) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| // fDStates is UVector<RBBIStateDescriptor *> |
| fDStates = new UVector(status); |
| if (U_SUCCESS(status) && fDStates == nullptr ) { |
| status = U_MEMORY_ALLOCATION_ERROR; |
| } |
| } |
| |
| |
| |
| RBBITableBuilder::~RBBITableBuilder() { |
| int i; |
| for (i=0; i<fDStates->size(); i++) { |
| delete (RBBIStateDescriptor *)fDStates->elementAt(i); |
| } |
| delete fDStates; |
| delete fSafeTable; |
| delete fLookAheadRuleMap; |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // RBBITableBuilder::buildForwardTable - This is the main function for building |
| // the DFA state transition table from the RBBI rules parse tree. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::buildForwardTable() { |
| |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| // If there were no rules, just return. This situation can easily arise |
| // for the reverse rules. |
| if (fTree==NULL) { |
| return; |
| } |
| |
| // |
| // Walk through the tree, replacing any references to $variables with a copy of the |
| // parse tree for the substition expression. |
| // |
| fTree = fTree->flattenVariables(); |
| #ifdef RBBI_DEBUG |
| if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { |
| RBBIDebugPuts("\nParse tree after flattening variable references."); |
| RBBINode::printTree(fTree, TRUE); |
| } |
| #endif |
| |
| // |
| // If the rules contained any references to {bof} |
| // add a {bof} <cat> <former root of tree> to the |
| // tree. Means that all matches must start out with the |
| // {bof} fake character. |
| // |
| if (fRB->fSetBuilder->sawBOF()) { |
| RBBINode *bofTop = new RBBINode(RBBINode::opCat); |
| RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); |
| // Delete and exit if memory allocation failed. |
| if (bofTop == NULL || bofLeaf == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| delete bofTop; |
| delete bofLeaf; |
| return; |
| } |
| bofTop->fLeftChild = bofLeaf; |
| bofTop->fRightChild = fTree; |
| bofLeaf->fParent = bofTop; |
| bofLeaf->fVal = 2; // Reserved value for {bof}. |
| fTree = bofTop; |
| } |
| |
| // |
| // Add a unique right-end marker to the expression. |
| // Appears as a cat-node, left child being the original tree, |
| // right child being the end marker. |
| // |
| RBBINode *cn = new RBBINode(RBBINode::opCat); |
| // Exit if memory allocation failed. |
| if (cn == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| cn->fLeftChild = fTree; |
| fTree->fParent = cn; |
| RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark); |
| // Delete and exit if memory allocation failed. |
| if (cn->fRightChild == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| delete cn; |
| return; |
| } |
| cn->fRightChild->fParent = cn; |
| fTree = cn; |
| |
| // |
| // Replace all references to UnicodeSets with the tree for the equivalent |
| // expression. |
| // |
| fTree->flattenSets(); |
| #ifdef RBBI_DEBUG |
| if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { |
| RBBIDebugPuts("\nParse tree after flattening Unicode Set references."); |
| RBBINode::printTree(fTree, TRUE); |
| } |
| #endif |
| |
| |
| // |
| // calculate the functions nullable, firstpos, lastpos and followpos on |
| // nodes in the parse tree. |
| // See the algorithm description in Aho. |
| // Understanding how this works by looking at the code alone will be |
| // nearly impossible. |
| // |
| calcNullable(fTree); |
| calcFirstPos(fTree); |
| calcLastPos(fTree); |
| calcFollowPos(fTree); |
| if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { |
| RBBIDebugPuts("\n"); |
| printPosSets(fTree); |
| } |
| |
| // |
| // For "chained" rules, modify the followPos sets |
| // |
| if (fRB->fChainRules) { |
| calcChainedFollowPos(fTree, endMarkerNode); |
| } |
| |
| // |
| // BOF (start of input) test fixup. |
| // |
| if (fRB->fSetBuilder->sawBOF()) { |
| bofFixup(); |
| } |
| |
| // |
| // Build the DFA state transition tables. |
| // |
| buildStateTable(); |
| mapLookAheadRules(); |
| flagAcceptingStates(); |
| flagLookAheadStates(); |
| flagTaggedStates(); |
| |
| // |
| // Update the global table of rule status {tag} values |
| // The rule builder has a global vector of status values that are common |
| // for all tables. Merge the ones from this table into the global set. |
| // |
| mergeRuleStatusVals(); |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::calcNullable(RBBINode *n) { |
| if (n == NULL) { |
| return; |
| } |
| if (n->fType == RBBINode::setRef || |
| n->fType == RBBINode::endMark ) { |
| // These are non-empty leaf node types. |
| n->fNullable = FALSE; |
| return; |
| } |
| |
| if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { |
| // Lookahead marker node. It's a leaf, so no recursion on children. |
| // It's nullable because it does not match any literal text from the input stream. |
| n->fNullable = TRUE; |
| return; |
| } |
| |
| |
| // The node is not a leaf. |
| // Calculate nullable on its children. |
| calcNullable(n->fLeftChild); |
| calcNullable(n->fRightChild); |
| |
| // Apply functions from table 3.40 in Aho |
| if (n->fType == RBBINode::opOr) { |
| n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; |
| } |
| else if (n->fType == RBBINode::opCat) { |
| n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; |
| } |
| else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { |
| n->fNullable = TRUE; |
| } |
| else { |
| n->fNullable = FALSE; |
| } |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::calcFirstPos(RBBINode *n) { |
| if (n == NULL) { |
| return; |
| } |
| if (n->fType == RBBINode::leafChar || |
| n->fType == RBBINode::endMark || |
| n->fType == RBBINode::lookAhead || |
| n->fType == RBBINode::tag) { |
| // These are non-empty leaf node types. |
| // Note: In order to maintain the sort invariant on the set, |
| // this function should only be called on a node whose set is |
| // empty to start with. |
| n->fFirstPosSet->addElement(n, *fStatus); |
| return; |
| } |
| |
| // The node is not a leaf. |
| // Calculate firstPos on its children. |
| calcFirstPos(n->fLeftChild); |
| calcFirstPos(n->fRightChild); |
| |
| // Apply functions from table 3.40 in Aho |
| if (n->fType == RBBINode::opOr) { |
| setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); |
| setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); |
| } |
| else if (n->fType == RBBINode::opCat) { |
| setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); |
| if (n->fLeftChild->fNullable) { |
| setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); |
| } |
| } |
| else if (n->fType == RBBINode::opStar || |
| n->fType == RBBINode::opQuestion || |
| n->fType == RBBINode::opPlus) { |
| setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); |
| } |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::calcLastPos(RBBINode *n) { |
| if (n == NULL) { |
| return; |
| } |
| if (n->fType == RBBINode::leafChar || |
| n->fType == RBBINode::endMark || |
| n->fType == RBBINode::lookAhead || |
| n->fType == RBBINode::tag) { |
| // These are non-empty leaf node types. |
| // Note: In order to maintain the sort invariant on the set, |
| // this function should only be called on a node whose set is |
| // empty to start with. |
| n->fLastPosSet->addElement(n, *fStatus); |
| return; |
| } |
| |
| // The node is not a leaf. |
| // Calculate lastPos on its children. |
| calcLastPos(n->fLeftChild); |
| calcLastPos(n->fRightChild); |
| |
| // Apply functions from table 3.40 in Aho |
| if (n->fType == RBBINode::opOr) { |
| setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); |
| setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); |
| } |
| else if (n->fType == RBBINode::opCat) { |
| setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); |
| if (n->fRightChild->fNullable) { |
| setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); |
| } |
| } |
| else if (n->fType == RBBINode::opStar || |
| n->fType == RBBINode::opQuestion || |
| n->fType == RBBINode::opPlus) { |
| setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); |
| } |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::calcFollowPos(RBBINode *n) { |
| if (n == NULL || |
| n->fType == RBBINode::leafChar || |
| n->fType == RBBINode::endMark) { |
| return; |
| } |
| |
| calcFollowPos(n->fLeftChild); |
| calcFollowPos(n->fRightChild); |
| |
| // Aho rule #1 |
| if (n->fType == RBBINode::opCat) { |
| RBBINode *i; // is 'i' in Aho's description |
| uint32_t ix; |
| |
| UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; |
| |
| for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) { |
| i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); |
| setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); |
| } |
| } |
| |
| // Aho rule #2 |
| if (n->fType == RBBINode::opStar || |
| n->fType == RBBINode::opPlus) { |
| RBBINode *i; // again, n and i are the names from Aho's description. |
| uint32_t ix; |
| |
| for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) { |
| i = (RBBINode *)n->fLastPosSet->elementAt(ix); |
| setAdd(i->fFollowPos, n->fFirstPosSet); |
| } |
| } |
| |
| |
| |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged |
| // as roots of a rule to a destination vector. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) { |
| if (node == NULL || U_FAILURE(*fStatus)) { |
| return; |
| } |
| if (node->fRuleRoot) { |
| dest->addElement(node, *fStatus); |
| // Note: rules cannot nest. If we found a rule start node, |
| // no child node can also be a start node. |
| return; |
| } |
| addRuleRootNodes(dest, node->fLeftChild); |
| addRuleRootNodes(dest, node->fRightChild); |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcChainedFollowPos. Modify the previously calculated followPos sets |
| // to implement rule chaining. NOT described by Aho |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) { |
| |
| UVector leafNodes(*fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| // get a list all leaf nodes |
| tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| // Collect all leaf nodes that can start matches for rules |
| // with inbound chaining enabled, which is the union of the |
| // firstPosition sets from each of the rule root nodes. |
| |
| UVector ruleRootNodes(*fStatus); |
| addRuleRootNodes(&ruleRootNodes, tree); |
| |
| UVector matchStartNodes(*fStatus); |
| for (int j=0; j<ruleRootNodes.size(); ++j) { |
| RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j)); |
| if (node->fChainIn) { |
| setAdd(&matchStartNodes, node->fFirstPosSet); |
| } |
| } |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| int32_t endNodeIx; |
| int32_t startNodeIx; |
| |
| for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { |
| RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx); |
| |
| // Identify leaf nodes that correspond to overall rule match positions. |
| // These include the endMarkNode in their followPos sets. |
| // |
| // Note: do not consider other end marker nodes, those that are added to |
| // look-ahead rules. These can't chain; a match immediately stops |
| // further matching. This leaves exactly one end marker node, the one |
| // at the end of the complete tree. |
| |
| if (!endNode->fFollowPos->contains(endMarkNode)) { |
| continue; |
| } |
| |
| // We've got a node that can end a match. |
| |
| // !!LBCMNoChain implementation: If this node's val correspond to |
| // the Line Break $CM char class, don't chain from it. |
| // TODO: Remove this. !!LBCMNoChain is deprecated, and is not used |
| // by any of the standard ICU rules. |
| if (fRB->fLBCMNoChain) { |
| UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); |
| if (c != -1) { |
| // c == -1 occurs with sets containing only the {eof} marker string. |
| ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); |
| if (cLBProp == U_LB_COMBINING_MARK) { |
| continue; |
| } |
| } |
| } |
| |
| // Now iterate over the nodes that can start a match, looking for ones |
| // with the same char class as our ending node. |
| RBBINode *startNode; |
| for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) { |
| startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx); |
| if (startNode->fType != RBBINode::leafChar) { |
| continue; |
| } |
| |
| if (endNode->fVal == startNode->fVal) { |
| // The end val (character class) of one possible match is the |
| // same as the start of another. |
| |
| // Add all nodes from the followPos of the start node to the |
| // followPos set of the end node, which will have the effect of |
| // letting matches transition from a match state at endNode |
| // to the second char of a match starting with startNode. |
| setAdd(endNode->fFollowPos, startNode->fFollowPos); |
| } |
| } |
| } |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // bofFixup. Fixup for state tables that include {bof} beginning of input testing. |
| // Do an swizzle similar to chaining, modifying the followPos set of |
| // the bofNode to include the followPos nodes from other {bot} nodes |
| // scattered through the tree. |
| // |
| // This function has much in common with calcChainedFollowPos(). |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::bofFixup() { |
| |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| // The parse tree looks like this ... |
| // fTree root ---> <cat> |
| // / \ . |
| // <cat> <#end node> |
| // / \ . |
| // <bofNode> rest |
| // of tree |
| // |
| // We will be adding things to the followPos set of the <bofNode> |
| // |
| RBBINode *bofNode = fTree->fLeftChild->fLeftChild; |
| U_ASSERT(bofNode->fType == RBBINode::leafChar); |
| U_ASSERT(bofNode->fVal == 2); |
| |
| // Get all nodes that can be the start a match of the user-written rules |
| // (excluding the fake bofNode) |
| // We want the nodes that can start a match in the |
| // part labeled "rest of tree" |
| // |
| UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; |
| |
| RBBINode *startNode; |
| int startNodeIx; |
| for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { |
| startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); |
| if (startNode->fType != RBBINode::leafChar) { |
| continue; |
| } |
| |
| if (startNode->fVal == bofNode->fVal) { |
| // We found a leaf node corresponding to a {bof} that was |
| // explicitly written into a rule. |
| // Add everything from the followPos set of this node to the |
| // followPos set of the fake bofNode at the start of the tree. |
| // |
| setAdd(bofNode->fFollowPos, startNode->fFollowPos); |
| } |
| } |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // buildStateTable() Determine the set of runtime DFA states and the |
| // transition tables for these states, by the algorithm |
| // of fig. 3.44 in Aho. |
| // |
| // Most of the comments are quotes of Aho's psuedo-code. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::buildStateTable() { |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| RBBIStateDescriptor *failState; |
| // Set it to NULL to avoid uninitialized warning |
| RBBIStateDescriptor *initialState = NULL; |
| // |
| // Add a dummy state 0 - the stop state. Not from Aho. |
| int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; |
| failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); |
| if (failState == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| goto ExitBuildSTdeleteall; |
| } |
| failState->fPositions = new UVector(*fStatus); |
| if (failState->fPositions == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| } |
| if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { |
| goto ExitBuildSTdeleteall; |
| } |
| fDStates->addElement(failState, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| goto ExitBuildSTdeleteall; |
| } |
| |
| // initially, the only unmarked state in Dstates is firstpos(root), |
| // where toot is the root of the syntax tree for (r)#; |
| initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); |
| if (initialState == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| } |
| if (U_FAILURE(*fStatus)) { |
| goto ExitBuildSTdeleteall; |
| } |
| initialState->fPositions = new UVector(*fStatus); |
| if (initialState->fPositions == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| } |
| if (U_FAILURE(*fStatus)) { |
| goto ExitBuildSTdeleteall; |
| } |
| setAdd(initialState->fPositions, fTree->fFirstPosSet); |
| fDStates->addElement(initialState, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| goto ExitBuildSTdeleteall; |
| } |
| |
| // while there is an unmarked state T in Dstates do begin |
| for (;;) { |
| RBBIStateDescriptor *T = NULL; |
| int32_t tx; |
| for (tx=1; tx<fDStates->size(); tx++) { |
| RBBIStateDescriptor *temp; |
| temp = (RBBIStateDescriptor *)fDStates->elementAt(tx); |
| if (temp->fMarked == FALSE) { |
| T = temp; |
| break; |
| } |
| } |
| if (T == NULL) { |
| break; |
| } |
| |
| // mark T; |
| T->fMarked = TRUE; |
| |
| // for each input symbol a do begin |
| int32_t a; |
| for (a = 1; a<=lastInputSymbol; a++) { |
| // let U be the set of positions that are in followpos(p) |
| // for some position p in T |
| // such that the symbol at position p is a; |
| UVector *U = NULL; |
| RBBINode *p; |
| int32_t px; |
| for (px=0; px<T->fPositions->size(); px++) { |
| p = (RBBINode *)T->fPositions->elementAt(px); |
| if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { |
| if (U == NULL) { |
| U = new UVector(*fStatus); |
| if (U == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| goto ExitBuildSTdeleteall; |
| } |
| } |
| setAdd(U, p->fFollowPos); |
| } |
| } |
| |
| // if U is not empty and not in DStates then |
| int32_t ux = 0; |
| UBool UinDstates = FALSE; |
| if (U != NULL) { |
| U_ASSERT(U->size() > 0); |
| int ix; |
| for (ix=0; ix<fDStates->size(); ix++) { |
| RBBIStateDescriptor *temp2; |
| temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix); |
| if (setEquals(U, temp2->fPositions)) { |
| delete U; |
| U = temp2->fPositions; |
| ux = ix; |
| UinDstates = TRUE; |
| break; |
| } |
| } |
| |
| // Add U as an unmarked state to Dstates |
| if (!UinDstates) |
| { |
| RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); |
| if (newState == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| } |
| if (U_FAILURE(*fStatus)) { |
| goto ExitBuildSTdeleteall; |
| } |
| newState->fPositions = U; |
| fDStates->addElement(newState, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| ux = fDStates->size()-1; |
| } |
| |
| // Dtran[T, a] := U; |
| T->fDtran->setElementAt(ux, a); |
| } |
| } |
| } |
| return; |
| // delete local pointers only if error occured. |
| ExitBuildSTdeleteall: |
| delete initialState; |
| delete failState; |
| } |
| |
| |
| /** |
| * mapLookAheadRules |
| * |
| */ |
| void RBBITableBuilder::mapLookAheadRules() { |
| fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus); |
| if (fLookAheadRuleMap == nullptr) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| } |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1); |
| |
| for (int32_t n=0; n<fDStates->size(); n++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); |
| int32_t laSlotForState = 0; |
| |
| // Establish the look-ahead slot for this state, if the state covers |
| // any look-ahead nodes - corresponding to the '/' in look-ahead rules. |
| |
| // If any of the look-ahead nodes already have a slot assigned, use it, |
| // otherwise assign a new one. |
| |
| bool sawLookAheadNode = false; |
| for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) { |
| RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos)); |
| if (node->fType != RBBINode::NodeType::lookAhead) { |
| continue; |
| } |
| sawLookAheadNode = true; |
| int32_t ruleNum = node->fVal; // Set when rule was originally parsed. |
| U_ASSERT(ruleNum < fLookAheadRuleMap->size()); |
| U_ASSERT(ruleNum > 0); |
| int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum); |
| if (laSlot != 0) { |
| if (laSlotForState == 0) { |
| laSlotForState = laSlot; |
| } else { |
| // TODO: figure out if this can fail, change to setting an error code if so. |
| U_ASSERT(laSlot == laSlotForState); |
| } |
| } |
| } |
| if (!sawLookAheadNode) { |
| continue; |
| } |
| |
| if (laSlotForState == 0) { |
| laSlotForState = ++fLASlotsInUse; |
| } |
| |
| // For each look ahead node covered by this state, |
| // set the mapping from the node's rule number to the look ahead slot. |
| // There can be multiple nodes/rule numbers going to the same la slot. |
| |
| for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) { |
| RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos)); |
| if (node->fType != RBBINode::NodeType::lookAhead) { |
| continue; |
| } |
| int32_t ruleNum = node->fVal; // Set when rule was originally parsed. |
| int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum); |
| (void)existingVal; |
| U_ASSERT(existingVal == 0 || existingVal == laSlotForState); |
| fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum); |
| } |
| } |
| |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // flagAcceptingStates Identify accepting states. |
| // First get a list of all of the end marker nodes. |
| // Then, for each state s, |
| // if s contains one of the end marker nodes in its list of tree positions then |
| // s is an accepting state. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::flagAcceptingStates() { |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| UVector endMarkerNodes(*fStatus); |
| RBBINode *endMarker; |
| int32_t i; |
| int32_t n; |
| |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| |
| for (i=0; i<endMarkerNodes.size(); i++) { |
| endMarker = (RBBINode *)endMarkerNodes.elementAt(i); |
| for (n=0; n<fDStates->size(); n++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); |
| if (sd->fPositions->indexOf(endMarker) >= 0) { |
| // Any non-zero value for fAccepting means this is an accepting node. |
| // The value is what will be returned to the user as the break status. |
| // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1). |
| |
| if (sd->fAccepting==0) { |
| // State hasn't been marked as accepting yet. Do it now. |
| sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal); |
| if (sd->fAccepting == 0) { |
| sd->fAccepting = ACCEPTING_UNCONDITIONAL; |
| } |
| } |
| if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) { |
| // Both lookahead and non-lookahead accepting for this state. |
| // Favor the look-ahead, because a look-ahead match needs to |
| // immediately stop the run-time engine. First match, not longest. |
| sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal); |
| } |
| // implicit else: |
| // if sd->fAccepting already had a value other than 0 or 1, leave it be. |
| } |
| } |
| } |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // flagLookAheadStates Very similar to flagAcceptingStates, above. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::flagLookAheadStates() { |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| UVector lookAheadNodes(*fStatus); |
| RBBINode *lookAheadNode; |
| int32_t i; |
| int32_t n; |
| |
| fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| for (i=0; i<lookAheadNodes.size(); i++) { |
| lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i); |
| U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead); |
| |
| for (n=0; n<fDStates->size(); n++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); |
| int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode); |
| if (positionsIdx >= 0) { |
| U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx)); |
| uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal); |
| U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot); |
| // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) { |
| // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n", |
| // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot); |
| // } |
| sd->fLookAhead = lookaheadSlot; |
| } |
| } |
| } |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // flagTaggedStates |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::flagTaggedStates() { |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| UVector tagNodes(*fStatus); |
| RBBINode *tagNode; |
| int32_t i; |
| int32_t n; |
| |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) |
| tagNode = (RBBINode *)tagNodes.elementAt(i); |
| |
| for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table) |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); |
| if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t |
| sortedAdd(&sd->fTagVals, tagNode->fVal); |
| } |
| } |
| } |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // mergeRuleStatusVals |
| // |
| // Update the global table of rule status {tag} values |
| // The rule builder has a global vector of status values that are common |
| // for all tables. Merge the ones from this table into the global set. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::mergeRuleStatusVals() { |
| // |
| // The basic outline of what happens here is this... |
| // |
| // for each state in this state table |
| // if the status tag list for this state is in the global statuses list |
| // record where and |
| // continue with the next state |
| // else |
| // add the tag list for this state to the global list. |
| // |
| int i; |
| int n; |
| |
| // Pre-set a single tag of {0} into the table. |
| // We will need this as a default, for rule sets with no explicit tagging. |
| if (fRB->fRuleStatusVals->size() == 0) { |
| fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group |
| fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero |
| } |
| |
| // For each state |
| for (n=0; n<fDStates->size(); n++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); |
| UVector *thisStatesTagValues = sd->fTagVals; |
| if (thisStatesTagValues == NULL) { |
| // No tag values are explicitly associated with this state. |
| // Set the default tag value. |
| sd->fTagsIdx = 0; |
| continue; |
| } |
| |
| // There are tag(s) associated with this state. |
| // fTagsIdx will be the index into the global tag list for this state's tag values. |
| // Initial value of -1 flags that we haven't got it set yet. |
| sd->fTagsIdx = -1; |
| int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list |
| int32_t nextTagGroupStart = 0; |
| |
| // Loop runs once per group of tags in the global list |
| while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { |
| thisTagGroupStart = nextTagGroupStart; |
| nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; |
| if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { |
| // The number of tags for this state is different from |
| // the number of tags in this group from the global list. |
| // Continue with the next group from the global list. |
| continue; |
| } |
| // The lengths match, go ahead and compare the actual tag values |
| // between this state and the group from the global list. |
| for (i=0; i<thisStatesTagValues->size(); i++) { |
| if (thisStatesTagValues->elementAti(i) != |
| fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { |
| // Mismatch. |
| break; |
| } |
| } |
| |
| if (i == thisStatesTagValues->size()) { |
| // We found a set of tag values in the global list that match |
| // those for this state. Use them. |
| sd->fTagsIdx = thisTagGroupStart; |
| break; |
| } |
| } |
| |
| if (sd->fTagsIdx == -1) { |
| // No suitable entry in the global tag list already. Add one |
| sd->fTagsIdx = fRB->fRuleStatusVals->size(); |
| fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); |
| for (i=0; i<thisStatesTagValues->size(); i++) { |
| fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); |
| } |
| } |
| } |
| } |
| |
| |
| |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // sortedAdd Add a value to a vector of sorted values (ints). |
| // Do not replicate entries; if the value is already there, do not |
| // add a second one. |
| // Lazily create the vector if it does not already exist. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { |
| int32_t i; |
| |
| if (*vector == NULL) { |
| *vector = new UVector(*fStatus); |
| } |
| if (*vector == NULL || U_FAILURE(*fStatus)) { |
| return; |
| } |
| UVector *vec = *vector; |
| int32_t vSize = vec->size(); |
| for (i=0; i<vSize; i++) { |
| int32_t valAtI = vec->elementAti(i); |
| if (valAtI == val) { |
| // The value is already in the vector. Don't add it again. |
| return; |
| } |
| if (valAtI > val) { |
| break; |
| } |
| } |
| vec->insertElementAt(val, i, *fStatus); |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // setAdd Set operation on UVector |
| // dest = dest union source |
| // Elements may only appear once and must be sorted. |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { |
| int32_t destOriginalSize = dest->size(); |
| int32_t sourceSize = source->size(); |
| int32_t di = 0; |
| MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc |
| void **destPtr, **sourcePtr; |
| void **destLim, **sourceLim; |
| |
| if (destOriginalSize > destArray.getCapacity()) { |
| if (destArray.resize(destOriginalSize) == NULL) { |
| return; |
| } |
| } |
| destPtr = destArray.getAlias(); |
| destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? |
| |
| if (sourceSize > sourceArray.getCapacity()) { |
| if (sourceArray.resize(sourceSize) == NULL) { |
| return; |
| } |
| } |
| sourcePtr = sourceArray.getAlias(); |
| sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? |
| |
| // Avoid multiple "get element" calls by getting the contents into arrays |
| (void) dest->toArray(destPtr); |
| (void) source->toArray(sourcePtr); |
| |
| dest->setSize(sourceSize+destOriginalSize, *fStatus); |
| |
| while (sourcePtr < sourceLim && destPtr < destLim) { |
| if (*destPtr == *sourcePtr) { |
| dest->setElementAt(*sourcePtr++, di++); |
| destPtr++; |
| } |
| // This check is required for machines with segmented memory, like i5/OS. |
| // Direct pointer comparison is not recommended. |
| else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { |
| dest->setElementAt(*destPtr++, di++); |
| } |
| else { /* *sourcePtr < *destPtr */ |
| dest->setElementAt(*sourcePtr++, di++); |
| } |
| } |
| |
| // At most one of these two cleanup loops will execute |
| while (destPtr < destLim) { |
| dest->setElementAt(*destPtr++, di++); |
| } |
| while (sourcePtr < sourceLim) { |
| dest->setElementAt(*sourcePtr++, di++); |
| } |
| |
| dest->setSize(di, *fStatus); |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // setEqual Set operation on UVector. |
| // Compare for equality. |
| // Elements must be sorted. |
| // |
| //----------------------------------------------------------------------------- |
| UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { |
| return a->equals(*b); |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos |
| // for each node in the tree. |
| // |
| //----------------------------------------------------------------------------- |
| #ifdef RBBI_DEBUG |
| void RBBITableBuilder::printPosSets(RBBINode *n) { |
| if (n==NULL) { |
| return; |
| } |
| printf("\n"); |
| RBBINode::printNodeHeader(); |
| RBBINode::printNode(n); |
| RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); |
| |
| RBBIDebugPrintf(" firstpos: "); |
| printSet(n->fFirstPosSet); |
| |
| RBBIDebugPrintf(" lastpos: "); |
| printSet(n->fLastPosSet); |
| |
| RBBIDebugPrintf(" followpos: "); |
| printSet(n->fFollowPos); |
| |
| printPosSets(n->fLeftChild); |
| printPosSets(n->fRightChild); |
| } |
| #endif |
| |
| // |
| // findDuplCharClassFrom() |
| // |
| bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) { |
| int32_t numStates = fDStates->size(); |
| int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); |
| |
| for (; categories->first < numCols-1; categories->first++) { |
| // Note: dictionary & non-dictionary columns cannot be merged. |
| // The limitSecond value prevents considering mixed pairs. |
| // Dictionary categories are >= DictCategoriesStart. |
| // Non dict categories are < DictCategoriesStart. |
| int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ? |
| fRB->fSetBuilder->getDictCategoriesStart() : numCols; |
| for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) { |
| // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates). |
| uint16_t table_base = 0; |
| uint16_t table_dupl = 1; |
| for (int32_t state=0; state<numStates; state++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); |
| table_base = (uint16_t)sd->fDtran->elementAti(categories->first); |
| table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second); |
| if (table_base != table_dupl) { |
| break; |
| } |
| } |
| if (table_base == table_dupl) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| |
| // |
| // removeColumn() |
| // |
| void RBBITableBuilder::removeColumn(int32_t column) { |
| int32_t numStates = fDStates->size(); |
| for (int32_t state=0; state<numStates; state++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); |
| U_ASSERT(column < sd->fDtran->size()); |
| sd->fDtran->removeElementAt(column); |
| } |
| } |
| |
| /* |
| * findDuplicateState |
| */ |
| bool RBBITableBuilder::findDuplicateState(IntPair *states) { |
| int32_t numStates = fDStates->size(); |
| int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); |
| |
| for (; states->first<numStates-1; states->first++) { |
| RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first); |
| for (states->second=states->first+1; states->second<numStates; states->second++) { |
| RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second); |
| if (firstSD->fAccepting != duplSD->fAccepting || |
| firstSD->fLookAhead != duplSD->fLookAhead || |
| firstSD->fTagsIdx != duplSD->fTagsIdx) { |
| continue; |
| } |
| bool rowsMatch = true; |
| for (int32_t col=0; col < numCols; ++col) { |
| int32_t firstVal = firstSD->fDtran->elementAti(col); |
| int32_t duplVal = duplSD->fDtran->elementAti(col); |
| if (!((firstVal == duplVal) || |
| ((firstVal == states->first || firstVal == states->second) && |
| (duplVal == states->first || duplVal == states->second)))) { |
| rowsMatch = false; |
| break; |
| } |
| } |
| if (rowsMatch) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| |
| bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) { |
| int32_t numStates = fSafeTable->size(); |
| |
| for (; states->first<numStates-1; states->first++) { |
| UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first)); |
| for (states->second=states->first+1; states->second<numStates; states->second++) { |
| UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second)); |
| bool rowsMatch = true; |
| int32_t numCols = firstRow->length(); |
| for (int32_t col=0; col < numCols; ++col) { |
| int32_t firstVal = firstRow->charAt(col); |
| int32_t duplVal = duplRow->charAt(col); |
| if (!((firstVal == duplVal) || |
| ((firstVal == states->first || firstVal == states->second) && |
| (duplVal == states->first || duplVal == states->second)))) { |
| rowsMatch = false; |
| break; |
| } |
| } |
| if (rowsMatch) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| |
| void RBBITableBuilder::removeState(IntPair duplStates) { |
| const int32_t keepState = duplStates.first; |
| const int32_t duplState = duplStates.second; |
| U_ASSERT(keepState < duplState); |
| U_ASSERT(duplState < fDStates->size()); |
| |
| RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState); |
| fDStates->removeElementAt(duplState); |
| delete duplSD; |
| |
| int32_t numStates = fDStates->size(); |
| int32_t numCols = fRB->fSetBuilder->getNumCharCategories(); |
| for (int32_t state=0; state<numStates; ++state) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); |
| for (int32_t col=0; col<numCols; col++) { |
| int32_t existingVal = sd->fDtran->elementAti(col); |
| int32_t newVal = existingVal; |
| if (existingVal == duplState) { |
| newVal = keepState; |
| } else if (existingVal > duplState) { |
| newVal = existingVal - 1; |
| } |
| sd->fDtran->setElementAt(newVal, col); |
| } |
| } |
| } |
| |
| void RBBITableBuilder::removeSafeState(IntPair duplStates) { |
| const int32_t keepState = duplStates.first; |
| const int32_t duplState = duplStates.second; |
| U_ASSERT(keepState < duplState); |
| U_ASSERT(duplState < fSafeTable->size()); |
| |
| fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function |
| // and will auto-delete the removed element. |
| int32_t numStates = fSafeTable->size(); |
| for (int32_t state=0; state<numStates; ++state) { |
| UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state); |
| int32_t numCols = sd->length(); |
| for (int32_t col=0; col<numCols; col++) { |
| int32_t existingVal = sd->charAt(col); |
| int32_t newVal = existingVal; |
| if (existingVal == duplState) { |
| newVal = keepState; |
| } else if (existingVal > duplState) { |
| newVal = existingVal - 1; |
| } |
| sd->setCharAt(col, static_cast<char16_t>(newVal)); |
| } |
| } |
| } |
| |
| |
| /* |
| * RemoveDuplicateStates |
| */ |
| int32_t RBBITableBuilder::removeDuplicateStates() { |
| IntPair dupls = {3, 0}; |
| int32_t numStatesRemoved = 0; |
| |
| while (findDuplicateState(&dupls)) { |
| // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second); |
| removeState(dupls); |
| ++numStatesRemoved; |
| } |
| return numStatesRemoved; |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // getTableSize() Calculate the size of the runtime form of this |
| // state transition table. |
| // |
| //----------------------------------------------------------------------------- |
| int32_t RBBITableBuilder::getTableSize() const { |
| int32_t size = 0; |
| int32_t numRows; |
| int32_t numCols; |
| int32_t rowSize; |
| |
| if (fTree == NULL) { |
| return 0; |
| } |
| |
| size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. |
| |
| numRows = fDStates->size(); |
| numCols = fRB->fSetBuilder->getNumCharCategories(); |
| |
| if (use8BitsForTable()) { |
| rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols; |
| } else { |
| rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols; |
| } |
| size += numRows * rowSize; |
| return size; |
| } |
| |
| bool RBBITableBuilder::use8BitsForTable() const { |
| return fDStates->size() <= kMaxStateFor8BitsTable; |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // exportTable() export the state transition table in the format required |
| // by the runtime engine. getTableSize() bytes of memory |
| // must be available at the output address "where". |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::exportTable(void *where) { |
| RBBIStateTable *table = (RBBIStateTable *)where; |
| uint32_t state; |
| int col; |
| |
| if (U_FAILURE(*fStatus) || fTree == NULL) { |
| return; |
| } |
| |
| int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); |
| if (catCount > 0x7fff || |
| fDStates->size() > 0x7fff) { |
| *fStatus = U_BRK_INTERNAL_ERROR; |
| return; |
| } |
| |
| table->fNumStates = fDStates->size(); |
| table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart(); |
| table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1; |
| table->fFlags = 0; |
| if (use8BitsForTable()) { |
| table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; |
| table->fFlags |= RBBI_8BITS_ROWS; |
| } else { |
| table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount; |
| } |
| if (fRB->fLookAheadHardBreak) { |
| table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; |
| } |
| if (fRB->fSetBuilder->sawBOF()) { |
| table->fFlags |= RBBI_BOF_REQUIRED; |
| } |
| |
| for (state=0; state<table->fNumStates; state++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); |
| RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); |
| if (use8BitsForTable()) { |
| U_ASSERT (sd->fAccepting <= 255); |
| U_ASSERT (sd->fLookAhead <= 255); |
| U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255); |
| RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row; |
| r8->fAccepting = sd->fAccepting; |
| r8->fLookAhead = sd->fLookAhead; |
| r8->fTagsIdx = sd->fTagsIdx; |
| for (col=0; col<catCount; col++) { |
| U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable); |
| r8->fNextState[col] = sd->fDtran->elementAti(col); |
| } |
| } else { |
| U_ASSERT (sd->fAccepting <= 0xffff); |
| U_ASSERT (sd->fLookAhead <= 0xffff); |
| U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff); |
| row->r16.fAccepting = sd->fAccepting; |
| row->r16.fLookAhead = sd->fLookAhead; |
| row->r16.fTagsIdx = sd->fTagsIdx; |
| for (col=0; col<catCount; col++) { |
| row->r16.fNextState[col] = sd->fDtran->elementAti(col); |
| } |
| } |
| } |
| } |
| |
| |
| /** |
| * Synthesize a safe state table from the main state table. |
| */ |
| void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) { |
| // The safe table creation has three steps: |
| |
| // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries |
| // following the pair do not depend on context or state before the pair. To test |
| // whether a pair is safe, run it through the main forward state table, starting |
| // from each state. If the the final state is the same, no matter what the starting state, |
| // the pair is safe. |
| // |
| // 2. Build a state table that recognizes the safe pairs. It's similar to their |
| // forward table, with a column for each input character [class], and a row for |
| // each state. Row 1 is the start state, and row 0 is the stop state. Initially |
| // create an additional state for each input character category; being in |
| // one of these states means that the character has been seen, and is potentially |
| // the first of a pair. In each of these rows, the entry for the second character |
| // of a safe pair is set to the stop state (0), indicating that a match was found. |
| // All other table entries are set to the state corresponding the current input |
| // character, allowing that charcter to be the of a start following pair. |
| // |
| // Because the safe rules are to be run in reverse, moving backwards in the text, |
| // the first and second pair categories are swapped when building the table. |
| // |
| // 3. Compress the table. There are typically many rows (states) that are |
| // equivalent - that have zeroes (match completed) in the same columns - |
| // and can be folded together. |
| |
| // Each safe pair is stored as two UChars in the safePair string. |
| UnicodeString safePairs; |
| |
| int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories(); |
| int32_t numStates = fDStates->size(); |
| |
| for (int32_t c1=0; c1<numCharClasses; ++c1) { |
| for (int32_t c2=0; c2 < numCharClasses; ++c2) { |
| int32_t wantedEndState = -1; |
| int32_t endState = 0; |
| for (int32_t startState = 1; startState < numStates; ++startState) { |
| RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState)); |
| int32_t s2 = startStateD->fDtran->elementAti(c1); |
| RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2)); |
| endState = s2StateD->fDtran->elementAti(c2); |
| if (wantedEndState < 0) { |
| wantedEndState = endState; |
| } else { |
| if (wantedEndState != endState) { |
| break; |
| } |
| } |
| } |
| if (wantedEndState == endState) { |
| safePairs.append((char16_t)c1); |
| safePairs.append((char16_t)c2); |
| // printf("(%d, %d) ", c1, c2); |
| } |
| } |
| // printf("\n"); |
| } |
| |
| // Populate the initial safe table. |
| // The table as a whole is UVector<UnicodeString> |
| // Each row is represented by a UnicodeString, being used as a Vector<int16>. |
| // Row 0 is the stop state. |
| // Row 1 is the start sate. |
| // Row 2 and beyond are other states, initially one per char class, but |
| // after initial construction, many of the states will be combined, compacting the table. |
| // The String holds the nextState data only. The four leading fields of a row, fAccepting, |
| // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building. |
| |
| U_ASSERT(fSafeTable == nullptr); |
| fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status); |
| for (int32_t row=0; row<numCharClasses + 2; ++row) { |
| fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status); |
| } |
| |
| // From the start state, each input char class transitions to the state for that input. |
| UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1)); |
| for (int32_t charClass=0; charClass < numCharClasses; ++charClass) { |
| // Note: +2 for the start & stop state. |
| startState.setCharAt(charClass, static_cast<char16_t>(charClass+2)); |
| } |
| |
| // Initially make every other state table row look like the start state row, |
| for (int32_t row=2; row<numCharClasses+2; ++row) { |
| UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row)); |
| rowState = startState; // UnicodeString assignment, copies contents. |
| } |
| |
| // Run through the safe pairs, set the next state to zero when pair has been seen. |
| // Zero being the stop state, meaning we found a safe point. |
| for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) { |
| int32_t c1 = safePairs.charAt(pairIdx); |
| int32_t c2 = safePairs.charAt(pairIdx + 1); |
| |
| UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2)); |
| rowState.setCharAt(c1, 0); |
| } |
| |
| // Remove duplicate or redundant rows from the table. |
| IntPair states = {1, 0}; |
| while (findDuplicateSafeState(&states)) { |
| // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second); |
| removeSafeState(states); |
| } |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // getSafeTableSize() Calculate the size of the runtime form of this |
| // safe state table. |
| // |
| //----------------------------------------------------------------------------- |
| int32_t RBBITableBuilder::getSafeTableSize() const { |
| int32_t size = 0; |
| int32_t numRows; |
| int32_t numCols; |
| int32_t rowSize; |
| |
| if (fSafeTable == nullptr) { |
| return 0; |
| } |
| |
| size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. |
| |
| numRows = fSafeTable->size(); |
| numCols = fRB->fSetBuilder->getNumCharCategories(); |
| |
| if (use8BitsForSafeTable()) { |
| rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols; |
| } else { |
| rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols; |
| } |
| size += numRows * rowSize; |
| return size; |
| } |
| |
| bool RBBITableBuilder::use8BitsForSafeTable() const { |
| return fSafeTable->size() <= kMaxStateFor8BitsTable; |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // exportSafeTable() export the state transition table in the format required |
| // by the runtime engine. getTableSize() bytes of memory |
| // must be available at the output address "where". |
| // |
| //----------------------------------------------------------------------------- |
| void RBBITableBuilder::exportSafeTable(void *where) { |
| RBBIStateTable *table = (RBBIStateTable *)where; |
| uint32_t state; |
| int col; |
| |
| if (U_FAILURE(*fStatus) || fSafeTable == nullptr) { |
| return; |
| } |
| |
| int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); |
| if (catCount > 0x7fff || |
| fSafeTable->size() > 0x7fff) { |
| *fStatus = U_BRK_INTERNAL_ERROR; |
| return; |
| } |
| |
| table->fNumStates = fSafeTable->size(); |
| table->fFlags = 0; |
| if (use8BitsForSafeTable()) { |
| table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; |
| table->fFlags |= RBBI_8BITS_ROWS; |
| } else { |
| table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount; |
| } |
| |
| for (state=0; state<table->fNumStates; state++) { |
| UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state); |
| RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); |
| if (use8BitsForSafeTable()) { |
| RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row; |
| r8->fAccepting = 0; |
| r8->fLookAhead = 0; |
| r8->fTagsIdx = 0; |
| for (col=0; col<catCount; col++) { |
| U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable); |
| r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col)); |
| } |
| } else { |
| row->r16.fAccepting = 0; |
| row->r16.fLookAhead = 0; |
| row->r16.fTagsIdx = 0; |
| for (col=0; col<catCount; col++) { |
| row->r16.fNextState[col] = rowString->charAt(col); |
| } |
| } |
| } |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printSet Debug function. Print the contents of a UVector |
| // |
| //----------------------------------------------------------------------------- |
| #ifdef RBBI_DEBUG |
| void RBBITableBuilder::printSet(UVector *s) { |
| int32_t i; |
| for (i=0; i<s->size(); i++) { |
| const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i)); |
| RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum); |
| } |
| RBBIDebugPrintf("\n"); |
| } |
| #endif |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printStates Debug Function. Dump the fully constructed state transition table. |
| // |
| //----------------------------------------------------------------------------- |
| #ifdef RBBI_DEBUG |
| void RBBITableBuilder::printStates() { |
| int c; // input "character" |
| int n; // state number |
| |
| RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); |
| RBBIDebugPrintf(" | Acc LA Tag"); |
| for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { |
| RBBIDebugPrintf(" %3d", c); |
| } |
| RBBIDebugPrintf("\n"); |
| RBBIDebugPrintf(" |---------------"); |
| for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { |
| RBBIDebugPrintf("----"); |
| } |
| RBBIDebugPrintf("\n"); |
| |
| for (n=0; n<fDStates->size(); n++) { |
| RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); |
| RBBIDebugPrintf(" %3d | " , n); |
| RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); |
| for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { |
| RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c)); |
| } |
| RBBIDebugPrintf("\n"); |
| } |
| RBBIDebugPrintf("\n\n"); |
| } |
| #endif |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printSafeTable Debug Function. Dump the fully constructed safe table. |
| // |
| //----------------------------------------------------------------------------- |
| #ifdef RBBI_DEBUG |
| void RBBITableBuilder::printReverseTable() { |
| int c; // input "character" |
| int n; // state number |
| |
| RBBIDebugPrintf(" Safe Reverse Table \n"); |
| if (fSafeTable == nullptr) { |
| RBBIDebugPrintf(" --- nullptr ---\n"); |
| return; |
| } |
| RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); |
| RBBIDebugPrintf(" | Acc LA Tag"); |
| for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { |
| RBBIDebugPrintf(" %2d", c); |
| } |
| RBBIDebugPrintf("\n"); |
| RBBIDebugPrintf(" |---------------"); |
| for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { |
| RBBIDebugPrintf("---"); |
| } |
| RBBIDebugPrintf("\n"); |
| |
| for (n=0; n<fSafeTable->size(); n++) { |
| UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n); |
| RBBIDebugPrintf(" %3d | " , n); |
| RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags |
| for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { |
| RBBIDebugPrintf(" %2d", rowString->charAt(c)); |
| } |
| RBBIDebugPrintf("\n"); |
| } |
| RBBIDebugPrintf("\n\n"); |
| } |
| #endif |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printRuleStatusTable Debug Function. Dump the common rule status table |
| // |
| //----------------------------------------------------------------------------- |
| #ifdef RBBI_DEBUG |
| void RBBITableBuilder::printRuleStatusTable() { |
| int32_t thisRecord = 0; |
| int32_t nextRecord = 0; |
| int i; |
| UVector *tbl = fRB->fRuleStatusVals; |
| |
| RBBIDebugPrintf("index | tags \n"); |
| RBBIDebugPrintf("-------------------\n"); |
| |
| while (nextRecord < tbl->size()) { |
| thisRecord = nextRecord; |
| nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; |
| RBBIDebugPrintf("%4d ", thisRecord); |
| for (i=thisRecord+1; i<nextRecord; i++) { |
| RBBIDebugPrintf(" %5d", tbl->elementAti(i)); |
| } |
| RBBIDebugPrintf("\n"); |
| } |
| RBBIDebugPrintf("\n\n"); |
| } |
| #endif |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // RBBIStateDescriptor Methods. This is a very struct-like class |
| // Most access is directly to the fields. |
| // |
| //----------------------------------------------------------------------------- |
| |
| RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { |
| fMarked = FALSE; |
| fAccepting = 0; |
| fLookAhead = 0; |
| fTagsIdx = 0; |
| fTagVals = NULL; |
| fPositions = NULL; |
| fDtran = NULL; |
| |
| fDtran = new UVector32(lastInputSymbol+1, *fStatus); |
| if (U_FAILURE(*fStatus)) { |
| return; |
| } |
| if (fDtran == NULL) { |
| *fStatus = U_MEMORY_ALLOCATION_ERROR; |
| return; |
| } |
| fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized. |
| // It is indexed by input symbols, and will |
| // hold the next state number for each |
| // symbol. |
| } |
| |
| |
| RBBIStateDescriptor::~RBBIStateDescriptor() { |
| delete fPositions; |
| delete fDtran; |
| delete fTagVals; |
| fPositions = NULL; |
| fDtran = NULL; |
| fTagVals = NULL; |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |