| /* |
| ********************************************************************** |
| * Copyright (c) 2002-2009, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| package com.ibm.icu.text; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| import java.util.SortedSet; |
| import java.util.TreeSet; |
| |
| import com.ibm.icu.impl.Assert; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UProperty; |
| |
| // |
| // class RBBITableBuilder is part of the RBBI rule compiler. |
| // It builds the state transition table used by the RBBI runtime |
| // from the expression syntax tree generated by the rule scanner. |
| // |
| // This class is part of the RBBI implementation only. |
| // There is no user-visible public API here. |
| // |
| class RBBITableBuilder { |
| |
| |
| |
| // |
| // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors, |
| // one for each state. |
| static class RBBIStateDescriptor { |
| boolean fMarked; |
| int fAccepting; |
| int fLookAhead; |
| SortedSet<Integer> fTagVals; |
| int fTagsIdx; |
| Set<RBBINode> fPositions; // Set of parse tree positions associated |
| // with this state. Unordered (it's a set). |
| // UVector contents are RBBINode * |
| |
| int[] fDtran; // Transitions out of this state. |
| // indexed by input character |
| // contents is int index of dest state |
| // in RBBITableBuilder.fDStates |
| |
| RBBIStateDescriptor(int maxInputSymbol) { |
| fTagVals = new TreeSet<Integer>(); |
| fPositions = new HashSet<RBBINode>(); |
| fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized. |
| // It is indexed by input symbols, and will |
| // hold the next state number for each |
| // symbol. |
| } |
| } |
| |
| |
| private RBBIRuleBuilder fRB; |
| private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots |
| // for the parse tree to operate on. |
| // Too bad Java can't do indirection more easily! |
| |
| private List<RBBIStateDescriptor> fDStates; // D states (Aho's terminology) |
| // Index is state number |
| // Contents are RBBIStateDescriptor pointers. |
| |
| //----------------------------------------------------------------------------- |
| // |
| // Constructor for RBBITableBuilder. |
| // |
| // rootNode is an index into the array of root nodes that is held by |
| // the overall RBBIRuleBuilder. |
| //----------------------------------------------------------------------------- |
| RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) { |
| fRootIx = rootNodeIx; |
| fRB = rb; |
| fDStates = new ArrayList<RBBIStateDescriptor>(); |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // RBBITableBuilder::build - This is the main function for building the DFA state transtion |
| // table from the RBBI rules parse tree. |
| // |
| //----------------------------------------------------------------------------- |
| void build() { |
| // If there were no rules, just return. This situation can easily arise |
| // for the reverse rules. |
| if (fRB.fTreeRoots[fRootIx]==null) { |
| return; |
| } |
| |
| // |
| // Walk through the tree, replacing any references to $variables with a copy of the |
| // parse tree for the substition expression. |
| // |
| fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables(); |
| if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) { |
| System.out.println("Parse tree after flattening variable references."); |
| fRB.fTreeRoots[fRootIx].printTree(true); |
| } |
| |
| // |
| // If the rules contained any references to {bof} |
| // add a {bof} <cat> <former root of tree> to the |
| // tree. Means that all matches must start out with the |
| // {bof} fake character. |
| // |
| if (fRB.fSetBuilder.sawBOF()) { |
| RBBINode bofTop = new RBBINode(RBBINode.opCat); |
| RBBINode bofLeaf = new RBBINode(RBBINode.leafChar); |
| bofTop.fLeftChild = bofLeaf; |
| bofTop.fRightChild = fRB.fTreeRoots[fRootIx]; |
| bofLeaf.fParent = bofTop; |
| bofLeaf.fVal = 2; // Reserved value for {bof}. |
| fRB.fTreeRoots[fRootIx] = bofTop; |
| } |
| |
| // |
| // Add a unique right-end marker to the expression. |
| // Appears as a cat-node, left child being the original tree, |
| // right child being the end marker. |
| // |
| RBBINode cn = new RBBINode(RBBINode.opCat); |
| cn.fLeftChild = fRB.fTreeRoots[fRootIx]; |
| fRB.fTreeRoots[fRootIx].fParent = cn; |
| cn.fRightChild = new RBBINode(RBBINode.endMark); |
| cn.fRightChild.fParent = cn; |
| fRB.fTreeRoots[fRootIx] = cn; |
| |
| // |
| // Replace all references to UnicodeSets with the tree for the equivalent |
| // expression. |
| // |
| fRB.fTreeRoots[fRootIx].flattenSets(); |
| if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) { |
| System.out.println("Parse tree after flattening Unicode Set references."); |
| fRB.fTreeRoots[fRootIx].printTree(true); |
| } |
| |
| |
| // |
| // calculate the functions nullable, firstpos, lastpos and followpos on |
| // nodes in the parse tree. |
| // See the alogrithm description in Aho. |
| // Understanding how this works by looking at the code alone will be |
| // nearly impossible. |
| // |
| calcNullable(fRB.fTreeRoots[fRootIx]); |
| calcFirstPos(fRB.fTreeRoots[fRootIx]); |
| calcLastPos(fRB.fTreeRoots[fRootIx]); |
| calcFollowPos(fRB.fTreeRoots[fRootIx]); |
| if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) { |
| System.out.print("\n"); |
| printPosSets(fRB.fTreeRoots[fRootIx]); |
| } |
| |
| // |
| // For "chained" rules, modify the followPos sets |
| // |
| if (fRB.fChainRules) { |
| calcChainedFollowPos(fRB.fTreeRoots[fRootIx]); |
| } |
| |
| // |
| // BOF (start of input) test fixup. |
| // |
| if (fRB.fSetBuilder.sawBOF()) { |
| bofFixup(); |
| } |
| |
| // |
| // Build the DFA state transition tables. |
| // |
| buildStateTable(); |
| flagAcceptingStates(); |
| flagLookAheadStates(); |
| flagTaggedStates(); |
| |
| // |
| // Update the global table of rule status {tag} values |
| // The rule builder has a global vector of status values that are common |
| // for all tables. Merge the ones from this table into the global set. |
| // |
| mergeRuleStatusVals(); |
| |
| if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();} |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void calcNullable(RBBINode n) { |
| if (n == null) { |
| return; |
| } |
| if (n.fType == RBBINode.setRef || |
| n.fType == RBBINode.endMark ) { |
| // These are non-empty leaf node types. |
| n.fNullable = false; |
| return; |
| } |
| |
| if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) { |
| // Lookahead marker node. It's a leaf, so no recursion on children. |
| // It's nullable because it does not match any literal text from the input stream. |
| n.fNullable = true; |
| return; |
| } |
| |
| |
| // The node is not a leaf. |
| // Calculate nullable on its children. |
| calcNullable(n.fLeftChild); |
| calcNullable(n.fRightChild); |
| |
| // Apply functions from table 3.40 in Aho |
| if (n.fType == RBBINode.opOr) { |
| n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable; |
| } |
| else if (n.fType == RBBINode.opCat) { |
| n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable; |
| } |
| else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) { |
| n.fNullable = true; |
| } |
| else { |
| n.fNullable = false; |
| } |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void calcFirstPos(RBBINode n) { |
| if (n == null) { |
| return; |
| } |
| if (n.fType == RBBINode.leafChar || |
| n.fType == RBBINode.endMark || |
| n.fType == RBBINode.lookAhead || |
| n.fType == RBBINode.tag) { |
| // These are non-empty leaf node types. |
| n.fFirstPosSet.add(n); |
| return; |
| } |
| |
| // The node is not a leaf. |
| // Calculate firstPos on its children. |
| calcFirstPos(n.fLeftChild); |
| calcFirstPos(n.fRightChild); |
| |
| // Apply functions from table 3.40 in Aho |
| if (n.fType == RBBINode.opOr) { |
| n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); |
| n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet); |
| } |
| else if (n.fType == RBBINode.opCat) { |
| n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); |
| if (n.fLeftChild.fNullable) { |
| n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet); |
| } |
| } |
| else if (n.fType == RBBINode.opStar || |
| n.fType == RBBINode.opQuestion || |
| n.fType == RBBINode.opPlus) { |
| n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); |
| } |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void calcLastPos(RBBINode n) { |
| if (n == null) { |
| return; |
| } |
| if (n.fType == RBBINode.leafChar || |
| n.fType == RBBINode.endMark || |
| n.fType == RBBINode.lookAhead || |
| n.fType == RBBINode.tag) { |
| // These are non-empty leaf node types. |
| n.fLastPosSet.add(n); |
| return; |
| } |
| |
| // The node is not a leaf. |
| // Calculate lastPos on its children. |
| calcLastPos(n.fLeftChild); |
| calcLastPos(n.fRightChild); |
| |
| // Apply functions from table 3.40 in Aho |
| if (n.fType == RBBINode.opOr) { |
| n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); |
| n.fLastPosSet.addAll(n.fRightChild.fLastPosSet); |
| } |
| else if (n.fType == RBBINode.opCat) { |
| n.fLastPosSet.addAll(n.fRightChild.fLastPosSet); |
| if (n.fRightChild.fNullable) { |
| n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); |
| } |
| } |
| else if (n.fType == RBBINode.opStar || |
| n.fType == RBBINode.opQuestion || |
| n.fType == RBBINode.opPlus) { |
| n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); |
| } |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 |
| // |
| //----------------------------------------------------------------------------- |
| void calcFollowPos(RBBINode n) { |
| if (n == null || |
| n.fType == RBBINode.leafChar || |
| n.fType == RBBINode.endMark) { |
| return; |
| } |
| |
| calcFollowPos(n.fLeftChild); |
| calcFollowPos(n.fRightChild); |
| |
| // Aho rule #1 |
| if (n.fType == RBBINode.opCat) { |
| for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) { |
| i.fFollowPos.addAll(n.fRightChild.fFirstPosSet); |
| } |
| } |
| |
| // Aho rule #2 |
| if (n.fType == RBBINode.opStar || |
| n.fType == RBBINode.opPlus) { |
| for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) { |
| i.fFollowPos.addAll(n.fFirstPosSet); |
| } |
| } |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // calcChainedFollowPos. Modify the previously calculated followPos sets |
| // to implement rule chaining. NOT described by Aho |
| // |
| //----------------------------------------------------------------------------- |
| void calcChainedFollowPos(RBBINode tree) { |
| |
| List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>(); |
| List<RBBINode> leafNodes = new ArrayList<RBBINode>(); |
| |
| // get a list of all endmarker nodes. |
| tree.findNodes(endMarkerNodes, RBBINode.endMark); |
| |
| // get a list all leaf nodes |
| tree.findNodes(leafNodes, RBBINode.leafChar); |
| |
| // Get all nodes that can be the start a match, which is FirstPosition() |
| // of the portion of the tree corresponding to user-written rules. |
| // See the tree description in bofFixup(). |
| RBBINode userRuleRoot = tree; |
| if (fRB.fSetBuilder.sawBOF()) { |
| userRuleRoot = tree.fLeftChild.fRightChild; |
| } |
| Assert.assrt(userRuleRoot != null); |
| Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet; |
| |
| // Iteratate over all leaf nodes, |
| // |
| for (RBBINode tNode : leafNodes) { |
| RBBINode endNode = null; |
| |
| // Identify leaf nodes that correspond to overall rule match positions. |
| // These include an endMarkerNode in their followPos sets. |
| for (RBBINode endMarkerNode : endMarkerNodes) { |
| if (tNode.fFollowPos.contains(endMarkerNode)) { |
| endNode = tNode; |
| break; |
| } |
| } |
| if (endNode == null) { |
| // node wasn't an end node. Try again with the next. |
| continue; |
| } |
| |
| // We've got a node that can end a match. |
| |
| // Line Break Specific hack: If this node's val correspond to the $CM char class, |
| // don't chain from it. |
| // TODO: Add rule syntax for this behavior, get specifics out of here and |
| // into the rule file. |
| if (fRB.fLBCMNoChain) { |
| int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal); |
| if (c != -1) { |
| // c == -1 occurs with sets containing only the {eof} marker string. |
| int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); |
| if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) { |
| continue; |
| } |
| } |
| } |
| |
| |
| // Now iterate over the nodes that can start a match, looking for ones |
| // with the same char class as our ending node. |
| for (RBBINode startNode : matchStartNodes) { |
| if (startNode.fType != RBBINode.leafChar) { |
| continue; |
| } |
| |
| if (endNode.fVal == startNode.fVal) { |
| // The end val (character class) of one possible match is the |
| // same as the start of another. |
| |
| // Add all nodes from the followPos of the start node to the |
| // followPos set of the end node, which will have the effect of |
| // letting matches transition from a match state at endNode |
| // to the second char of a match starting with startNode. |
| endNode.fFollowPos.addAll(startNode.fFollowPos); |
| } |
| } |
| } |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // bofFixup. Fixup for state tables that include {bof} beginning of input testing. |
| // Do an swizzle similar to chaining, modifying the followPos set of |
| // the bofNode to include the followPos nodes from other {bot} nodes |
| // scattered through the tree. |
| // |
| // This function has much in common with calcChainedFollowPos(). |
| // |
| //----------------------------------------------------------------------------- |
| void bofFixup() { |
| // |
| // The parse tree looks like this ... |
| // fTree root --. <cat> |
| // / \ |
| // <cat> <#end node> |
| // / \ |
| // <bofNode> rest |
| // of tree |
| // |
| // We will be adding things to the followPos set of the <bofNode> |
| // |
| RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild; |
| Assert.assrt(bofNode.fType == RBBINode.leafChar); |
| Assert.assrt(bofNode.fVal == 2); |
| |
| // Get all nodes that can be the start a match of the user-written rules |
| // (excluding the fake bofNode) |
| // We want the nodes that can start a match in the |
| // part labeled "rest of tree" |
| // |
| Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet; |
| for (RBBINode startNode : matchStartNodes) { |
| if (startNode.fType != RBBINode.leafChar) { |
| continue; |
| } |
| |
| if (startNode.fVal == bofNode.fVal) { |
| // We found a leaf node corresponding to a {bof} that was |
| // explicitly written into a rule. |
| // Add everything from the followPos set of this node to the |
| // followPos set of the fake bofNode at the start of the tree. |
| // |
| bofNode.fFollowPos.addAll(startNode.fFollowPos); |
| } |
| } |
| } |
| |
| //----------------------------------------------------------------------------- |
| // |
| // buildStateTable() Determine the set of runtime DFA states and the |
| // transition tables for these states, by the algorithm |
| // of fig. 3.44 in Aho. |
| // |
| // Most of the comments are quotes of Aho's psuedo-code. |
| // |
| //----------------------------------------------------------------------------- |
| void buildStateTable() { |
| // |
| // Add a dummy state 0 - the stop state. Not from Aho. |
| int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1; |
| RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol); |
| fDStates.add(failState); |
| |
| // initially, the only unmarked state in Dstates is firstpos(root), |
| // where toot is the root of the syntax tree for (r)#; |
| RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol); |
| initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet); |
| fDStates.add(initialState); |
| |
| // while there is an unmarked state T in Dstates do begin |
| for (;;) { |
| RBBIStateDescriptor T = null; |
| int tx; |
| for (tx=1; tx<fDStates.size(); tx++) { |
| RBBIStateDescriptor temp = fDStates.get(tx); |
| if (temp.fMarked == false) { |
| T = temp; |
| break; |
| } |
| } |
| if (T == null) { |
| break; |
| } |
| |
| // mark T; |
| T.fMarked = true; |
| |
| // for each input symbol a do begin |
| int a; |
| for (a = 1; a<=lastInputSymbol; a++) { |
| // let U be the set of positions that are in followpos(p) |
| // for some position p in T |
| // such that the symbol at position p is a; |
| Set<RBBINode> U = null; |
| for (RBBINode p : T.fPositions) { |
| if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) { |
| if (U == null) { |
| U = new HashSet<RBBINode>(); |
| } |
| U.addAll(p.fFollowPos); |
| } |
| } |
| |
| // if U is not empty and not in DStates then |
| int ux = 0; |
| boolean UinDstates = false; |
| if (U != null) { |
| Assert.assrt(U.size() > 0); |
| int ix; |
| for (ix=0; ix<fDStates.size(); ix++) { |
| RBBIStateDescriptor temp2; |
| temp2 = fDStates.get(ix); |
| if (U.equals(temp2.fPositions)) { |
| U = temp2.fPositions; |
| ux = ix; |
| UinDstates = true; |
| break; |
| } |
| } |
| |
| // Add U as an unmarked state to Dstates |
| if (!UinDstates) |
| { |
| RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol); |
| newState.fPositions = U; |
| fDStates.add(newState); |
| ux = fDStates.size()-1; |
| } |
| |
| // Dtran[T, a] := U; |
| T.fDtran[a] = ux; |
| } |
| } |
| } |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // flagAcceptingStates Identify accepting states. |
| // First get a list of all of the end marker nodes. |
| // Then, for each state s, |
| // if s contains one of the end marker nodes in its list of tree positions then |
| // s is an accepting state. |
| // |
| //----------------------------------------------------------------------------- |
| void flagAcceptingStates() { |
| List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>(); |
| RBBINode endMarker; |
| int i; |
| int n; |
| |
| fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark); |
| |
| for (i=0; i<endMarkerNodes.size(); i++) { |
| endMarker = endMarkerNodes.get(i); |
| for (n=0; n<fDStates.size(); n++) { |
| RBBIStateDescriptor sd = fDStates.get(n); |
| //if (sd.fPositions.indexOf(endMarker) >= 0) { |
| if (sd.fPositions.contains(endMarker)) { |
| // Any non-zero value for fAccepting means this is an accepting node. |
| // The value is what will be returned to the user as the break status. |
| // If no other value was specified, force it to -1. |
| |
| if (sd.fAccepting==0) { |
| // State hasn't been marked as accepting yet. Do it now. |
| sd.fAccepting = endMarker.fVal; |
| if (sd.fAccepting == 0) { |
| sd.fAccepting = -1; |
| } |
| } |
| if (sd.fAccepting==-1 && endMarker.fVal != 0) { |
| // Both lookahead and non-lookahead accepting for this state. |
| // Favor the look-ahead. Expedient for line break. |
| // TODO: need a more elegant resolution for conflicting rules. |
| sd.fAccepting = endMarker.fVal; |
| } |
| // implicit else: |
| // if sd.fAccepting already had a value other than 0 or -1, leave it be. |
| |
| // If the end marker node is from a look-ahead rule, set |
| // the fLookAhead field or this state also. |
| if (endMarker.fLookAheadEnd) { |
| // TODO: don't change value if already set? |
| // TODO: allow for more than one active look-ahead rule in engine. |
| // Make value here an index to a side array in engine? |
| sd.fLookAhead = sd.fAccepting; |
| } |
| } |
| } |
| } |
| } |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // flagLookAheadStates Very similar to flagAcceptingStates, above. |
| // |
| //----------------------------------------------------------------------------- |
| void flagLookAheadStates() { |
| List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>(); |
| RBBINode lookAheadNode; |
| int i; |
| int n; |
| |
| fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead); |
| for (i=0; i<lookAheadNodes.size(); i++) { |
| lookAheadNode = lookAheadNodes.get(i); |
| |
| for (n=0; n<fDStates.size(); n++) { |
| RBBIStateDescriptor sd = fDStates.get(n); |
| if (sd.fPositions.contains(lookAheadNode)) { |
| sd.fLookAhead = lookAheadNode.fVal; |
| } |
| } |
| } |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // flagTaggedStates |
| // |
| //----------------------------------------------------------------------------- |
| void flagTaggedStates() { |
| List<RBBINode> tagNodes = new ArrayList<RBBINode>(); |
| RBBINode tagNode; |
| int i; |
| int n; |
| |
| fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag); |
| for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) |
| tagNode = tagNodes.get(i); |
| |
| for (n=0; n<fDStates.size(); n++) { // For each state s (row in the state table) |
| RBBIStateDescriptor sd = fDStates.get(n); |
| if (sd.fPositions.contains(tagNode)) { // if s include the tag node t |
| sd.fTagVals.add(Integer.valueOf(tagNode.fVal)); |
| } |
| } |
| } |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // mergeRuleStatusVals |
| // |
| // Allocate positions in the global array of rule status {tag} values |
| // |
| // The RBBI runtime uses an array of {sets of status values} that can |
| // be returned for boundaries. Each accepting state that has non-zero |
| // status includes an index into this array. The format of the array |
| // is |
| // Num of status values in group 1 |
| // status val |
| // status val |
| // ... |
| // Num of status vals in group 2 |
| // status val |
| // status val |
| // ... |
| // etc. |
| // |
| // |
| //----------------------------------------------------------------------------- |
| |
| void mergeRuleStatusVals() { |
| // |
| // The basic outline of what happens here is this... |
| // |
| // for each state in this state table |
| // if the status tag list for this state is in the global statuses list |
| // record where and |
| // continue with the next state |
| // else |
| // add the tag list for this state to the global list. |
| // |
| int n; |
| |
| // Pre-load a single tag of {0} into the table. |
| // We will need this as a default, for rule sets with no explicit tagging, |
| // or with explicit tagging of {0}. |
| if (fRB.fRuleStatusVals.size() == 0) { |
| fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group |
| fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero |
| |
| SortedSet<Integer> s0 = new TreeSet<Integer>(); |
| Integer izero = Integer.valueOf(0); |
| fRB.fStatusSets.put(s0, izero); |
| SortedSet<Integer> s1 = new TreeSet<Integer>(); |
| s1.add(izero); |
| fRB.fStatusSets.put(s0, izero); |
| } |
| |
| // For each state, check whether the state's status tag values are |
| // already entered into the status values array, and add them if not. |
| for (n=0; n<fDStates.size(); n++) { |
| RBBIStateDescriptor sd = fDStates.get(n); |
| Set<Integer> statusVals = sd.fTagVals; |
| Integer arrayIndexI = fRB.fStatusSets.get(statusVals); |
| if (arrayIndexI == null) { |
| // This is the first encounter of this set of status values. |
| // Add them to the statusSets map, This map associates |
| // the set of status values with an index in the runtime status |
| // values array. |
| arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size()); |
| fRB.fStatusSets.put(statusVals, arrayIndexI); |
| |
| // Add the new set of status values to the vector of values that |
| // will eventually become the array used by the runtime engine. |
| fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size())); |
| fRB.fRuleStatusVals.addAll(statusVals); |
| } |
| |
| // Save the runtime array index back into the state descriptor. |
| sd.fTagsIdx = arrayIndexI.intValue(); |
| } |
| } |
| |
| |
| |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos |
| // for each node in the tree. |
| // |
| //----------------------------------------------------------------------------- |
| |
| void printPosSets(RBBINode n) { |
| if (n==null) { |
| return; |
| } |
| RBBINode.printNode(n); |
| System.out.print(" Nullable: " + n.fNullable); |
| |
| System.out.print(" firstpos: "); |
| printSet(n.fFirstPosSet); |
| |
| System.out.print(" lastpos: "); |
| printSet(n.fLastPosSet); |
| |
| System.out.print(" followpos: "); |
| printSet(n.fFollowPos); |
| |
| printPosSets(n.fLeftChild); |
| printPosSets(n.fRightChild); |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // getTableSize() Calculate the size in bytes of the runtime form of this |
| // state transition table. |
| // |
| // Note: Refer to common/rbbidata.h from ICU4C for the declarations |
| // of the structures being matched by this calculation. |
| // |
| //----------------------------------------------------------------------------- |
| int getTableSize() { |
| int size = 0; |
| int numRows; |
| int numCols; |
| int rowSize; |
| |
| if (fRB.fTreeRoots[fRootIx] == null) { |
| return 0; |
| } |
| |
| size = /*sizeof(RBBIStateTable) - 4 */ 16; // The header, with no rows to the table. |
| |
| numRows = fDStates.size(); |
| numCols = fRB.fSetBuilder.getNumCharCategories(); |
| |
| // Note The declaration of RBBIStateTableRow is for a table of two columns. |
| // Therefore we subtract two from numCols when determining |
| // how much storage to add to a row for the total columns. |
| // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); |
| rowSize = 8 + 2*numCols; |
| size += numRows * rowSize; |
| while (size % 8 > 0) { // Size must be multiple of 8 bytes in size. |
| size++; |
| } |
| |
| return size; |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // exportTable() export the state transition table in the ICU4C format. |
| // |
| // Most of the table is 16 bit shorts. This function exports |
| // the whole thing as an array of shorts. |
| // |
| // The size of the array must be rounded up to a multiple of |
| // 8 bytes. |
| // |
| // See struct RBBIStateTable in ICU4C, common/rbbidata.h |
| // |
| //----------------------------------------------------------------------------- |
| |
| short [] exportTable() { |
| int state; |
| int col; |
| |
| if (fRB.fTreeRoots[fRootIx] == null) { |
| return new short[0]; |
| } |
| |
| Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff && |
| fDStates.size() < 0x7fff); |
| |
| int numStates = fDStates.size(); |
| |
| // Size of table size in shorts. |
| // the "4" is the size of struct RBBIStateTableRow, the row header part only. |
| int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); |
| int tableSize = getTableSize() / 2; |
| |
| |
| short [] table = new short[tableSize]; |
| |
| // |
| // Fill in the header fields. |
| // Annoying because they really want to be ints, not shorts. |
| // |
| // RBBIStateTable.fNumStates |
| table[RBBIDataWrapper.NUMSTATES] = (short)(numStates >>> 16); |
| table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff); |
| |
| // RBBIStateTable.fRowLen |
| table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16); |
| table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff); |
| |
| // RBBIStateTable.fFlags |
| int flags = 0; |
| if (fRB.fLookAheadHardBreak) { |
| flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK; |
| } |
| if (fRB.fSetBuilder.sawBOF()) { |
| flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED; |
| } |
| table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16); |
| table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff); |
| |
| int numCharCategories = fRB.fSetBuilder.getNumCharCategories(); |
| for (state=0; state<numStates; state++) { |
| RBBIStateDescriptor sd = fDStates.get(state); |
| int row = 8 + state*rowLen; |
| Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767); |
| Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767); |
| table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting; |
| table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead; |
| table[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx; |
| for (col=0; col<numCharCategories; col++) { |
| table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col]; |
| } |
| } |
| return table; |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printSet Debug function. Print the contents of a set of Nodes |
| // |
| //----------------------------------------------------------------------------- |
| |
| void printSet(Collection<RBBINode> s) { |
| for (RBBINode n : s) { |
| RBBINode.printInt(n.fSerialNum, 8); |
| } |
| System.out.println(); |
| } |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printStates Debug Function. Dump the fully constructed state transition table. |
| // |
| //----------------------------------------------------------------------------- |
| |
| void printStates() { |
| int c; // input "character" |
| int n; // state number |
| |
| System.out.print("state | i n p u t s y m b o l s \n"); |
| System.out.print(" | Acc LA Tag"); |
| for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) { |
| RBBINode.printInt(c, 3); |
| } |
| System.out.print("\n"); |
| System.out.print(" |---------------"); |
| for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) { |
| System.out.print("---"); |
| } |
| System.out.print("\n"); |
| |
| for (n=0; n<fDStates.size(); n++) { |
| RBBIStateDescriptor sd = fDStates.get(n); |
| RBBINode.printInt(n, 5); |
| System.out.print(" | "); |
| |
| RBBINode.printInt(sd.fAccepting, 3); |
| RBBINode.printInt(sd.fLookAhead, 4); |
| RBBINode.printInt(sd.fTagsIdx, 6); |
| System.out.print(" "); |
| for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) { |
| RBBINode.printInt(sd.fDtran[c], 3); |
| } |
| System.out.print("\n"); |
| } |
| System.out.print("\n\n"); |
| } |
| |
| |
| |
| |
| //----------------------------------------------------------------------------- |
| // |
| // printRuleStatusTable Debug Function. Dump the common rule status table |
| // |
| //----------------------------------------------------------------------------- |
| |
| void printRuleStatusTable() { |
| int thisRecord = 0; |
| int nextRecord = 0; |
| int i; |
| List<Integer> tbl = fRB.fRuleStatusVals; |
| |
| System.out.print("index | tags \n"); |
| System.out.print("-------------------\n"); |
| |
| while (nextRecord < tbl.size()) { |
| thisRecord = nextRecord; |
| nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1; |
| RBBINode.printInt(thisRecord, 7); |
| for (i=thisRecord+1; i<nextRecord; i++) { |
| int val = tbl.get(i).intValue(); |
| RBBINode.printInt(val, 7); |
| } |
| System.out.print("\n"); |
| } |
| System.out.print("\n\n"); |
| } |
| |
| |
| |
| } |