src/com/ibm/icu/text/RBBITableBuilder.java - external/github.com/unicode-org/icu - Git at Google

 /*
 **********************************************************************
 *   Copyright (c) 2002-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

 package com.ibm.icu.text;

 import java.util.List;
 import java.util.ArrayList;
 import java.util.Set;
 import java.util.HashSet;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.Iterator;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Collection;

 import com.ibm.icu.impl.Assert;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty;

 //
 //  class RBBITableBuilder is part of the RBBI rule compiler.
 //                         It builds the state transition table used by the RBBI runtime
 //                         from the expression syntax tree generated by the rule scanner.
 //
 //                         This class is part of the RBBI implementation only.
 //                         There is no user-visible public API here.
 //
 class RBBITableBuilder {


     //
     //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
     //                        one for each state.
     static class RBBIStateDescriptor {
         boolean      fMarked;
         int          fAccepting;
         int          fLookAhead;
         SortedSet    fTagVals;
         int          fTagsIdx;
         Set          fPositions;                 // Set of parse tree positions associated
                                                   //   with this state.  Unordered (it's a set).
                                                   //   UVector contents are RBBINode *

         int[]        fDtran;                      // Transitions out of this state.
                                                    //   indexed by input character
                                                    //   contents is int index of dest state
                                                    //   in RBBITableBuilder.fDStates

         RBBIStateDescriptor(int maxInputSymbol) {
             fTagVals = new TreeSet();
             fPositions = new HashSet();
             fDtran = new int[maxInputSymbol+1];    // fDtran needs to be pre-sized.
                                                     //   It is indexed by input symbols, and will
                                                     //   hold  the next state number for each
                                                     //   symbol.
         }
     }


     private  RBBIRuleBuilder  fRB;
     private  int             fRootIx;             // The array index into RBBIRuleBuilder.fTreeRoots
                                                    //   for the parse tree to operate on.
                                                    //   Too bad Java can't do indirection more easily!

     private  List             fDStates;       //  D states (Aho's terminology)
                                                //  Index is state number
                                                //  Contents are RBBIStateDescriptor pointers.

     //-----------------------------------------------------------------------------
     //
     //  Constructor    for RBBITableBuilder.
     //
     //                 rootNode is an index into the array of root nodes that is held by
     //                          the overall RBBIRuleBuilder.
     //-----------------------------------------------------------------------------
     RBBITableBuilder(RBBIRuleBuilder rb,  int rootNodeIx)  {
            fRootIx     = rootNodeIx;
            fRB         = rb;
            fDStates    = new ArrayList();
         }


        //-----------------------------------------------------------------------------
        //
        //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
        //                               table from the RBBI rules parse tree.
        //
        //-----------------------------------------------------------------------------
        void  build() {
            // If there were no rules, just return.  This situation can easily arise
            //   for the reverse rules.
            if (fRB.fTreeRoots[fRootIx]==null) {
                return;
            }

            //
            // Walk through the tree, replacing any references to $variables with a copy of the
            //   parse tree for the substition expression.
            //
            fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();
            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
                System.out.println("Parse tree after flattening variable references.");
                fRB.fTreeRoots[fRootIx].printTree(true);
            }

            //
            // If the rules contained any references to {bof}
            //   add a {bof} <cat> <former root of tree> to the
            //   tree.  Means that all matches must start out with the
            //   {bof} fake character.
            //
            if (fRB.fSetBuilder.sawBOF()) {
                RBBINode bofTop    = new RBBINode(RBBINode.opCat);
                RBBINode bofLeaf   = new RBBINode(RBBINode.leafChar);
                bofTop.fLeftChild  = bofLeaf;
                bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
                bofLeaf.fParent    = bofTop;
                bofLeaf.fVal       = 2;      // Reserved value for {bof}.
                fRB.fTreeRoots[fRootIx] = bofTop;
            }

            //
            // Add a unique right-end marker to the expression.
            //   Appears as a cat-node, left child being the original tree,
            //   right child being the end marker.
            //
            RBBINode cn = new RBBINode(RBBINode.opCat);
            cn.fLeftChild = fRB.fTreeRoots[fRootIx];
            fRB.fTreeRoots[fRootIx].fParent = cn;
            cn.fRightChild = new RBBINode(RBBINode.endMark);
            cn.fRightChild.fParent = cn;
            fRB.fTreeRoots[fRootIx] = cn;

            //
            //  Replace all references to UnicodeSets with the tree for the equivalent
            //      expression.
            //
            fRB.fTreeRoots[fRootIx].flattenSets();
            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
                System.out.println("Parse tree after flattening Unicode Set references.");
                fRB.fTreeRoots[fRootIx].printTree(true);
            }


            //
            // calculate the functions nullable, firstpos, lastpos and followpos on
            // nodes in the parse tree.
            //    See the alogrithm description in Aho.
            //    Understanding how this works by looking at the code alone will be
            //       nearly impossible.
            //
            calcNullable(fRB.fTreeRoots[fRootIx]);
            calcFirstPos(fRB.fTreeRoots[fRootIx]);
            calcLastPos(fRB.fTreeRoots[fRootIx]);
            calcFollowPos(fRB.fTreeRoots[fRootIx]);
            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
                System.out.print("\n");
                printPosSets(fRB.fTreeRoots[fRootIx]);
            }

            //
            //  For "chained" rules, modify the followPos sets
            //
            if (fRB.fChainRules) {
                calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
            }

            //
            //  BOF (start of input) test fixup.
            //
            if (fRB.fSetBuilder.sawBOF()) {
                bofFixup();
            }

            //
            // Build the DFA state transition tables.
            //
            buildStateTable();
            flagAcceptingStates();
            flagLookAheadStates();
            flagTaggedStates();

            //
            // Update the global table of rule status {tag} values
            // The rule builder has a global vector of status values that are common
            //    for all tables.  Merge the ones from this table into the global set.
            //
            mergeRuleStatusVals();

            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();};
        }


        //-----------------------------------------------------------------------------
        //
        //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
        //
        //-----------------------------------------------------------------------------
        void calcNullable(RBBINode n) {
            if (n == null) {
                return;
            }
            if (n.fType == RBBINode.setRef ||
                n.fType == RBBINode.endMark ) {
                // These are non-empty leaf node types.
                n.fNullable = false;
                return;
            }

            if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
                // Lookahead marker node.  It's a leaf, so no recursion on children.
                // It's nullable because it does not match any literal text from the input stream.
                n.fNullable = true;
                return;
            }


            // The node is not a leaf.
            //  Calculate nullable on its children.
            calcNullable(n.fLeftChild);
            calcNullable(n.fRightChild);

            // Apply functions from table 3.40 in Aho
            if (n.fType == RBBINode.opOr) {
                n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
            }
            else if (n.fType == RBBINode.opCat) {
                n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
            }
            else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
                n.fNullable = true;
            }
            else {
                n.fNullable = false;
            }
        }


        //-----------------------------------------------------------------------------
        //
        //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
        //
        //-----------------------------------------------------------------------------
        void calcFirstPos(RBBINode n) {
            if (n == null) {
                return;
            }
            if (n.fType == RBBINode.leafChar  ||
                n.fType == RBBINode.endMark   ||
                n.fType == RBBINode.lookAhead ||
                n.fType == RBBINode.tag) {
                // These are non-empty leaf node types.
                n.fFirstPosSet.add(n);
                return;
            }

            // The node is not a leaf.
            //  Calculate firstPos on its children.
            calcFirstPos(n.fLeftChild);
            calcFirstPos(n.fRightChild);

            // Apply functions from table 3.40 in Aho
            if (n.fType == RBBINode.opOr) {
                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
                n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
            }
            else if (n.fType == RBBINode.opCat) {
                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
                if (n.fLeftChild.fNullable) {
                    n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
                }
            }
            else if (n.fType == RBBINode.opStar ||
                     n.fType == RBBINode.opQuestion ||
                     n.fType == RBBINode.opPlus) {
                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
            }
        }


        //-----------------------------------------------------------------------------
        //
        //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
        //
        //-----------------------------------------------------------------------------
        void calcLastPos(RBBINode n) {
            if (n == null) {
                return;
            }
            if (n.fType == RBBINode.leafChar  ||
                n.fType == RBBINode.endMark   ||
                n.fType == RBBINode.lookAhead ||
                n.fType == RBBINode.tag) {
                // These are non-empty leaf node types.
                n.fLastPosSet.add(n);
                return;
            }

            // The node is not a leaf.
            //  Calculate lastPos on its children.
            calcLastPos(n.fLeftChild);
            calcLastPos(n.fRightChild);

            // Apply functions from table 3.40 in Aho
            if (n.fType == RBBINode.opOr) {
                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
            }
            else if (n.fType == RBBINode.opCat) {
                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
                if (n.fRightChild.fNullable) {
                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
                }
            }
            else if (n.fType == RBBINode.opStar     ||
                     n.fType == RBBINode.opQuestion ||
                     n.fType == RBBINode.opPlus) {
                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
            }
        }


        //-----------------------------------------------------------------------------
        //
        //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
        //
        //-----------------------------------------------------------------------------
        void calcFollowPos(RBBINode n) {
            if (n == null ||
                n.fType == RBBINode.leafChar ||
                n.fType == RBBINode.endMark) {
                return;
            }

            calcFollowPos(n.fLeftChild);
            calcFollowPos(n.fRightChild);

            // Aho rule #1
            if (n.fType == RBBINode.opCat) {
                RBBINode i;   // is 'i' in Aho's description

                Set LastPosOfLeftChild = n.fLeftChild.fLastPosSet;

                Iterator ix = LastPosOfLeftChild.iterator();
                while (ix.hasNext()) {
                    i = (RBBINode )ix.next();
                    i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
                }
            }

            // Aho rule #2
            if (n.fType == RBBINode.opStar ||
                n.fType == RBBINode.opPlus) {
                RBBINode   i;  // again, n and i are the names from Aho's description.
                Iterator   ix = n.fLastPosSet.iterator();
                while (ix.hasNext()) {
                    i = (RBBINode) ix.next();
                    i.fFollowPos.addAll(n.fFirstPosSet);
                }

            }


        }


        //-----------------------------------------------------------------------------
        //
        //   calcChainedFollowPos.    Modify the previously calculated followPos sets
        //                            to implement rule chaining.  NOT described by Aho
        //
        //-----------------------------------------------------------------------------
        void calcChainedFollowPos(RBBINode tree) {

            List        endMarkerNodes = new ArrayList();
            List        leafNodes      = new ArrayList();

             // get a list of all endmarker nodes.
            tree.findNodes(endMarkerNodes, RBBINode.endMark);

            // get a list all leaf nodes
            tree.findNodes(leafNodes, RBBINode.leafChar);

            // Get all nodes that can be the start a match, which is FirstPosition()
            // of the portion of the tree corresponding to user-written rules.
            // See the tree description in bofFixup().
            RBBINode userRuleRoot = tree;
            if (fRB.fSetBuilder.sawBOF()) {
                userRuleRoot = tree.fLeftChild.fRightChild;
            }
            Assert.assrt(userRuleRoot != null);
            Set matchStartNodes = userRuleRoot.fFirstPosSet;


            // Iteratate over all leaf nodes,
            //
            Iterator  endNodeIx = leafNodes.iterator();

            while (endNodeIx.hasNext()) {
                RBBINode tNode   = (RBBINode)endNodeIx.next();
                RBBINode endNode = null;

                // Identify leaf nodes that correspond to overall rule match positions.
                //   These include an endMarkerNode in their followPos sets.
                Iterator i = endMarkerNodes.iterator();
                while (i.hasNext()) {
                    RBBINode endMarkerNode = (RBBINode)i.next();
                    if (tNode.fFollowPos.contains(endMarkerNode)) {
                        endNode = tNode;
                        break;
                    }
                }
                if (endNode == null) {
                    // node wasn't an end node.  Try again with the next.
                    continue;
                }

                // We've got a node that can end a match.

                // Line Break Specific hack:  If this node's val correspond to the $CM char class,
                //                            don't chain from it.
                // TODO:  Add rule syntax for this behavior, get specifics out of here and
                //        into the rule file.
                if (fRB.fLBCMNoChain) {
                    int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
                    if (c != -1) {
                        // c == -1 occurs with sets containing only the {eof} marker string.
                        int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
                        if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
                            continue;
                        }
                    }
                }


                // Now iterate over the nodes that can start a match, looking for ones
                //   with the same char class as our ending node.
                RBBINode startNode;
                Iterator  startNodeIx = matchStartNodes.iterator();
                while (startNodeIx.hasNext()) {
                    startNode = (RBBINode )startNodeIx.next();
                    if (startNode.fType != RBBINode.leafChar) {
                        continue;
                    }

                    if (endNode.fVal == startNode.fVal) {
                        // The end val (character class) of one possible match is the
                        //   same as the start of another.

                        // Add all nodes from the followPos of the start node to the
                        //  followPos set of the end node, which will have the effect of
                        //  letting matches transition from a match state at endNode
                        //  to the second char of a match starting with startNode.
                        endNode.fFollowPos.addAll(startNode.fFollowPos);
                    }
                }
            }
        }


        //-----------------------------------------------------------------------------
        //
        //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
        //                Do an swizzle similar to chaining, modifying the followPos set of
        //                the bofNode to include the followPos nodes from other {bot} nodes
        //                scattered through the tree.
        //
        //                This function has much in common with calcChainedFollowPos().
        //
        //-----------------------------------------------------------------------------
        void bofFixup() {
            //
            //   The parse tree looks like this ...
            //         fTree root  --.       <cat>
            //                               /     \
            //                            <cat>   <#end node>
            //                           /     \
            //                     <bofNode>   rest
            //                               of tree
            //
            //    We will be adding things to the followPos set of the <bofNode>
            //
            RBBINode  bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
            Assert.assrt(bofNode.fType == RBBINode.leafChar);
            Assert.assrt(bofNode.fVal == 2);

            // Get all nodes that can be the start a match of the user-written rules
            //  (excluding the fake bofNode)
            //  We want the nodes that can start a match in the
            //     part labeled "rest of tree"
            //
            Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
            Iterator startNodeIt = matchStartNodes.iterator();
            while (startNodeIt.hasNext()) {
                RBBINode startNode = (RBBINode)startNodeIt.next();
                if (startNode.fType != RBBINode.leafChar) {
                    continue;
                }

                if (startNode.fVal == bofNode.fVal) {
                    //  We found a leaf node corresponding to a {bof} that was
                    //    explicitly written into a rule.
                    //  Add everything from the followPos set of this node to the
                    //    followPos set of the fake bofNode at the start of the tree.
                    //
                    bofNode.fFollowPos.addAll(startNode.fFollowPos);
                }
            }
        }

        //-----------------------------------------------------------------------------
        //
        //   buildStateTable()    Determine the set of runtime DFA states and the
        //                        transition tables for these states, by the algorithm
        //                        of fig. 3.44 in Aho.
        //
        //                        Most of the comments are quotes of Aho's psuedo-code.
        //
        //-----------------------------------------------------------------------------
        void buildStateTable() {
            //
            // Add a dummy state 0 - the stop state.  Not from Aho.
            int      lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
            RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
            fDStates.add(failState);

            // initially, the only unmarked state in Dstates is firstpos(root),
            //       where toot is the root of the syntax tree for (r)#;
            RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
            initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
            fDStates.add(initialState);

            // while there is an unmarked state T in Dstates do begin
            for (;;) {
                RBBIStateDescriptor T = null;
                int              tx;
                for (tx=1; tx<fDStates.size(); tx++) {
                    RBBIStateDescriptor temp  = (RBBIStateDescriptor )fDStates.get(tx);
                    if (temp.fMarked == false) {
                        T = temp;
                        break;
                    }
                }
                if (T == null) {
                    break;
                }

                // mark T;
                T.fMarked = true;

                // for each input symbol a do begin
                int  a;
                for (a = 1; a<=lastInputSymbol; a++) {
                    // let U be the set of positions that are in followpos(p)
                    //    for some position p in T
                    //    such that the symbol at position p is a;
                    Set        U = null;
                    RBBINode   p;
                    Iterator   pit = T.fPositions.iterator();
                    while (pit.hasNext()) {
                        p = (RBBINode )pit.next();
                        if ((p.fType == RBBINode.leafChar) &&  (p.fVal == a)) {
                            if (U == null) {
                                U = new HashSet();
                            }
                            U.addAll(p.fFollowPos);
                        }
                    }

                    // if U is not empty and not in DStates then
                    int  ux = 0;
                    boolean    UinDstates = false;
                    if (U != null) {
                        Assert.assrt(U.size() > 0);
                        int  ix;
                        for (ix=0; ix<fDStates.size(); ix++) {
                            RBBIStateDescriptor temp2;
                            temp2 = (RBBIStateDescriptor )fDStates.get(ix);
                            if (U.equals(temp2.fPositions)) {
                                U  = temp2.fPositions;
                                ux = ix;
                                UinDstates = true;
                                break;
                            }
                        }

                        // Add U as an unmarked state to Dstates
                        if (!UinDstates)
                        {
                            RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
                            newState.fPositions = U;
                            fDStates.add(newState);
                            ux = fDStates.size()-1;
                        }

                        // Dtran[T, a] := U;
                        T.fDtran[a] = ux;
                    }
                }
            }
        }


        //-----------------------------------------------------------------------------
        //
        //   flagAcceptingStates    Identify accepting states.
        //                          First get a list of all of the end marker nodes.
        //                          Then, for each state s,
        //                              if s contains one of the end marker nodes in its list of tree positions then
        //                                  s is an accepting state.
        //
        //-----------------------------------------------------------------------------
        void     flagAcceptingStates() {
            List        endMarkerNodes = new ArrayList();
            RBBINode    endMarker;
            int     i;
            int     n;

            fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);

            for (i=0; i<endMarkerNodes.size(); i++) {
                endMarker = (RBBINode)endMarkerNodes.get(i);
                for (n=0; n<fDStates.size(); n++) {
                    RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
                    //if (sd.fPositions.indexOf(endMarker) >= 0) {
                    if (sd.fPositions.contains(endMarker)) {
                        // Any non-zero value for fAccepting means this is an accepting node.
                        // The value is what will be returned to the user as the break status.
                        // If no other value was specified, force it to -1.

                        if (sd.fAccepting==0) {
                         // State hasn't been marked as accepting yet.  Do it now.
                            sd.fAccepting = endMarker.fVal;
                            if (sd.fAccepting == 0) {
                                sd.fAccepting = -1;
                         }
                        }
                        if (sd.fAccepting==-1 && endMarker.fVal != 0) {
                         // Both lookahead and non-lookahead accepting for this state.
                         // Favor the look-ahead.  Expedient for line break.
                         // TODO:  need a more elegant resolution for conflicting rules.
                         sd.fAccepting = endMarker.fVal;
                     }
                         // implicit else:
                         // if sd.fAccepting already had a value other than 0 or -1, leave it be.

                        // If the end marker node is from a look-ahead rule, set
                        //   the fLookAhead field or this state also.
                        if (endMarker.fLookAheadEnd) {
                         // TODO:  don't change value if already set?
                         // TODO:  allow for more than one active look-ahead rule in engine.
                         //        Make value here an index to a side array in engine?
                            sd.fLookAhead = sd.fAccepting;
                        }
                    }
                }
            }
        }


        //-----------------------------------------------------------------------------
        //
        //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
        //
        //-----------------------------------------------------------------------------
        void     flagLookAheadStates() {
            List        lookAheadNodes = new ArrayList();
            RBBINode    lookAheadNode;
            int     i;
            int     n;

            fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
            for (i=0; i<lookAheadNodes.size(); i++) {
                lookAheadNode = (RBBINode )lookAheadNodes.get(i);

                for (n=0; n<fDStates.size(); n++) {
                    RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
                    if (sd.fPositions.contains(lookAheadNode)) {
                        sd.fLookAhead = lookAheadNode.fVal;
                    }
                }
            }
        }


        //-----------------------------------------------------------------------------
        //
        //    flagTaggedStates
        //
        //-----------------------------------------------------------------------------
        void     flagTaggedStates() {
            List        tagNodes = new ArrayList();
            RBBINode    tagNode;
            int     i;
            int     n;

            fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
            for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
                tagNode = (RBBINode )tagNodes.get(i);

                for (n=0; n<fDStates.size(); n++) {              //    For each state  s (row in the state table)
                    RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
                    if (sd.fPositions.contains(tagNode)) {       //       if  s include the tag node t
                        sd.fTagVals.add(new Integer(tagNode.fVal));
                    }
                }
            }
        }


        //-----------------------------------------------------------------------------
        //
        //  mergeRuleStatusVals
        //
        //      Allocate positions in the  global array of rule status {tag} values
        //
        //      The RBBI runtime uses an array of {sets of status values} that can
        //      be returned for boundaries.  Each accepting state that has non-zero
        //      status includes an index into this array.  The format of the array
        //      is
        //           Num of status values in group 1
        //              status val
        //              status val
        //              ...
        //           Num of status vals in group 2
        //              status val
        //              status val
        //              ...
        //           etc.
        //
        //
        //-----------------------------------------------------------------------------

        void  mergeRuleStatusVals() {
            //
            //  The basic outline of what happens here is this...
            //
            //    for each state in this state table
            //       if the status tag list for this state is in the global statuses list
            //           record where and
            //           continue with the next state
            //       else
            //           add the tag list for this state to the global list.
            //
            int i;
            int n;

            // Pre-load a single tag of {0} into the table.
            //   We will need this as a default, for rule sets with no explicit tagging,
            //   or with explicit tagging of {0}.
            if (fRB.fRuleStatusVals.size() == 0) {
                fRB.fRuleStatusVals.add(new Integer(1));    // Num of statuses in group
                fRB.fRuleStatusVals.add(new Integer(0));    //   and our single status of zero

                SortedSet s0 = new TreeSet();
                Integer izero = new Integer(0);
                fRB.fStatusSets.put(s0, izero);
                SortedSet s1 = new TreeSet();
                s1.add(izero);
                fRB.fStatusSets.put(s0, izero);
            }

            //    For each state, check whether the state's status tag values are
            //       already entered into the status values array, and add them if not.
            for (n=0; n<fDStates.size(); n++) {
                RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
                Set statusVals = sd.fTagVals;
                Integer arrayIndexI = (Integer)fRB.fStatusSets.get(statusVals);
                if (arrayIndexI == null) {
                    // This is the first encounter of this set of status values.
                    //   Add them to the statusSets map, This map associates
                    //   the set of status values with an index in the runtime status
                    //   values array.
                    arrayIndexI = new Integer(fRB.fRuleStatusVals.size());
                    fRB.fStatusSets.put(statusVals, arrayIndexI);

                    // Add the new set of status values to the vector of values that
                    //   will eventually become the array used by the runtime engine.
                    fRB.fRuleStatusVals.add(new Integer(statusVals.size()));
                    Iterator it = statusVals.iterator();
                    while (it.hasNext()) {
                        fRB.fRuleStatusVals.add(it.next());
                    }

                }

                // Save the runtime array index back into the state descriptor.
                sd.fTagsIdx = arrayIndexI.intValue();
            }
        }


        //-----------------------------------------------------------------------------
        //
        //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
        //                 for each node in the tree.
        //
        //-----------------------------------------------------------------------------

        void printPosSets(RBBINode n) {
            if (n==null) {
                return;
            }
            RBBINode.printNode(n);
            System.out.print("         Nullable:  " + n.fNullable);

            System.out.print("         firstpos:  ");
            printSet(n.fFirstPosSet);

            System.out.print("         lastpos:   ");
            printSet(n.fLastPosSet);

            System.out.print("         followpos: ");
            printSet(n.fFollowPos);

            printPosSets(n.fLeftChild);
            printPosSets(n.fRightChild);
        }


        //-----------------------------------------------------------------------------
        //
        //   getTableSize()    Calculate the size in bytes of the runtime form of this
        //                     state transition table.
        //
        //          Note:  Refer to common/rbbidata.h from ICU4C for the declarations
        //                 of the structures being matched by this calculation.
        //
        //-----------------------------------------------------------------------------
        int  getTableSize()  {
            int    size = 0;
            int    numRows;
            int    numCols;
            int    rowSize;

            if (fRB.fTreeRoots[fRootIx] == null) {
                return 0;
            }

            size    = /*sizeof(RBBIStateTable) - 4 */ 16;    // The header, with no rows to the table.

            numRows = fDStates.size();
            numCols = fRB.fSetBuilder.getNumCharCategories();

            //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
            //        Therefore we subtract two from numCols when determining
            //        how much storage to add to a row for the total columns.
            // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
            rowSize = 8 + 2*numCols;
            size   += numRows * rowSize;
            while (size % 8 > 0) {    // Size must be multiple of 8 bytes in size.
                size++;
            }

            return size;
        }


        //-----------------------------------------------------------------------------
        //
        //   exportTable()    export the state transition table in the ICU4C format.
        //
        //                    Most of the table is 16 bit shorts.  This function exports
        //                    the whole thing as an array of shorts.
        //
        //                    The size of the array must be rounded up to a multiple of
        //                    8 bytes.
        //
        //                    See struct RBBIStateTable in ICU4C, common/rbbidata.h
        //
        //-----------------------------------------------------------------------------

        short [] exportTable() {
            int                state;
            int                col;

            if (fRB.fTreeRoots[fRootIx] == null) {
                return new short[0];
            }

            Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
                fDStates.size() < 0x7fff);

            int numStates = fDStates.size();

            // Size of table size in shorts.
            //  the "4" is the size of struct RBBIStateTableRow, the row header part only.
            int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
            int tableSize = getTableSize() / 2;


            short [] table = new short[tableSize];

            //
            // Fill in the header fields.
            //      Annoying because they really want to be ints, not shorts.
            //
            // RBBIStateTable.fNumStates
            table[RBBIDataWrapper.NUMSTATES]   = (short)(numStates >>> 16);
            table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);

            // RBBIStateTable.fRowLen
            table[RBBIDataWrapper.ROWLEN]   = (short)(rowLen >>> 16);
            table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);

            // RBBIStateTable.fFlags
            int flags = 0;
            if (fRB.fLookAheadHardBreak) {
                flags  |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
            }
            if (fRB.fSetBuilder.sawBOF()) {
                flags  |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
            }
            table[RBBIDataWrapper.FLAGS]   = (short)(flags >>> 16);
            table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);

            int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
            for (state=0; state<numStates; state++) {
                RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(state);
                int                row = 8 + state*rowLen;
                Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
                Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
                table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
                table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
                table[row + RBBIDataWrapper.TAGIDX]    = (short)sd.fTagsIdx;
                for (col=0; col<numCharCategories; col++) {
                    table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
                }
            }
            return table;
        }


        //-----------------------------------------------------------------------------
        //
        //   printSet    Debug function.   Print the contents of a set of Nodes
        //
        //-----------------------------------------------------------------------------

        void printSet(Collection s) {
            Iterator it = s.iterator();
            while (it.hasNext()) {
                RBBINode n = (RBBINode)it.next();
                RBBINode.printInt(n.fSerialNum, 8);
            }
            System.out.println();
        }


        //-----------------------------------------------------------------------------
        //
        //   printStates    Debug Function.  Dump the fully constructed state transition table.
        //
        //-----------------------------------------------------------------------------

        void printStates() {
            int     c;    // input "character"
            int     n;    // state number

            System.out.print("state |           i n p u t     s y m b o l s \n");
            System.out.print("      | Acc  LA    Tag");
            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
                RBBINode.printInt((int)c, 3);
            }
            System.out.print("\n");
            System.out.print("      |---------------");
            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
                System.out.print("---");
            }
            System.out.print("\n");

            for (n=0; n<fDStates.size(); n++) {
                RBBIStateDescriptor sd = (RBBIStateDescriptor)fDStates.get(n);
                RBBINode.printInt(n, 5);
                System.out.print(" | ");

                RBBINode.printInt(sd.fAccepting, 3);
                RBBINode.printInt(sd.fLookAhead, 4);
                RBBINode.printInt(sd.fTagsIdx, 6);
                System.out.print(" ");
                for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
                    RBBINode.printInt(sd.fDtran[c], 3);
                }
                System.out.print("\n");
            }
            System.out.print("\n\n");
        }


        //-----------------------------------------------------------------------------
        //
        //   printRuleStatusTable    Debug Function.  Dump the common rule status table
        //
        //-----------------------------------------------------------------------------

        void printRuleStatusTable() {
            int  thisRecord = 0;
            int  nextRecord = 0;
            int      i;
            List  tbl = fRB.fRuleStatusVals;

            System.out.print("index |  tags \n");
            System.out.print("-------------------\n");

            while (nextRecord < tbl.size()) {
                thisRecord = nextRecord;
                nextRecord = thisRecord + ((Integer)tbl.get(thisRecord)).intValue() + 1;
                RBBINode.printInt(thisRecord, 7);
                for (i=thisRecord+1; i<nextRecord; i++) {
                    int val = ((Integer)tbl.get(i)).intValue();
                    RBBINode.printInt(val, 7);
                }
                System.out.print("\n");
            }
            System.out.print("\n\n");
        }


 }