icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java - external/github.com/unicode-org/icu - Git at Google

 /*
  *******************************************************************************
  * Copyright (C) 2005-2014 International Business Machines Corporation and
  * others. All Rights Reserved.
  *******************************************************************************
  */

 package com.ibm.icu.text;

 import static com.ibm.icu.impl.CharacterIteration.DONE32;
 import static com.ibm.icu.impl.CharacterIteration.next32;
 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
 import static com.ibm.icu.impl.CharacterIteration.previous32;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.text.CharacterIterator;
 import java.util.concurrent.ConcurrentHashMap;

 import com.ibm.icu.impl.Assert;
 import com.ibm.icu.impl.CharTrie;
 import com.ibm.icu.impl.CharacterIteration;
 import com.ibm.icu.impl.ICUBinary;
 import com.ibm.icu.impl.ICUDebug;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;

 /**
  * Rule Based Break Iterator
  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
  *
  * @stable ICU 2.0
  */
 public class RuleBasedBreakIterator extends BreakIterator {
     //=======================================================================
     // Constructors & Factories
     //=======================================================================

     /**
      * private constructor
      */
     private RuleBasedBreakIterator() {
         fLastStatusIndexValid = true;
         fDictionaryCharCount  = 0;
         fBreakEngines.put(-1, fUnhandledBreakEngine);
     }

     /**
      * Create a break iterator from a precompiled set of break rules.
      *
      * Creating a break iterator from the binary rules is much faster than
      * creating one from source rules.
      *
      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
      * Binary break iterator rules are not guaranteed to be compatible between
      * different versions of ICU.
      *
      * @param is an input stream supplying the compiled binary rules.
      * @throws IOException if there is an error while reading the rules from the InputStream.
      * @see    #compileRules(String, OutputStream)
      * @stable ICU 4.8
      */
     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStream(is));
         return This;
     }

     /**
      * Create a break iterator from a precompiled set of break rules.
      *
      * Creating a break iterator from the binary rules is much faster than
      * creating one from source rules.
      *
      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
      * Binary break iterator rules are not guaranteed to be compatible between
      * different versions of ICU.
      *
      * @param bytes a buffer supplying the compiled binary rules.
      * @throws IOException if there is an error while reading the rules from the buffer.
      * @see    #compileRules(String, OutputStream)
      * @internal
      * @deprecated This API is ICU internal only.
      */
     @Deprecated
     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
         This.fRData = RBBIDataWrapper.get(bytes);
         return This;
     }

     /**
      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
      * @param rules The break rules to be used.
      * @stable ICU 2.2
      */
     public RuleBasedBreakIterator(String rules)  {
         this();
         try {
             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
             compileRules(rules, ruleOS);
             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
         } catch (IOException e) {
             ///CLOVER:OFF
             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
             //  causing bogus compiled rules to be produced, but with no compile error raised.
             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
                     + e.getMessage());
             throw rte;
             ///CLOVER:ON
         }
     }

     //=======================================================================
     // Boilerplate
     //=======================================================================

     /**
      * Clones this iterator.
      * @return A newly-constructed RuleBasedBreakIterator with the same
      * behavior as this one.
      * @stable ICU 2.0
      */
     public Object clone()
     {
         RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
         if (fText != null) {
             result.fText = (CharacterIterator)(fText.clone());
         }
         return result;
     }

     /**
      * Returns true if both BreakIterators are of the same class, have the same
      * rules, and iterate over the same text.
      * @stable ICU 2.0
      */
     public boolean equals(Object that) {
         if (that == null) {
             return false;
         }
         if (this == that) {
             return true;
         }
         try {
             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
                 return false;
             }
             if (fRData != null && other.fRData != null &&
                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
                 return false;
             }
             if (fText == null && other.fText == null) {
                 return true;
             }
             if (fText == null || other.fText == null) {
                 return false;
             }
             return fText.equals(other.fText);
         }
         catch(ClassCastException e) {
             return false;
         }
      }

     /**
      * Returns the description (rules) used to create this iterator.
      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
      * @stable ICU 2.0
      */
     public String toString() {
         String retStr = "";
         if (fRData != null) {
             retStr =  fRData.fRuleSource;
         }
         return retStr;
     }

     /**
      * Compute a hashcode for this BreakIterator
      * @return A hash code
      * @stable ICU 2.0
      */
     public int hashCode()
     {
         return fRData.fRuleSource.hashCode();
     }


     private static final int  START_STATE = 1;     // The state number of the starting state
     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"

     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
     //               of user text.  A variable with this enum type keeps track of where we
     //               are.  The state machine only fetches user text input while in RUN mode.
     private static final int  RBBI_START  = 0;
     private static final int  RBBI_RUN    = 1;
     private static final int  RBBI_END    = 2;

     /*
      * The character iterator through which this BreakIterator accesses the text.
      */
     private CharacterIterator   fText = new java.text.StringCharacterIterator("");

     /**
      * The rule data for this BreakIterator instance. Package private.
      */
     RBBIDataWrapper             fRData;

     /*
      * Index of the Rule {tag} values for the most recent match.
      */
     private int                 fLastRuleStatusIndex;

     /*
      * Rule tag value valid flag.
      * Some iterator operations don't intrinsically set the correct tag value.
      * This flag lets us lazily compute the value if we are ever asked for it.
      */
     private boolean             fLastStatusIndexValid;

     /**
      * Counter for the number of characters encountered with the "dictionary"
      *   flag set.  Normal RBBI iterators don't use it, although the code
      *   for updating it is live.  Dictionary Based break iterators (a subclass
      *   of us) access this field directly.
      * @internal
      */
     private int fDictionaryCharCount;

     /*
      * ICU debug argument name for RBBI
      */
     private static final String RBBI_DEBUG_ARG = "rbbi";

     /**
      * Debugging flag.  Trace operation of state machine when true.
      */
     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;

     /**
      * What kind of break iterator this is. Set to KIND_LINE by default,
      * since this produces sensible output.
      */
     private int fBreakType = KIND_LINE;

     /**
      * The "default" break engine - just skips over ranges of dictionary words,
      * producing no breaks. Should only be used if characters need to be handled
      * by a dictionary but we have no dictionary implementation for them.
      */
     private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();

     /**
      * when a range of characters is divided up using the dictionary, the break
      * positions that are discovered are stored here, preventing us from having
      * to use either the dictionary or the state table again until the iterator
      * leaves this range of text
      */
     private int[] fCachedBreakPositions;

     /**
      * if fCachedBreakPositions is not null, this indicates which item in the
      * cache the current iteration position refers to
      */
     private int fPositionInCache;


     private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
             new ConcurrentHashMap<Integer, LanguageBreakEngine>();
     /**
      * Dumps caches and performs other actions associated with a complete change
      * in text or iteration position.
      */
     private void reset() {
         fCachedBreakPositions = null;
         // fNumCachedBreakPositions = 0;
         fDictionaryCharCount = 0;
         fPositionInCache = 0;

     }
     /**
      * Dump the contents of the state table and character classes for this break iterator.
      * For debugging only.
      * @internal
      * @deprecated This API is ICU internal only.
      */
     @Deprecated
     public void dump() {
         this.fRData.dump();
     }

     /**
      * Compile a set of source break rules into the binary state tables used
      * by the break iterator engine.  Creating a break iterator from precompiled
      * rules is much faster than creating one from source rules.
      *
      * Binary break rules are not guaranteed to be compatible between different
      * versions of ICU.
      *
      *
      * @param rules  The source form of the break rules
      * @param ruleBinary  An output stream to receive the compiled rules.
      * @throws IOException If there is an error writing the output.
      * @see #getInstanceFromCompiledRules(InputStream)
      * @stable ICU 4.8
      */
     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
         RBBIRuleBuilder.compileRules(rules, ruleBinary);
     }

     //=======================================================================
     // BreakIterator overrides
     //=======================================================================

     /**
      * Sets the current iteration position to the beginning of the text.
      * (i.e., the CharacterIterator's starting offset).
      * @return The offset of the beginning of the text.
      * @stable ICU 2.0
      */
     public int first() {
         fCachedBreakPositions = null;
         fDictionaryCharCount = 0;
         fPositionInCache = 0;
         fLastRuleStatusIndex  = 0;
         fLastStatusIndexValid = true;
         if (fText == null) {
             return BreakIterator.DONE;
         }
         fText.first();
         return fText.getIndex();
     }

     /**
      * Sets the current iteration position to the end of the text.
      * (i.e., the CharacterIterator's ending offset).
      * @return The text's past-the-end offset.
      * @stable ICU 2.0
      */
     public int last() {
         fCachedBreakPositions = null;
         fDictionaryCharCount = 0;
         fPositionInCache = 0;

         if (fText == null) {
             fLastRuleStatusIndex  = 0;
             fLastStatusIndexValid = true;
             return BreakIterator.DONE;
         }

         // t.last() returns the offset of the last character,
         // rather than the past-the-end offset
         // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
         // will work correctly.
         fLastStatusIndexValid = false;
         int pos = fText.getEndIndex();
         fText.setIndex(pos);
         return pos;
     }

     /**
      * Advances the iterator either forward or backward the specified number of steps.
      * Negative values move backward, and positive values move forward.  This is
      * equivalent to repeatedly calling next() or previous().
      * @param n The number of steps to move.  The sign indicates the direction
      * (negative is backwards, and positive is forwards).
      * @return The character offset of the boundary position n boundaries away from
      * the current one.
      * @stable ICU 2.0
      */
     public int next(int n) {
         int result = current();
         while (n > 0) {
             result = next();
             --n;
         }
         while (n < 0) {
             result = previous();
             ++n;
         }
         return result;
     }

     /**
      * Advances the iterator to the next boundary position.
      * @return The position of the first boundary after this one.
      * @stable ICU 2.0
      */
     public int next() {
         // if we have cached break positions and we're still in the range
         // covered by them, just move one step forward in the cache
         if (fCachedBreakPositions != null) {
             if (fPositionInCache < fCachedBreakPositions.length - 1) {
                 ++fPositionInCache;
                 int pos = fCachedBreakPositions[fPositionInCache];
                 fText.setIndex(pos);
                 return pos;
             }
             else {
                 reset();
             }
         }

         int startPos = current();
         fDictionaryCharCount = 0;
         int result = handleNext(fRData.fFTable);
         if (fDictionaryCharCount > 0) {
             result = checkDictionary(startPos, result, false);
         }
         return result;
     }

     /**
       *  checkDictionary      This function handles all processing of characters in
       *                       the "dictionary" set. It will determine the appropriate
       *                       course of action, and possibly set up a cache in the
       *                       process.
       */
     private int checkDictionary(int startPos, int endPos, boolean reverse) {

         // Reset the old break cache first.
         reset();

         // note: code segment below assumes that dictionary chars are in the
         // startPos-endPos range
         // value returned should be next character in sequence
         if ((endPos - startPos) <= 1) {
             return (reverse ? startPos : endPos);
         }

         // Starting from the starting point, scan towards the proposed result,
         // looking for the first dictionary character (which may be the one
         // we're on, if we're starting in the middle of a range).
         fText.setIndex(reverse ? endPos : startPos);
         if (reverse) {
             CharacterIteration.previous32(fText);
         }

         int  rangeStart = startPos;
         int  rangeEnd = endPos;

         int    category;
         int    current;
         DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
         int     foundBreakCount = 0;
         int     c = CharacterIteration.current32(fText);
         category = (short)fRData.fTrie.getCodePointValue(c);

         // Is the character we're starting on a dictionary character? If so, we
         // need to back up to include the entire run; otherwise the results of
         // the break algorithm will differ depending on where we start. Since
         // the result is cached and there is typically a non-dictionary break
         // within a small number of words, there should be little performance impact.
         if ((category & 0x4000) != 0) {
             if (reverse) {
                 do {
                     CharacterIteration.next32(fText);
                     c = CharacterIteration.current32(fText);
                     category = (short)fRData.fTrie.getCodePointValue(c);
                 } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);

                 // Back up to the last dictionary character
                 rangeEnd = fText.getIndex();
                 if (c == CharacterIteration.DONE32) {
                     // c = fText->last32();
                     //   TODO:  why was this if needed?
                     c = CharacterIteration.previous32(fText);
                 }
                 else {
                     c = CharacterIteration.previous32(fText);
                 }
             }
             else {
                 do {
                     c = CharacterIteration.previous32(fText);
                     category = (short)fRData.fTrie.getCodePointValue(c);
                 }
                 while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
                 // Back up to the last dictionary character
                 if (c == CharacterIteration.DONE32) {
                     // c = fText->first32();
                     c = CharacterIteration.current32(fText);
                 }
                 else {
                     CharacterIteration.next32(fText);
                     c = CharacterIteration.current32(fText);
                 }
                 rangeStart = fText.getIndex();
             }
             category = (short)fRData.fTrie.getCodePointValue(c);
         }


         // Loop through the text, looking for ranges of dictionary characters.
         // For each span, find the appropriate break engine, and ask it to find
         // any breaks within the span.
         // Note: we always do this in the forward direction, so that the break
         // cache is built in the right order.
         if (reverse) {
             fText.setIndex(rangeStart);
             c = CharacterIteration.current32(fText);
             category = (short)fRData.fTrie.getCodePointValue(c);
         }
         LanguageBreakEngine lbe = null;
         while(true) {
             while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
                 CharacterIteration.next32(fText);
                 c = CharacterIteration.current32(fText);
                 category = (short)fRData.fTrie.getCodePointValue(c);
             }
             if (current >= rangeEnd) {
                 break;
             }

             // We now have a dictionary character. Get the appropriate language object
             // to deal with it.
             lbe = getLanguageBreakEngine(c);

             // Ask the language object if there are any breaks. It will leave the text
             // pointer on the other side of its range, ready to search for the next one.
             if (lbe != null) {
                 int startingIdx = fText.getIndex();
                 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks);
                 assert fText.getIndex() > startingIdx;
             }

             // Reload the loop variables for the next go-round
             c = CharacterIteration.current32(fText);
             category = (short)fRData.fTrie.getCodePointValue(c);
         }

         // If we found breaks, build a new break cache. The first and last entries must
         // be the original starting and ending position.
         if (foundBreakCount > 0) {
             if (foundBreakCount != breaks.size()) {
                 System.out.println("oops, foundBreakCount != breaks.size().  LBE = " + lbe.getClass());
             }
             assert foundBreakCount == breaks.size();
             if (startPos < breaks.peekLast()) {
                 breaks.offer(startPos);
             }
             if (endPos > breaks.peek()) {
                 breaks.push(endPos);
             }

             // TODO: get rid of this array, use results from the deque directly
             fCachedBreakPositions = new int[breaks.size()];

             int i = 0;
             while (breaks.size() > 0) {
                 fCachedBreakPositions[i++] = breaks.pollLast();
             }

             // If there are breaks, then by definition, we are replacing the original
             // proposed break by one of the breaks we found. Use following() and
             // preceding() to do the work. They should never recurse in this case.
             if (reverse) {
                 return preceding(endPos);
             }
             else {
                 return following(startPos);
             }
         }

         // If we get here, there were no language-based breaks. Set the text pointer
         // to the original proposed break.
         fText.setIndex(reverse ? startPos : endPos);
         return (reverse ? startPos : endPos);

         }


     /**
      * Moves the iterator backwards, to the last boundary preceding this one.
      * @return The position of the last boundary position preceding this one.
      * @stable ICU 2.0
      */
     public int previous() {
         int result;
         int startPos;

         CharacterIterator text = getText();

         fLastStatusIndexValid = false;

         // if we have cached break positions and we're still in the range
         // covered by them, just move one step backward in the cache
         if (fCachedBreakPositions != null) {
             if (fPositionInCache > 0) {
                 --fPositionInCache;
                 // If we're at the beginning of the cache, need to reevaluate the
                 // rule status
                 if (fPositionInCache <= 0) {
                     fLastStatusIndexValid = false;
                 }
                 int pos = fCachedBreakPositions[fPositionInCache];
                 text.setIndex(pos);
                 return pos;
             } else {
                 reset();
             }
         }

         // if we're already sitting at the beginning of the text, return DONE
         startPos = current();
         if (fText == null || startPos == fText.getBeginIndex()) {
             fLastRuleStatusIndex  = 0;
             fLastStatusIndexValid = true;
             return BreakIterator.DONE;
         }

         // Rules with an exact reverse table are handled here.
         if (fRData.fSRTable != null || fRData.fSFTable != null) {
             result =  handlePrevious(fRData.fRTable);
             if (fDictionaryCharCount > 0) {
                 result = checkDictionary(result, startPos, true);
             }
             return result;
         }

         // old rule syntax
         // set things up.  handlePrevious() will back us up to some valid
         // break position before the current position (we back our internal
         // iterator up one step to prevent handlePrevious() from returning
         // the current position), but not necessarily the last one before
         // where we started

         int       start = current();

         previous32(fText);
         int       lastResult    = handlePrevious(fRData.fRTable);
         if (lastResult == BreakIterator.DONE) {
             lastResult = fText.getBeginIndex();
             fText.setIndex(lastResult);
         }
         result = lastResult;
         int      lastTag       = 0;
         boolean  breakTagValid = false;

         // iterate forward from the known break position until we pass our
         // starting point.  The last break position before the starting
         // point is our return value

         for (;;) {
             result         = next();
             if (result == BreakIterator.DONE || result >= start) {
                 break;
             }
             lastResult     = result;
             lastTag        = fLastRuleStatusIndex;
             breakTagValid  = true;
         }

         // fLastBreakTag wants to have the value for section of text preceding
         // the result position that we are to return (in lastResult.)  If
         // the backwards rules overshot and the above loop had to do two or more
         // handleNext()s to move up to the desired return position, we will have a valid
         // tag value. But, if handlePrevious() took us to exactly the correct result position,
         // we wont have a tag value for that position, which is only set by handleNext().

         // Set the current iteration position to be the last break position
         // before where we started, and then return that value.
         fText.setIndex(lastResult);
         fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
         fLastStatusIndexValid = breakTagValid;
         return lastResult;
     }

     /**
      * Sets the iterator to refer to the first boundary position following
      * the specified position.
      * @param offset The position from which to begin searching for a break position.
      * @return The position of the first break after the current position.
      * @stable ICU 2.0
      */
     public int following(int offset) {
         CharacterIterator text = getText();

         // if we have no cached break positions, or if "offset" is outside the
         // range covered by the cache, then dump the cache and call our
         // inherited following() method.  This will call other methods in this
         // class that may refresh the cache.
         if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
                 offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
             fCachedBreakPositions = null;
             return rulesFollowing(offset);
         }

         // on the other hand, if "offset" is within the range covered by the
         // cache, then just search the cache for the first break position
         // after "offset"
         else {
             fPositionInCache = 0;
             while (fPositionInCache < fCachedBreakPositions.length
                    && offset >= fCachedBreakPositions[fPositionInCache])
                 ++fPositionInCache;
             text.setIndex(fCachedBreakPositions[fPositionInCache]);
             return text.getIndex();
         }
     }

     private int rulesFollowing(int offset) {
         // if the offset passed in is already past the end of the text,
         // just return DONE; if it's before the beginning, return the
         // text's starting offset
         fLastRuleStatusIndex  = 0;
         fLastStatusIndexValid = true;
         if (fText == null || offset >= fText.getEndIndex()) {
             last();
             return next();
         }
         else if (offset < fText.getBeginIndex()) {
             return first();
         }

         // otherwise, set our internal iteration position (temporarily)
         // to the position passed in.  If this is the _beginning_ position,
         // then we can just use next() to get our return value

         int result = 0;

         if (fRData.fSRTable != null) {
             // Safe Point Reverse rules exist.
             //   This allows us to use the optimum algorithm.
             fText.setIndex(offset);
             // move forward one codepoint to prepare for moving back to a
             // safe point.
             // this handles offset being between a supplementary character
             next32(fText);
             // handlePrevious will move most of the time to < 1 boundary away
             handlePrevious(fRData.fSRTable);
             result = next();
             while (result <= offset) {
                 result = next();
             }
             return result;
         }
         if (fRData.fSFTable != null) {
             // No Safe point reverse table, but there is a safe pt forward table.
             //
             fText.setIndex(offset);
             previous32(fText);
             // handle next will give result >= offset
             handleNext(fRData.fSFTable);
             // previous will give result 0 or 1 boundary away from offset,
             // most of the time
             // we have to
             int oldresult = previous();
             while (oldresult > offset) {
                 result = previous();
                 if (result <= offset) {
                     return oldresult;
                 }
                 oldresult = result;
             }
             result = next();
             if (result <= offset) {
                 return next();
             }
             return result;
         }
         // otherwise, we have to sync up first.  Use handlePrevious() to back
         // us up to a known break position before the specified position (if
         // we can determine that the specified position is a break position,
         // we don't back up at all).  This may or may not be the last break
         // position at or before our starting position.  Advance forward
         // from here until we've passed the starting position.  The position
         // we stop on will be the first break position after the specified one.
         // old rule syntax

         fText.setIndex(offset);
         if (offset == fText.getBeginIndex()) {
             return next();
         }
         result = previous();

         while (result != BreakIterator.DONE && result <= offset) {
             result = next();
         }

         return result;
     }
     /**
      * Sets the iterator to refer to the last boundary position before the
      * specified position.
      * @param offset The position to begin searching for a break from.
      * @return The position of the last boundary before the starting position.
      * @stable ICU 2.0
      */
     public int preceding(int offset) {
         CharacterIterator text = getText();

         // if we have no cached break positions, or "offset" is outside the
         // range covered by the cache, we can just call the inherited routine
         // (which will eventually call other routines in this class that may
         // refresh the cache)
         if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
                 offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
             fCachedBreakPositions = null;
             return rulesPreceding(offset);
         }

         // on the other hand, if "offset" is within the range covered by the cache,
         // then all we have to do is search the cache for the last break position
         // before "offset"
         else {
             fPositionInCache = 0;
             while (fPositionInCache < fCachedBreakPositions.length
                    && offset > fCachedBreakPositions[fPositionInCache])
                 ++fPositionInCache;
             --fPositionInCache;
             text.setIndex(fCachedBreakPositions[fPositionInCache]);
             return text.getIndex();
         }
     }

     private int rulesPreceding(int offset) {
         // if the offset passed in is already past the end of the text,
         // just return DONE; if it's before the beginning, return the

         // text's starting offset
         if (fText == null || offset > fText.getEndIndex()) {
             // return BreakIterator::DONE;
             return last();
         }
         else if (offset < fText.getBeginIndex()) {
             return first();
         }

         // if we start by updating the current iteration position to the
         // position specified by the caller, we can just use previous()
         // to carry out this operation

         int  result;
         if (fRData.fSFTable != null) {
             /// todo synwee
             // new rule syntax
             fText.setIndex(offset);
             // move backwards one codepoint to prepare for moving forwards to a
             // safe point.
             // this handles offset being between a supplementary character
             previous32(fText);
             handleNext(fRData.fSFTable);
             result = previous();
             while (result >= offset) {
                 result = previous();
             }
             return result;
         }
         if (fRData.fSRTable != null) {
             // backup plan if forward safe table is not available
             fText.setIndex(offset);
             next32(fText);
             // handle previous will give result <= offset
             handlePrevious(fRData.fSRTable);

             // next will give result 0 or 1 boundary away from offset,
             // most of the time
             // we have to
             int oldresult = next();
             while (oldresult < offset) {
                 result = next();
                 if (result >= offset) {
                     return oldresult;
                 }
                 oldresult = result;
             }
             result = previous();
             if (result >= offset) {
                 return previous();
             }
             return result;
         }

         // old rule syntax
         fText.setIndex(offset);
         return previous();
     }

     /**
      * Throw IllegalArgumentException unless begin <= offset < end.
      * @stable ICU 2.0
      */
     protected static final void checkOffset(int offset, CharacterIterator text) {
         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
             throw new IllegalArgumentException("offset out of bounds");
         }
     }


     /**
      * Returns true if the specified position is a boundary position.  As a side
      * effect, leaves the iterator pointing to the first boundary position at
      * or after "offset".
      * @param offset the offset to check.
      * @return True if "offset" is a boundary position.
      * @stable ICU 2.0
      */
     public boolean isBoundary(int offset) {
         checkOffset(offset, fText);

         // the beginning index of the iterator is always a boundary position by definition
         if (offset == fText.getBeginIndex()) {
             first();       // For side effects on current position, tag values.
             return true;
         }

         if (offset == fText.getEndIndex()) {
             last();       // For side effects on current position, tag values.
             return true;
         }

         // otherwise, we can use following() on the position before the specified
         // one and return true if the position we get back is the one the user
         // specified

         // return following(offset - 1) == offset;
         // TODO:  check whether it is safe to revert to the simpler offset-1 code
         //         The safe rules may take care of unpaired surrogates ok.
         fText.setIndex(offset);
         previous32(fText);
         int  pos = fText.getIndex();
         boolean result = following(pos) == offset;
         return result;
     }

     /**
      * Returns the current iteration position.
      * @return The current iteration position.
      * @stable ICU 2.0
      */
     public int current() {
         return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
     }

     private void makeRuleStatusValid() {
         if (fLastStatusIndexValid == false) {
             //  No cached status is available.
             int curr = current();
             if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
                 //  At start of text, or there is no text.  Status is always zero.
                 fLastRuleStatusIndex = 0;
                 fLastStatusIndexValid = true;
             } else {
                 //  Not at start of text.  Find status the tedious way.
                 int pa = fText.getIndex();
                 first();
                 int pb = current();
                 while (fText.getIndex() < pa) {
                     pb = next();
                 }
                 Assert.assrt(pa == pb);
             }
             Assert.assrt(fLastStatusIndexValid == true);
             Assert.assrt(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fRData.fStatusTable.length);
         }
     }

     /**
      * Return the status tag from the break rule that determined the most recently
      * returned break position.  The values appear in the rule source
      * within brackets, {123}, for example.  For rules that do not specify a
      * status, a default value of 0 is returned.  If more than one rule applies,
      * the numerically largest of the possible status values is returned.
      * <p>
      * Of the standard types of ICU break iterators, only the word break
      * iterator provides status values.  The values are defined in
      * class RuleBasedBreakIterator, and allow distinguishing between words
      * that contain alphabetic letters, "words" that appear to be numbers,
      * punctuation and spaces, words containing ideographic characters, and
      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
      * position from <code>next()<code>, <code>previous()</code>, or
      * any other break iterator functions that returns a boundary position.
      * <p>
      * @return the status from the break rule that determined the most recently
      * returned break position.
      *
      * @draft ICU 3.0 (retain)
      * @provisional This is a draft API and might change in a future release of ICU.
      */

     public int  getRuleStatus() {
         makeRuleStatusValid();
         //   Status records have this form:
         //           Count N         <--  fLastRuleStatusIndex points here.
         //           Status val 0
         //           Status val 1
         //              ...
         //           Status val N-1  <--  the value we need to return
         //   The status values are sorted in ascending order.
         //   This function returns the last (largest) of the array of status values.
         int  idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
         int  tagVal = fRData.fStatusTable[idx];
         return tagVal;
     }

     /**
      * Get the status (tag) values from the break rule(s) that determined the most
      * recently returned break position.  The values appear in the rule source
      * within brackets, {123}, for example.  The default status value for rules
      * that do not explicitly provide one is zero.
      * <p>
      * The status values used by the standard ICU break rules are defined
      * as public constants in class RuleBasedBreakIterator.
      * <p>
      * If the size  of the output array is insufficient to hold the data,
      *  the output will be truncated to the available length.  No exception
      *  will be thrown.
      *
      * @param fillInArray an array to be filled in with the status values.
      * @return          The number of rule status values from rules that determined
      *                  the most recent boundary returned by the break iterator.
      *                  In the event that the array is too small, the return value
      *                  is the total number of status values that were available,
      *                  not the reduced number that were actually returned.
      * @draft ICU 3.0 (retain)
      * @provisional This is a draft API and might change in a future release of ICU.
      */
     public int getRuleStatusVec(int[] fillInArray) {
         makeRuleStatusValid();
         int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
         if (fillInArray != null) {
             int numToCopy = Math.min(numStatusVals, fillInArray.length);
             for (int i=0; i<numToCopy; i++) {
                 fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
             }
         }
         return numStatusVals;
     }

     /**
      * Return a CharacterIterator over the text being analyzed.  This version
      * of this method returns the actual CharacterIterator we're using internally.
      * Changing the state of this iterator can have undefined consequences.  If
      * you need to change it, clone it first.
      * @return An iterator over the text being analyzed.
      * @stable ICU 2.0
      */
     public CharacterIterator getText() {
         return fText;
     }

     /**
      * Set the iterator to analyze a new piece of text.  This function resets
      * the current iteration position to the beginning of the text.
      * @param newText An iterator over the text to analyze.
      * @stable ICU 2.0
      */
     public void setText(CharacterIterator newText) {
         fText = newText;
         // first() resets the caches
         this.first();
     }

     /**
      * package private
      */
     void setBreakType(int type) {
         fBreakType = type;
     }

     /**
      * package private
      */
     int getBreakType() {
         return fBreakType;
     }

     /**
      * Control debug, trace and dump options.
      * @internal
      */
     static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
                                         ICUDebug.value(RBBI_DEBUG_ARG) : null;


     private LanguageBreakEngine getLanguageBreakEngine(int c) {

         // We have a dictionary character.
         // Does an already instantiated break engine handle it?
         for (LanguageBreakEngine candidate : fBreakEngines.values()) {
             if (candidate.handles(c, fBreakType)) {
                 return candidate;
             }
         }

         // if we don't have an existing engine, build one.
         int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
         if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
             // Katakana, Hiragana and Han are handled by the same dictionary engine.
             // Fold them together for mapping from script -> engine.
             script = UScript.HAN;
         }

         LanguageBreakEngine eng = fBreakEngines.get(script);
         /*
         if (eng != null && !eng.handles(c, fBreakType)) {
             fUnhandledBreakEngine.handleChar(c, getBreakType());
             eng = fUnhandledBreakEngine;
         } else  */  {
             try {
                 switch (script) {
                 case UScript.THAI:
                     eng = new ThaiBreakEngine();
                     break;
                 case UScript.LAO:
                     eng = new LaoBreakEngine();
                     break;
                 case UScript.MYANMAR:
                     eng = new BurmeseBreakEngine();
                     break;
                 case UScript.KHMER:
                     eng = new KhmerBreakEngine();
                     break;
                 case UScript.HAN:
                     if (getBreakType() == KIND_WORD) {
                         eng = new CjkBreakEngine(false);
                     }
                     else {
                         fUnhandledBreakEngine.handleChar(c, getBreakType());
                         eng = fUnhandledBreakEngine;
                     }
                     break;
                 case UScript.HANGUL:
                     if (getBreakType() == KIND_WORD) {
                         eng = new CjkBreakEngine(true);
                     } else {
                         fUnhandledBreakEngine.handleChar(c, getBreakType());
                         eng = fUnhandledBreakEngine;
                     }
                     break;
                 default:
                     fUnhandledBreakEngine.handleChar(c, getBreakType());
                     eng = fUnhandledBreakEngine;
                     break;
                 }
             } catch (IOException e) {
                 eng = null;
             }
         }

         if (eng != null && eng != fUnhandledBreakEngine) {
             LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
             if (existingEngine != null) {
                 // There was a race & another thread was first to register an engine for this script.
                 // Use theirs and discard the one we just created.
                 eng = existingEngine;
             }
             // assert eng.handles(c, fBreakType);
         }
         return eng;
     }


     /**
      * The State Machine Engine for moving forward is here.
      * This function is the heart of the RBBI run time engine.
      *
      * @param stateTable
      * @return the new iterator position
      *
      * A note on supplementary characters and the position of underlying
      * Java CharacterIterator:   Normally, a character iterator is positioned at
      * the char most recently returned by next().  Within this function, when
      * a supplementary char is being processed, the char iterator is left
      * sitting on the trail surrogate, in the middle of the code point.
      * This is different from everywhere else, where an iterator always
      * points at the lead surrogate of a supplementary.
      */
     private int handleNext(short stateTable[]) {
         if (TRACE) {
             System.out.println("Handle Next   pos      char  state category");
         }

         // No matter what, handleNext alway correctly sets the break tag value.
         fLastStatusIndexValid = true;
         fLastRuleStatusIndex  = 0;

         // caches for quicker access
         CharacterIterator text = fText;
         CharTrie trie = fRData.fTrie;

         // Set up the starting char
         int c               = text.current();
         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
             c = nextTrail32(text, c);
             if (c == DONE32) {
                 return BreakIterator.DONE;
             }
         }
         int initialPosition = text.getIndex();
         int result          = initialPosition;

         // Set the initial state for the state machine
         int state           = START_STATE;
         int row             = fRData.getRowIndex(state);
         short category      = 3;
         int flagsState      = fRData.getStateTableFlags(stateTable);
         int mode            = RBBI_RUN;
         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
             category = 2;
             mode     = RBBI_START;
             if (TRACE) {
                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
             }
         }
         int lookaheadStatus = 0;
         int lookaheadTagIdx = 0;
         int lookaheadResult = 0;

         // loop until we reach the end of the text or transition to state 0
         while (state != STOP_STATE) {
             if (c == DONE32) {
                 // Reached end of input string.
                 if (mode == RBBI_END) {
                     // We have already run the loop one last time with the
                     // character set to the pseudo {eof} value. Now it is time
                     // to unconditionally bail out.

                     if (lookaheadResult > result) {
                         // We ran off the end of the string with a pending
                         // look-ahead match.
                         // Treat this as if the look-ahead condition had been
                         // met, and return
                         // the match at the / position from the look-ahead rule.
                         result = lookaheadResult;
                         fLastRuleStatusIndex = lookaheadTagIdx;
                     }
                     break;
                 }
                 // Run the loop one last time with the fake end-of-input character category
                 mode = RBBI_END;
                 category = 1;
             }
             else if (mode == RBBI_RUN) {
                 // Get the char category.  An incoming category of 1 or 2 mens that
                 //      we are preset for doing the beginning or end of input, and
                 //      that we shouldn't get a category from an actual text input character.
                 //

                 // look up the current character's character category, which tells us
                 // which column in the state table to look at.
                 //
                 category = (short) trie.getCodePointValue(c);

                 // Check the dictionary bit in the character's category.
                 //    Counter is only used by dictionary based iterators (subclasses).
                 //    Chars that need to be handled by a dictionary have a flag bit set
                 //    in their category values.
                 //
                 if ((category & 0x4000) != 0)  {
                     fDictionaryCharCount++;
                     //  And off the dictionary flag bit.
                     category &= ~0x4000;
                 }

                 if (TRACE) {
                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
                 }

                 // Advance to the next character.
                 // If this is a beginning-of-input loop iteration, don't advance.
                 //    The next iteration will be processing the first real input character.
                 c = (int)text.next();
                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
                     c = nextTrail32(text, c);
                 }
             }
             else {
                 mode = RBBI_RUN;
             }

             // look up a state transition in the state table
             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
             row   = fRData.getRowIndex(state);

             if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
                 // Match found, common case
                 result = text.getIndex();
                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                     // The iterator has been left in the middle of a surrogate pair.
                     // We want the start of it.
                     result--;
                 }

                 //  Remember the break status (tag) values.
                 fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
             }

             if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
                 if (lookaheadStatus != 0
                     && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
                     // Lookahead match is completed.  Set the result accordingly, but only
                     // if no other rule has matched further in the mean time.
                     result               = lookaheadResult;
                     fLastRuleStatusIndex = lookaheadTagIdx;
                     lookaheadStatus      = 0;
                     // TODO: make a standalone hard break in a rule work.
                     if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) {
                         text.setIndex(result);
                         return result;
                     }
                     // Look-ahead completed, but other rules may match further.  Continue on.
                     //   TODO:  junk this feature?  I don't think it's used anywhere.
                     continue;
                 }

                 lookaheadResult = text.getIndex();
                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                     // The iterator has been left in the middle of a surrogate pair.
                     // We want the beginning  of it.
                     lookaheadResult--;
                 }
                 lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
                 lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
                 continue;
             }

             if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
                 // Because this is an accepting state, any in-progress look-ahead match
                 //   is no longer relevant.  Clear out the pending lookahead status.
                 lookaheadStatus = 0;
             }
         }        // End of state machine main loop

         // The state machine is done.  Check whether it found a match...

         // If the iterator failed to advance in the match engine force it ahead by one.
         // This indicates a defect in the break rules, which should always match
         // at least one character.

         if (result == initialPosition) {
             if (TRACE) {
                 System.out.println("Iterator did not move. Advancing by 1.");
             }
             text.setIndex(initialPosition);
             next32(text);
             result = text.getIndex();
         }
         else {
             // Leave the iterator at our result position.
             //   (we may have advanced beyond the last accepting position chasing after
             //    longer matches that never completed.)
             text.setIndex(result);
         }
         if (TRACE) {
             System.out.println("result = " + result);
         }
         return result;
     }

     private int handlePrevious(short stateTable[]) {
         if (fText == null || stateTable == null) {
             return 0;
         }

         int            state;
         int            category           = 0;
         int            mode;
         int            row;
         int            c;
         int            lookaheadStatus    = 0;
         int            result             = 0;
         int            initialPosition    = 0;
         int            lookaheadResult    = 0;
         boolean        lookAheadHardBreak =
             (fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;

         // handlePrevious() never gets the rule status.
         // Flag the status as invalid; if the user ever asks for status, we will need
         // to back up, then re-find the break position using handleNext(), which does
         // get the status value.
         fLastStatusIndexValid = false;
         fLastRuleStatusIndex  = 0;

         // set up the starting char
         initialPosition = fText.getIndex();
         result          = initialPosition;
         c               = previous32(fText);

         // Set up the initial state for the state machine
         state = START_STATE;
         row = fRData.getRowIndex(state);
         category = 3;   // TODO:  obsolete?  from the old start/run mode scheme?
         mode     = RBBI_RUN;
         if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
             category = 2;
             mode     = RBBI_START;
         }

         if (TRACE) {
             System.out.println("Handle Prev   pos   char  state category ");
         }

         // loop until we reach the beginning of the text or transition to state 0
         //
         mainLoop: for (;;) {
             innerBlock: {
                 if (c == DONE32) {
                     // Reached end of input string.
                     if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
                         // Either this is the old (ICU 3.2 and earlier) format data which
                         // does not support explicit support for matching {eof}, or
                         // we have already done the {eof} iteration.  Now is the time
                         // to unconditionally bail out.
                         if (lookaheadResult < result) {
                             // We ran off the end of the string with a pending look-ahead match.
                             // Treat this as if the look-ahead condition had been met, and return
                             //  the match at the / position from the look-ahead rule.
                             result = lookaheadResult;
                             lookaheadStatus = 0;
                         } else if (result == initialPosition) {
                             // Ran off start, no match found.
                             // Move one position (towards the start, since we are doing previous.)
                             fText.setIndex(initialPosition);
                             previous32(fText);
                         }
                         break mainLoop;
                     }
                     mode = RBBI_END;
                     category = 1;
                 }

                 if (mode == RBBI_RUN) {
                     // look up the current character's category, which tells us
                     // which column in the state table to look at.
                     //
                     category = (short) fRData.fTrie.getCodePointValue(c);

                     // Check the dictionary bit in the character's category.
                     //    Counter is only used by dictionary based iterators (subclasses).
                     //    Chars that need to be handled by a dictionary have a flag bit set
                     //    in their category values.
                     //
                     if ((category & 0x4000) != 0)  {
                         fDictionaryCharCount++;
                         //  And off the dictionary flag bit.
                         category &= ~0x4000;
                     }
                 }


                 if (TRACE) {
                     System.out.print("             " + fText.getIndex() + "   ");
                     if (0x20 <= c && c < 0x7f) {
                         System.out.print("  " + c + "  ");
                     } else {
                         System.out.print(" " + Integer.toHexString(c) + " ");
                     }
                     System.out.println(" " + state + "  " + category + " ");
                 }

                 // State Transition - move machine to its next state
                 //
                 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
                 row = fRData.getRowIndex(state);

                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
                     // Match found, common case, could have lookahead so we move
                     // on to check it
                     result = fText.getIndex();
                 }

                 if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
                     if (lookaheadStatus != 0
                             && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
                         // Lookahead match is completed. Set the result
                         // accordingly, but only
                         // if no other rule has matched further in the mean
                         // time.
                         result = lookaheadResult;
                         lookaheadStatus = 0;
                         // TODO: make a stand-alone hard break in a rule work.

                         if (lookAheadHardBreak) {
                             break mainLoop;
                         }
                         // Look-ahead completed, but other rules may match further.
                         // Continue on.
                         // TODO: junk this feature?  I don't think that it's used anywhere.
                         break innerBlock;
                     }
                     // Hit a possible look-ahead match. We are at the
                     // position of the '/'. Remember this position.
                     lookaheadResult = fText.getIndex();
                     lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
                     break innerBlock;
                 }

                 // not lookahead...
                 if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
                     // This is a plain (non-look-ahead) accepting state.
                     if (!lookAheadHardBreak) {
                         // Clear out any pending look-ahead matches,
                         // but only if not doing the lookAheadHardBreak option
                         // which needs to force a break no matter what is going
                         // on with the rest of the match, i.e. we can't abandon
                         // a partially completed look-ahead match because
                         // some other rule matched further than the '/' position
                         // in the look-ahead match.
                         lookaheadStatus = 0;
                     }
                 }

             } // end of innerBlock.  "break innerBlock" in above code comes out here.


             if (state == STOP_STATE) {
                 // Normal loop exit is here
                 break mainLoop;
             }

             // then move iterator position backwards one character
             //
             if (mode == RBBI_RUN) {
                 c = previous32(fText);
             } else {
                 if (mode == RBBI_START) {
                     mode = RBBI_RUN;
                 }
             }


         }   // End of the main loop.

         // The state machine is done.  Check whether it found a match...
         //
         // If the iterator failed to advance in the match engine, force it ahead by one.
         //   (This really indicates a defect in the break rules.  They should always match
         //    at least one character.)
         if (result == initialPosition) {
             result = fText.setIndex(initialPosition);
             previous32(fText);
             result = fText.getIndex();
         }

         fText.setIndex(result);
         if (TRACE) {
             System.out.println("Result = " + result);
         }

         return result;
     }
 }