| /* |
| ******************************************************************************* |
| * Copyright (C) 2005-2014 International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.text; |
| |
| import static com.ibm.icu.impl.CharacterIteration.DONE32; |
| import static com.ibm.icu.impl.CharacterIteration.next32; |
| import static com.ibm.icu.impl.CharacterIteration.nextTrail32; |
| import static com.ibm.icu.impl.CharacterIteration.previous32; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.nio.ByteBuffer; |
| import java.text.CharacterIterator; |
| import java.util.concurrent.ConcurrentHashMap; |
| |
| import com.ibm.icu.impl.Assert; |
| import com.ibm.icu.impl.CharTrie; |
| import com.ibm.icu.impl.CharacterIteration; |
| import com.ibm.icu.impl.ICUBinary; |
| import com.ibm.icu.impl.ICUDebug; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.lang.UScript; |
| |
| /** |
| * Rule Based Break Iterator |
| * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. |
| * |
| * @stable ICU 2.0 |
| */ |
| public class RuleBasedBreakIterator extends BreakIterator { |
| //======================================================================= |
| // Constructors & Factories |
| //======================================================================= |
| |
| /** |
| * private constructor |
| */ |
| private RuleBasedBreakIterator() { |
| fLastStatusIndexValid = true; |
| fDictionaryCharCount = 0; |
| fBreakEngines.put(-1, fUnhandledBreakEngine); |
| } |
| |
| /** |
| * Create a break iterator from a precompiled set of break rules. |
| * |
| * Creating a break iterator from the binary rules is much faster than |
| * creating one from source rules. |
| * |
| * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. |
| * Binary break iterator rules are not guaranteed to be compatible between |
| * different versions of ICU. |
| * |
| * @param is an input stream supplying the compiled binary rules. |
| * @throws IOException if there is an error while reading the rules from the InputStream. |
| * @see #compileRules(String, OutputStream) |
| * @stable ICU 4.8 |
| */ |
| public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { |
| RuleBasedBreakIterator This = new RuleBasedBreakIterator(); |
| This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStream(is)); |
| return This; |
| } |
| |
| /** |
| * Create a break iterator from a precompiled set of break rules. |
| * |
| * Creating a break iterator from the binary rules is much faster than |
| * creating one from source rules. |
| * |
| * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. |
| * Binary break iterator rules are not guaranteed to be compatible between |
| * different versions of ICU. |
| * |
| * @param bytes a buffer supplying the compiled binary rules. |
| * @throws IOException if there is an error while reading the rules from the buffer. |
| * @see #compileRules(String, OutputStream) |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { |
| RuleBasedBreakIterator This = new RuleBasedBreakIterator(); |
| This.fRData = RBBIDataWrapper.get(bytes); |
| return This; |
| } |
| |
| /** |
| * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. |
| * @param rules The break rules to be used. |
| * @stable ICU 2.2 |
| */ |
| public RuleBasedBreakIterator(String rules) { |
| this(); |
| try { |
| ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); |
| compileRules(rules, ruleOS); |
| fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); |
| } catch (IOException e) { |
| ///CLOVER:OFF |
| // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, |
| // causing bogus compiled rules to be produced, but with no compile error raised. |
| RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " |
| + e.getMessage()); |
| throw rte; |
| ///CLOVER:ON |
| } |
| } |
| |
| //======================================================================= |
| // Boilerplate |
| //======================================================================= |
| |
| /** |
| * Clones this iterator. |
| * @return A newly-constructed RuleBasedBreakIterator with the same |
| * behavior as this one. |
| * @stable ICU 2.0 |
| */ |
| public Object clone() |
| { |
| RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone(); |
| if (fText != null) { |
| result.fText = (CharacterIterator)(fText.clone()); |
| } |
| return result; |
| } |
| |
| /** |
| * Returns true if both BreakIterators are of the same class, have the same |
| * rules, and iterate over the same text. |
| * @stable ICU 2.0 |
| */ |
| public boolean equals(Object that) { |
| if (that == null) { |
| return false; |
| } |
| if (this == that) { |
| return true; |
| } |
| try { |
| RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; |
| if (fRData != other.fRData && (fRData == null || other.fRData == null)) { |
| return false; |
| } |
| if (fRData != null && other.fRData != null && |
| (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { |
| return false; |
| } |
| if (fText == null && other.fText == null) { |
| return true; |
| } |
| if (fText == null || other.fText == null) { |
| return false; |
| } |
| return fText.equals(other.fText); |
| } |
| catch(ClassCastException e) { |
| return false; |
| } |
| } |
| |
| /** |
| * Returns the description (rules) used to create this iterator. |
| * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) |
| * @stable ICU 2.0 |
| */ |
| public String toString() { |
| String retStr = ""; |
| if (fRData != null) { |
| retStr = fRData.fRuleSource; |
| } |
| return retStr; |
| } |
| |
| /** |
| * Compute a hashcode for this BreakIterator |
| * @return A hash code |
| * @stable ICU 2.0 |
| */ |
| public int hashCode() |
| { |
| return fRData.fRuleSource.hashCode(); |
| } |
| |
| |
| private static final int START_STATE = 1; // The state number of the starting state |
| private static final int STOP_STATE = 0; // The state-transition value indicating "stop" |
| |
| // RBBIRunMode - the state machine runs an extra iteration at the beginning and end |
| // of user text. A variable with this enum type keeps track of where we |
| // are. The state machine only fetches user text input while in RUN mode. |
| private static final int RBBI_START = 0; |
| private static final int RBBI_RUN = 1; |
| private static final int RBBI_END = 2; |
| |
| /* |
| * The character iterator through which this BreakIterator accesses the text. |
| */ |
| private CharacterIterator fText = new java.text.StringCharacterIterator(""); |
| |
| /** |
| * The rule data for this BreakIterator instance. Package private. |
| */ |
| RBBIDataWrapper fRData; |
| |
| /* |
| * Index of the Rule {tag} values for the most recent match. |
| */ |
| private int fLastRuleStatusIndex; |
| |
| /* |
| * Rule tag value valid flag. |
| * Some iterator operations don't intrinsically set the correct tag value. |
| * This flag lets us lazily compute the value if we are ever asked for it. |
| */ |
| private boolean fLastStatusIndexValid; |
| |
| /** |
| * Counter for the number of characters encountered with the "dictionary" |
| * flag set. Normal RBBI iterators don't use it, although the code |
| * for updating it is live. Dictionary Based break iterators (a subclass |
| * of us) access this field directly. |
| * @internal |
| */ |
| private int fDictionaryCharCount; |
| |
| /* |
| * ICU debug argument name for RBBI |
| */ |
| private static final String RBBI_DEBUG_ARG = "rbbi"; |
| |
| /** |
| * Debugging flag. Trace operation of state machine when true. |
| */ |
| private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) |
| && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; |
| |
| /** |
| * What kind of break iterator this is. Set to KIND_LINE by default, |
| * since this produces sensible output. |
| */ |
| private int fBreakType = KIND_LINE; |
| |
| /** |
| * The "default" break engine - just skips over ranges of dictionary words, |
| * producing no breaks. Should only be used if characters need to be handled |
| * by a dictionary but we have no dictionary implementation for them. |
| */ |
| private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); |
| |
| /** |
| * when a range of characters is divided up using the dictionary, the break |
| * positions that are discovered are stored here, preventing us from having |
| * to use either the dictionary or the state table again until the iterator |
| * leaves this range of text |
| */ |
| private int[] fCachedBreakPositions; |
| |
| /** |
| * if fCachedBreakPositions is not null, this indicates which item in the |
| * cache the current iteration position refers to |
| */ |
| private int fPositionInCache; |
| |
| |
| private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines = |
| new ConcurrentHashMap<Integer, LanguageBreakEngine>(); |
| /** |
| * Dumps caches and performs other actions associated with a complete change |
| * in text or iteration position. |
| */ |
| private void reset() { |
| fCachedBreakPositions = null; |
| // fNumCachedBreakPositions = 0; |
| fDictionaryCharCount = 0; |
| fPositionInCache = 0; |
| |
| } |
| /** |
| * Dump the contents of the state table and character classes for this break iterator. |
| * For debugging only. |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| public void dump() { |
| this.fRData.dump(); |
| } |
| |
| /** |
| * Compile a set of source break rules into the binary state tables used |
| * by the break iterator engine. Creating a break iterator from precompiled |
| * rules is much faster than creating one from source rules. |
| * |
| * Binary break rules are not guaranteed to be compatible between different |
| * versions of ICU. |
| * |
| * |
| * @param rules The source form of the break rules |
| * @param ruleBinary An output stream to receive the compiled rules. |
| * @throws IOException If there is an error writing the output. |
| * @see #getInstanceFromCompiledRules(InputStream) |
| * @stable ICU 4.8 |
| */ |
| public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { |
| RBBIRuleBuilder.compileRules(rules, ruleBinary); |
| } |
| |
| //======================================================================= |
| // BreakIterator overrides |
| //======================================================================= |
| |
| /** |
| * Sets the current iteration position to the beginning of the text. |
| * (i.e., the CharacterIterator's starting offset). |
| * @return The offset of the beginning of the text. |
| * @stable ICU 2.0 |
| */ |
| public int first() { |
| fCachedBreakPositions = null; |
| fDictionaryCharCount = 0; |
| fPositionInCache = 0; |
| fLastRuleStatusIndex = 0; |
| fLastStatusIndexValid = true; |
| if (fText == null) { |
| return BreakIterator.DONE; |
| } |
| fText.first(); |
| return fText.getIndex(); |
| } |
| |
| /** |
| * Sets the current iteration position to the end of the text. |
| * (i.e., the CharacterIterator's ending offset). |
| * @return The text's past-the-end offset. |
| * @stable ICU 2.0 |
| */ |
| public int last() { |
| fCachedBreakPositions = null; |
| fDictionaryCharCount = 0; |
| fPositionInCache = 0; |
| |
| if (fText == null) { |
| fLastRuleStatusIndex = 0; |
| fLastStatusIndexValid = true; |
| return BreakIterator.DONE; |
| } |
| |
| // t.last() returns the offset of the last character, |
| // rather than the past-the-end offset |
| // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... |
| // will work correctly. |
| fLastStatusIndexValid = false; |
| int pos = fText.getEndIndex(); |
| fText.setIndex(pos); |
| return pos; |
| } |
| |
| /** |
| * Advances the iterator either forward or backward the specified number of steps. |
| * Negative values move backward, and positive values move forward. This is |
| * equivalent to repeatedly calling next() or previous(). |
| * @param n The number of steps to move. The sign indicates the direction |
| * (negative is backwards, and positive is forwards). |
| * @return The character offset of the boundary position n boundaries away from |
| * the current one. |
| * @stable ICU 2.0 |
| */ |
| public int next(int n) { |
| int result = current(); |
| while (n > 0) { |
| result = next(); |
| --n; |
| } |
| while (n < 0) { |
| result = previous(); |
| ++n; |
| } |
| return result; |
| } |
| |
| /** |
| * Advances the iterator to the next boundary position. |
| * @return The position of the first boundary after this one. |
| * @stable ICU 2.0 |
| */ |
| public int next() { |
| // if we have cached break positions and we're still in the range |
| // covered by them, just move one step forward in the cache |
| if (fCachedBreakPositions != null) { |
| if (fPositionInCache < fCachedBreakPositions.length - 1) { |
| ++fPositionInCache; |
| int pos = fCachedBreakPositions[fPositionInCache]; |
| fText.setIndex(pos); |
| return pos; |
| } |
| else { |
| reset(); |
| } |
| } |
| |
| int startPos = current(); |
| fDictionaryCharCount = 0; |
| int result = handleNext(fRData.fFTable); |
| if (fDictionaryCharCount > 0) { |
| result = checkDictionary(startPos, result, false); |
| } |
| return result; |
| } |
| |
| /** |
| * checkDictionary This function handles all processing of characters in |
| * the "dictionary" set. It will determine the appropriate |
| * course of action, and possibly set up a cache in the |
| * process. |
| */ |
| private int checkDictionary(int startPos, int endPos, boolean reverse) { |
| |
| // Reset the old break cache first. |
| reset(); |
| |
| // note: code segment below assumes that dictionary chars are in the |
| // startPos-endPos range |
| // value returned should be next character in sequence |
| if ((endPos - startPos) <= 1) { |
| return (reverse ? startPos : endPos); |
| } |
| |
| // Starting from the starting point, scan towards the proposed result, |
| // looking for the first dictionary character (which may be the one |
| // we're on, if we're starting in the middle of a range). |
| fText.setIndex(reverse ? endPos : startPos); |
| if (reverse) { |
| CharacterIteration.previous32(fText); |
| } |
| |
| int rangeStart = startPos; |
| int rangeEnd = endPos; |
| |
| int category; |
| int current; |
| DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI(); |
| int foundBreakCount = 0; |
| int c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| |
| // Is the character we're starting on a dictionary character? If so, we |
| // need to back up to include the entire run; otherwise the results of |
| // the break algorithm will differ depending on where we start. Since |
| // the result is cached and there is typically a non-dictionary break |
| // within a small number of words, there should be little performance impact. |
| if ((category & 0x4000) != 0) { |
| if (reverse) { |
| do { |
| CharacterIteration.next32(fText); |
| c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0); |
| |
| // Back up to the last dictionary character |
| rangeEnd = fText.getIndex(); |
| if (c == CharacterIteration.DONE32) { |
| // c = fText->last32(); |
| // TODO: why was this if needed? |
| c = CharacterIteration.previous32(fText); |
| } |
| else { |
| c = CharacterIteration.previous32(fText); |
| } |
| } |
| else { |
| do { |
| c = CharacterIteration.previous32(fText); |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| } |
| while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0)); |
| // Back up to the last dictionary character |
| if (c == CharacterIteration.DONE32) { |
| // c = fText->first32(); |
| c = CharacterIteration.current32(fText); |
| } |
| else { |
| CharacterIteration.next32(fText); |
| c = CharacterIteration.current32(fText); |
| } |
| rangeStart = fText.getIndex(); |
| } |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| } |
| |
| |
| // Loop through the text, looking for ranges of dictionary characters. |
| // For each span, find the appropriate break engine, and ask it to find |
| // any breaks within the span. |
| // Note: we always do this in the forward direction, so that the break |
| // cache is built in the right order. |
| if (reverse) { |
| fText.setIndex(rangeStart); |
| c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| } |
| LanguageBreakEngine lbe = null; |
| while(true) { |
| while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { |
| CharacterIteration.next32(fText); |
| c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| } |
| if (current >= rangeEnd) { |
| break; |
| } |
| |
| // We now have a dictionary character. Get the appropriate language object |
| // to deal with it. |
| lbe = getLanguageBreakEngine(c); |
| |
| // Ask the language object if there are any breaks. It will leave the text |
| // pointer on the other side of its range, ready to search for the next one. |
| if (lbe != null) { |
| int startingIdx = fText.getIndex(); |
| foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks); |
| assert fText.getIndex() > startingIdx; |
| } |
| |
| // Reload the loop variables for the next go-round |
| c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.getCodePointValue(c); |
| } |
| |
| // If we found breaks, build a new break cache. The first and last entries must |
| // be the original starting and ending position. |
| if (foundBreakCount > 0) { |
| if (foundBreakCount != breaks.size()) { |
| System.out.println("oops, foundBreakCount != breaks.size(). LBE = " + lbe.getClass()); |
| } |
| assert foundBreakCount == breaks.size(); |
| if (startPos < breaks.peekLast()) { |
| breaks.offer(startPos); |
| } |
| if (endPos > breaks.peek()) { |
| breaks.push(endPos); |
| } |
| |
| // TODO: get rid of this array, use results from the deque directly |
| fCachedBreakPositions = new int[breaks.size()]; |
| |
| int i = 0; |
| while (breaks.size() > 0) { |
| fCachedBreakPositions[i++] = breaks.pollLast(); |
| } |
| |
| // If there are breaks, then by definition, we are replacing the original |
| // proposed break by one of the breaks we found. Use following() and |
| // preceding() to do the work. They should never recurse in this case. |
| if (reverse) { |
| return preceding(endPos); |
| } |
| else { |
| return following(startPos); |
| } |
| } |
| |
| // If we get here, there were no language-based breaks. Set the text pointer |
| // to the original proposed break. |
| fText.setIndex(reverse ? startPos : endPos); |
| return (reverse ? startPos : endPos); |
| |
| } |
| |
| |
| /** |
| * Moves the iterator backwards, to the last boundary preceding this one. |
| * @return The position of the last boundary position preceding this one. |
| * @stable ICU 2.0 |
| */ |
| public int previous() { |
| int result; |
| int startPos; |
| |
| CharacterIterator text = getText(); |
| |
| fLastStatusIndexValid = false; |
| |
| // if we have cached break positions and we're still in the range |
| // covered by them, just move one step backward in the cache |
| if (fCachedBreakPositions != null) { |
| if (fPositionInCache > 0) { |
| --fPositionInCache; |
| // If we're at the beginning of the cache, need to reevaluate the |
| // rule status |
| if (fPositionInCache <= 0) { |
| fLastStatusIndexValid = false; |
| } |
| int pos = fCachedBreakPositions[fPositionInCache]; |
| text.setIndex(pos); |
| return pos; |
| } else { |
| reset(); |
| } |
| } |
| |
| // if we're already sitting at the beginning of the text, return DONE |
| startPos = current(); |
| if (fText == null || startPos == fText.getBeginIndex()) { |
| fLastRuleStatusIndex = 0; |
| fLastStatusIndexValid = true; |
| return BreakIterator.DONE; |
| } |
| |
| // Rules with an exact reverse table are handled here. |
| if (fRData.fSRTable != null || fRData.fSFTable != null) { |
| result = handlePrevious(fRData.fRTable); |
| if (fDictionaryCharCount > 0) { |
| result = checkDictionary(result, startPos, true); |
| } |
| return result; |
| } |
| |
| // old rule syntax |
| // set things up. handlePrevious() will back us up to some valid |
| // break position before the current position (we back our internal |
| // iterator up one step to prevent handlePrevious() from returning |
| // the current position), but not necessarily the last one before |
| // where we started |
| |
| int start = current(); |
| |
| previous32(fText); |
| int lastResult = handlePrevious(fRData.fRTable); |
| if (lastResult == BreakIterator.DONE) { |
| lastResult = fText.getBeginIndex(); |
| fText.setIndex(lastResult); |
| } |
| result = lastResult; |
| int lastTag = 0; |
| boolean breakTagValid = false; |
| |
| // iterate forward from the known break position until we pass our |
| // starting point. The last break position before the starting |
| // point is our return value |
| |
| for (;;) { |
| result = next(); |
| if (result == BreakIterator.DONE || result >= start) { |
| break; |
| } |
| lastResult = result; |
| lastTag = fLastRuleStatusIndex; |
| breakTagValid = true; |
| } |
| |
| // fLastBreakTag wants to have the value for section of text preceding |
| // the result position that we are to return (in lastResult.) If |
| // the backwards rules overshot and the above loop had to do two or more |
| // handleNext()s to move up to the desired return position, we will have a valid |
| // tag value. But, if handlePrevious() took us to exactly the correct result position, |
| // we wont have a tag value for that position, which is only set by handleNext(). |
| |
| // Set the current iteration position to be the last break position |
| // before where we started, and then return that value. |
| fText.setIndex(lastResult); |
| fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() |
| fLastStatusIndexValid = breakTagValid; |
| return lastResult; |
| } |
| |
| /** |
| * Sets the iterator to refer to the first boundary position following |
| * the specified position. |
| * @param offset The position from which to begin searching for a break position. |
| * @return The position of the first break after the current position. |
| * @stable ICU 2.0 |
| */ |
| public int following(int offset) { |
| CharacterIterator text = getText(); |
| |
| // if we have no cached break positions, or if "offset" is outside the |
| // range covered by the cache, then dump the cache and call our |
| // inherited following() method. This will call other methods in this |
| // class that may refresh the cache. |
| if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] || |
| offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) { |
| fCachedBreakPositions = null; |
| return rulesFollowing(offset); |
| } |
| |
| // on the other hand, if "offset" is within the range covered by the |
| // cache, then just search the cache for the first break position |
| // after "offset" |
| else { |
| fPositionInCache = 0; |
| while (fPositionInCache < fCachedBreakPositions.length |
| && offset >= fCachedBreakPositions[fPositionInCache]) |
| ++fPositionInCache; |
| text.setIndex(fCachedBreakPositions[fPositionInCache]); |
| return text.getIndex(); |
| } |
| } |
| |
| private int rulesFollowing(int offset) { |
| // if the offset passed in is already past the end of the text, |
| // just return DONE; if it's before the beginning, return the |
| // text's starting offset |
| fLastRuleStatusIndex = 0; |
| fLastStatusIndexValid = true; |
| if (fText == null || offset >= fText.getEndIndex()) { |
| last(); |
| return next(); |
| } |
| else if (offset < fText.getBeginIndex()) { |
| return first(); |
| } |
| |
| // otherwise, set our internal iteration position (temporarily) |
| // to the position passed in. If this is the _beginning_ position, |
| // then we can just use next() to get our return value |
| |
| int result = 0; |
| |
| if (fRData.fSRTable != null) { |
| // Safe Point Reverse rules exist. |
| // This allows us to use the optimum algorithm. |
| fText.setIndex(offset); |
| // move forward one codepoint to prepare for moving back to a |
| // safe point. |
| // this handles offset being between a supplementary character |
| next32(fText); |
| // handlePrevious will move most of the time to < 1 boundary away |
| handlePrevious(fRData.fSRTable); |
| result = next(); |
| while (result <= offset) { |
| result = next(); |
| } |
| return result; |
| } |
| if (fRData.fSFTable != null) { |
| // No Safe point reverse table, but there is a safe pt forward table. |
| // |
| fText.setIndex(offset); |
| previous32(fText); |
| // handle next will give result >= offset |
| handleNext(fRData.fSFTable); |
| // previous will give result 0 or 1 boundary away from offset, |
| // most of the time |
| // we have to |
| int oldresult = previous(); |
| while (oldresult > offset) { |
| result = previous(); |
| if (result <= offset) { |
| return oldresult; |
| } |
| oldresult = result; |
| } |
| result = next(); |
| if (result <= offset) { |
| return next(); |
| } |
| return result; |
| } |
| // otherwise, we have to sync up first. Use handlePrevious() to back |
| // us up to a known break position before the specified position (if |
| // we can determine that the specified position is a break position, |
| // we don't back up at all). This may or may not be the last break |
| // position at or before our starting position. Advance forward |
| // from here until we've passed the starting position. The position |
| // we stop on will be the first break position after the specified one. |
| // old rule syntax |
| |
| fText.setIndex(offset); |
| if (offset == fText.getBeginIndex()) { |
| return next(); |
| } |
| result = previous(); |
| |
| while (result != BreakIterator.DONE && result <= offset) { |
| result = next(); |
| } |
| |
| return result; |
| } |
| /** |
| * Sets the iterator to refer to the last boundary position before the |
| * specified position. |
| * @param offset The position to begin searching for a break from. |
| * @return The position of the last boundary before the starting position. |
| * @stable ICU 2.0 |
| */ |
| public int preceding(int offset) { |
| CharacterIterator text = getText(); |
| |
| // if we have no cached break positions, or "offset" is outside the |
| // range covered by the cache, we can just call the inherited routine |
| // (which will eventually call other routines in this class that may |
| // refresh the cache) |
| if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] || |
| offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) { |
| fCachedBreakPositions = null; |
| return rulesPreceding(offset); |
| } |
| |
| // on the other hand, if "offset" is within the range covered by the cache, |
| // then all we have to do is search the cache for the last break position |
| // before "offset" |
| else { |
| fPositionInCache = 0; |
| while (fPositionInCache < fCachedBreakPositions.length |
| && offset > fCachedBreakPositions[fPositionInCache]) |
| ++fPositionInCache; |
| --fPositionInCache; |
| text.setIndex(fCachedBreakPositions[fPositionInCache]); |
| return text.getIndex(); |
| } |
| } |
| |
| private int rulesPreceding(int offset) { |
| // if the offset passed in is already past the end of the text, |
| // just return DONE; if it's before the beginning, return the |
| |
| // text's starting offset |
| if (fText == null || offset > fText.getEndIndex()) { |
| // return BreakIterator::DONE; |
| return last(); |
| } |
| else if (offset < fText.getBeginIndex()) { |
| return first(); |
| } |
| |
| // if we start by updating the current iteration position to the |
| // position specified by the caller, we can just use previous() |
| // to carry out this operation |
| |
| int result; |
| if (fRData.fSFTable != null) { |
| /// todo synwee |
| // new rule syntax |
| fText.setIndex(offset); |
| // move backwards one codepoint to prepare for moving forwards to a |
| // safe point. |
| // this handles offset being between a supplementary character |
| previous32(fText); |
| handleNext(fRData.fSFTable); |
| result = previous(); |
| while (result >= offset) { |
| result = previous(); |
| } |
| return result; |
| } |
| if (fRData.fSRTable != null) { |
| // backup plan if forward safe table is not available |
| fText.setIndex(offset); |
| next32(fText); |
| // handle previous will give result <= offset |
| handlePrevious(fRData.fSRTable); |
| |
| // next will give result 0 or 1 boundary away from offset, |
| // most of the time |
| // we have to |
| int oldresult = next(); |
| while (oldresult < offset) { |
| result = next(); |
| if (result >= offset) { |
| return oldresult; |
| } |
| oldresult = result; |
| } |
| result = previous(); |
| if (result >= offset) { |
| return previous(); |
| } |
| return result; |
| } |
| |
| // old rule syntax |
| fText.setIndex(offset); |
| return previous(); |
| } |
| |
| /** |
| * Throw IllegalArgumentException unless begin <= offset < end. |
| * @stable ICU 2.0 |
| */ |
| protected static final void checkOffset(int offset, CharacterIterator text) { |
| if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { |
| throw new IllegalArgumentException("offset out of bounds"); |
| } |
| } |
| |
| |
| /** |
| * Returns true if the specified position is a boundary position. As a side |
| * effect, leaves the iterator pointing to the first boundary position at |
| * or after "offset". |
| * @param offset the offset to check. |
| * @return True if "offset" is a boundary position. |
| * @stable ICU 2.0 |
| */ |
| public boolean isBoundary(int offset) { |
| checkOffset(offset, fText); |
| |
| // the beginning index of the iterator is always a boundary position by definition |
| if (offset == fText.getBeginIndex()) { |
| first(); // For side effects on current position, tag values. |
| return true; |
| } |
| |
| if (offset == fText.getEndIndex()) { |
| last(); // For side effects on current position, tag values. |
| return true; |
| } |
| |
| // otherwise, we can use following() on the position before the specified |
| // one and return true if the position we get back is the one the user |
| // specified |
| |
| // return following(offset - 1) == offset; |
| // TODO: check whether it is safe to revert to the simpler offset-1 code |
| // The safe rules may take care of unpaired surrogates ok. |
| fText.setIndex(offset); |
| previous32(fText); |
| int pos = fText.getIndex(); |
| boolean result = following(pos) == offset; |
| return result; |
| } |
| |
| /** |
| * Returns the current iteration position. |
| * @return The current iteration position. |
| * @stable ICU 2.0 |
| */ |
| public int current() { |
| return (fText != null) ? fText.getIndex() : BreakIterator.DONE; |
| } |
| |
| private void makeRuleStatusValid() { |
| if (fLastStatusIndexValid == false) { |
| // No cached status is available. |
| int curr = current(); |
| if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) { |
| // At start of text, or there is no text. Status is always zero. |
| fLastRuleStatusIndex = 0; |
| fLastStatusIndexValid = true; |
| } else { |
| // Not at start of text. Find status the tedious way. |
| int pa = fText.getIndex(); |
| first(); |
| int pb = current(); |
| while (fText.getIndex() < pa) { |
| pb = next(); |
| } |
| Assert.assrt(pa == pb); |
| } |
| Assert.assrt(fLastStatusIndexValid == true); |
| Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); |
| } |
| } |
| |
| /** |
| * Return the status tag from the break rule that determined the most recently |
| * returned break position. The values appear in the rule source |
| * within brackets, {123}, for example. For rules that do not specify a |
| * status, a default value of 0 is returned. If more than one rule applies, |
| * the numerically largest of the possible status values is returned. |
| * <p> |
| * Of the standard types of ICU break iterators, only the word break |
| * iterator provides status values. The values are defined in |
| * class RuleBasedBreakIterator, and allow distinguishing between words |
| * that contain alphabetic letters, "words" that appear to be numbers, |
| * punctuation and spaces, words containing ideographic characters, and |
| * more. Call <code>getRuleStatus</code> after obtaining a boundary |
| * position from <code>next()<code>, <code>previous()</code>, or |
| * any other break iterator functions that returns a boundary position. |
| * <p> |
| * @return the status from the break rule that determined the most recently |
| * returned break position. |
| * |
| * @draft ICU 3.0 (retain) |
| * @provisional This is a draft API and might change in a future release of ICU. |
| */ |
| |
| public int getRuleStatus() { |
| makeRuleStatusValid(); |
| // Status records have this form: |
| // Count N <-- fLastRuleStatusIndex points here. |
| // Status val 0 |
| // Status val 1 |
| // ... |
| // Status val N-1 <-- the value we need to return |
| // The status values are sorted in ascending order. |
| // This function returns the last (largest) of the array of status values. |
| int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; |
| int tagVal = fRData.fStatusTable[idx]; |
| return tagVal; |
| } |
| |
| /** |
| * Get the status (tag) values from the break rule(s) that determined the most |
| * recently returned break position. The values appear in the rule source |
| * within brackets, {123}, for example. The default status value for rules |
| * that do not explicitly provide one is zero. |
| * <p> |
| * The status values used by the standard ICU break rules are defined |
| * as public constants in class RuleBasedBreakIterator. |
| * <p> |
| * If the size of the output array is insufficient to hold the data, |
| * the output will be truncated to the available length. No exception |
| * will be thrown. |
| * |
| * @param fillInArray an array to be filled in with the status values. |
| * @return The number of rule status values from rules that determined |
| * the most recent boundary returned by the break iterator. |
| * In the event that the array is too small, the return value |
| * is the total number of status values that were available, |
| * not the reduced number that were actually returned. |
| * @draft ICU 3.0 (retain) |
| * @provisional This is a draft API and might change in a future release of ICU. |
| */ |
| public int getRuleStatusVec(int[] fillInArray) { |
| makeRuleStatusValid(); |
| int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; |
| if (fillInArray != null) { |
| int numToCopy = Math.min(numStatusVals, fillInArray.length); |
| for (int i=0; i<numToCopy; i++) { |
| fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1]; |
| } |
| } |
| return numStatusVals; |
| } |
| |
| /** |
| * Return a CharacterIterator over the text being analyzed. This version |
| * of this method returns the actual CharacterIterator we're using internally. |
| * Changing the state of this iterator can have undefined consequences. If |
| * you need to change it, clone it first. |
| * @return An iterator over the text being analyzed. |
| * @stable ICU 2.0 |
| */ |
| public CharacterIterator getText() { |
| return fText; |
| } |
| |
| /** |
| * Set the iterator to analyze a new piece of text. This function resets |
| * the current iteration position to the beginning of the text. |
| * @param newText An iterator over the text to analyze. |
| * @stable ICU 2.0 |
| */ |
| public void setText(CharacterIterator newText) { |
| fText = newText; |
| // first() resets the caches |
| this.first(); |
| } |
| |
| /** |
| * package private |
| */ |
| void setBreakType(int type) { |
| fBreakType = type; |
| } |
| |
| /** |
| * package private |
| */ |
| int getBreakType() { |
| return fBreakType; |
| } |
| |
| /** |
| * Control debug, trace and dump options. |
| * @internal |
| */ |
| static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? |
| ICUDebug.value(RBBI_DEBUG_ARG) : null; |
| |
| |
| private LanguageBreakEngine getLanguageBreakEngine(int c) { |
| |
| // We have a dictionary character. |
| // Does an already instantiated break engine handle it? |
| for (LanguageBreakEngine candidate : fBreakEngines.values()) { |
| if (candidate.handles(c, fBreakType)) { |
| return candidate; |
| } |
| } |
| |
| // if we don't have an existing engine, build one. |
| int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); |
| if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { |
| // Katakana, Hiragana and Han are handled by the same dictionary engine. |
| // Fold them together for mapping from script -> engine. |
| script = UScript.HAN; |
| } |
| |
| LanguageBreakEngine eng = fBreakEngines.get(script); |
| /* |
| if (eng != null && !eng.handles(c, fBreakType)) { |
| fUnhandledBreakEngine.handleChar(c, getBreakType()); |
| eng = fUnhandledBreakEngine; |
| } else */ { |
| try { |
| switch (script) { |
| case UScript.THAI: |
| eng = new ThaiBreakEngine(); |
| break; |
| case UScript.LAO: |
| eng = new LaoBreakEngine(); |
| break; |
| case UScript.MYANMAR: |
| eng = new BurmeseBreakEngine(); |
| break; |
| case UScript.KHMER: |
| eng = new KhmerBreakEngine(); |
| break; |
| case UScript.HAN: |
| if (getBreakType() == KIND_WORD) { |
| eng = new CjkBreakEngine(false); |
| } |
| else { |
| fUnhandledBreakEngine.handleChar(c, getBreakType()); |
| eng = fUnhandledBreakEngine; |
| } |
| break; |
| case UScript.HANGUL: |
| if (getBreakType() == KIND_WORD) { |
| eng = new CjkBreakEngine(true); |
| } else { |
| fUnhandledBreakEngine.handleChar(c, getBreakType()); |
| eng = fUnhandledBreakEngine; |
| } |
| break; |
| default: |
| fUnhandledBreakEngine.handleChar(c, getBreakType()); |
| eng = fUnhandledBreakEngine; |
| break; |
| } |
| } catch (IOException e) { |
| eng = null; |
| } |
| } |
| |
| if (eng != null && eng != fUnhandledBreakEngine) { |
| LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng); |
| if (existingEngine != null) { |
| // There was a race & another thread was first to register an engine for this script. |
| // Use theirs and discard the one we just created. |
| eng = existingEngine; |
| } |
| // assert eng.handles(c, fBreakType); |
| } |
| return eng; |
| } |
| |
| |
| |
| /** |
| * The State Machine Engine for moving forward is here. |
| * This function is the heart of the RBBI run time engine. |
| * |
| * @param stateTable |
| * @return the new iterator position |
| * |
| * A note on supplementary characters and the position of underlying |
| * Java CharacterIterator: Normally, a character iterator is positioned at |
| * the char most recently returned by next(). Within this function, when |
| * a supplementary char is being processed, the char iterator is left |
| * sitting on the trail surrogate, in the middle of the code point. |
| * This is different from everywhere else, where an iterator always |
| * points at the lead surrogate of a supplementary. |
| */ |
| private int handleNext(short stateTable[]) { |
| if (TRACE) { |
| System.out.println("Handle Next pos char state category"); |
| } |
| |
| // No matter what, handleNext alway correctly sets the break tag value. |
| fLastStatusIndexValid = true; |
| fLastRuleStatusIndex = 0; |
| |
| // caches for quicker access |
| CharacterIterator text = fText; |
| CharTrie trie = fRData.fTrie; |
| |
| // Set up the starting char |
| int c = text.current(); |
| if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { |
| c = nextTrail32(text, c); |
| if (c == DONE32) { |
| return BreakIterator.DONE; |
| } |
| } |
| int initialPosition = text.getIndex(); |
| int result = initialPosition; |
| |
| // Set the initial state for the state machine |
| int state = START_STATE; |
| int row = fRData.getRowIndex(state); |
| short category = 3; |
| int flagsState = fRData.getStateTableFlags(stateTable); |
| int mode = RBBI_RUN; |
| if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { |
| category = 2; |
| mode = RBBI_START; |
| if (TRACE) { |
| System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); |
| System.out.print(RBBIDataWrapper.intToHexString(c, 10)); |
| System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); |
| } |
| } |
| int lookaheadStatus = 0; |
| int lookaheadTagIdx = 0; |
| int lookaheadResult = 0; |
| |
| // loop until we reach the end of the text or transition to state 0 |
| while (state != STOP_STATE) { |
| if (c == DONE32) { |
| // Reached end of input string. |
| if (mode == RBBI_END) { |
| // We have already run the loop one last time with the |
| // character set to the pseudo {eof} value. Now it is time |
| // to unconditionally bail out. |
| |
| if (lookaheadResult > result) { |
| // We ran off the end of the string with a pending |
| // look-ahead match. |
| // Treat this as if the look-ahead condition had been |
| // met, and return |
| // the match at the / position from the look-ahead rule. |
| result = lookaheadResult; |
| fLastRuleStatusIndex = lookaheadTagIdx; |
| } |
| break; |
| } |
| // Run the loop one last time with the fake end-of-input character category |
| mode = RBBI_END; |
| category = 1; |
| } |
| else if (mode == RBBI_RUN) { |
| // Get the char category. An incoming category of 1 or 2 mens that |
| // we are preset for doing the beginning or end of input, and |
| // that we shouldn't get a category from an actual text input character. |
| // |
| |
| // look up the current character's character category, which tells us |
| // which column in the state table to look at. |
| // |
| category = (short) trie.getCodePointValue(c); |
| |
| // Check the dictionary bit in the character's category. |
| // Counter is only used by dictionary based iterators (subclasses). |
| // Chars that need to be handled by a dictionary have a flag bit set |
| // in their category values. |
| // |
| if ((category & 0x4000) != 0) { |
| fDictionaryCharCount++; |
| // And off the dictionary flag bit. |
| category &= ~0x4000; |
| } |
| |
| if (TRACE) { |
| System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); |
| System.out.print(RBBIDataWrapper.intToHexString(c, 10)); |
| System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); |
| } |
| |
| // Advance to the next character. |
| // If this is a beginning-of-input loop iteration, don't advance. |
| // The next iteration will be processing the first real input character. |
| c = (int)text.next(); |
| if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { |
| c = nextTrail32(text, c); |
| } |
| } |
| else { |
| mode = RBBI_RUN; |
| } |
| |
| // look up a state transition in the state table |
| state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; |
| row = fRData.getRowIndex(state); |
| |
| if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { |
| // Match found, common case |
| result = text.getIndex(); |
| if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { |
| // The iterator has been left in the middle of a surrogate pair. |
| // We want the start of it. |
| result--; |
| } |
| |
| // Remember the break status (tag) values. |
| fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; |
| } |
| |
| if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { |
| if (lookaheadStatus != 0 |
| && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { |
| // Lookahead match is completed. Set the result accordingly, but only |
| // if no other rule has matched further in the mean time. |
| result = lookaheadResult; |
| fLastRuleStatusIndex = lookaheadTagIdx; |
| lookaheadStatus = 0; |
| // TODO: make a standalone hard break in a rule work. |
| if ((flagsState & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0) { |
| text.setIndex(result); |
| return result; |
| } |
| // Look-ahead completed, but other rules may match further. Continue on. |
| // TODO: junk this feature? I don't think it's used anywhere. |
| continue; |
| } |
| |
| lookaheadResult = text.getIndex(); |
| if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { |
| // The iterator has been left in the middle of a surrogate pair. |
| // We want the beginning of it. |
| lookaheadResult--; |
| } |
| lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; |
| lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX]; |
| continue; |
| } |
| |
| if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { |
| // Because this is an accepting state, any in-progress look-ahead match |
| // is no longer relevant. Clear out the pending lookahead status. |
| lookaheadStatus = 0; |
| } |
| } // End of state machine main loop |
| |
| // The state machine is done. Check whether it found a match... |
| |
| // If the iterator failed to advance in the match engine force it ahead by one. |
| // This indicates a defect in the break rules, which should always match |
| // at least one character. |
| |
| if (result == initialPosition) { |
| if (TRACE) { |
| System.out.println("Iterator did not move. Advancing by 1."); |
| } |
| text.setIndex(initialPosition); |
| next32(text); |
| result = text.getIndex(); |
| } |
| else { |
| // Leave the iterator at our result position. |
| // (we may have advanced beyond the last accepting position chasing after |
| // longer matches that never completed.) |
| text.setIndex(result); |
| } |
| if (TRACE) { |
| System.out.println("result = " + result); |
| } |
| return result; |
| } |
| |
| private int handlePrevious(short stateTable[]) { |
| if (fText == null || stateTable == null) { |
| return 0; |
| } |
| |
| int state; |
| int category = 0; |
| int mode; |
| int row; |
| int c; |
| int lookaheadStatus = 0; |
| int result = 0; |
| int initialPosition = 0; |
| int lookaheadResult = 0; |
| boolean lookAheadHardBreak = |
| (fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0; |
| |
| // handlePrevious() never gets the rule status. |
| // Flag the status as invalid; if the user ever asks for status, we will need |
| // to back up, then re-find the break position using handleNext(), which does |
| // get the status value. |
| fLastStatusIndexValid = false; |
| fLastRuleStatusIndex = 0; |
| |
| // set up the starting char |
| initialPosition = fText.getIndex(); |
| result = initialPosition; |
| c = previous32(fText); |
| |
| // Set up the initial state for the state machine |
| state = START_STATE; |
| row = fRData.getRowIndex(state); |
| category = 3; // TODO: obsolete? from the old start/run mode scheme? |
| mode = RBBI_RUN; |
| if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { |
| category = 2; |
| mode = RBBI_START; |
| } |
| |
| if (TRACE) { |
| System.out.println("Handle Prev pos char state category "); |
| } |
| |
| // loop until we reach the beginning of the text or transition to state 0 |
| // |
| mainLoop: for (;;) { |
| innerBlock: { |
| if (c == DONE32) { |
| // Reached end of input string. |
| if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { |
| // Either this is the old (ICU 3.2 and earlier) format data which |
| // does not support explicit support for matching {eof}, or |
| // we have already done the {eof} iteration. Now is the time |
| // to unconditionally bail out. |
| if (lookaheadResult < result) { |
| // We ran off the end of the string with a pending look-ahead match. |
| // Treat this as if the look-ahead condition had been met, and return |
| // the match at the / position from the look-ahead rule. |
| result = lookaheadResult; |
| lookaheadStatus = 0; |
| } else if (result == initialPosition) { |
| // Ran off start, no match found. |
| // Move one position (towards the start, since we are doing previous.) |
| fText.setIndex(initialPosition); |
| previous32(fText); |
| } |
| break mainLoop; |
| } |
| mode = RBBI_END; |
| category = 1; |
| } |
| |
| if (mode == RBBI_RUN) { |
| // look up the current character's category, which tells us |
| // which column in the state table to look at. |
| // |
| category = (short) fRData.fTrie.getCodePointValue(c); |
| |
| // Check the dictionary bit in the character's category. |
| // Counter is only used by dictionary based iterators (subclasses). |
| // Chars that need to be handled by a dictionary have a flag bit set |
| // in their category values. |
| // |
| if ((category & 0x4000) != 0) { |
| fDictionaryCharCount++; |
| // And off the dictionary flag bit. |
| category &= ~0x4000; |
| } |
| } |
| |
| |
| if (TRACE) { |
| System.out.print(" " + fText.getIndex() + " "); |
| if (0x20 <= c && c < 0x7f) { |
| System.out.print(" " + c + " "); |
| } else { |
| System.out.print(" " + Integer.toHexString(c) + " "); |
| } |
| System.out.println(" " + state + " " + category + " "); |
| } |
| |
| // State Transition - move machine to its next state |
| // |
| state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; |
| row = fRData.getRowIndex(state); |
| |
| if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { |
| // Match found, common case, could have lookahead so we move |
| // on to check it |
| result = fText.getIndex(); |
| } |
| |
| if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) { |
| if (lookaheadStatus != 0 |
| && stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) { |
| // Lookahead match is completed. Set the result |
| // accordingly, but only |
| // if no other rule has matched further in the mean |
| // time. |
| result = lookaheadResult; |
| lookaheadStatus = 0; |
| // TODO: make a stand-alone hard break in a rule work. |
| |
| if (lookAheadHardBreak) { |
| break mainLoop; |
| } |
| // Look-ahead completed, but other rules may match further. |
| // Continue on. |
| // TODO: junk this feature? I don't think that it's used anywhere. |
| break innerBlock; |
| } |
| // Hit a possible look-ahead match. We are at the |
| // position of the '/'. Remember this position. |
| lookaheadResult = fText.getIndex(); |
| lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; |
| break innerBlock; |
| } |
| |
| // not lookahead... |
| if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) { |
| // This is a plain (non-look-ahead) accepting state. |
| if (!lookAheadHardBreak) { |
| // Clear out any pending look-ahead matches, |
| // but only if not doing the lookAheadHardBreak option |
| // which needs to force a break no matter what is going |
| // on with the rest of the match, i.e. we can't abandon |
| // a partially completed look-ahead match because |
| // some other rule matched further than the '/' position |
| // in the look-ahead match. |
| lookaheadStatus = 0; |
| } |
| } |
| |
| } // end of innerBlock. "break innerBlock" in above code comes out here. |
| |
| |
| if (state == STOP_STATE) { |
| // Normal loop exit is here |
| break mainLoop; |
| } |
| |
| // then move iterator position backwards one character |
| // |
| if (mode == RBBI_RUN) { |
| c = previous32(fText); |
| } else { |
| if (mode == RBBI_START) { |
| mode = RBBI_RUN; |
| } |
| } |
| |
| |
| } // End of the main loop. |
| |
| // The state machine is done. Check whether it found a match... |
| // |
| // If the iterator failed to advance in the match engine, force it ahead by one. |
| // (This really indicates a defect in the break rules. They should always match |
| // at least one character.) |
| if (result == initialPosition) { |
| result = fText.setIndex(initialPosition); |
| previous32(fText); |
| result = fText.getIndex(); |
| } |
| |
| fText.setIndex(result); |
| if (TRACE) { |
| System.out.println("Result = " + result); |
| } |
| |
| return result; |
| } |
| } |
| |