| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2005-2016 International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| |
| package com.ibm.icu.text; |
| |
| import static com.ibm.icu.impl.CharacterIteration.DONE32; |
| import static com.ibm.icu.impl.CharacterIteration.next32; |
| import static com.ibm.icu.impl.CharacterIteration.nextTrail32; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.nio.ByteBuffer; |
| import java.text.CharacterIterator; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import com.ibm.icu.impl.CharacterIteration; |
| import com.ibm.icu.impl.ICUBinary; |
| import com.ibm.icu.impl.ICUDebug; |
| import com.ibm.icu.impl.RBBIDataWrapper; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UProperty; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.util.CodePointTrie; |
| |
| /** |
| * Rule Based Break Iterator |
| * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. |
| * |
| * @stable ICU 2.0 |
| */ |
| public class RuleBasedBreakIterator extends BreakIterator { |
| //======================================================================= |
| // Constructors & Factories |
| //======================================================================= |
| |
| /** |
| * private constructor |
| */ |
| private RuleBasedBreakIterator() { |
| fDictionaryCharCount = 0; |
| synchronized(gAllBreakEngines) { |
| fBreakEngines = new ArrayList<>(gAllBreakEngines); |
| } |
| } |
| |
| /** |
| * Create a break iterator from a precompiled set of break rules. |
| * |
| * Creating a break iterator from the binary rules is much faster than |
| * creating one from source rules. |
| * |
| * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. |
| * Binary break iterator rules are not guaranteed to be compatible between |
| * different versions of ICU. |
| * |
| * @param is an input stream supplying the compiled binary rules. |
| * @throws IOException if there is an error while reading the rules from the InputStream. |
| * @see #compileRules(String, OutputStream) |
| * @stable ICU 4.8 |
| */ |
| public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { |
| RuleBasedBreakIterator This = new RuleBasedBreakIterator(); |
| This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); |
| This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; |
| return This; |
| } |
| |
| /** |
| * Create a break iterator from a precompiled set of break rules. |
| * |
| * Creating a break iterator from the binary rules is much faster than |
| * creating one from source rules. |
| * |
| * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. |
| * Binary break iterator rules are not guaranteed to be compatible between |
| * different versions of ICU. |
| * |
| * @param bytes a buffer supplying the compiled binary rules. |
| * @throws IOException if there is an error while reading the rules from the buffer. |
| * @see #compileRules(String, OutputStream) |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { |
| RuleBasedBreakIterator This = new RuleBasedBreakIterator(); |
| This.fRData = RBBIDataWrapper.get(bytes); |
| This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; |
| return This; |
| } |
| |
| /** |
| * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. |
| * @param rules The break rules to be used. |
| * @stable ICU 2.2 |
| */ |
| public RuleBasedBreakIterator(String rules) { |
| this(); |
| try { |
| ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); |
| compileRules(rules, ruleOS); |
| fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); |
| fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; |
| } catch (IOException e) { |
| ///CLOVER:OFF |
| // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, |
| // causing bogus compiled rules to be produced, but with no compile error raised. |
| RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " |
| + e.getMessage()); |
| throw rte; |
| ///CLOVER:ON |
| } |
| } |
| |
| //======================================================================= |
| // Boilerplate |
| //======================================================================= |
| |
| /** |
| * Clones this iterator. |
| * @return A newly-constructed RuleBasedBreakIterator with the same |
| * behavior as this one. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public Object clone() { |
| RuleBasedBreakIterator result; |
| result = (RuleBasedBreakIterator)super.clone(); |
| if (fText != null) { |
| result.fText = (CharacterIterator)(fText.clone()); |
| } |
| synchronized (gAllBreakEngines) { |
| result.fBreakEngines = new ArrayList<>(gAllBreakEngines); |
| } |
| result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; |
| result.fBreakCache = result.new BreakCache(fBreakCache); |
| result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); |
| return result; |
| } |
| |
| |
| /** |
| * Returns true if both BreakIterators are of the same class, have the same |
| * rules, and iterate over the same text. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public boolean equals(Object that) { |
| if (that == null) { |
| return false; |
| } |
| if (this == that) { |
| return true; |
| } |
| try { |
| RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; |
| if (fRData != other.fRData && (fRData == null || other.fRData == null)) { |
| return false; |
| } |
| if (fRData != null && other.fRData != null && |
| (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { |
| return false; |
| } |
| if (fText == null && other.fText == null) { |
| return true; |
| } |
| if (fText == null || other.fText == null || !fText.equals(other.fText)) { |
| return false; |
| } |
| return fPosition == other.fPosition; |
| } |
| catch(ClassCastException e) { |
| return false; |
| } |
| } |
| |
| /** |
| * Returns the description (rules) used to create this iterator. |
| * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public String toString() { |
| String retStr = ""; |
| if (fRData != null) { |
| retStr = fRData.fRuleSource; |
| } |
| return retStr; |
| } |
| |
| /** |
| * Compute a hashcode for this BreakIterator |
| * @return A hash code |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int hashCode() |
| { |
| return fRData.fRuleSource.hashCode(); |
| } |
| |
| |
| private static final int START_STATE = 1; // The state number of the starting state |
| private static final int STOP_STATE = 0; // The state-transition value indicating "stop" |
| |
| // RBBIRunMode - the state machine runs an extra iteration at the beginning and end |
| // of user text. A variable with this enum type keeps track of where we |
| // are. The state machine only fetches user text input while in RUN mode. |
| private static final int RBBI_START = 0; |
| private static final int RBBI_RUN = 1; |
| private static final int RBBI_END = 2; |
| |
| /** |
| * The character iterator through which this BreakIterator accesses the text. |
| */ |
| private CharacterIterator fText = new java.text.StringCharacterIterator(""); |
| |
| /** |
| * The rule data for this BreakIterator instance. |
| * Not intended for public use. Declared public for testing purposes only. |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| public RBBIDataWrapper fRData; |
| |
| /** |
| * The iteration state - current position, rule status for the current position, |
| * and whether the iterator ran off the end, yielding UBRK_DONE. |
| * Current position is pinned to be 0 < position <= text.length. |
| * Current position is always set to a boundary. |
| * |
| * The current position of the iterator. Pinned, 0 < fPosition <= text.length. |
| * Never has the value UBRK_DONE (-1). |
| */ |
| private int fPosition; |
| |
| /** |
| * Index of the Rule {tag} values for the most recent match. |
| */ |
| private int fRuleStatusIndex; |
| |
| /** |
| * True when iteration has run off the end, and iterator functions should return UBRK_DONE. |
| */ |
| private boolean fDone; |
| |
| /** |
| * Array of look-ahead tentative results. |
| */ |
| private int[] fLookAheadMatches; |
| |
| /** |
| * Cache of previously determined boundary positions. |
| */ |
| private BreakCache fBreakCache = new BreakCache(); |
| |
| |
| /** |
| * Counter for the number of characters encountered with the "dictionary" |
| * flag set. Normal RBBI iterators don't use it, although the code |
| * for updating it is live. Dictionary Based break iterators (a subclass |
| * of us) access this field directly. |
| * @internal |
| */ |
| private int fDictionaryCharCount; |
| |
| private DictionaryCache fDictionaryCache = new DictionaryCache(); |
| |
| /** |
| * ICU debug argument name for RBBI |
| */ |
| private static final String RBBI_DEBUG_ARG = "rbbi"; |
| |
| /** |
| * Debugging flag. Trace operation of state machine when true. |
| */ |
| private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) |
| && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; |
| |
| /** |
| * The "default" break engine - just skips over ranges of dictionary words, |
| * producing no breaks. Should only be used if characters need to be handled |
| * by a dictionary but we have no dictionary implementation for them. |
| * |
| * Only one instance; shared by all break iterators. |
| */ |
| private static final UnhandledBreakEngine gUnhandledBreakEngine; |
| |
| /** |
| * List of all known break engines, common for all break iterators. |
| * Lazily updated as break engines are needed, because instantiation of |
| * break engines is expensive. |
| * |
| * Because gAllBreakEngines can be referenced concurrently from different |
| * BreakIterator instances, all access is synchronized. |
| */ |
| private static final List<LanguageBreakEngine> gAllBreakEngines; |
| |
| static { |
| gUnhandledBreakEngine = new UnhandledBreakEngine(); |
| gAllBreakEngines = new ArrayList<>(); |
| gAllBreakEngines.add(gUnhandledBreakEngine); |
| } |
| |
| /** |
| * List of all known break engines. Similar to gAllBreakEngines, but local to a |
| * break iterator, allowing it to be used without synchronization. |
| */ |
| private List<LanguageBreakEngine> fBreakEngines; |
| |
| /** |
| * Dump the contents of the state table and character classes for this break iterator. |
| * For debugging only. |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| public void dump(java.io.PrintStream out) { |
| if (out == null) { |
| out = System.out; |
| } |
| this.fRData.dump(out); |
| } |
| |
| /** |
| * Compile a set of source break rules into the binary state tables used |
| * by the break iterator engine. Creating a break iterator from precompiled |
| * rules is much faster than creating one from source rules. |
| * |
| * Binary break rules are not guaranteed to be compatible between different |
| * versions of ICU. |
| * |
| * |
| * @param rules The source form of the break rules |
| * @param ruleBinary An output stream to receive the compiled rules. |
| * @throws IOException If there is an error writing the output. |
| * @see #getInstanceFromCompiledRules(InputStream) |
| * @stable ICU 4.8 |
| */ |
| public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { |
| RBBIRuleBuilder.compileRules(rules, ruleBinary); |
| } |
| |
| //======================================================================= |
| // BreakIterator overrides |
| //======================================================================= |
| |
| /** |
| * Sets the current iteration position to the beginning of the text. |
| * (i.e., the CharacterIterator's starting offset). |
| * @return The offset of the beginning of the text. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int first() { |
| if (fText == null) { |
| return BreakIterator.DONE; |
| } |
| fText.first(); |
| int start = fText.getIndex(); |
| if (!fBreakCache.seek(start)) { |
| fBreakCache.populateNear(start); |
| } |
| fBreakCache.current(); |
| assert(fPosition == start); |
| return fPosition; |
| } |
| |
| /** |
| * Sets the current iteration position to the end of the text. |
| * (i.e., the CharacterIterator's ending offset). |
| * @return The text's past-the-end offset. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int last() { |
| if (fText == null) { |
| return BreakIterator.DONE; |
| } |
| int endPos = fText.getEndIndex(); |
| boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. |
| assert(endShouldBeBoundary); |
| if (fPosition != endPos) { |
| assert(fPosition == endPos); |
| } |
| return endPos; |
| } |
| |
| /** |
| * Advances the iterator either forward or backward the specified number of steps. |
| * Negative values move backward, and positive values move forward. This is |
| * equivalent to repeatedly calling next() or previous(). |
| * @param n The number of steps to move. The sign indicates the direction |
| * (negative is backwards, and positive is forwards). |
| * @return The character offset of the boundary position n boundaries away from |
| * the current one. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int next(int n) { |
| int result = 0; |
| if (n > 0) { |
| for (; n > 0 && result != DONE; --n) { |
| result = next(); |
| } |
| } else if (n < 0) { |
| for (; n < 0 && result != DONE; ++n) { |
| result = previous(); |
| } |
| } else { |
| result = current(); |
| } |
| return result; |
| } |
| |
| /** |
| * Advances the iterator to the next boundary position. |
| * @return The position of the first boundary after this one. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int next() { |
| fBreakCache.next(); |
| return fDone ? DONE : fPosition; |
| } |
| |
| /** |
| * Moves the iterator backwards, to the boundary preceding the current one. |
| * @return The position of the boundary position immediately preceding the starting position. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int previous() { |
| fBreakCache.previous(); |
| return fDone ? DONE : fPosition; |
| } |
| |
| /** |
| * Sets the iterator to refer to the first boundary position following |
| * the specified position. |
| * @param startPos The position from which to begin searching for a break position. |
| * @return The position of the first break after the current position. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int following(int startPos) { |
| // if the supplied position is before the beginning, return the |
| // text's starting offset |
| if (startPos < fText.getBeginIndex()) { |
| return first(); |
| } |
| |
| // Move requested offset to a code point start. It might be between a lead and trail surrogate. |
| // Or it may be beyond the end of the text. |
| startPos = CISetIndex32(fText, startPos); |
| fBreakCache.following(startPos); |
| return fDone ? DONE : fPosition; |
| } |
| |
| |
| /** |
| * Sets the iterator to refer to the last boundary position before the |
| * specified position. |
| * @param offset The position to begin searching for a break from. |
| * @return The position of the last boundary before the starting position. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int preceding(int offset) { |
| if (fText == null || offset > fText.getEndIndex()) { |
| return last(); |
| } else if (offset < fText.getBeginIndex()) { |
| return first(); |
| } |
| |
| // Move requested offset to a code point start. It might be between a lead and trail surrogate. |
| // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. |
| int adjustedOffset = offset; |
| fBreakCache.preceding(adjustedOffset); |
| return fDone ? DONE : fPosition; |
| |
| } |
| |
| |
| /** |
| * Throw IllegalArgumentException unless begin <= offset < end. |
| * @stable ICU 2.0 |
| */ |
| protected static final void checkOffset(int offset, CharacterIterator text) { |
| if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { |
| throw new IllegalArgumentException("offset out of bounds"); |
| } |
| } |
| |
| |
| /** |
| * Returns true if the specified position is a boundary position. As a side |
| * effect, leaves the iterator pointing to the first boundary position at |
| * or after "offset". |
| * @param offset the offset to check. |
| * @return True if "offset" is a boundary position. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public boolean isBoundary(int offset) { |
| // TODO: behavior difference with ICU4C, which considers out-of-range offsets |
| // to not be boundaries, and to not be errors. |
| checkOffset(offset, fText); |
| |
| // Adjust offset to be on a code point boundary and not beyond the end of the text. |
| // Note that isBoundary() is always false for offsets that are not on code point boundaries. |
| // But we still need the side effect of leaving iteration at the following boundary. |
| int adjustedOffset = CISetIndex32(fText, offset); |
| |
| boolean result = false; |
| if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { |
| result = (fBreakCache.current() == offset); |
| } |
| |
| if (!result) { |
| // Not on a boundary. isBoundary() must leave iterator on the following boundary. |
| // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. |
| next(); |
| } |
| return result; |
| |
| } |
| |
| /** |
| * Returns the current iteration position. Note that DONE is never |
| * returned from this function; if iteration has run to the end of a |
| * string, current() will return the length of the string while |
| * next() will return BreakIterator.DONE). |
| * @return The current iteration position. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public int current() { |
| return (fText != null) ? fPosition : BreakIterator.DONE; |
| } |
| |
| |
| /** |
| * Return the status tag from the break rule that determined the boundary at |
| * the current iteration position. The values appear in the rule source |
| * within brackets, {123}, for example. For rules that do not specify a |
| * status, a default value of 0 is returned. If more than one rule applies, |
| * the numerically largest of the possible status values is returned. |
| * <p> |
| * Of the standard types of ICU break iterators, only the word and line break |
| * iterator provides status values. The values are defined in |
| * class RuleBasedBreakIterator, and allow distinguishing between words |
| * that contain alphabetic letters, "words" that appear to be numbers, |
| * punctuation and spaces, words containing ideographic characters, and |
| * more. Call <code>getRuleStatus</code> after obtaining a boundary |
| * position from <code>next()</code>, <code>previous()</code>, or |
| * any other break iterator functions that returns a boundary position. |
| * <p> |
| * Note that <code>getRuleStatus()</code> returns the value corresponding to |
| * <code>current()</code> index even after <code>next()</code> has returned DONE. |
| * <p> |
| |
| * @return the status from the break rule that determined the boundary |
| * at the current iteration position. |
| * |
| * @stable ICU 60 |
| */ |
| |
| @Override |
| public int getRuleStatus() { |
| // Status records have this form: |
| // Count N <-- fLastRuleStatusIndex points here. |
| // Status val 0 |
| // Status val 1 |
| // ... |
| // Status val N-1 <-- the value we need to return |
| // The status values are sorted in ascending order. |
| // This function returns the last (largest) of the array of status values. |
| int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; |
| int tagVal = fRData.fStatusTable[idx]; |
| return tagVal; |
| } |
| |
| /** |
| * Get the status (tag) values from the break rule(s) that determined the boundary |
| * at the current iteration position. The values appear in the rule source |
| * within brackets, {123}, for example. The default status value for rules |
| * that do not explicitly provide one is zero. |
| * <p> |
| * The status values used by the standard ICU break rules are defined |
| * as public constants in class RuleBasedBreakIterator. |
| * <p> |
| * If the size of the output array is insufficient to hold the data, |
| * the output will be truncated to the available length. No exception |
| * will be thrown. |
| * |
| * @param fillInArray an array to be filled in with the status values. |
| * @return The number of rule status values from the rules that determined |
| * the boundary at the current iteration position. |
| * In the event that the array is too small, the return value |
| * is the total number of status values that were available, |
| * not the reduced number that were actually returned. |
| * @stable ICU 60 |
| */ |
| @Override |
| public int getRuleStatusVec(int[] fillInArray) { |
| int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; |
| if (fillInArray != null) { |
| int numToCopy = Math.min(numStatusVals, fillInArray.length); |
| for (int i=0; i<numToCopy; i++) { |
| fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; |
| } |
| } |
| return numStatusVals; |
| } |
| |
| /** |
| * Returns a CharacterIterator over the text being analyzed. |
| * <p> |
| * <b><i>Caution:</i></b>The state of the returned CharacterIterator |
| * must not be modified in any way while the BreakIterator is still in use. |
| * Doing so will lead to undefined behavior of the BreakIterator. |
| * Clone the returned CharacterIterator first and work with that. |
| * <p> |
| * The returned CharacterIterator is a reference |
| * to the <b>actual iterator being used</b> by the BreakIterator. |
| * No guarantees are made about the current position |
| * of this iterator when it is returned; it may differ from the |
| * BreakIterators current position. If you need to move that |
| * position to examine the text, clone this function's return value first. |
| * @return An iterator over the text being analyzed. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public CharacterIterator getText() { |
| return fText; |
| } |
| |
| /** |
| * Set the iterator to analyze a new piece of text. This function resets |
| * the current iteration position to the beginning of the text. |
| * (The old iterator is dropped.) |
| * <p> |
| * <b><i>Caution:</i></b> The supplied CharacterIterator is used |
| * directly by the BreakIterator, and must not be altered in any |
| * way by code outside of the BreakIterator. |
| * Doing so will lead to undefined behavior of the BreakIterator. |
| * |
| * @param newText An iterator over the text to analyze. |
| * @stable ICU 2.0 |
| */ |
| @Override |
| public void setText(CharacterIterator newText) { |
| if (newText != null) { |
| fBreakCache.reset(newText.getBeginIndex(), 0); |
| } else { |
| fBreakCache.reset(); |
| } |
| fDictionaryCache.reset(); |
| fText = newText; |
| this.first(); |
| } |
| |
| /** |
| * Control debug, trace and dump options. |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? |
| ICUDebug.value(RBBI_DEBUG_ARG) : null; |
| |
| |
| private LanguageBreakEngine getLanguageBreakEngine(int c) { |
| |
| // We have a dictionary character. |
| // Does an already instantiated break engine handle it? |
| for (LanguageBreakEngine candidate : fBreakEngines) { |
| if (candidate.handles(c)) { |
| return candidate; |
| } |
| } |
| |
| synchronized (gAllBreakEngines) { |
| // This break iterator's list of break engines didn't handle the character. |
| // Check the global list, another break iterator may have instantiated the |
| // desired engine. |
| for (LanguageBreakEngine candidate : gAllBreakEngines) { |
| if (candidate.handles(c)) { |
| fBreakEngines.add(candidate); |
| return candidate; |
| } |
| } |
| |
| // The global list doesn't have an existing engine, build one. |
| int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); |
| if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { |
| // Katakana, Hiragana and Han are handled by the same dictionary engine. |
| // Fold them together for mapping from script -> engine. |
| script = UScript.HAN; |
| } |
| |
| LanguageBreakEngine eng; |
| try { |
| switch (script) { |
| case UScript.THAI: |
| eng = new ThaiBreakEngine(); |
| break; |
| case UScript.LAO: |
| eng = new LaoBreakEngine(); |
| break; |
| case UScript.MYANMAR: |
| eng = new BurmeseBreakEngine(); |
| break; |
| case UScript.KHMER: |
| eng = new KhmerBreakEngine(); |
| break; |
| case UScript.HAN: |
| eng = new CjkBreakEngine(false); |
| break; |
| case UScript.HANGUL: |
| eng = new CjkBreakEngine(true); |
| break; |
| default: |
| gUnhandledBreakEngine.handleChar(c); |
| eng = gUnhandledBreakEngine; |
| break; |
| } |
| } catch (IOException e) { |
| eng = null; |
| } |
| |
| if (eng != null && eng != gUnhandledBreakEngine) { |
| gAllBreakEngines.add(eng); |
| fBreakEngines.add(eng); |
| } |
| return eng; |
| } // end synchronized(gAllBreakEngines) |
| } |
| |
| /** |
| * The State Machine Engine for moving forward is here. |
| * This function is the heart of the RBBI run time engine. |
| * |
| * Input |
| * fPosition, the position in the text to begin from. |
| * Output |
| * fPosition: the boundary following the starting position. |
| * fDictionaryCharCount the number of dictionary characters encountered. |
| * If > 0, the segment will be further subdivided |
| * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. |
| * |
| * @return the new iterator position |
| * |
| * A note on supplementary characters and the position of underlying |
| * Java CharacterIterator: Normally, a character iterator is positioned at |
| * the char most recently returned by next(). Within this function, when |
| * a supplementary char is being processed, the char iterator is left |
| * sitting on the trail surrogate, in the middle of the code point. |
| * This is different from everywhere else, where an iterator always |
| * points at the lead surrogate of a supplementary. |
| */ |
| private int handleNext() { |
| if (TRACE) { |
| System.out.println("Handle Next pos char state category"); |
| } |
| |
| // handleNext always sets the break tag value. |
| // Set the default for it. |
| fRuleStatusIndex = 0; |
| fDictionaryCharCount = 0; |
| |
| // caches for quicker access |
| CharacterIterator text = fText; |
| CodePointTrie trie = fRData.fTrie; |
| |
| char[] stateTable = fRData.fFTable.fTable; |
| int initialPosition = fPosition; |
| text.setIndex(initialPosition); |
| int result = initialPosition; |
| |
| // Set up the starting char |
| int c = text.current(); |
| if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { |
| c = nextTrail32(text, c); |
| if (c == DONE32) { |
| fDone = true; |
| return BreakIterator.DONE; |
| } |
| } |
| |
| // Set the initial state for the state machine |
| int state = START_STATE; |
| int row = fRData.getRowIndex(state); |
| short category = 3; |
| int flagsState = fRData.fFTable.fFlags; |
| int dictStart = fRData.fFTable.fDictCategoriesStart; |
| int mode = RBBI_RUN; |
| if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { |
| category = 2; |
| mode = RBBI_START; |
| if (TRACE) { |
| System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); |
| System.out.print(RBBIDataWrapper.intToHexString(c, 10)); |
| System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); |
| } |
| } |
| |
| // loop until we reach the end of the text or transition to state 0 |
| while (state != STOP_STATE) { |
| if (c == DONE32) { |
| // Reached end of input string. |
| if (mode == RBBI_END) { |
| // We have already run the loop one last time with the |
| // character set to the pseudo {eof} value. Now it is time |
| // to unconditionally bail out. |
| break; |
| } |
| // Run the loop one last time with the fake end-of-input character category |
| mode = RBBI_END; |
| category = 1; |
| } |
| else if (mode == RBBI_RUN) { |
| // Get the char category. An incoming category of 1 or 2 mens that |
| // we are preset for doing the beginning or end of input, and |
| // that we shouldn't get a category from an actual text input character. |
| // |
| |
| // look up the current character's character category, which tells us |
| // which column in the state table to look at. |
| // |
| category = (short) trie.get(c); |
| |
| // Check for categories that require word dictionary handling. |
| if (category >= dictStart) { |
| fDictionaryCharCount++; |
| } |
| |
| if (TRACE) { |
| System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); |
| System.out.print(RBBIDataWrapper.intToHexString(c, 10)); |
| System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); |
| } |
| |
| // Advance to the next character. |
| // If this is a beginning-of-input loop iteration, don't advance. |
| // The next iteration will be processing the first real input character. |
| c = text.next(); |
| if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { |
| c = nextTrail32(text, c); |
| } |
| } |
| else { |
| mode = RBBI_RUN; |
| } |
| |
| // look up a state transition in the state table |
| state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; |
| row = fRData.getRowIndex(state); |
| int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; |
| if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { |
| // Match found, common case |
| result = text.getIndex(); |
| if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { |
| // The iterator has been left in the middle of a surrogate pair. |
| // We want the start of it. |
| result--; |
| } |
| |
| // Remember the break status (tag) values. |
| fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; |
| } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { |
| // Lookahead match is completed |
| int lookaheadResult = fLookAheadMatches[accepting]; |
| if (lookaheadResult >= 0) { |
| fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; |
| fPosition = lookaheadResult; |
| return lookaheadResult; |
| } |
| } |
| |
| |
| // If we are at the position of the '/' in a look-ahead (hard break) rule; |
| // record the current position, to be returned later, if the full rule matches. |
| // TODO: Move this check before the previous check of fAccepting. |
| // This would enable hard-break rules with no following context. |
| // But there are line break test failures when trying this. Investigate. |
| // Issue ICU-20837 |
| int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; |
| if (rule != 0) { |
| int pos = text.getIndex(); |
| if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { |
| // The iterator has been left in the middle of a surrogate pair. |
| // We want the beginning of it. |
| pos--; |
| } |
| fLookAheadMatches[rule] = pos; |
| } |
| |
| |
| } // End of state machine main loop |
| |
| // The state machine is done. Check whether it found a match... |
| |
| // If the iterator failed to advance in the match engine force it ahead by one. |
| // This indicates a defect in the break rules, which should always match |
| // at least one character. |
| |
| if (result == initialPosition) { |
| if (TRACE) { |
| System.out.println("Iterator did not move. Advancing by 1."); |
| } |
| text.setIndex(initialPosition); |
| next32(text); |
| result = text.getIndex(); |
| fRuleStatusIndex = 0; |
| } |
| |
| // Leave the iterator at our result position. |
| // (we may have advanced beyond the last accepting position chasing after |
| // longer matches that never completed.) |
| fPosition = result; |
| |
| if (TRACE) { |
| System.out.println("result = " + result); |
| } |
| return result; |
| } |
| |
| /** |
| * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. |
| * This locates a "Safe Position" from which the forward break rules |
| * will operate correctly. A Safe Position is not necessarily a boundary itself. |
| * |
| * The logic of this function is very similar to handleNext(), above, but simpler |
| * because the safe table does not require as many options. |
| * |
| * @param fromPosition the position in the input text to begin the iteration. |
| * @internal |
| */ |
| private int handleSafePrevious(int fromPosition) { |
| char state; |
| short category = 0; |
| int result = 0; |
| |
| // caches for quicker access |
| CharacterIterator text = fText; |
| CodePointTrie trie = fRData.fTrie; |
| char[] stateTable = fRData.fRTable.fTable; |
| |
| CISetIndex32(text, fromPosition); |
| if (TRACE) { |
| System.out.print("Handle Previous pos char state category"); |
| } |
| |
| // if we're already at the start of the text, return DONE. |
| if (text.getIndex() == text.getBeginIndex()) { |
| return BreakIterator.DONE; |
| } |
| |
| // Set the initial state for the state machine |
| int c = CharacterIteration.previous32(text); |
| state = START_STATE; |
| int row = fRData.getRowIndex(state); |
| |
| // loop until we reach the start of the text or transition to state 0 |
| // |
| for (; c != DONE32; c = CharacterIteration.previous32(text)) { |
| |
| // look up the current character's character category, which tells us |
| // which column in the state table to look at. |
| // |
| // And off the dictionary flag bit. For reverse iteration it is not used. |
| category = (short) trie.get(c); |
| if (TRACE) { |
| System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); |
| System.out.print(RBBIDataWrapper.intToHexString(c, 10)); |
| System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); |
| } |
| |
| // State Transition - move machine to its next state |
| // |
| assert(category < fRData.fHeader.fCatCount); |
| state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; |
| row = fRData.getRowIndex(state); |
| |
| if (state == STOP_STATE) { |
| // This is the normal exit from the lookup state machine. |
| // Transition to state zero means we have found a safe point. |
| break; |
| } |
| } |
| |
| // The state machine is done. |
| result = text.getIndex(); |
| if (TRACE) { |
| System.out.println("result = " + result); |
| } |
| return result; |
| } |
| |
| /** |
| * Set the index of a CharacterIterator. |
| * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. |
| * If the index points to a trail surrogate of a supplementary character, adjust it |
| * to the start (lead surrogate) index. |
| * |
| * @param ci A CharacterIterator to set |
| * @param index the index to set |
| * @return the resulting index, possibly pinned or adjusted. |
| */ |
| private static int CISetIndex32(CharacterIterator ci, int index) { |
| if (index <= ci.getBeginIndex()) { |
| ci.first(); |
| } else if (index >= ci.getEndIndex()) { |
| ci.setIndex(ci.getEndIndex()); |
| } else if (Character.isLowSurrogate(ci.setIndex(index))) { |
| if (!Character.isHighSurrogate(ci.previous())) { |
| ci.next(); |
| } |
| } |
| return ci.getIndex(); |
| } |
| |
| /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. |
| * Dictionary boundaries are moved first to this cache, then from here |
| * to the main BreakCache, where they may inter-leave with non-dictionary |
| * boundaries. The public BreakIterator API always fetches directly |
| * from the main BreakCache, not from here. |
| * |
| * In common situations, the number of boundaries in a single dictionary run |
| * should be quite small, it will be terminated by punctuation, spaces, |
| * or any other non-dictionary characters. The main BreakCache may end |
| * up with boundaries from multiple dictionary based runs. |
| * |
| * The boundaries are stored in a simple ArrayList (vector), with the |
| * assumption that they will be accessed sequentially. |
| */ |
| class DictionaryCache { |
| |
| void reset() { |
| fPositionInCache = -1; |
| fStart = 0; |
| fLimit = 0; |
| fFirstRuleStatusIndex = 0; |
| fOtherRuleStatusIndex = 0; |
| fBreaks.removeAllElements(); |
| }; |
| |
| boolean following(int fromPos) { |
| if (fromPos >= fLimit || fromPos < fStart) { |
| fPositionInCache = -1; |
| return false; |
| } |
| |
| // Sequential iteration, move from previous boundary to the following |
| |
| int r = 0; |
| if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { |
| ++fPositionInCache; |
| if (fPositionInCache >= fBreaks.size()) { |
| fPositionInCache = -1; |
| return false; |
| } |
| r = fBreaks.elementAt(fPositionInCache); |
| assert(r > fromPos); |
| fBoundary = r; |
| fStatusIndex = fOtherRuleStatusIndex; |
| return true; |
| } |
| |
| // Random indexing. Linear search for the boundary following the given position. |
| |
| for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { |
| r= fBreaks.elementAt(fPositionInCache); |
| if (r > fromPos) { |
| fBoundary = r; |
| fStatusIndex = fOtherRuleStatusIndex; |
| return true; |
| } |
| } |
| |
| // Internal error. fStart <= fromPos < fLimit, but no cached boundary. |
| assert(false); |
| fPositionInCache = -1; |
| return false; |
| }; |
| |
| boolean preceding(int fromPos) { |
| if (fromPos <= fStart || fromPos > fLimit) { |
| fPositionInCache = -1; |
| return false; |
| } |
| |
| if (fromPos == fLimit) { |
| fPositionInCache = fBreaks.size() - 1; |
| if (fPositionInCache >= 0) { |
| assert(fBreaks.elementAt(fPositionInCache) == fromPos); |
| } |
| } |
| |
| int r; |
| if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { |
| --fPositionInCache; |
| r = fBreaks.elementAt(fPositionInCache); |
| assert(r < fromPos); |
| fBoundary = r; |
| fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
| return true; |
| } |
| |
| if (fPositionInCache == 0) { |
| fPositionInCache = -1; |
| return false; |
| } |
| |
| for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { |
| r = fBreaks.elementAt(fPositionInCache); |
| if (r < fromPos) { |
| fBoundary = r; |
| fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
| return true; |
| } |
| } |
| assert(false); |
| fPositionInCache = -1; |
| return false; |
| }; |
| |
| /** |
| * Populate the cache with the dictionary based boundaries within a region of text. |
| * @param startPos The start position of a range of text |
| * @param endPos The end position of a range of text |
| * @param firstRuleStatus The rule status index that applies to the break at startPos |
| * @param otherRuleStatus The rule status index that applies to boundaries other than startPos |
| * @internal |
| */ |
| void populateDictionary(int startPos, int endPos, |
| int firstRuleStatus, int otherRuleStatus) { |
| if ((endPos - startPos) <= 1) { |
| return; |
| } |
| |
| reset(); |
| fFirstRuleStatusIndex = firstRuleStatus; |
| fOtherRuleStatusIndex = otherRuleStatus; |
| |
| int rangeStart = startPos; |
| int rangeEnd = endPos; |
| |
| int category; |
| int current; |
| int foundBreakCount = 0; |
| |
| // Loop through the text, looking for ranges of dictionary characters. |
| // For each span, find the appropriate break engine, and ask it to find |
| // any breaks within the span. |
| |
| fText.setIndex(rangeStart); |
| int c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.get(c); |
| int dictStart = fRData.fFTable.fDictCategoriesStart; |
| |
| while(true) { |
| while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { |
| c = CharacterIteration.next32(fText); // pre-increment |
| category = (short)fRData.fTrie.get(c); |
| } |
| if (current >= rangeEnd) { |
| break; |
| } |
| |
| // We now have a dictionary character. Get the appropriate language object |
| // to deal with it. |
| LanguageBreakEngine lbe = getLanguageBreakEngine(c); |
| |
| // Ask the language object if there are any breaks. It will add them to the cache and |
| // leave the text pointer on the other side of its range, ready to search for the next one. |
| if (lbe != null) { |
| foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks); |
| } |
| |
| // Reload the loop variables for the next go-round |
| c = CharacterIteration.current32(fText); |
| category = (short)fRData.fTrie.get(c); |
| } |
| |
| // If we found breaks, ensure that the first and last entries are |
| // the original starting and ending position. And initialize the |
| // cache iteration position to the first entry. |
| |
| // System.out.printf("foundBreakCount = %d%n", foundBreakCount); |
| if (foundBreakCount > 0) { |
| assert(foundBreakCount == fBreaks.size()); |
| if (startPos < fBreaks.elementAt(0)) { |
| // The dictionary did not place a boundary at the start of the segment of text. |
| // Add one now. This should not commonly happen, but it would be easy for interactions |
| // of the rules for dictionary segments and the break engine implementations to |
| // inadvertently cause it. Cover it here, just in case. |
| fBreaks.offer(startPos); |
| } |
| if (endPos > fBreaks.peek()) { |
| fBreaks.push(endPos); |
| } |
| fPositionInCache = 0; |
| // Note: Dictionary matching may extend beyond the original limit. |
| fStart = fBreaks.elementAt(0); |
| fLimit = fBreaks.peek(); |
| } else { |
| // there were no language-based breaks, even though the segment contained |
| // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache |
| // for this range will fail, and the calling code will fall back to the rule based boundaries. |
| } |
| |
| }; |
| |
| |
| DictionaryCache() { |
| fPositionInCache = -1; |
| fBreaks = new DictionaryBreakEngine.DequeI(); |
| } |
| |
| /** |
| * copy constructor. Used by RuleBasedBreakIterator.clone(). |
| * |
| * @param src the source object to be copied. |
| */ |
| DictionaryCache(DictionaryCache src) { |
| try { |
| fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); |
| } |
| catch (CloneNotSupportedException e) { |
| throw new RuntimeException(e); |
| } |
| fPositionInCache = src.fPositionInCache; |
| fStart = src.fStart; |
| fLimit = src.fLimit; |
| fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; |
| fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; |
| fBoundary = src.fBoundary; |
| fStatusIndex = src.fStatusIndex; |
| } |
| |
| // A data structure containing the boundaries themselves. Essentially a vector of raw ints. |
| DictionaryBreakEngine.DequeI fBreaks; |
| int fPositionInCache; // Index in fBreaks of last boundary returned by following() |
| // // or preceding(). Optimizes sequential access. |
| int fStart; // Text position of first boundary in cache. |
| int fLimit; // Last boundary in cache. Which is the limit of the |
| // // text segment being handled by the dictionary. |
| int fFirstRuleStatusIndex; // Rule status info for first boundary. |
| int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. |
| int fBoundary; // Current boundary. Set by preceding(), following(). |
| int fStatusIndex; // Current rule status index. Set by preceding, following(). |
| }; |
| |
| |
| |
| |
| /* |
| * class BreakCache |
| * |
| * Cache of break boundary positions and rule status values. |
| * Break iterator API functions, next(), previous(), etc., will use cached results |
| * when possible, and otherwise cache new results as they are obtained. |
| * |
| * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. |
| * |
| * The cache is implemented as a single circular buffer. |
| */ |
| |
| /* |
| * size of the circular cache buffer. |
| */ |
| |
| class BreakCache { |
| |
| BreakCache() { |
| reset(); |
| }; |
| |
| void reset(int pos, int ruleStatus) { |
| fStartBufIdx = 0; |
| fEndBufIdx = 0; |
| fTextIdx = pos; |
| fBufIdx = 0; |
| fBoundaries[0] = pos; |
| fStatuses[0] = (short)ruleStatus; |
| } |
| |
| void reset() {reset(0, 0); }; |
| |
| void next() { |
| if (fBufIdx == fEndBufIdx) { |
| fDone = !populateFollowing(); |
| fPosition = fTextIdx; |
| fRuleStatusIndex = fStatuses[fBufIdx]; |
| } else { |
| fBufIdx = modChunkSize(fBufIdx + 1); |
| fTextIdx = fPosition = fBoundaries[fBufIdx]; |
| fRuleStatusIndex = fStatuses[fBufIdx]; |
| } |
| }; |
| |
| void previous() { |
| int initialBufIdx = fBufIdx; |
| if (fBufIdx == fStartBufIdx) { |
| // At start of cache. Prepend to it. |
| populatePreceding(); |
| } else { |
| // Cache already holds the next boundary |
| fBufIdx = modChunkSize(fBufIdx - 1); |
| fTextIdx = fBoundaries[fBufIdx]; |
| } |
| fDone = (fBufIdx == initialBufIdx); |
| fPosition = fTextIdx; |
| fRuleStatusIndex = fStatuses[fBufIdx]; |
| return; |
| }; |
| |
| // Move the iteration state to the position following the startPosition. |
| // Input position must be pinned to the input length. |
| void following(int startPos) { |
| if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { |
| // startPos is in the cache. Do a next() from that position. |
| // TODO: an awkward set of interactions with bi->fDone |
| // seek() does not clear it; it can't because of interactions with populateNear(). |
| // next() does not clear it in the fast-path case, where everything matters. Maybe it should. |
| // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. |
| fDone = false; |
| next(); |
| } |
| |
| }; |
| |
| void preceding(int startPos) { |
| if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { |
| if (startPos == fTextIdx) { |
| previous(); |
| } else { |
| // seek() leaves the BreakCache positioned at the preceding boundary |
| // if the requested position is between two bounaries. |
| // current() pushes the BreakCache position out to the BreakIterator itself. |
| assert(startPos > fTextIdx); |
| current(); |
| } |
| } |
| return; |
| }; |
| |
| /** |
| * Update the state of the public BreakIterator (fBI) to reflect the |
| * current state of the break iterator cache (this). |
| */ |
| int current() { |
| fPosition = fTextIdx; |
| fRuleStatusIndex = fStatuses[fBufIdx]; |
| fDone = false; |
| return fTextIdx; |
| }; |
| |
| /** |
| * Add boundaries to the cache near the specified position. |
| * The given position need not be a boundary itself. |
| * The input position must be within the range of the text, and |
| * on a code point boundary. |
| * If the requested position is a break boundary, leave the iteration |
| * position on it. |
| * If the requested position is not a boundary, leave the iteration |
| * position on the preceding boundary and include both the the |
| * preceding and following boundaries in the cache. |
| * Additional boundaries, either preceding or following, may be added |
| * to the cache as a side effect. |
| * |
| * Return false if the operation failed. |
| */ |
| boolean populateNear(int position) { |
| assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); |
| |
| // Find a boundary somewhere in the vicinity of the requested position. |
| // Depending on the safe rules and the text data, it could be either before, at, or after |
| // the requested position. |
| |
| |
| // If the requested position is not near already cached positions, clear the existing cache, |
| // find a near-by boundary and begin new cache contents there. |
| |
| if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { |
| int aBoundary = fText.getBeginIndex(); |
| int ruleStatusIndex = 0; |
| if (position > aBoundary + 20) { |
| int backupPos = handleSafePrevious(position); |
| if (backupPos > aBoundary) { |
| // Advance to the boundary following the backup position. |
| // There is a complication: the safe reverse rules identify pairs of code points |
| // that are safe. If advancing from the safe point moves forwards by less than |
| // two code points, we need to advance one more time to ensure that the boundary |
| // is good, including a correct rules status value. |
| // |
| fPosition = backupPos; |
| aBoundary = handleNext(); |
| if (aBoundary == backupPos + 1 || |
| (aBoundary == backupPos + 2 && |
| Character.isHighSurrogate(fText.setIndex(backupPos)) && |
| Character.isLowSurrogate(fText.next()))) { |
| // The initial handleNext() only advanced by a single code point. Go again. |
| // Safe rules identify safe pairs. |
| aBoundary = handleNext(); |
| } |
| } |
| ruleStatusIndex = fRuleStatusIndex; |
| } |
| reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. |
| } |
| |
| // Fill in boundaries between existing cache content and the new requested position. |
| |
| if (fBoundaries[fEndBufIdx] < position) { |
| // The last position in the cache precedes the requested position. |
| // Add following position(s) to the cache. |
| while (fBoundaries[fEndBufIdx] < position) { |
| if (!populateFollowing()) { |
| assert false; |
| return false; |
| } |
| } |
| fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. |
| fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. |
| while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. |
| previous(); |
| } |
| return true; |
| } |
| |
| if (fBoundaries[fStartBufIdx] > position) { |
| // The first position in the cache is beyond the requested position. |
| // back up more until we get a boundary <= the requested position. |
| while (fBoundaries[fStartBufIdx] > position) { |
| populatePreceding(); |
| } |
| fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. |
| fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. |
| while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. |
| next(); |
| } |
| if (fTextIdx > position) { |
| // If position is not itself a boundary, the next() loop above will overshoot. |
| // Back up one, leaving cache position at the boundary preceding the requested position. |
| previous(); |
| } |
| return true; |
| } |
| |
| assert fTextIdx == position; |
| return true; |
| |
| }; |
| |
| /** |
| * Add boundary(s) to the cache following the current last boundary. |
| * Return false if at the end of the text, and no more boundaries can be added. |
| * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. |
| */ |
| boolean populateFollowing() { |
| int fromPosition = fBoundaries[fEndBufIdx]; |
| int fromRuleStatusIdx = fStatuses[fEndBufIdx]; |
| int pos = 0; |
| int ruleStatusIdx = 0; |
| |
| if (fDictionaryCache.following(fromPosition)) { |
| addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); |
| return true; |
| } |
| |
| fPosition = fromPosition; |
| pos = handleNext(); |
| if (pos == BreakIterator.DONE) { |
| return false; |
| } |
| |
| ruleStatusIdx = fRuleStatusIndex; |
| if (fDictionaryCharCount > 0) { |
| // The text segment obtained from the rules includes dictionary characters. |
| // Subdivide it, with subdivided results going into the dictionary cache. |
| fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); |
| if (fDictionaryCache.following(fromPosition)) { |
| addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); |
| return true; |
| // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. |
| // But be careful with interactions with populateNear(). |
| } |
| } |
| |
| // Rule based segment did not include dictionary characters. |
| // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, |
| // meaning that we didn't take the return, above. |
| // Add its end point to the cache. |
| addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
| |
| // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. |
| // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. |
| // |
| for (int count=0; count<6; ++count) { |
| pos = handleNext(); |
| if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { |
| break; |
| } |
| addFollowing(pos, fRuleStatusIndex, RetainCachePosition); |
| } |
| return true; |
| }; |
| |
| /** |
| * Add one or more boundaries to the cache preceding the first currently cached boundary. |
| * Leave the iteration position on the first added boundary. |
| * Return false if no boundaries could be added (if at the start of the text.) |
| */ |
| boolean populatePreceding() { |
| int textBegin = fText.getBeginIndex(); |
| int fromPosition = fBoundaries[fStartBufIdx]; |
| if (fromPosition == textBegin) { |
| return false; |
| } |
| |
| int position = textBegin; |
| int positionStatusIdx = 0; |
| |
| if (fDictionaryCache.preceding(fromPosition)) { |
| addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); |
| return true; |
| } |
| |
| int backupPosition = fromPosition; |
| |
| // Find a boundary somewhere preceding the first already-cached boundary |
| do { |
| backupPosition = backupPosition - 30; |
| if (backupPosition <= textBegin) { |
| backupPosition = textBegin; |
| } else { |
| backupPosition = handleSafePrevious(backupPosition); |
| } |
| if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { |
| position = textBegin; |
| positionStatusIdx = 0; |
| } else { |
| // Advance to the boundary following the backup position. |
| // There is a complication: the safe reverse rules identify pairs of code points |
| // that are safe. If advancing from the safe point moves forwards by less than |
| // two code points, we need to advance one more time to ensure that the boundary |
| // is good, including a correct rules status value. |
| // |
| fPosition = backupPosition; // TODO: pass starting position in a clearer way. |
| position = handleNext(); |
| if (position == backupPosition + 1 || |
| (position == backupPosition + 2 && |
| Character.isHighSurrogate(fText.setIndex(backupPosition)) && |
| Character.isLowSurrogate(fText.next()))) { |
| // The initial handleNext() only advanced by a single code point. Go again. |
| // Safe rules identify safe pairs. |
| position = handleNext(); |
| } |
| positionStatusIdx = fRuleStatusIndex; |
| } |
| } while (position >= fromPosition); |
| |
| // Find boundaries between the one we just located and the first already-cached boundary |
| // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. |
| |
| fSideBuffer.removeAllElements(); |
| fSideBuffer.push(position); |
| fSideBuffer.push(positionStatusIdx); |
| |
| do { |
| int prevPosition = fPosition = position; |
| int prevStatusIdx = positionStatusIdx; |
| position = handleNext(); |
| positionStatusIdx = fRuleStatusIndex; |
| if (position == BreakIterator.DONE) { |
| break; |
| } |
| |
| boolean segmentHandledByDictionary = false; |
| if (fDictionaryCharCount != 0) { |
| // Segment from the rules includes dictionary characters. |
| // Subdivide it, with subdivided results going into the dictionary cache. |
| int dictSegEndPosition = position; |
| fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); |
| while (fDictionaryCache.following(prevPosition)) { |
| position = fDictionaryCache.fBoundary; |
| positionStatusIdx = fDictionaryCache.fStatusIndex; |
| segmentHandledByDictionary = true; |
| assert(position > prevPosition); |
| if (position >= fromPosition) { |
| break; |
| } |
| assert(position <= dictSegEndPosition); |
| fSideBuffer.push(position); |
| fSideBuffer.push(positionStatusIdx); |
| prevPosition = position; |
| } |
| assert(position==dictSegEndPosition || position>=fromPosition); |
| } |
| |
| if (!segmentHandledByDictionary && position < fromPosition) { |
| fSideBuffer.push(position); |
| fSideBuffer.push(positionStatusIdx); |
| } |
| } while (position < fromPosition); |
| |
| // Move boundaries from the side buffer to the main circular buffer. |
| boolean success = false; |
| if (!fSideBuffer.isEmpty()) { |
| positionStatusIdx = fSideBuffer.pop(); |
| position = fSideBuffer.pop(); |
| addPreceding(position, positionStatusIdx, UpdateCachePosition); |
| success = true; |
| } |
| |
| while (!fSideBuffer.isEmpty()) { |
| positionStatusIdx = fSideBuffer.pop(); |
| position = fSideBuffer.pop(); |
| if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { |
| // No space in circular buffer to hold a new preceding result while |
| // also retaining the current cache (iteration) position. |
| // Bailing out is safe; the cache will refill again if needed. |
| break; |
| } |
| } |
| return success; |
| }; |
| |
| |
| static final boolean RetainCachePosition = false; |
| static final boolean UpdateCachePosition = true; |
| |
| /** |
| * Add the boundary following the current position. |
| * The current position can be left as it was, or changed to the newly added boundary, |
| * as specified by the update parameter. |
| */ |
| void addFollowing(int position, int ruleStatusIdx, boolean update) { |
| assert(position > fBoundaries[fEndBufIdx]); |
| assert(ruleStatusIdx <= Short.MAX_VALUE); |
| int nextIdx = modChunkSize(fEndBufIdx + 1); |
| if (nextIdx == fStartBufIdx) { |
| fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. |
| } |
| fBoundaries[nextIdx] = position; |
| fStatuses[nextIdx] = (short)ruleStatusIdx; |
| fEndBufIdx = nextIdx; |
| if (update == UpdateCachePosition) { |
| // Set current position to the newly added boundary. |
| fBufIdx = nextIdx; |
| fTextIdx = position; |
| } else { |
| // Retaining the original cache position. |
| // Check if the added boundary wraps around the buffer, and would over-write the original position. |
| // It's the responsibility of callers of this function to not add too many. |
| assert(nextIdx != fBufIdx); |
| } |
| |
| }; |
| |
| |
| /** |
| * Add the boundary preceding the current position. |
| * The current position can be left as it was, or changed to the newly added boundary, |
| * as specified by the update parameter. |
| */ |
| boolean addPreceding(int position, int ruleStatusIdx, boolean update) { |
| assert(position < fBoundaries[fStartBufIdx]); |
| assert(ruleStatusIdx <= Short.MAX_VALUE); |
| int nextIdx = modChunkSize(fStartBufIdx - 1); |
| if (nextIdx == fEndBufIdx) { |
| if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { |
| // Failure. The insertion of the new boundary would claim the buffer position that is the |
| // current iteration position. And we also want to retain the current iteration position. |
| // (The buffer is already completely full of entries that precede the iteration position.) |
| return false; |
| } |
| fEndBufIdx = modChunkSize(fEndBufIdx - 1); |
| } |
| fBoundaries[nextIdx] = position; |
| fStatuses[nextIdx] = (short)ruleStatusIdx; |
| fStartBufIdx = nextIdx; |
| if (update == UpdateCachePosition) { |
| fBufIdx = nextIdx; |
| fTextIdx = position; |
| } |
| return true; |
| }; |
| |
| /** |
| * Set the cache position to the specified position, or, if the position |
| * falls between to cached boundaries, to the preceding boundary. |
| * Fails if the requested position is outside of the range of boundaries currently held by the cache. |
| * The startPosition must be on a code point boundary. |
| * |
| * Return true if successful, false if the specified position is after |
| * the last cached boundary or before the first. |
| */ |
| boolean seek(int pos) { |
| if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { |
| return false; |
| } |
| if (pos == fBoundaries[fStartBufIdx]) { |
| // Common case: seek(0), from BreakIterator::first() |
| fBufIdx = fStartBufIdx; |
| fTextIdx = fBoundaries[fBufIdx]; |
| return true; |
| } |
| if (pos == fBoundaries[fEndBufIdx]) { |
| fBufIdx = fEndBufIdx; |
| fTextIdx = fBoundaries[fBufIdx]; |
| return true; |
| } |
| |
| int min = fStartBufIdx; |
| int max = fEndBufIdx; |
| while (min != max) { |
| int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; |
| probe = modChunkSize(probe); |
| if (fBoundaries[probe] > pos) { |
| max = probe; |
| } else { |
| min = modChunkSize(probe + 1); |
| } |
| } |
| assert(fBoundaries[max] > pos); |
| fBufIdx = modChunkSize(max - 1); |
| fTextIdx = fBoundaries[fBufIdx]; |
| assert(fTextIdx <= pos); |
| return true; |
| |
| }; |
| |
| |
| /** |
| * copy constructor, used from RuleBasedBreakIterator.clone(). |
| * |
| * @param src |
| */ |
| BreakCache(BreakCache src) { |
| fStartBufIdx = src.fStartBufIdx; |
| fEndBufIdx = src.fEndBufIdx; |
| fTextIdx = src.fTextIdx; |
| fBufIdx = src.fBufIdx; |
| fBoundaries = src.fBoundaries.clone(); |
| fStatuses = src.fStatuses.clone(); |
| fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. |
| } |
| |
| void dumpCache() { |
| System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); |
| for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { |
| System.out.printf("%d %d%n", i, fBoundaries[i]); |
| if (i == fEndBufIdx) { |
| break; |
| } |
| } |
| }; |
| |
| private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; |
| |
| static final int CACHE_SIZE = 128; |
| // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); |
| |
| int fStartBufIdx; |
| int fEndBufIdx; // inclusive |
| |
| int fTextIdx; |
| int fBufIdx; |
| |
| int[] fBoundaries = new int[CACHE_SIZE]; |
| short[] fStatuses = new short[CACHE_SIZE]; |
| |
| DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); |
| }; |
| |
| |
| |
| |
| } |
| |