blob: dd64108be3669d2d2851765eab45ef96bf707f1d [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2005-2012 International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.current32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
import static com.ibm.icu.impl.CharacterIteration.previous32;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.CharacterIterator;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.Stack;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
/**
* Rule Based Break Iterator
* This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
*
* @stable ICU 2.0
*/
public class RuleBasedBreakIterator extends BreakIterator {
//=======================================================================
// Constructors & Factories
//=======================================================================
/**
* @internal
* @deprecated This API is ICU internal only.
*/
public RuleBasedBreakIterator() {
}
/**
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param is an input stream supplying the compiled binary rules.
* @throws IOException if there is an error while reading the rules from the InputStream.
* @see #compileRules(String, OutputStream)
* @stable ICU 4.8
*/
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator This = new RuleBasedBreakIterator();
This.fRData = RBBIDataWrapper.get(is);
return This;
}
/**
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
* @param rules The break rules to be used.
* @stable ICU 2.2
*/
public RuleBasedBreakIterator(String rules) {
init();
try {
ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
compileRules(rules, ruleOS);
byte [] ruleBA = ruleOS.toByteArray();
InputStream ruleIS = new ByteArrayInputStream(ruleBA);
fRData = RBBIDataWrapper.get(ruleIS);
} catch (IOException e) {
///CLOVER:OFF
// An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
// causing bogus compiled rules to be produced, but with no compile error raised.
RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
+ e.getMessage());
throw rte;
///CLOVER:ON
}
}
//=======================================================================
// Boilerplate
//=======================================================================
/**
* Clones this iterator.
* @return A newly-constructed RuleBasedBreakIterator with the same
* behavior as this one.
* @stable ICU 2.0
*/
public Object clone()
{
RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone();
if (fText != null) {
result.fText = (CharacterIterator)(fText.clone());
}
return result;
}
/**
* Returns true if both BreakIterators are of the same class, have the same
* rules, and iterate over the same text.
* @stable ICU 2.0
*/
public boolean equals(Object that) {
if (that == null) {
return false;
}
if (this == that) {
return true;
}
try {
RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
return false;
}
if (fRData != null && other.fRData != null &&
(!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
return false;
}
if (fText == null && other.fText == null) {
return true;
}
if (fText == null || other.fText == null) {
return false;
}
return fText.equals(other.fText);
}
catch(ClassCastException e) {
return false;
}
}
/**
* Returns the description (rules) used to create this iterator.
* (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
* @stable ICU 2.0
*/
public String toString() {
String retStr = "";
if (fRData != null) {
retStr = fRData.fRuleSource;
}
return retStr;
}
/**
* Compute a hashcode for this BreakIterator
* @return A hash code
* @stable ICU 2.0
*/
public int hashCode()
{
return fRData.fRuleSource.hashCode();
}
/**
* Tag value for "words" that do not fit into any of other categories.
* Includes spaces and most punctuation.
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_NONE = 0;
/**
* Upper bound for tags for uncategorized words.
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_NONE_LIMIT = 100;
/**
* Tag value for words that appear to be numbers, lower limit.
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_NUMBER = 100;
/**
* Tag value for words that appear to be numbers, upper limit.
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_NUMBER_LIMIT = 200;
/**
* Tag value for words that contain letters, excluding
* hiragana, katakana or ideographic characters, lower limit.
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_LETTER = 200;
/**
* Tag value for words containing letters, upper limit
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_LETTER_LIMIT = 300;
/**
* Tag value for words containing kana characters, lower limit
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_KANA = 300;
/**
* Tag value for words containing kana characters, upper limit
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_KANA_LIMIT = 400;
/**
* Tag value for words containing ideographic characters, lower limit
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_IDEO = 400;
/**
* Tag value for words containing ideographic characters, upper limit
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public static final int WORD_IDEO_LIMIT = 500;
private static final int START_STATE = 1; // The state number of the starting state
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
// of user text. A variable with this enum type keeps track of where we
// are. The state machine only fetches user text input while in RUN mode.
private static final int RBBI_START = 0;
private static final int RBBI_RUN = 1;
private static final int RBBI_END = 2;
/*
* The character iterator through which this BreakIterator accesses the text.
*/
private CharacterIterator fText = new java.text.StringCharacterIterator("");
/**
* The rule data for this BreakIterator instance
* @internal
* @deprecated This API is ICU internal only.
*/
protected RBBIDataWrapper fRData;
/*
* Index of the Rule {tag} values for the most recent match.
*/
private int fLastRuleStatusIndex;
/*
* Rule tag value valid flag.
* Some iterator operations don't intrinsically set the correct tag value.
* This flag lets us lazily compute the value if we are ever asked for it.
*/
private boolean fLastStatusIndexValid;
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set. Normal RBBI iterators don't use it, although the code
* for updating it is live. Dictionary Based break iterators (a subclass
* of us) access this field directly.
* @internal
*/
private int fDictionaryCharCount;
/**
* Debugging flag. Trace operation of state machine when true.
* @internal
* @deprecated This API is ICU internal only.
*/
public static boolean fTrace;
/**
* What kind of break iterator this is. Set to KIND_LINE by default,
* since this produces sensible output.
*/
private int fBreakType = KIND_LINE;
/**
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
* by a dictionary but we have no dictionary implementation for them.
*/
private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
private int[] fCachedBreakPositions;
/**
* if fCachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
private int fPositionInCache;
/**
* Whether or not we should be using the dictionary. Set to true by
* default - only set to false if we get an empty string as input or
* if our "kind" is not KIND_WORD or KIND_LINE.
*
* If this is set to false, no dictionary handling is done.
*/
private boolean fUseDictionary = true;
private final Set<LanguageBreakEngine> fBreakEngines = Collections.synchronizedSet(new HashSet<LanguageBreakEngine>());
/*
* ICU debug argument name for RBBI
*/
private static final String RBBI_DEBUG_ARG = "rbbi";
/**
* Dump the contents of the state table and character classes for this break iterator.
* For debugging only.
* @internal
* @deprecated This API is ICU internal only.
*/
public void dump() {
this.fRData.dump();
}
private static boolean debugInitDone = false;
private void init() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
fBreakEngines.add(fUnhandledBreakEngine);
if (debugInitDone == false) {
fTrace = ICUDebug.enabled(RBBI_DEBUG_ARG)
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
debugInitDone = true;
}
}
/**
* Compile a set of source break rules into the binary state tables used
* by the break iterator engine. Creating a break iterator from precompiled
* rules is much faster than creating one from source rules.
*
* Binary break rules are not guaranteed to be compatible between different
* versions of ICU.
*
*
* @param rules The source form of the break rules
* @param ruleBinary An output stream to receive the compiled rules.
* @throws IOException If there is an error writing the output.
* @see #getInstanceFromCompiledRules(InputStream)
* @stable ICU 4.8
*/
public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
RBBIRuleBuilder.compileRules(rules, ruleBinary);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable ICU 2.0
*/
public int first() {
fCachedBreakPositions = null;
fDictionaryCharCount = 0;
fPositionInCache = 0;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
if (fText == null) {
return BreakIterator.DONE;
}
fText.first();
return fText.getIndex();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/
public int last() {
fCachedBreakPositions = null;
fDictionaryCharCount = 0;
fPositionInCache = 0;
if (fText == null) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
}
// t.last() returns the offset of the last character,
// rather than the past-the-end offset
// so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
// will work correctly.
fLastStatusIndexValid = false;
int pos = fText.getEndIndex();
fText.setIndex(pos);
return pos;
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
* @stable ICU 2.0
*/
public int next(int n) {
int result = current();
while (n > 0) {
result = handleNext();
--n;
}
while (n < 0) {
result = previous();
++n;
}
return result;
}
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
*/
public int next() {
return handleNext();
}
/**
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
public int previous() {
CharacterIterator text = getText();
fLastStatusIndexValid = false;
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (fCachedBreakPositions != null && fPositionInCache > 0) {
--fPositionInCache;
text.setIndex(fCachedBreakPositions[fPositionInCache]);
return fCachedBreakPositions[fPositionInCache];
}
// otherwise, dump the cache and use the inherited previous() method to move
// backward. This may fill up the cache with new break positions, in which
// case we have to mark our position in the cache. If it doesn't, use next()
// to move forward until we hit or pass the current position. This *will* fill
// the cache.
else {
// TODO: Try to reuse the array rather than reallocating it all the time
fCachedBreakPositions = null;
int offset = current();
int result = rulesPrevious();
if (result == BreakIterator.DONE) {
return result;
}
if (fDictionaryCharCount == 0) {
return result;
}
if (fCachedBreakPositions != null) {
fPositionInCache = fCachedBreakPositions.length - 2;
return result;
}
while (result < offset) {
int nextResult = handleNext();
if (nextResult >= offset) {
break;
}
result = nextResult;
}
if (fCachedBreakPositions != null) {
for (fPositionInCache = 0; fPositionInCache < fCachedBreakPositions.length; fPositionInCache++) {
if (fCachedBreakPositions[fPositionInCache] >= offset) {
fPositionInCache--;
break;
}
}
}
// prepare for the user asking for our status
// our status will have been marked as valid by the next()
// calls but isn't at the right place, so mark it as invalid
// and recompute it when the user asks
fLastStatusIndexValid = false;
text.setIndex(result);
return result;
}
}
private int rulesPrevious() {
// if we're already sitting at the beginning of the text, return DONE
if (fText == null || current() == fText.getBeginIndex()) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
}
if (fRData.fSRTable != null || fRData.fSFTable != null) {
return handlePrevious(fRData.fRTable);
}
// old rule syntax
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
// iterator up one step to prevent handlePrevious() from returning
// the current position), but not necessarily the last one before
// where we started
int start = current();
previous32(fText);
int lastResult = handlePrevious(fRData.fRTable);
if (lastResult == BreakIterator.DONE) {
lastResult = fText.getBeginIndex();
fText.setIndex(lastResult);
}
int result = lastResult;
int lastTag = 0;
boolean breakTagValid = false;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
for (;;) {
result = handleNext();
if (result == BreakIterator.DONE || result >= start) {
break;
}
lastResult = result;
lastTag = fLastRuleStatusIndex;
breakTagValid = true;
}
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
// we wont have a tag value for that position, which is only set by handleNext().
// set the current iteration position to be the last break position
// before where we started, and then return that value
fText.setIndex(lastResult);
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
fLastStatusIndexValid = breakTagValid;
return lastResult;
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @param offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
public int following(int offset) {
CharacterIterator text = getText();
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] ||
offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
fCachedBreakPositions = null;
return rulesFollowing(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
fPositionInCache = 0;
while (fPositionInCache < fCachedBreakPositions.length
&& offset >= fCachedBreakPositions[fPositionInCache])
++fPositionInCache;
text.setIndex(fCachedBreakPositions[fPositionInCache]);
return text.getIndex();
}
}
private int rulesFollowing(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
if (fText == null || offset >= fText.getEndIndex()) {
last();
return next();
}
else if (offset < fText.getBeginIndex()) {
return first();
}
// otherwise, set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
int result = 0;
if (fRData.fSRTable != null) {
// Safe Point Reverse rules exist.
// This allows us to use the optimum algorithm.
fText.setIndex(offset);
// move forward one codepoint to prepare for moving back to a
// safe point.
// this handles offset being between a supplementary character
next32(fText);
// handlePrevious will move most of the time to < 1 boundary away
handlePrevious(fRData.fSRTable);
result = next();
while (result <= offset) {
result = next();
}
return result;
}
if (fRData.fSFTable != null) {
// No Safe point reverse table, but there is a safe pt forward table.
//
fText.setIndex(offset);
previous32(fText);
// handle next will give result >= offset
handleNext(fRData.fSFTable);
// previous will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
int oldresult = previous();
while (oldresult > offset) {
result = previous();
if (result <= offset) {
return oldresult;
}
oldresult = result;
}
result = next();
if (result <= offset) {
return next();
}
return result;
}
// otherwise, we have to sync up first. Use handlePrevious() to back
// us up to a known break position before the specified position (if
// we can determine that the specified position is a break position,
// we don't back up at all). This may or may not be the last break
// position at or before our starting position. Advance forward
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
// old rule syntax
fText.setIndex(offset);
if (offset == fText.getBeginIndex()) {
return handleNext();
}
result = previous();
while (result != BreakIterator.DONE && result <= offset) {
result = next();
}
return result;
}
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
public int preceding(int offset) {
CharacterIterator text = getText();
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] ||
offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) {
fCachedBreakPositions = null;
return rulesPreceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
fPositionInCache = 0;
while (fPositionInCache < fCachedBreakPositions.length
&& offset > fCachedBreakPositions[fPositionInCache])
++fPositionInCache;
--fPositionInCache;
text.setIndex(fCachedBreakPositions[fPositionInCache]);
return text.getIndex();
}
}
private int rulesPreceding(int offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == null || offset > fText.getEndIndex()) {
// return BreakIterator::DONE;
return last();
}
else if (offset < fText.getBeginIndex()) {
return first();
}
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
int result;
if (fRData.fSFTable != null) {
/// todo synwee
// new rule syntax
fText.setIndex(offset);
// move backwards one codepoint to prepare for moving forwards to a
// safe point.
// this handles offset being between a supplementary character
previous32(fText);
handleNext(fRData.fSFTable);
result = previous();
while (result >= offset) {
result = previous();
}
return result;
}
if (fRData.fSRTable != null) {
// backup plan if forward safe table is not available
fText.setIndex(offset);
next32(fText);
// handle previous will give result <= offset
handlePrevious(fRData.fSRTable);
// next will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
int oldresult = next();
while (oldresult < offset) {
result = next();
if (result >= offset) {
return oldresult;
}
oldresult = result;
}
result = previous();
if (result >= offset) {
return previous();
}
return result;
}
// old rule syntax
fText.setIndex(offset);
return previous();
}
/**
* Throw IllegalArgumentException unless begin <= offset < end.
* @stable ICU 2.0
*/
protected static final void checkOffset(int offset, CharacterIterator text) {
if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
throw new IllegalArgumentException("offset out of bounds");
}
}
/**
* Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
public boolean isBoundary(int offset) {
checkOffset(offset, fText);
// the beginning index of the iterator is always a boundary position by definition
if (offset == fText.getBeginIndex()) {
first(); // For side effects on current position, tag values.
return true;
}
if (offset == fText.getEndIndex()) {
last(); // For side effects on current position, tag values.
return true;
}
// otherwise, we can use following() on the position before the specified
// one and return true if the position we get back is the one the user
// specified
// return following(offset - 1) == offset;
// TODO: check whether it is safe to revert to the simpler offset-1 code
// The safe rules may take care of unpaired surrogates ok.
fText.setIndex(offset);
previous32(fText);
int pos = fText.getIndex();
boolean result = following(pos) == offset;
return result;
}
/**
* Returns the current iteration position.
* @return The current iteration position.
* @stable ICU 2.0
*/
public int current() {
return (fText != null) ? fText.getIndex() : BreakIterator.DONE;
}
private void makeRuleStatusValid() {
if (fLastStatusIndexValid == false) {
// No cached status is available.
int curr = current();
if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) {
// At start of text, or there is no text. Status is always zero.
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
} else {
// Not at start of text. Find status the tedious way.
int pa = fText.getIndex();
first();
int pb = current();
while (fText.getIndex() < pa) {
pb = next();
}
Assert.assrt(pa == pb);
}
Assert.assrt(fLastStatusIndexValid == true);
Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length);
}
}
/**
* Return the status tag from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned. If more than one rule applies,
* the numerically largest of the possible status values is returned.
* <p>
* Of the standard types of ICU break iterators, only the word break
* iterator provides status values. The values are defined in
* class RuleBasedBreakIterator, and allow distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. Call <code>getRuleStatus</code> after obtaining a boundary
* position from <code>next()<code>, <code>previous()</code>, or
* any other break iterator functions that returns a boundary position.
* <p>
* @return the status from the break rule that determined the most recently
* returned break position.
*
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public int getRuleStatus() {
makeRuleStatusValid();
// Status records have this form:
// Count N <-- fLastRuleStatusIndex points here.
// Status val 0
// Status val 1
// ...
// Status val N-1 <-- the value we need to return
// The status values are sorted in ascending order.
// This function returns the last (largest) of the array of status values.
int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex];
int tagVal = fRData.fStatusTable[idx];
return tagVal;
}
/**
* Get the status (tag) values from the break rule(s) that determined the most
* recently returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
* <p>
* The status values used by the standard ICU break rules are defined
* as public constants in class RuleBasedBreakIterator.
* <p>
* If the size of the output array is insufficient to hold the data,
* the output will be truncated to the available length. No exception
* will be thrown.
*
* @param fillInArray an array to be filled in with the status values.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* In the event that the array is too small, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
* @draft ICU 3.0
* @provisional This is a draft API and might change in a future release of ICU.
*/
public int getRuleStatusVec(int[] fillInArray) {
makeRuleStatusValid();
int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex];
if (fillInArray != null) {
int numToCopy = Math.min(numStatusVals, fillInArray.length);
for (int i=0; i<numToCopy; i++) {
fillInArray[i] = fRData.fStatusTable[fLastRuleStatusIndex + i + 1];
}
}
return numStatusVals;
}
/**
* Return a CharacterIterator over the text being analyzed. This version
* of this method returns the actual CharacterIterator we're using internally.
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
* @stable ICU 2.0
*/
public CharacterIterator getText() {
return fText;
}
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze.
* @stable ICU 2.0
*/
public void setText(CharacterIterator newText) {
if (newText != null) {
if ((fBreakType == KIND_WORD || fBreakType == KIND_LINE)
&& newText.getEndIndex() != newText.getBeginIndex()) {
fUseDictionary = true;
} else {
fUseDictionary = false;
}
}
fText = newText;
this.first();
fCachedBreakPositions = null;
fDictionaryCharCount = 0;
fPositionInCache = 0;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
void setBreakType(int type) {
fBreakType = type;
if (type != KIND_WORD && type != KIND_LINE) {
fUseDictionary = false;
}
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
int getBreakType() {
return fBreakType;
}
/**
* Control debug, trace and dump options.
* @internal
*/
static String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
ICUDebug.value(RBBI_DEBUG_ARG) : null;
/**
* Finds an appropriate LanguageBreakEngine for this character and
* break type.
* @internal
* @deprecated This API is ICU internal only.
*/
protected LanguageBreakEngine getEngineFor(int c) {
if (c == DONE32 || !fUseDictionary) {
return null;
}
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c, fBreakType)) {
return candidate;
}
}
// if we don't have an existing engine, build one.
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
LanguageBreakEngine eng = null;
try {
switch (script) {
case UScript.THAI:
eng = new ThaiBreakEngine();
break;
case UScript.KATAKANA:
case UScript.HIRAGANA:
case UScript.HAN:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(false);
}
else {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
}
break;
case UScript.HANGUL:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(true);
} else {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
}
break;
default:
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
break;
}
} catch (IOException e) {
eng = null;
}
if (eng != null) {
fBreakEngines.add(eng);
}
return eng;
}
//-----------------------------------------------------------------------------------
//
// handleNext(void) All forward iteration vectors through this function.
//
//-----------------------------------------------------------------------------------
int handleNext() {
CharacterIterator text = getText();
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
int startPos = text.getIndex();
if (fCachedBreakPositions == null || fPositionInCache == fCachedBreakPositions.length - 1) {
// start by using the rules handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
fDictionaryCharCount = 0;
int result = handleNext(fRData.fFTable);
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range.
if (fDictionaryCharCount > 1 && result - startPos > 1) {
text.setIndex(startPos);
LanguageBreakEngine e = getEngineFor(current32(text));
if (e != null) {
// we have an engine! use it to produce breaks
Stack<Integer> breaks = new Stack<Integer>();
e.findBreaks(text, startPos, result, false, getBreakType(), breaks);
fCachedBreakPositions = new int[breaks.size() + 2];
fCachedBreakPositions[0] = startPos;
for (int i = 0; i < breaks.size(); i++) {
fCachedBreakPositions[i + 1] = breaks.elementAt(i).intValue();
}
fCachedBreakPositions[breaks.size() + 1] = result;
fPositionInCache = 0;
} else {
// we don't have an engine; just use the rules
text.setIndex(result);
return result;
}
}
else {
// otherwise, the value we got back from the inherited function
// is our return value, and we can dump the cache
fCachedBreakPositions = null;
return result;
}
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (fCachedBreakPositions != null) {
++fPositionInCache;
text.setIndex(fCachedBreakPositions[fPositionInCache]);
return fCachedBreakPositions[fPositionInCache];
}
///CLOVER:OFF
Assert.assrt(false);
return -9999; // WE SHOULD NEVER GET HERE!
///CLOVER:ON
}
/**
* The State Machine Engine for moving forward is here.
* This function is the heart of the RBBI run time engine.
*
* @param stateTable
* @return the new iterator position
*
* A note on supplementary characters and the position of underlying
* Java CharacterIterator: Normally, a character iterator is positioned at
* the char most recently returned by next(). Within this function, when
* a supplementary char is being processed, the char iterator is left
* sitting on the trail surrogate, in the middle of the code point.
* This is different from everywhere else, where an iterator always
* points at the lead surrogate of a supplementary.
*/
private int handleNext(short stateTable[]) {
int state;
short category = 0;
int mode;
int row;
int c;
int lookaheadStatus = 0;
int lookaheadTagIdx = 0;
int result = 0;
int initialPosition = 0;
int lookaheadResult = 0;
boolean lookAheadHardBreak =
(stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
if (fTrace) {
System.out.println("Handle Next pos char state category");
}
// No matter what, handleNext alway correctly sets the break tag value.
fLastStatusIndexValid = true;
fLastRuleStatusIndex = 0;
// if we're already at the end of the text, return DONE.
if (fText == null) {
fLastRuleStatusIndex = 0;
return BreakIterator.DONE;
}
// Set up the starting char
initialPosition = fText.getIndex();
result = initialPosition;
c = fText.current();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(fText, c);
if (c == DONE32) {
fLastRuleStatusIndex = 0;
return BreakIterator.DONE;
}
}
// Set the initial state for the state machine
state = START_STATE;
row = fRData.getRowIndex(state);
category = 3;
mode = RBBI_RUN;
if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
}
// loop until we reach the end of the text or transition to state 0
while (state != STOP_STATE) {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END) {
// We have already run the loop one last time with the
// character set to the pseudo {eof} value. Now it is time
// to unconditionally bail out.
if (lookaheadResult > result) {
// We ran off the end of the string with a pending
// look-ahead match.
// Treat this as if the look-ahead condition had been
// met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off end, no match found.
// move forward one
fText.setIndex(initialPosition);
next32(fText);
}
break;
}
// Run the loop one last time with the fake end-of-input character category
mode = RBBI_END;
category = 1;
}
// Get the char category. An incoming category of 1 or 2 mens that
// we are preset for doing the beginning or end of input, and
// that we shouldn't get a category from an actual text input character.
//
if (mode == RBBI_RUN) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
}
if (fTrace) {
System.out.print(" " + RBBIDataWrapper.intToString(fText.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// look up a state transition in the state table
// state = row->fNextState[category];
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
// Advance to the next character.
// If this is a beginning-of-input loop iteration, don't advance.
// The next iteration will be processing the first real input character.
if (mode == RBBI_RUN) {
c = (int)fText.next();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(fText, c);
}
} else {
if (mode == RBBI_START) {
mode = RBBI_RUN;
}
}
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case
result = fText.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
// The iterator has been left in the middle of a surrogate pair.
// We want the start of it.
result--;
}
// Remember the break status (tag) values.
fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
}
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
if (lookaheadStatus != 0
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
// TODO: make a standalone hard break in a rule work.
if (lookAheadHardBreak) {
fText.setIndex(result);
return result;
}
// Look-ahead completed, but other rules may match further. Continue on.
// TODO: junk this feature? I don't think it's used anywhere.
continue;
}
lookaheadResult = fText.getIndex();
if (c>=UTF16.SUPPLEMENTARY_MIN_VALUE && c!=DONE32) {
// The iterator has been left in the middle of a surrogate pair.
// We want the beginning of it.
lookaheadResult--;
}
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
lookaheadTagIdx = stateTable[row + RBBIDataWrapper.TAGIDX];
continue;
}
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relavant. Clear out the pending lookahead status.
lookaheadStatus = 0;
}
} // End of state machine main loop
// The state machine is done. Check whether it found a match...
// If the iterator failed to advance in the match engine, force it ahead by one.
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
result = fText.setIndex(initialPosition);
next32(fText);
result = fText.getIndex();
}
// Leave the iterator at our result position.
// (we may have advanced beyond the last accepting position chasing after
// longer matches that never completed.)
fText.setIndex(result);
if (fTrace) {
System.out.println("result = " + result);
}
return result;
}
private int handlePrevious(short stateTable[]) {
if (fText == null || stateTable == null) {
return 0;
}
int state;
int category = 0;
int mode;
int row;
int c;
int lookaheadStatus = 0;
int result = 0;
int initialPosition = 0;
int lookaheadResult = 0;
boolean lookAheadHardBreak =
(stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK) != 0;
// handlePrevious() never gets the rule status.
// Flag the status as invalid; if the user ever asks for status, we will need
// to back up, then re-find the break position using handleNext(), which does
// get the status value.
fLastStatusIndexValid = false;
fLastRuleStatusIndex = 0;
// set up the starting char
initialPosition = fText.getIndex();
result = initialPosition;
c = previous32(fText);
// Set up the initial state for the state machine
state = START_STATE;
row = fRData.getRowIndex(state);
category = 3; // TODO: obsolete? from the old start/run mode scheme?
mode = RBBI_RUN;
if ((stateTable[RBBIDataWrapper.FLAGS+1] & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
}
if (fTrace) {
System.out.println("Handle Prev pos char state category ");
}
// loop until we reach the beginning of the text or transition to state 0
//
mainLoop: for (;;) {
innerBlock: {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
// Either this is the old (ICU 3.2 and earlier) format data which
// does not support explicit support for matching {eof}, or
// we have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
if (lookaheadResult < result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off start, no match found.
// Move one position (towards the start, since we are doing previous.)
fText.setIndex(initialPosition);
previous32(fText);
}
break mainLoop;
}
mode = RBBI_END;
category = 1;
}
if (mode == RBBI_RUN) {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
}
if (fTrace) {
System.out.print(" " + fText.getIndex() + " ");
if (0x20 <= c && c < 0x7f) {
System.out.print(" " + c + " ");
} else {
System.out.print(" " + Integer.toHexString(c) + " ");
}
System.out.println(" " + state + " " + category + " ");
}
// State Transition - move machine to its next state
//
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move
// on to check it
result = fText.getIndex();
}
if (stateTable[row + RBBIDataWrapper.LOOKAHEAD] != 0) {
if (lookaheadStatus != 0
&& stateTable[row + RBBIDataWrapper.ACCEPTING] == lookaheadStatus) {
// Lookahead match is completed. Set the result
// accordingly, but only
// if no other rule has matched further in the mean
// time.
result = lookaheadResult;
lookaheadStatus = 0;
// TODO: make a stand-alone hard break in a rule work.
if (lookAheadHardBreak) {
break mainLoop;
}
// Look-ahead completed, but other rules may match further.
// Continue on.
// TODO: junk this feature? I don't think that it's used anywhere.
break innerBlock;
}
// Hit a possible look-ahead match. We are at the
// position of the '/'. Remember this position.
lookaheadResult = fText.getIndex();
lookaheadStatus = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
break innerBlock;
}
// not lookahead...
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != 0) {
// This is a plain (non-look-ahead) accepting state.
if (!lookAheadHardBreak) {
// Clear out any pending look-ahead matches,
// but only if not doing the lookAheadHardBreak option
// which needs to force a break no matter what is going
// on with the rest of the match, i.e. we can't abandon
// a partially completed look-ahead match because
// some other rule matched further than the '/' position
// in the look-ahead match.
lookaheadStatus = 0;
}
}
} // end of innerBlock. "break innerBlock" in above code comes out here.
if (state == STOP_STATE) {
// Normal loop exit is here
break mainLoop;
}
// then move iterator position backwards one character
//
if (mode == RBBI_RUN) {
c = previous32(fText);
} else {
if (mode == RBBI_START) {
mode = RBBI_RUN;
}
}
} // End of the main loop.
// The state machine is done. Check whether it found a match...
//
// If the iterator failed to advance in the match engine, force it ahead by one.
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
result = fText.setIndex(initialPosition);
previous32(fText);
result = fText.getIndex();
}
fText.setIndex(result);
if (fTrace) {
System.out.println("Result = " + result);
}
return result;
}
}