/* | |
******************************************************************************* | |
* Copyright (C) 1996-2008, International Business Machines Corporation and * | |
* others. All Rights Reserved. * | |
******************************************************************************* | |
*/ | |
package com.ibm.icu.text; | |
import com.ibm.icu.impl.Assert; | |
import com.ibm.icu.text.UnicodeSet; | |
import java.util.Stack; | |
import java.io.InputStream; | |
import java.text.CharacterIterator; | |
import java.io.IOException; | |
class ThaiBreakIterator extends DictionaryBasedBreakIterator { | |
/* Helper class for improving readability of the Thai word break | |
* algorithm. | |
*/ | |
static class PossibleWord { | |
// List size, limited by the maximum number of words in the dictionary | |
// that form a nested sequence. | |
private final int POSSIBLE_WORD_LIST_MAX = 20; | |
//list of word candidate lengths, in increasing length order | |
private int lengths[]; | |
private int count[]; // Count of candidates | |
private int prefix; // The longeset match with a dictionary word | |
private int offset; // Offset in the text of these candidates | |
private int mark; // The preferred candidate's offset | |
private int current; // The candidate we're currently looking at | |
// Default constructor | |
public PossibleWord() { | |
lengths = new int[POSSIBLE_WORD_LIST_MAX]; | |
count = new int[1]; // count needs to be an array of 1 so that it can be pass as reference | |
offset = -1; | |
} | |
// Fill the list of candidates if needed, select the longest, and return the number found | |
public int candidates(CharacterIterator fIter, BreakCTDictionary dict, int rangeEnd) { | |
int start = fIter.getIndex(); | |
if (start != offset) { | |
offset = start; | |
prefix = dict.matches(fIter, rangeEnd - start, lengths, count, lengths.length); | |
// Dictionary leaves text after longest prefix, not longest word. Back up. | |
if (count[0] <= 0) { | |
fIter.setIndex(start); | |
} | |
} | |
if (count[0] > 0) { | |
fIter.setIndex(start + lengths[count[0]-1]); | |
} | |
current = count[0] - 1; | |
mark = current; | |
return count[0]; | |
} | |
// Select the currently marked candidate, point after it in the text, and invalidate self | |
public int acceptMarked(CharacterIterator fIter) { | |
fIter.setIndex(offset + lengths[mark]); | |
return lengths[mark]; | |
} | |
// Backup from the current candidate to the next shorter one; rreturn true if that exists | |
// and point the text after it | |
public boolean backUp(CharacterIterator fIter) { | |
if (current > 0) { | |
fIter.setIndex(offset + lengths[--current]); | |
return true; | |
} | |
return false; | |
} | |
// Return the longest prefix this candidate location shares with a dictionary word | |
public int longestPrefix() { | |
return prefix; | |
} | |
// Mark the current candidate as the one we like | |
public void markCurrent() { | |
mark = current; | |
} | |
} | |
private static UnicodeSet fThaiWordSet; | |
private static UnicodeSet fEndWordSet; | |
private static UnicodeSet fBeginWordSet; | |
private static UnicodeSet fSuffixSet; | |
private static UnicodeSet fMarkSet; | |
private BreakCTDictionary fDictionary; | |
// Constants for ThaiBreakIterator | |
// How many words in a row are "good enough"? | |
private static final byte THAI_LOOKAHEAD = 3; | |
// Will not combine a non-word with a preceding dictionary word longer than this | |
private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3; | |
// Will not combine a non-word that shares at least this much prefix with a | |
// dictionary word with a preceding word | |
private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3; | |
// Ellision character | |
private static final char THAI_PAIYANNOI = 0x0E2F; | |
// Repeat character | |
private static final char THAI_MAIYAMOK = 0x0E46; | |
// Minimum word size | |
private static final byte THAI_MIN_WORD = 2; | |
// Minimum number of characters for two words | |
//private final int THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; | |
static { | |
// Initialize UnicodeSets | |
fThaiWordSet = new UnicodeSet(); | |
fMarkSet = new UnicodeSet(); | |
fEndWordSet = new UnicodeSet(); | |
fBeginWordSet = new UnicodeSet(); | |
fSuffixSet = new UnicodeSet(); | |
fThaiWordSet.applyPattern(new String("[[:Thai:]&[:LineBreak=SA:]]")); | |
fThaiWordSet.compact(); | |
fMarkSet.applyPattern(new String("[[:Thai:]&[:LineBreak=SA:]&[:M:]]")); | |
fMarkSet.add(0x0020); | |
fEndWordSet = fThaiWordSet; | |
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK | |
fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
fSuffixSet.add(THAI_PAIYANNOI); | |
fSuffixSet.add(THAI_MAIYAMOK); | |
// Compact for caching | |
fMarkSet.compact(); | |
fEndWordSet.compact(); | |
fBeginWordSet.compact(); | |
fSuffixSet.compact(); | |
// Freeze the static UnicodeSet | |
fThaiWordSet.freeze(); | |
fMarkSet.freeze(); | |
fEndWordSet.freeze(); | |
fBeginWordSet.freeze(); | |
fSuffixSet.freeze(); | |
} | |
public ThaiBreakIterator(InputStream ruleStream, InputStream dictionaryStream) throws IOException { | |
super(ruleStream); | |
// Initialize diciontary | |
fDictionary = new BreakCTDictionary(dictionaryStream); | |
} | |
/** | |
* This is the implementation function for next(). | |
* @internal | |
* @deprecated This API is ICU internal only. | |
*/ | |
protected int handleNext() { | |
CharacterIterator text = getText(); | |
// if there are no cached break positions, or if we've just moved | |
// off the end of the range covered by the cache, we have to dump | |
// and possibly regenerate the cache | |
if (cachedBreakPositions == null || positionInCache == cachedBreakPositions.length - 1) { | |
// start by using the inherited handleNext() to find a tentative return | |
// value. dictionaryCharCount tells us how many dictionary characters | |
// we passed over on our way to the tentative return value | |
int startPos = text.getIndex(); | |
fDictionaryCharCount = 0; | |
int result = super.handleNext(); | |
// if we passed over more than one dictionary character, then we use | |
// divideUpDictionaryRange() to regenerate the cached break positions | |
// for the new range | |
if (fDictionaryCharCount > 1 && result - startPos > 1) { | |
divideUpDictionaryRange(startPos, result); | |
} | |
// otherwise, the value we got back from the inherited fuction | |
// is our return value, and we can dump the cache | |
else { | |
cachedBreakPositions = null; | |
return result; | |
} | |
} | |
// if the cache of break positions has been regenerated (or existed all | |
// along), then just advance to the next break position in the cache | |
// and return it | |
if (cachedBreakPositions != null) { | |
++positionInCache; | |
text.setIndex(cachedBreakPositions[positionInCache]); | |
return cachedBreakPositions[positionInCache]; | |
} | |
Assert.assrt(false); | |
return -9999; // SHOULD NEVER GET HERE! | |
} | |
/** | |
* Divide up a range of known dictionary characters. | |
* | |
* @param rangeStart The start of the range of dictionary characters | |
* @param rangeEnd The end of the range of dictionary characters | |
* @return The number of breaks found | |
*/ | |
private int divideUpDictionaryRange(int rangeStart, int rangeEnd) { | |
if ((rangeEnd - rangeStart) < THAI_MIN_WORD) { | |
return 0; // Not enough chacters for word | |
} | |
CharacterIterator fIter = getText(); | |
int wordsFound = 0; | |
int wordLength; | |
int current; | |
Stack foundBreaks = new Stack(); | |
PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD]; | |
for (int i = 0; i < THAI_LOOKAHEAD; i++) { | |
words[i] = new PossibleWord(); | |
} | |
int uc; | |
fIter.setIndex(rangeStart); | |
while ((current = fIter.getIndex()) < rangeEnd) { | |
wordLength = 0; | |
//Look for candidate words at the current position | |
int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); | |
// If we found exactly one, use that | |
if (candidates == 1) { | |
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter); | |
wordsFound += 1; | |
} | |
// If there was more than one, see which one can take use forward the most words | |
else if (candidates > 1) { | |
boolean foundBest = false; | |
// If we're already at the end of the range, we're done | |
if (fIter.getIndex() < rangeEnd) { | |
do { | |
int wordsMatched = 1; | |
if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) { | |
if (wordsMatched < 2) { | |
// Followed by another dictionary word; mark first word as a good candidate | |
words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
wordsMatched = 2; | |
} | |
// If we're already at the end of the range, we're done | |
if (fIter.getIndex() >= rangeEnd) { | |
break; | |
} | |
// See if any of the possible second words is followed by a third word | |
do { | |
// If we find a third word, stop right away | |
if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) { | |
words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
foundBest = true; | |
break; | |
} | |
} while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter)); | |
} | |
} while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter) && !foundBest); | |
} | |
/* foundBest: */wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter); | |
wordsFound += 1; | |
} | |
// We come here after having either found a word or not. We look ahead to the | |
// next word. If it's not a dictionary word, we will combine it with the word we | |
// just found (if there is one), but only if the preceding word does not exceed | |
// the threshold. | |
// The text iterator should now be positioned at the end of the word we found. | |
if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { | |
// If it is a dictionary word, do nothing. If it isn't, then if there is | |
// no preceding word, or the non-word shares less than the minimum threshold | |
// of characters with a dictionary word, then scan to resynchronize | |
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 && | |
(wordLength == 0 || | |
words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | |
// Look for a plausible word boundary | |
int remaining = rangeEnd - (current + wordLength); | |
int pc = fIter.current(); | |
int chars = 0; | |
for (;;) { | |
fIter.next(); | |
uc = fIter.current(); | |
chars += 1; | |
if (--remaining <= 0) { | |
break; | |
} | |
if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
// Maybe. See if it's in the dictionary. | |
// Note: In the original Apple code, checked that the next | |
// two characters after uc were not 0x0E4C THANTHAKHAT before | |
// checking the dictionary. That is just a performance filter, | |
// but it's not clear it's faster than checking the trie | |
int candidate = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); | |
fIter.setIndex(current+wordLength+chars); | |
if (candidate > 0) { | |
break; | |
} | |
} | |
pc = uc; | |
} | |
// Bump the word cound if there wasn't already one | |
if (wordLength <= 0) { | |
wordsFound += 1; | |
} | |
// Update the length with the passed-over characters | |
wordLength += chars; | |
} else { | |
// Backup to where we were for next iteration | |
fIter.setIndex(current+wordLength); | |
} | |
} | |
// Never stop before a combining mark. | |
int currPos; | |
while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) { | |
fIter.next(); | |
wordLength += fIter.getIndex() - currPos; | |
} | |
// Look ahead for possible suffixes if a dictionary word does not follow. | |
// We do this in code rather than using a rule so that the heuristic | |
// resynch continues to function. For example, one of the suffix characters | |
// could be a typo in the middle of a word. | |
if (fIter.getIndex() < rangeEnd && wordLength > 0) { | |
if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 && | |
fSuffixSet.contains(uc = fIter.current())) { | |
if (uc == THAI_PAIYANNOI) { | |
if (!fSuffixSet.contains(fIter.previous())) { | |
// Skip over previous end and PAIYANNOI | |
fIter.next(); | |
fIter.next(); | |
wordLength += 1; | |
uc = fIter.current(); | |
} else { | |
// Restore prior position | |
fIter.next(); | |
} | |
} | |
if (uc == THAI_MAIYAMOK) { | |
if (fIter.previous() != THAI_MAIYAMOK) { | |
// Skip over previous end and MAIYAMOK | |
fIter.next(); | |
fIter.next(); | |
wordLength += 1; | |
} else { | |
// restore prior position | |
fIter.next(); | |
} | |
} | |
} else { | |
fIter.setIndex(current+wordLength); | |
} | |
} | |
// Did we find a word on this iteration? If so, push it on the break stack | |
if (wordLength > 0) { | |
foundBreaks.push(new Integer(current+wordLength)); | |
} | |
} | |
// Don't return a break for the end of the dictionary range if there is one there | |
if (((Integer)foundBreaks.peek()).intValue() >= rangeEnd) { | |
foundBreaks.pop(); | |
wordsFound -= 1; | |
} | |
// Store the break points in cachedBreakPositions. | |
cachedBreakPositions = new int[foundBreaks.size() + 2]; | |
cachedBreakPositions[0] = rangeStart; | |
int i; | |
for (i = 0; i < foundBreaks.size(); i++) { | |
cachedBreakPositions[i + 1] = ((Integer)foundBreaks.elementAt(i)).intValue(); | |
} | |
cachedBreakPositions[i + 1] = rangeEnd; | |
positionInCache = 0; | |
return wordsFound; | |
} | |
} |