blob: 7dd8872b92c49c9538f9625e7d23548aa893f87a [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2003-2004 International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.rbbi;
// Monkey testing of RuleBasedBreakIterator
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Locale;
/**
* Monkey tests for RBBI. These tests have independent implementations of
* the Unicode TR boundary rules, and compare results between these and ICU's
* implementation, using random data.
*
* Tests cover Grapheme Cluster (char), Word and Line breaks
*
* Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
*
*/
public class RBBITestMonkey extends TestFmwk {
public static void main(String[] args) {
new RBBITestMonkey().run(args);
}
//
// classs RBBIMonkeyKind
//
// Monkey Test for Break Iteration
// Abstract interface class. Concrete derived classes independently
// implement the break rules for different iterator types.
//
// The Monkey Test itself uses doesn't know which type of break iterator it is
// testing, but works purely in terms of the interface defined here.
//
abstract static class RBBIMonkeyKind {
// Return a List of UnicodeSets, representing the character classes used
// for this type of iterator.
abstract List charClasses();
// Set the test text on which subsequent calls to next() will operate
abstract void setText(StringBuffer text);
// Find the next break postion, starting from the specified position.
// Return -1 after reaching end of string.
abstract int next(int i);
}
/**
* Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
*/
static class RBBICharMonkey extends RBBIMonkeyKind {
List fSets;
UnicodeSet fCRLFSet;
UnicodeSet fControlSet;
UnicodeSet fExtendSet;
UnicodeSet fHangulSet;
UnicodeSet fAnySet;
StringBuffer fText;
RBBICharMonkey() {
fText = null;
fCRLFSet = new UnicodeSet("[\\r\\n]");
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]");
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]");
fHangulSet = new UnicodeSet(
"[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}" +
"\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]");
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
fSets = new ArrayList();
fSets.add(fCRLFSet);
fSets.add(fControlSet);
fSets.add(fExtendSet);
fSets.add(fHangulSet);
fSets.add(fAnySet);
};
void setText(StringBuffer s) {
fText = s;
}
List charClasses() {
return fSets;
}
int next(int i) {
return nextGC(fText, i);
}
}
/**
*
* Word Monkey Test Class
*
*
*
*/
static class RBBIWordMonkey extends RBBIMonkeyKind {
List fSets;
StringBuffer fText;
UnicodeSet fKatakanaSet;
UnicodeSet fALetterSet;
UnicodeSet fMidLetterSet;
UnicodeSet fMidNumLetSet;
UnicodeSet fMidNumSet;
UnicodeSet fNumericSet;
UnicodeSet fFormatSet;
UnicodeSet fExtendSet;
UnicodeSet fOtherSet;
RBBIWordMonkey() {
fSets = new ArrayList();
fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}\\u30fc\\uff70\\uff9e\\uff9f]");
String ALetterStr = "[[\\p{Alphabetic}\\u05f3]-[\\p{Ideographic}]-[\\p{Script=Thai}]" +
"-[\\p{Script=Lao}]-[\\p{Script=Hiragana}]-" +
"[\\p{script=KATAKANA}\\u30fc\\uff70\\uff9e\\uff9f]]";
fALetterSet = new UnicodeSet(ALetterStr);
fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027]");
fMidNumLetSet = new UnicodeSet("[\\u002e\\u003a]");
fMidNumSet = new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]");
fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]");
fFormatSet = new UnicodeSet("[\\p{Format}-\\p{Grapheme_Extend}]");
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]");
fOtherSet = new UnicodeSet();
fOtherSet.complement();
fOtherSet.removeAll(fKatakanaSet);
fOtherSet.removeAll(fALetterSet);
fOtherSet.removeAll(fMidLetterSet);
fOtherSet.removeAll(fMidNumLetSet);
fOtherSet.removeAll(fMidNumSet);
fOtherSet.removeAll(fNumericSet);
fSets.add(fALetterSet);
fSets.add(fMidLetterSet);
fSets.add(fMidNumLetSet);
fSets.add(fMidNumSet);
fSets.add(fNumericSet);
fSets.add(fFormatSet);
fSets.add(fOtherSet);
}
List charClasses() {
return fSets;
}
void setText(StringBuffer s) {
fText = s;
}
int next(int prevPos) {
int p0, p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
int breakPos = -1;
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
// Prev break at end of string. return DONE.
if (prevPos >= fText.length()) {
return -1;
}
p0 = p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
c0 = c1 = c2 = 0;
// Format char after prev break? Special case, see last Note for Word Boundaries TR.
// break immdiately after the format char.
if (breakPos >= 0 && fFormatSet.contains(c3) && breakPos < (fText.length() -1)) {
breakPos = UTF16.moveCodePointOffset(fText, breakPos, 1);
return breakPos;
}
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
p0 = p1; c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
// Advancd p3 by (GC Format*) Rules 3, 4
p3 = nextGC(fText, p3);
if (p3 == -1 || p3 >= fText.length()) {
p3 = fText.length();
c3 = 0;
} else {
c3 = UTF16.charAt(fText, p3);
while (fFormatSet.contains(c3)) {
p3 = moveIndex32(fText, p3, 1);
c3 = 0;
if (p3 < fText.length()) {
c3 = UTF16.charAt(fText, p3);
}
}
}
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
if (p2 == fText.length()) {
// Reached end of string. Always a break position.
break;
}
// Rule (5). ALetter x ALetter
if (fALetterSet.contains(c1) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
//
// Also incorporates rule 7 by skipping pos ahead to position of the
// terminating ALetter.
if ( fALetterSet.contains(c1) &&
(fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
fALetterSet.contains(c3)) {
continue;
}
// Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
if (fALetterSet.contains(c0) &&
(fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) ) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (8) Numeric x Numeric
if (fNumericSet.contains(c1) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (9) ALetter x Numeric
if (fALetterSet.contains(c1) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (10) Numeric x ALetter
if (fNumericSet.contains(c1) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
if ( fNumericSet.contains(c0) &&
(fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (12) Numeric x (MidNum | MidNumLet) Numeric
if (fNumericSet.contains(c1) &&
(fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
fNumericSet.contains(c3)) {
continue;
}
// Rule (13) Katakana x Katakana
if (fKatakanaSet.contains(c1) &&
fKatakanaSet.contains(c2)) {
continue;
}
// Rule 14. Break found here.
break;
}
// Rule 4 fixup, back up before any trailing
// format characters at the end of the word.
breakPos = p2;
int t = nextGC(fText, p1);
if (t > p1) {
breakPos = t;
}
return breakPos;
}
}
static class RBBILineMonkey extends RBBIMonkeyKind {
List fSets;
UnicodeSet fBK;
UnicodeSet fCR;
UnicodeSet fLF;
UnicodeSet fCM;
UnicodeSet fNL;
UnicodeSet fSG;
UnicodeSet fWJ;
UnicodeSet fZW;
UnicodeSet fGL;
UnicodeSet fCB;
UnicodeSet fSP;
UnicodeSet fB2;
UnicodeSet fBA;
UnicodeSet fBB;
UnicodeSet fHY;
UnicodeSet fCL;
UnicodeSet fEX;
UnicodeSet fIN;
UnicodeSet fNS;
UnicodeSet fOP;
UnicodeSet fQU;
UnicodeSet fIS;
UnicodeSet fNU;
UnicodeSet fPO;
UnicodeSet fPR;
UnicodeSet fSY;
UnicodeSet fAI;
UnicodeSet fAL;
UnicodeSet fID;
UnicodeSet fSA;
UnicodeSet fXX;
BreakIterator fCharBI;
StringBuffer fText;
int fOrigPositions;
RBBILineMonkey()
{
fSets = new ArrayList();
fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
fCR = new UnicodeSet("[\\p{Line_break=CR}]");
fLF = new UnicodeSet("[\\p{Line_break=LF}]");
fCM = new UnicodeSet("[\\p{Line_break=CM}]");
fNL = new UnicodeSet("[\\p{Line_break=NL}]");
fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
fGL = new UnicodeSet("[\\p{Line_break=GL}]");
fCB = new UnicodeSet("[\\p{Line_break=CB}]");
fSP = new UnicodeSet("[\\p{Line_break=SP}]");
fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
fBA = new UnicodeSet("[\\p{Line_break=BA}]");
fBB = new UnicodeSet("[\\p{Line_break=BB}]");
fHY = new UnicodeSet("[\\p{Line_break=HY}]");
fCL = new UnicodeSet("[\\p{Line_break=CL}]");
fEX = new UnicodeSet("[\\p{Line_break=EX}]");
fIN = new UnicodeSet("[\\p{Line_break=IN}]");
fNS = new UnicodeSet("[\\p{Line_break=NS}]");
fOP = new UnicodeSet("[\\p{Line_break=OP}]");
fQU = new UnicodeSet("[\\p{Line_break=QU}]");
fIS = new UnicodeSet("[\\p{Line_break=IS}]");
fNU = new UnicodeSet("[\\p{Line_break=NU}]");
fPO = new UnicodeSet("[\\p{Line_break=PO}]");
fPR = new UnicodeSet("[\\p{Line_break=PR}]");
fSY = new UnicodeSet("[\\p{Line_break=SY}]");
fAI = new UnicodeSet("[\\p{Line_break=AI}]");
fAL = new UnicodeSet("[\\p{Line_break=AL}]");
fID = new UnicodeSet("[\\p{Line_break=ID}]");
fSA = new UnicodeSet("[\\p{Line_break=SA}]");
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
fAL.addAll(fXX); // Default behavior for XX is identical to AL
fAL.addAll(fAI); // Default behavior for AI is identical to AL
fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
fSets.add(fBK);
fSets.add(fCR);
fSets.add(fLF);
fSets.add(fCM);
fSets.add(fNL);
fSets.add(fWJ);
fSets.add(fZW);
fSets.add(fGL);
fSets.add(fCB);
fSets.add(fSP);
fSets.add(fB2);
fSets.add(fBA);
fSets.add(fBB);
fSets.add(fHY);
fSets.add(fCL);
fSets.add(fEX);
fSets.add(fIN);
fSets.add(fNS);
fSets.add(fOP);
fSets.add(fQU);
fSets.add(fIS);
fSets.add(fNU);
fSets.add(fPO);
fSets.add(fPR);
fSets.add(fSY);
fSets.add(fAI);
fSets.add(fAL);
fSets.add(fID);
fSets.add(fWJ);
fSets.add(fSA);
fCharBI = BreakIterator.getCharacterInstance(Locale.ENGLISH);
}
void setText(StringBuffer s) {
fText = s;
fCharBI.setText(s.toString());
}
//
// rule67Adjust
// Line Break TR rules 6 and 7 implementation.
// This deals with combining marks, Hangul Syllables, and other sequences that
// that must be treated as if they were something other than what they actually are.
//
// This is factored out into a separate function because it must be applied twice for
// each potential break, once to the chars before the position being checked, then
// again to the text following the possible break.
//
int[] rule67Adjust(int pos, int posChar, int nextPos, int nextChar, int[] retVals) {
if (retVals == null) {
retVals = new int[3];
}
retVals[0] = posChar;
retVals[1] = nextPos;
retVals[2] = nextChar;
if (pos == -1) {
// Invalid initial position. Happens during the warmup iteration of the
// main loop in next().
return retVals;
}
int nPos = nextPos;
// LB 6 Treat Korean Syllables as a single unit
int hangultype = UCharacter.getIntPropertyValue(posChar, UProperty.HANGUL_SYLLABLE_TYPE);
if (hangultype != UCharacter.HangulSyllableType.NOT_APPLICABLE) {
nPos = fCharBI.following(pos); // Advance by grapheme cluster, which
// contains the logic to locate Hangul syllables.
// Grapheme Cluster Ugliness: some Grapheme_Extend chars, which are absorbed
// into a grapheme cluster, are NOT Line Break CM. (Some are GL, for example.)
// We don't want consume any of these. The Approach is
// 1. Back nPos up, undoing the consumption of any
// Grapheme_Extend chars by the char break iterator.
// 2. Let the LB 7b logic below reconsume any Line Break CM chars.
for (;;) {
nPos = moveIndex32(fText, nPos, -1);
int possiblyExtendChar = UTF16.charAt(fText, nPos);
if (fID.contains(possiblyExtendChar)) {
// We hit into the Hangul Syllable itself, class is ID.
nPos = moveIndex32(fText, nPos, +1);
break;
}
if (nPos == 0) {
break;
}
}
}
// LB 7b Keep combining sequences together.
// advance over any CM class chars. (Line Break CM class is different from
// grapheme cluster CM, so we need to do this even for HangulSyllables.
// Line Break may eat additional stuff as combining, beyond what graphem cluster did.
if (!(fBK.contains(posChar) || fZW.contains(posChar) || posChar==0x0a
|| posChar==0x0d || posChar==0x85)) {
for (;;) {
if (nPos == fText.length()) {
break;
}
nextChar = UTF16.charAt(fText, nPos);
if (!fCM.contains(nextChar)) {
break;
}
nPos = moveIndex32(fText, nPos, 1);
}
}
// LB 7a In a SP CM* sequence, treat the SP as an ID
if (nPos != nextPos && fSP.contains(posChar)) {
posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.
}
// LB 7b Treat X CM* as if it were x.
// No explicit action required.
// LB 7c Treat any remaining combining mark as AL
if (fCM.contains(posChar)) {
posChar = 'A';
}
// Push the updated nextPos and nextChar back to our caller.
// This only makes a difference if posChar got bigger, by slurping up a
// combining sequence or Hangul syllable.
nextPos = nPos;
nextChar = 0;
if (nPos < fText.length()) {
nextChar = UTF16.charAt(fText, nPos);
}
retVals[0] = posChar;
retVals[1] = nextPos;
retVals[2] = nextChar;
return retVals;
}
int next(int startPos) {
int pos; // Index of the char following a potential break position
int thisChar; // Character at above position "pos"
int prevPos; // Index of the char preceding a potential break position
int prevChar; // Character at above position. Note that prevChar
// and thisChar may not be adjacent because combining
// characters between them will be ignored.
int nextPos; // Index of the next character following pos.
// Usually skips over combining marks.
int nextCPPos; // Index of the code point following "pos."
// May point to a combining mark.
int tPos; // temp value.
int c;
int LB10match[] = null; // Regular expr match results for LB10.
int matchVals[] = null; // Regular Expression Match Results
int rule67vals[] = null; // Return values from Rule 6 & 7 adjust function.
if (startPos >= fText.length()) {
return -1;
}
// Initial values for loop. Loop will run the first time without finding breaks,
// while the invalid values shift out and the "this" and
// "prev" positions are filled in with good values.
pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
thisChar = prevChar = 0;
nextPos = nextCPPos = startPos;
// Loop runs once per position in the test text, until a break position
// is found.
for (;;) {
prevPos = pos;
prevChar = thisChar;
pos = nextPos;
// Break at end of text.
if (pos >= fText.length()) {
break;
}
thisChar = UTF16.charAt(fText, pos);
nextCPPos = moveIndex32(fText, pos, 1);
nextPos = nextCPPos;
// LB 3a Always break after hard line breaks,
if (fBK.contains(prevChar)) {
break;
}
// LB 3b Break after CR, LF, NL, but not inside CR LF
if (prevChar == 0x0d && thisChar == 0x0a) {
continue;
}
if (prevChar == 0x0d ||
prevChar == 0x0a ||
prevChar == 0x85) {
break;
}
// LB 3c Don't break before hard line breaks
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
fBK.contains(thisChar)) {
continue;
}
// LB 10 QU SP* x OP
if (prevPos >= 0) {
matchVals = LB10Check(fText, prevPos, LB10match); // Test for match of
if (matchVals[0] != -1) { // /QU CM* SP* (OP) CM*/
pos = matchVals[0];
nextPos = matchVals[1];
thisChar = UTF16.charAt(fText, pos);
continue;
}
}
// LB 11 CL SP* x NS
if (prevPos >= 0) {
matchVals = LB11Check(fText, prevPos, matchVals);
if (matchVals[0] != -1) { // /QU CM* SP* (OP) CM*/;
pos = matchVals[0];
nextPos = matchVals[1];
thisChar = UTF16.charAt(fText, pos);
continue;
}
}
// LB 4 Don't break before spaces or zero-width space.
if (fSP.contains(thisChar)) {
continue;
}
if (fZW.contains(thisChar)) {
continue;
}
// LB 5 Break after zero width space
if (fZW.contains(prevChar)) {
break;
}
// LB 6, LB 7
/*int oldpos = pos;*/
int retVals[] = null;
retVals = rule67Adjust(prevPos, prevChar, pos, thisChar, retVals);
prevChar = retVals[0];
pos = retVals[1];
thisChar = retVals[2];
nextCPPos = moveIndex32(fText, pos, 1);
nextPos = nextCPPos;
c = 0;
if (nextPos < fText.length()) {
c = UTF16.charAt(fText, nextPos);
}
// another peculiarity of LB 4 - Dont break before space
if (fSP.contains(thisChar)) {
continue;
}
rule67vals = rule67Adjust(pos, thisChar, nextPos, c, rule67vals);
thisChar = rule67vals[0];
nextPos = rule67vals[1];
c = rule67vals[2];
// If the loop is still warming up - if we haven't shifted the initial
// -1 positions out of prevPos yet - loop back to advance the
// position in the input without any further looking for breaks.
if (prevPos == -1) {
continue;
}
// Re-apply rules 3c, 4 because these could be affected by having
// a new thisChar from doing rule 6 or 7.
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c
fBK.contains(thisChar)) {
continue;
}
if (fSP.contains(thisChar)) { // LB 4
continue;
}
if (fZW.contains(thisChar)) { // LB 4
continue;
}
// LB 8 Don't break before closings.
// NU x CL and NU x IS are not matched here so that they will
// fall into LB 17 and the more general number regular expression.
//
if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
fEX.contains(thisChar) ||
!fNU.contains(prevChar) && fIS.contains(thisChar) ||
!fNU.contains(prevChar) && fSY.contains(thisChar)) {
continue;
}
// LB 9 Don't break after OP SP*
// Scan backwards, checking for this sequence.
// The OP char could include combining marks, so we acually check for
// OP CM* SP*
// Another Twist: The Rule 67 fixes may have changed a CP CM
// sequence into a ID char, so before scanning back through spaces,
// verify that prevChar is indeed a space. The prevChar variable
// may differ from fText[prevPos]
tPos = prevPos;
if (fSP.contains(prevChar)) {
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
tPos=moveIndex32(fText, tPos, -1);
}
}
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
tPos=moveIndex32(fText, tPos, -1);
}
if (fOP.contains(UTF16.charAt(fText, tPos))) {
continue;
}
// LB 11a B2 x B2
if (fB2.contains(thisChar) && fB2.contains(prevChar)) {
continue;
}
// LB 11b
// x GL
// GL x
if (fGL.contains(thisChar) || fGL.contains(prevChar)) {
continue;
}
if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
continue;
}
// LB 12 break after space
if (fSP.contains(prevChar)) {
break;
}
// LB 14
// x QU
// QU x
if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
continue;
}
// LB 14a Break around a CB
if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
break;
}
// LB 15
if (fBA.contains(thisChar) ||
fHY.contains(thisChar) ||
fNS.contains(thisChar) ||
fBB.contains(prevChar) ) {
continue;
}
// LB 16
if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
fID.contains(prevChar) && fIN.contains(thisChar) ||
fIN.contains(prevChar) && fIN.contains(thisChar) ||
fNU.contains(prevChar) && fIN.contains(thisChar) ) {
continue;
}
// LB 17 ID x PO (Note: Leading CM behaves like ID)
// AL x NU
// NU x AL
if (fID.contains(prevChar) && fPO.contains(thisChar) ||
fCM.contains(prevChar) && fPO.contains(thisChar) ||
fAL.contains(prevChar) && fNU.contains(thisChar) ||
fNU.contains(prevChar) && fAL.contains(thisChar) ) {
continue;
}
// LB 18 Numbers
matchVals = LBNumberCheck(fText, prevPos, matchVals);
if (matchVals[0] != -1) {
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int numEndIdx = matchVals[1]; // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least the two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
nextPos = numEndIdx;
pos = fCharBI.preceding(numEndIdx);
thisChar = UTF16.charAt(fText, pos);
while (fCM.contains(thisChar)) {
pos = fCharBI.preceding(pos);
thisChar = UTF16.charAt(fText, pos);
}
}
continue;
}
}
if (fPR.contains(prevChar) && fAL.contains(thisChar)) {
continue;
}
if (fPR.contains(prevChar) && fID.contains(thisChar)) {
continue;
}
// LB 18b
if (fHY.contains(prevChar) || fBB.contains(thisChar)) {
break;
}
// LB 19
if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
continue;
}
// LB 19b
if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
continue;
}
// LB 20 Break everywhere else
break;
}
return pos;
}
// Match the following regular expression in the input text.
// QU CM* SP* (OP) CM*
// 0 1 2 3 4 (match states)
// Can't use Java regexp because supplementary chars must be handled,
// because line break properties are needed, and
// because Unicode Version must match ICU.
// retVals array [0] index of the OP in the match, or -1 if no match
// [1] index of first char following the match.
private int[] LB10Check(StringBuffer s, int startIdx, int[] retVals) {
if (retVals == null) {
retVals = new int[2];
}
retVals[0] = -1; // Indicates no match.
int matchState = 0;
int idx = startIdx;
matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
int c = UTF16.charAt(s, idx);
int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
switch (matchState) {
case 0:
if (cLBType == UCharacter.LineBreak.QUOTATION) {
matchState = 1;
break;
}
break matchLoop; /* No Match */
case 1:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
break;
}
case 2:
if (cLBType == UCharacter.LineBreak.SPACE) {
matchState = 2;
break;
}
if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
matchState = 4;
retVals[0] = idx;
retVals[1] = idx;
break;
}
break matchLoop; /* No Match */
case 4:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
retVals[1] = idx;
break;
}
break matchLoop; // Successful match.
}
}
if (retVals[0] >= 0) {
retVals[1] = moveIndex32(fText, retVals[1], 1);
}
return retVals;
}
// Match the following regular expression in the input text.
// CL CM* SP* (NS) CM*
// 0 1 2 4 (match states)
// Can't use Java regexp because supplementary chars must be handled,
// because line break properties are needed, and
// because Unicode Version must match ICU.
// retVals array [0] index of the OP in the match, or -1 if no match
// [1] index of first char following the match.
private int[] LB11Check(StringBuffer s, int startIdx, int[] retVals) {
if (retVals == null) {
retVals = new int[2];
}
retVals[0] = -1; // Indicates no match.
int matchState = 0;
int idx = startIdx;
matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
int c = UTF16.charAt(s, idx);
int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
switch (matchState) {
case 0:
if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
matchState = 1;
break;
}
break matchLoop; /* No Match */
case 1:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
break;
}
case 2:
if (cLBType == UCharacter.LineBreak.SPACE) {
matchState = 2;
break;
}
if (cLBType == UCharacter.LineBreak.NONSTARTER) {
matchState = 4;
retVals[0] = idx;
retVals[1] = idx;
break;
}
break matchLoop; /* No Match */
case 4:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
retVals[1] = idx;
break;
}
break matchLoop; // Successful match.
}
if (retVals[0] >= 0) {
retVals[1] = moveIndex32(fText, retVals[1], 1);
}
}
return retVals;
}
// Match the following regular expression in the input text.
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)?
// 0 1 3 3 3 7 7 7 7 7 9 9 11 11 (match states)
// retVals array [0] index of the start of the match, or -1 if no match
// [1] index of first char following the match.
private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
if (retVals == null) {
retVals = new int[2];
}
retVals[0] = -1; // Indicates no match.
int matchState = 0;
int idx = startIdx;
matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
int c = UTF16.charAt(s, idx);
int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
switch (matchState) {
case 0:
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
matchState = 1;
break;
}
if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.HYPHEN) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
break matchLoop; /* No Match */
case 1:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 1;
break;
}
if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.HYPHEN) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
break matchLoop; /* No Match */
case 4:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
break matchLoop; /* No Match */
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)?
// 0 1 3 3 4 7 7 7 7 7 7 9 9 11 11 (match states)
case 7:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
matchState = 9;
break;
}
if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
matchState = 11;
break;
}
break matchLoop; // Match Complete.
case 9:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 9;
break;
}
if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
matchState = 11;
break;
}
break matchLoop; // Match Complete.
case 11:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 11;
break;
}
break matchLoop; // Match Complete.
}
}
if (matchState > 4) {
retVals[0] = startIdx;
retVals[1] = idx;
}
return retVals;
}
List charClasses() {
return fSets;
}
}
/**
* Move an index into a string by n code points.
* Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
* complicating usage.
* @param s a Text string
* @param i The starting code unit index into the text string
* @param amt The amount to adjust the string by.
* @return The adjusted code unit index, pinned to the string's length, or
* unchanged if input index was outside of the string.
*/
static int moveIndex32(StringBuffer s, int pos, int amt) {
int i;
char c;
if (amt>0) {
for (i=0; i<amt; i++) {
if (pos >= s.length()) {
return s.length();
}
c = s.charAt(pos);
pos++;
if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
c = s.charAt(pos);
if (UTF16.isTrailSurrogate(c)) {
pos++;
}
}
}
} else {
for (i=0; i>amt; i--) {
if (pos <= 0) {
return 0;
}
pos--;
c = s.charAt(pos);
if (UTF16.isTrailSurrogate(c) && pos >= 0) {
c = s.charAt(pos);
if (UTF16.isLeadSurrogate(c)) {
pos--;
}
}
}
}
return pos;
}
/**
* return the index of the next code point in the input text.
* @param i the preceding index
* @return
* @internal
*/
static int nextCP(StringBuffer s, int i) {
if (i == -1) {
// End of Input indication. Continue to return end value.
return -1;
}
int retVal = i + 1;
if (retVal > s.length()) {
return -1;
}
int c = UTF16.charAt(s, i);
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE) {
retVal++;
}
return retVal;
}
//
// The following UnicodeSets are used in matching a Grapheme Cluster
//
private static UnicodeSet GC_Control =
new UnicodeSet("[[:Zl:][:Zp:][:Cc:][:Cf:]-[\\u000d\\u000a]-[:Grapheme_Extend:]]");
private static UnicodeSet GC_Extend =
new UnicodeSet("[[:Grapheme_Extend:]]");
private static UnicodeSet GC_L =
new UnicodeSet("[[:Hangul_Syllable_Type=L:]]");
private static UnicodeSet GC_V =
new UnicodeSet("[[:Hangul_Syllable_Type=V:]]");
private static UnicodeSet GC_T =
new UnicodeSet("[[:Hangul_Syllable_Type=T:]]");
private static UnicodeSet GC_LV =
new UnicodeSet("[[:Hangul_Syllable_Type=LV:]]");
private static UnicodeSet GC_LVT =
new UnicodeSet("[[:Hangul_Syllable_Type=LVT:]]");
/**
* Find the end of the extent of a grapheme cluster.
* This is the reference implementation used by the monkey test for comparison
* with the RBBI results.
* @param s The string containing the text to be analyzed
* @param i The index of the start of the grapheme cluster.
* @return The index of the first code point following the grapheme cluster
* @internal
*/
private static int nextGC(StringBuffer s, int i) {
if (i >= s.length() || i == -1 ) {
return -1;
}
int c = UTF16.charAt(s, i);
int pos = i;
if (c == 0x0d) {
pos = nextCP(s, i);
if (pos >= s.length()) {
return pos;
}
c = UTF16.charAt(s, pos);
if (c == 0x0a) {
pos = nextCP(s, pos);
}
return pos;
}
if (GC_Control.contains(c) || c == 0x0a) {
pos = nextCP(s, pos);
return pos;
}
// Little state machine to consume Hangul Syllables
int hangulState = 1;
state_loop: for (;;) {
switch (hangulState) {
case 1:
if (GC_L.contains(c)) {
hangulState = 2;
break;
}
if (GC_V.contains(c) || GC_LV.contains(c)) {
hangulState = 3;
break;
}
if (GC_T.contains(c) || GC_LVT.contains(c)) {
hangulState = 4;
break;
}
break state_loop;
case 2:
if (GC_L.contains(c)) {
// continue in state 2.
break;
}
if (GC_V.contains(c) || GC_LV.contains(c)) {
hangulState = 3;
break;
}
if (GC_LVT.contains(c)) {
hangulState = 4;
break;
}
if (GC_Extend.contains(c)) {
hangulState = 5;
break;
}
break state_loop;
case 3:
if (GC_V.contains(c)) {
// continue in state 3;
break;
}
if (GC_T.contains(c)) {
hangulState = 4;
break;
}
if (GC_Extend.contains(c)) {
hangulState = 5;
break;
}
break state_loop;
case 4:
if (GC_T.contains(c)) {
// continue in state 4
break;
}
if (GC_Extend.contains(c)) {
hangulState = 5;
break;
}
break state_loop;
case 5:
if (GC_Extend.contains(c)) {
hangulState = 5;
break;
}
break state_loop;
}
// We have exited the switch statement, but are still in the loop.
// Still in a Hangul Syllable, advance to the next code point.
pos = nextCP(s, pos);
if (pos >= s.length()) {
break;
}
c = UTF16.charAt(s, pos);
} // end of loop
if (hangulState != 1) {
// We found a Hangul. We're done.
return pos;
}
// Ordinary characters. Consume one codepoint unconditionally, then any following Extends.
for (;;) {
pos = nextCP(s, pos);
if (pos >= s.length()) {
break;
}
c = UTF16.charAt(s, pos);
if (GC_Extend.contains(c) == false) {
break;
}
}
return pos;
}
/**
* random number generator. Not using Java's built-in Randoms for two reasons:
* 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
* 2. We need to get and restore the seed from values occuring in the middle
* of a long sequence, to more easily reproduce failing cases.
*/
private static int m_seed = 1;
private static int m_rand()
{
m_seed = m_seed * 1103515245 + 12345;
return (int)(m_seed >>> 16) % 32768;
}
/**
* Run a RBBI monkey test. Common routine, for all break iterator types.
* Parameters:
* bi - the break iterator to use
* mk - MonkeyKind, abstraction for obtaining expected results
* name - Name of test (char, word, etc.) for use in error messages
* seed - Seed for starting random number generator (parameter from user)
* numIterations
*/
void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) {
int TESTSTRINGLEN = 500;
StringBuffer testText = new StringBuffer();
int numCharClasses;
List chClasses;
int[] expected = new int[TESTSTRINGLEN*2 + 1];
int expectedCount = 0;
boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
int i;
int loopCount = 0;
boolean printTestData = false;
boolean printBreaksFromBI = false;
m_seed = seed;
numCharClasses = mk.charClasses().size();
chClasses = mk.charClasses();
// Verify that the character classes all have at least one member.
for (i=0; i<numCharClasses; i++) {
UnicodeSet s = (UnicodeSet)chClasses.get(i);
if (s == null || s.size() == 0) {
errln("Character Class " + i + " is null or of zero size.");
return;
}
}
//--------------------------------------------------------------------------------------------
//
// Debugging settings. Comment out everything in the following block for normal operation
//
//--------------------------------------------------------------------------------------------
// numIterations = 20;
//RuleBasedBreakIterator_New.fTrace = true;
//m_seed = -1324359431;
// TESTSTRINGLEN = 50;
// printTestData = true;
// printBreaksFromBI = true;
// ((RuleBasedBreakIterator_New)bi).dump();
//--------------------------------------------------------------------------------------------
//
// End of Debugging settings.
//
//--------------------------------------------------------------------------------------------
int dotsOnLine = 0;
while (loopCount < numIterations || numIterations == -1) {
if (numIterations == -1 && loopCount % 10 == 0) {
// If test is running in an infinite loop, display a periodic tic so
// we can tell that it is making progress.
System.out.print(".");
if (dotsOnLine++ >= 80){
System.out.println();
dotsOnLine = 0;
}
}
// Save current random number seed, so that we can recreate the random numbers
// for this loop iteration in event of an error.
seed = m_seed;
testText.setLength(0);
// Populate a test string with data.
if (printTestData) {
System.out.println("Test Data string ...");
}
for (i=0; i<TESTSTRINGLEN; i++) {
int aClassNum = m_rand() % numCharClasses;
UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum);
int charIdx = m_rand() % classSet.size();
int c = classSet.charAt(charIdx);
if (c < 0) { // TODO: deal with sets containing strings.
errln("c < 0");
}
UTF16.appendCodePoint(testText, c);
if (printTestData) {
System.out.print(Integer.toHexString(c) + " ");
}
}
if (printTestData) {
System.out.println();
}
Arrays.fill(expected, 0);
Arrays.fill(expectedBreaks, false);
Arrays.fill(forwardBreaks, false);
Arrays.fill(reverseBreaks, false);
Arrays.fill(isBoundaryBreaks, false);
Arrays.fill(followingBreaks, false);
Arrays.fill(precedingBreaks, false);
// Calculate the expected results for this test string.
mk.setText(testText);
expectedCount = 0;
expectedBreaks[0] = true;
expected[expectedCount ++] = 0;
int breakPos = 0;
int lastBreakPos = -1;
for (;;) {
lastBreakPos = breakPos;
breakPos = mk.next(breakPos);
if (breakPos == -1) {
break;
}
if (breakPos > testText.length()) {
errln("breakPos > testText.length()");
}
if (lastBreakPos >= breakPos) {
errln("Next() not increasing.");
// break;
}
expectedBreaks[breakPos] = true;
expected[expectedCount ++] = breakPos;
}
// Find the break positions using forward iteration
if (printBreaksFromBI) {
System.out.println("Breaks from BI...");
}
bi.setText(testText.toString());
for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
if (i < 0 || i > testText.length()) {
errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
break;
}
if (printBreaksFromBI) {
System.out.print(Integer.toHexString(i) + " ");
}
forwardBreaks[i] = true;
}
if (printBreaksFromBI) {
System.out.println();
}
// Find the break positions using reverse iteration
for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
if (i < 0 || i > testText.length()) {
errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
break;
}
reverseBreaks[i] = true;
}
// Find the break positions using isBoundary() tests.
for (i=0; i<=testText.length(); i++) {
isBoundaryBreaks[i] = bi.isBoundary(i);
}
// Find the break positions using the following() function.
lastBreakPos = 0;
followingBreaks[0] = true;
for (i=0; i<testText.length(); i++) {
breakPos = bi.following(i);
if (breakPos <= i ||
breakPos < lastBreakPos ||
breakPos > testText.length() ||
breakPos > lastBreakPos && lastBreakPos > i ) {
errln(name + " break monkey test: " +
"Out of range value returned by BreakIterator::following().\n" +
"index=" + i + "following returned=" + breakPos +
"lastBreak=" + lastBreakPos);
precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
} else {
followingBreaks[breakPos] = true;
lastBreakPos = breakPos;
}
}
// Find the break positions using the preceding() function.
lastBreakPos = testText.length();
precedingBreaks[testText.length()] = true;
for (i=testText.length(); i>0; i--) {
breakPos = bi.preceding(i);
if (breakPos >= i ||
breakPos > lastBreakPos ||
breakPos < 0 ||
breakPos < lastBreakPos && lastBreakPos < i ) {
errln(name + " break monkey test: " +
"Out of range value returned by BreakIterator::preceding().\n" +
"index=" + i + "preceding returned=" + breakPos +
"lastBreak=" + lastBreakPos);
precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
} else {
precedingBreaks[breakPos] = true;
lastBreakPos = breakPos;
}
}
// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
String errorType = null;
if (forwardBreaks[i] != expectedBreaks[i]) {
errorType = "next()";
} else if (reverseBreaks[i] != forwardBreaks[i]) {
errorType = "previous()";
} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
errorType = "isBoundary()";
} else if (followingBreaks[i] != expectedBreaks[i]) {
errorType = "following()";
} else if (precedingBreaks[i] != expectedBreaks[i]) {
errorType = "preceding()";
}
if (errorType != null) {
// Format a range of the test text that includes the failure as
// a data item that can be included in the rbbi test data file.
// Start of the range is the last point where expected and actual results
// both agreed that there was a break position.
int startContext = i;
int count = 0;
for (;;) {
if (startContext==0) { break; }
startContext --;
if (expectedBreaks[startContext]) {
if (count == 2) break;
count ++;
}
}
// End of range is two expected breaks past the start position.
int endContext = i + 1;
int ci;
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
for (;;) {
if (endContext >= testText.length()) {break;}
if (expectedBreaks[endContext-1]) {
if (count == 0) break;
count --;
}
endContext ++;
}
}
// Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
StringBuffer errorText = new StringBuffer();
errorText.append("<data>");
String hexChars = "0123456789abcdef";
int c; // Char from test data
int bn;
for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
if (ci == i) {
// This is the location of the error.
errorText.append("<?>");
} else if (expectedBreaks[ci]) {
// This a non-error expected break position.
errorText.append("<>");
}
if (ci < testText.length()) {
c = UTF16.charAt(testText, ci);
if (c < 0x10000) {
errorText.append("\\u");
for (bn=12; bn>=0; bn-=4) {
errorText.append(hexChars.charAt((((int)c)>>bn)&0xf));
}
} else {
errorText.append("\\U");
for (bn=28; bn>=0; bn-=4) {
errorText.append(hexChars.charAt((((int)c)>>bn)&0xf));
}
}
}
}
if (ci == testText.length() && ci != -1) {
errorText.append("<>");
}
errorText.append("</data>\n");
// Output the error
errln(name + " break monkey test error. " +
(expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
"\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" +
errorText);
break;
}
}
loopCount++;
}
}
public void TestCharMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 10000;
}
RBBICharMonkey m = new RBBICharMonkey();
BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
RunMonkey(bi, m, "char", seed, loopCount);
}
public void TestWordMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 10000;
}
logln("Word Break Monkey Test");
RBBIWordMonkey m = new RBBIWordMonkey();
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
RunMonkey(bi, m, "word", seed, loopCount);
}
public void TestLineMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 10000;
}
logln("Line Break Monkey Test");
RBBILineMonkey m = new RBBILineMonkey();
BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
if (params == null) {
loopCount = 50;
}
RunMonkey(bi, m, "line", seed, loopCount);
}
}