blob: 1931a58b9ea1b8a3acd7cf0a48c6bb70c36d21d2 [file] [log] [blame]
/*
*******************************************************************************
* Copyright (C) 2003-2011 International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.rbbi;
// Monkey testing of RuleBasedBreakIterator
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* Monkey tests for RBBI. These tests have independent implementations of
* the Unicode TR boundary rules, and compare results between these and ICU's
* implementation, using random data.
*
* Tests cover Grapheme Cluster (char), Word and Line breaks
*
* Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
*
*/
public class RBBITestMonkey extends TestFmwk {
public static void main(String[] args) {
new RBBITestMonkey().run(args);
}
//
// classs RBBIMonkeyKind
//
// Monkey Test for Break Iteration
// Abstract interface class. Concrete derived classes independently
// implement the break rules for different iterator types.
//
// The Monkey Test itself uses doesn't know which type of break iterator it is
// testing, but works purely in terms of the interface defined here.
//
abstract static class RBBIMonkeyKind {
// Return a List of UnicodeSets, representing the character classes used
// for this type of iterator.
abstract List charClasses();
// Set the test text on which subsequent calls to next() will operate
abstract void setText(StringBuffer text);
// Find the next break postion, starting from the specified position.
// Return -1 after reaching end of string.
abstract int next(int i);
// A Character Property, one of the constants defined in class UProperty.
// The value fo this property will be displayed for the characters
// near any test failure.
int fCharProperty;
}
/**
* Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
* Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
*/
static class RBBICharMonkey extends RBBIMonkeyKind {
List fSets;
UnicodeSet fCRLFSet;
UnicodeSet fControlSet;
UnicodeSet fExtendSet;
UnicodeSet fPrependSet;
UnicodeSet fSpacingSet;
UnicodeSet fLSet;
UnicodeSet fVSet;
UnicodeSet fTSet;
UnicodeSet fLVSet;
UnicodeSet fLVTSet;
UnicodeSet fHangulSet;
UnicodeSet fAnySet;
StringBuffer fText;
RBBICharMonkey() {
fText = null;
fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
fCRLFSet = new UnicodeSet("[\\r\\n]");
fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
fHangulSet = new UnicodeSet();
fHangulSet.addAll(fLSet);
fHangulSet.addAll(fVSet);
fHangulSet.addAll(fTSet);
fHangulSet.addAll(fLVSet);
fHangulSet.addAll(fLVTSet);
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
fSets = new ArrayList();
fSets.add(fCRLFSet);
fSets.add(fControlSet);
fSets.add(fExtendSet);
if (!fPrependSet.isEmpty()) {
fSets.add(fPrependSet);
}
fSets.add(fSpacingSet);
fSets.add(fHangulSet);
fSets.add(fAnySet);
}
void setText(StringBuffer s) {
fText = s;
}
List charClasses() {
return fSets;
}
int next(int prevPos) {
int p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
int breakPos = -1;
int c1, c2, c3; // The code points at p0, p1, p2 & p3.
// Previous break at end of string. return DONE.
if (prevPos >= fText.length()) {
return -1;
}
p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
c1 = c2 = 0;
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
// Advance p3 by one codepoint
p3 = moveIndex32(fText, p3, 1);
c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
if (p2 == fText.length()) {
// Reached end of string. Always a break position.
break;
}
// Rule GB3 CR x LF
// No Extend or Format characters may appear between the CR and LF,
// which requires the additional check for p2 immediately following p1.
//
if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
continue;
}
// Rule (GB4). ( Control | CR | LF ) <break>
if (fControlSet.contains(c1) ||
c1 == 0x0D ||
c1 == 0x0A) {
break;
}
// Rule (GB5) <break> ( Control | CR | LF )
//
if (fControlSet.contains(c2) ||
c2 == 0x0D ||
c2 == 0x0A) {
break;
}
// Rule (GB6) L x ( L | V | LV | LVT )
if (fLSet.contains(c1) &&
(fLSet.contains(c2) ||
fVSet.contains(c2) ||
fLVSet.contains(c2) ||
fLVTSet.contains(c2))) {
continue;
}
// Rule (GB7) ( LV | V ) x ( V | T )
if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
(fVSet.contains(c2) || fTSet.contains(c2))) {
continue;
}
// Rule (GB8) ( LVT | T) x T
if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
fTSet.contains(c2)) {
continue;
}
// Rule (GB9) Numeric x ALetter
if (fExtendSet.contains(c2)) {
continue;
}
// Rule (GB9a) x SpacingMark
if (fSpacingSet.contains(c2)) {
continue;
}
// Rule (GB9b) Prepend x
if (fPrependSet.contains(c1)) {
continue;
}
// Rule (GB10) Any <break> Any
break;
}
breakPos = p2;
return breakPos;
}
}
/**
*
* Word Monkey Test Class
*
*
*
*/
static class RBBIWordMonkey extends RBBIMonkeyKind {
List fSets;
StringBuffer fText;
UnicodeSet fCRSet;
UnicodeSet fLFSet;
UnicodeSet fNewlineSet;
UnicodeSet fKatakanaSet;
UnicodeSet fALetterSet;
UnicodeSet fMidNumLetSet;
UnicodeSet fMidLetterSet;
UnicodeSet fMidNumSet;
UnicodeSet fNumericSet;
UnicodeSet fFormatSet;
UnicodeSet fExtendSet;
UnicodeSet fExtendNumLetSet;
UnicodeSet fOtherSet;
RBBIWordMonkey() {
fCharProperty = UProperty.WORD_BREAK;
fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]");
fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]");
fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]");
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
fOtherSet = new UnicodeSet();
fOtherSet.complement();
fOtherSet.removeAll(fCRSet);
fOtherSet.removeAll(fLFSet);
fOtherSet.removeAll(fNewlineSet);
fOtherSet.removeAll(fALetterSet);
fOtherSet.removeAll(fKatakanaSet);
fOtherSet.removeAll(fMidLetterSet);
fOtherSet.removeAll(fMidNumSet);
fOtherSet.removeAll(fNumericSet);
fOtherSet.removeAll(fFormatSet);
fOtherSet.removeAll(fExtendSet);
fOtherSet.removeAll(fExtendNumLetSet);
// Inhibit dictionary characters from being tested at all.
fOtherSet.removeAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
fSets = new ArrayList();
fSets.add(fCRSet);
fSets.add(fLFSet);
fSets.add(fNewlineSet);
fSets.add(fALetterSet);
fSets.add(fKatakanaSet);
fSets.add(fMidLetterSet);
fSets.add(fMidNumLetSet);
fSets.add(fMidNumSet);
fSets.add(fNumericSet);
fSets.add(fFormatSet);
fSets.add(fExtendSet);
fSets.add(fExtendNumLetSet);
fSets.add(fOtherSet);
}
List charClasses() {
return fSets;
}
void setText(StringBuffer s) {
fText = s;
}
int next(int prevPos) {
int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
int breakPos = -1;
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
// Previous break at end of string. return DONE.
if (prevPos >= fText.length()) {
return -1;
}
/*p0 =*/ p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
c0 = c1 = c2 = 0;
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
/*p0 = p1;*/ c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
// Advancd p3 by X(Extend | Format)* Rule 4
// But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
do {
p3 = moveIndex32(fText, p3, 1);
c3 = -1;
if (p3>=fText.length()) {
break;
}
c3 = UTF16.charAt(fText, p3);
if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
break;
}
}
while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3));
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
if (p2 == fText.length()) {
// Reached end of string. Always a break position.
break;
}
// Rule (3) CR x LF
// No Extend or Format characters may appear between the CR and LF,
// which requires the additional check for p2 immediately following p1.
//
if (c1==0x0D && c2==0x0A) {
continue;
}
// Rule (3a) Break before and after newlines (including CR and LF)
//
if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
break;
}
if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
break;
}
// Rule (5). ALetter x ALetter
if (fALetterSet.contains(c1) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (6) ALetter x (MidLetter | MidNumLet) ALetter
//
if ( fALetterSet.contains(c1) &&
(fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
setContains(fALetterSet, c3)) {
continue;
}
// Rule (7) ALetter (MidLetter | MidNumLet) x ALetter
if (fALetterSet.contains(c0) &&
(fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (8) Numeric x Numeric
if (fNumericSet.contains(c1) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (9) ALetter x Numeric
if (fALetterSet.contains(c1) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (10) Numeric x ALetter
if (fNumericSet.contains(c1) &&
fALetterSet.contains(c2)) {
continue;
}
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
if ( fNumericSet.contains(c0) &&
(fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1)) &&
fNumericSet.contains(c2)) {
continue;
}
// Rule (12) Numeric x (MidNum | MidNumLet) Numeric
if (fNumericSet.contains(c1) &&
(fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2)) &&
setContains(fNumericSet, c3)) {
continue;
}
// Rule (13) Katakana x Katakana
if (fKatakanaSet.contains(c1) &&
fKatakanaSet.contains(c2)) {
continue;
}
// Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet
if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) ||
fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
fExtendNumLetSet.contains(c2)) {
continue;
}
// Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet)
if (fExtendNumLetSet.contains(c1) &&
(fALetterSet.contains(c2) || fNumericSet.contains(c2) ||
fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) {
continue;
}
// Rule 14. Break found here.
break;
}
breakPos = p2;
return breakPos;
}
}
static class RBBILineMonkey extends RBBIMonkeyKind {
List fSets;
UnicodeSet fBK;
UnicodeSet fCR;
UnicodeSet fLF;
UnicodeSet fCM;
UnicodeSet fNL;
UnicodeSet fSG;
UnicodeSet fWJ;
UnicodeSet fZW;
UnicodeSet fGL;
UnicodeSet fCB;
UnicodeSet fSP;
UnicodeSet fB2;
UnicodeSet fBA;
UnicodeSet fBB;
UnicodeSet fHY;
UnicodeSet fCL;
UnicodeSet fCP;
UnicodeSet fEX;
UnicodeSet fIN;
UnicodeSet fNS;
UnicodeSet fOP;
UnicodeSet fQU;
UnicodeSet fIS;
UnicodeSet fNU;
UnicodeSet fPO;
UnicodeSet fPR;
UnicodeSet fSY;
UnicodeSet fAI;
UnicodeSet fAL;
UnicodeSet fID;
UnicodeSet fSA;
UnicodeSet fJL;
UnicodeSet fJV;
UnicodeSet fJT;
UnicodeSet fH2;
UnicodeSet fH3;
UnicodeSet fXX;
StringBuffer fText;
int fOrigPositions;
RBBILineMonkey()
{
fCharProperty = UProperty.LINE_BREAK;
fSets = new ArrayList();
fBK = new UnicodeSet("[\\p{Line_Break=BK}]");
fCR = new UnicodeSet("[\\p{Line_break=CR}]");
fLF = new UnicodeSet("[\\p{Line_break=LF}]");
fCM = new UnicodeSet("[\\p{Line_break=CM}]");
fNL = new UnicodeSet("[\\p{Line_break=NL}]");
fWJ = new UnicodeSet("[\\p{Line_break=WJ}]");
fZW = new UnicodeSet("[\\p{Line_break=ZW}]");
fGL = new UnicodeSet("[\\p{Line_break=GL}]");
fCB = new UnicodeSet("[\\p{Line_break=CB}]");
fSP = new UnicodeSet("[\\p{Line_break=SP}]");
fB2 = new UnicodeSet("[\\p{Line_break=B2}]");
fBA = new UnicodeSet("[\\p{Line_break=BA}]");
fBB = new UnicodeSet("[\\p{Line_break=BB}]");
fHY = new UnicodeSet("[\\p{Line_break=HY}]");
fCL = new UnicodeSet("[\\p{Line_break=CL}]");
fCP = new UnicodeSet("[\\p{Line_break=CP}]");
fEX = new UnicodeSet("[\\p{Line_break=EX}]");
fIN = new UnicodeSet("[\\p{Line_break=IN}]");
fNS = new UnicodeSet("[\\p{Line_break=NS}]");
fOP = new UnicodeSet("[\\p{Line_break=OP}]");
fQU = new UnicodeSet("[\\p{Line_break=QU}]");
fIS = new UnicodeSet("[\\p{Line_break=IS}]");
fNU = new UnicodeSet("[\\p{Line_break=NU}]");
fPO = new UnicodeSet("[\\p{Line_break=PO}]");
fPR = new UnicodeSet("[\\p{Line_break=PR}]");
fSY = new UnicodeSet("[\\p{Line_break=SY}]");
fAI = new UnicodeSet("[\\p{Line_break=AI}]");
fAL = new UnicodeSet("[\\p{Line_break=AL}]");
fID = new UnicodeSet("[\\p{Line_break=ID}]");
fSA = new UnicodeSet("[\\p{Line_break=SA}]");
fJL = new UnicodeSet("[\\p{Line_break=JL}]");
fJV = new UnicodeSet("[\\p{Line_break=JV}]");
fJT = new UnicodeSet("[\\p{Line_break=JT}]");
fH2 = new UnicodeSet("[\\p{Line_break=H2}]");
fH3 = new UnicodeSet("[\\p{Line_break=H3}]");
fSG = new UnicodeSet("[\\ud800-\\udfff]");
fXX = new UnicodeSet("[\\p{Line_break=XX}]");
fAL.addAll(fXX); // Default behavior for XX is identical to AL
fAL.addAll(fAI); // Default behavior for AI is identical to AL
fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL
fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL
fSets.add(fBK);
fSets.add(fCR);
fSets.add(fLF);
fSets.add(fCM);
fSets.add(fNL);
fSets.add(fWJ);
fSets.add(fZW);
fSets.add(fGL);
fSets.add(fCB);
fSets.add(fSP);
fSets.add(fB2);
fSets.add(fBA);
fSets.add(fBB);
fSets.add(fHY);
fSets.add(fH2);
fSets.add(fH3);
fSets.add(fCL);
fSets.add(fCP);
fSets.add(fEX);
fSets.add(fIN);
fSets.add(fJL);
fSets.add(fJT);
fSets.add(fJV);
fSets.add(fNS);
fSets.add(fOP);
fSets.add(fQU);
fSets.add(fIS);
fSets.add(fNU);
fSets.add(fPO);
fSets.add(fPR);
fSets.add(fSY);
fSets.add(fAI);
fSets.add(fAL);
fSets.add(fID);
fSets.add(fWJ);
fSets.add(fSA);
fSets.add(fSG);
}
void setText(StringBuffer s) {
fText = s;
}
int next(int startPos) {
int pos; // Index of the char following a potential break position
int thisChar; // Character at above position "pos"
int prevPos; // Index of the char preceding a potential break position
int prevChar; // Character at above position. Note that prevChar
// and thisChar may not be adjacent because combining
// characters between them will be ignored.
int nextPos; // Index of the next character following pos.
// Usually skips over combining marks.
int tPos; // temp value.
int matchVals[] = null; // Number Expression Match Results
if (startPos >= fText.length()) {
return -1;
}
// Initial values for loop. Loop will run the first time without finding breaks,
// while the invalid values shift out and the "this" and
// "prev" positions are filled in with good values.
pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
thisChar = prevChar = 0;
nextPos = startPos;
// Loop runs once per position in the test text, until a break position
// is found. In each iteration, we are testing for a possible break
// just preceding the character at index "pos". The character preceding
// this char is at postion "prevPos"; because of combining sequences,
// "prevPos" can be arbitrarily far before "pos".
for (;;) {
// Advance to the next position to be tested.
prevPos = pos;
prevChar = thisChar;
pos = nextPos;
nextPos = moveIndex32(fText, pos, 1);
// Rule LB2 - Break at end of text.
if (pos >= fText.length()) {
break;
}
// Rule LB 9 - adjust for combining sequences.
// We do this rule out-of-order because the adjustment does
// not effect the way that rules LB 3 through LB 6 match,
// and doing it here rather than after LB 6 is substantially
// simpler when combining sequences do occur.
// LB 9 Keep combining sequences together.
// advance over any CM class chars at "pos",
// result is "nextPos" for the following loop iteration.
thisChar = UTF16.charAt(fText, pos);
if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
for (;;) {
if (nextPos == fText.length()) {
break;
}
int nextChar = UTF16.charAt(fText, nextPos);
if (!fCM.contains(nextChar)) {
break;
}
nextPos = moveIndex32(fText, nextPos, 1);
}
}
// LB 9 Treat X CM* as if it were X
// No explicit action required.
// LB 10 Treat any remaining combining mark as AL
if (fCM.contains(thisChar)) {
thisChar = 'A';
}
// If the loop is still warming up - if we haven't shifted the initial
// -1 positions out of prevPos yet - loop back to advance the
// position in the input without any further looking for breaks.
if (prevPos == -1) {
continue;
}
// LB 4 Always break after hard line breaks,
if (fBK.contains(prevChar)) {
break;
}
// LB 5 Break after CR, LF, NL, but not inside CR LF
if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
continue;
}
if (fCR.contains(prevChar) ||
fLF.contains(prevChar) ||
fNL.contains(prevChar)) {
break;
}
// LB 6 Don't break before hard line breaks
if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
fLF.contains(thisChar) || fNL.contains(thisChar) ) {
continue;
}
// LB 7 Don't break before spaces or zero-width space.
if (fSP.contains(thisChar)) {
continue;
}
if (fZW.contains(thisChar)) {
continue;
}
// LB 8 Break after zero width space
if (fZW.contains(prevChar)) {
break;
}
// LB 9, 10 Already done, at top of loop.
//
// LB 11
// x WJ
// WJ x
if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
continue;
}
// LB 12
// GL x
if (fGL.contains(prevChar)) {
continue;
}
// LB 12a
// [^SP BA HY] x GL
if (!(fSP.contains(prevChar) ||
fBA.contains(prevChar) ||
fHY.contains(prevChar) ) && fGL.contains(thisChar)) {
continue;
}
// LB 13 Don't break before closings.
// NU x CL, NU x CP and NU x IS are not matched here so that they will
// fall into LB 17 and the more general number regular expression.
//
if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
!fNU.contains(prevChar) && fCP.contains(thisChar) ||
fEX.contains(thisChar) ||
!fNU.contains(prevChar) && fIS.contains(thisChar) ||
!fNU.contains(prevChar) && fSY.contains(thisChar)) {
continue;
}
// LB 14 Don't break after OP SP*
// Scan backwards, checking for this sequence.
// The OP char could include combining marks, so we actually check for
// OP CM* SP* x
tPos = prevPos;
if (fSP.contains(prevChar)) {
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
tPos=moveIndex32(fText, tPos, -1);
}
}
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
tPos=moveIndex32(fText, tPos, -1);
}
if (fOP.contains(UTF16.charAt(fText, tPos))) {
continue;
}
// LB 15 Do not break within "[
// QU CM* SP* x OP
if (fOP.contains(thisChar)) {
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
tPos = prevPos;
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
if (fQU.contains(UTF16.charAt(fText, tPos))) {
continue;
}
}
// LB 16 (CL | CP) SP* x NS
if (fNS.contains(thisChar)) {
tPos = prevPos;
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
continue;
}
}
// LB 17 B2 SP* x B2
if (fB2.contains(thisChar)) {
tPos = prevPos;
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
tPos = moveIndex32(fText, tPos, -1);
}
if (fB2.contains(UTF16.charAt(fText, tPos))) {
continue;
}
}
// LB 18 break after space
if (fSP.contains(prevChar)) {
break;
}
// LB 19
// x QU
// QU x
if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
continue;
}
// LB 20 Break around a CB
if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
break;
}
// LB 21
if (fBA.contains(thisChar) ||
fHY.contains(thisChar) ||
fNS.contains(thisChar) ||
fBB.contains(prevChar) ) {
continue;
}
// LB 22
if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
fID.contains(prevChar) && fIN.contains(thisChar) ||
fIN.contains(prevChar) && fIN.contains(thisChar) ||
fNU.contains(prevChar) && fIN.contains(thisChar) ) {
continue;
}
// LB 23 ID x PO (Note: Leading CM behaves like ID)
// AL x NU
// NU x AL
if (fID.contains(prevChar) && fPO.contains(thisChar) ||
fAL.contains(prevChar) && fNU.contains(thisChar) ||
fNU.contains(prevChar) && fAL.contains(thisChar) ) {
continue;
}
// LB 24 Do not break between prefix and letters or ideographs.
// PR x ID
// PR x AL
// PO x AL
if (fPR.contains(prevChar) && fID.contains(thisChar) ||
fPR.contains(prevChar) && fAL.contains(thisChar) ||
fPO.contains(prevChar) && fAL.contains(thisChar)) {
continue;
}
// LB 25 Numbers
matchVals = LBNumberCheck(fText, prevPos, matchVals);
if (matchVals[0] != -1) {
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int numEndIdx = matchVals[1]; // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least the two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
nextPos = numEndIdx;
pos = numEndIdx;
do {
pos = moveIndex32(fText, pos, -1);
thisChar = UTF16.charAt(fText, pos);
}
while (fCM.contains(thisChar));
}
continue;
}
}
// LB 26 Do not break Korean Syllables
if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
fJV.contains(thisChar) ||
fH2.contains(thisChar) ||
fH3.contains(thisChar))) {
continue;
}
if ((fJV.contains(prevChar) || fH2.contains(prevChar)) &&
(fJV.contains(thisChar) || fJT.contains(thisChar))) {
continue;
}
if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
fJT.contains(thisChar)) {
continue;
}
// LB 27 Treat a Korean Syllable Block the same as ID
if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
fIN.contains(thisChar)) {
continue;
}
if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
fPO.contains(thisChar)) {
continue;
}
if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
continue;
}
// LB 28 Do not break between alphabetics
if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
continue;
}
// LB 29 Do not break between numeric punctuation and alphabetics
if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
continue;
}
// LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
// (AL | NU) x OP
// CP x (AL | NU)
if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
continue;
}
if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
continue;
}
// LB 31 Break everywhere else
break;
}
return pos;
}
// Match the following regular expression in the input text.
// ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)?
// 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states)
// retVals array [0] index of the start of the match, or -1 if no match
// [1] index of first char following the match.
// Can not use Java regex because need supplementary character support,
// and because Unicode char properties version must be the same as in
// the version of ICU being tested.
private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
if (retVals == null) {
retVals = new int[2];
}
retVals[0] = -1; // Indicates no match.
int matchState = 0;
int idx = startIdx;
matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
int c = UTF16.charAt(s, idx);
int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
switch (matchState) {
case 0:
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
matchState = 1;
break;
}
if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.HYPHEN) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
break matchLoop; /* No Match */
case 1:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 1;
break;
}
if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.HYPHEN) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
break matchLoop; /* No Match */
case 4:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 4;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
break matchLoop; /* No Match */
// ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
// 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
case 7:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.NUMERIC) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
matchState = 7;
break;
}
if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
matchState = 9;
break;
}
if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
matchState = 9;
break;
}
if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
matchState = 11;
break;
}
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
matchState = 11;
break;
}
break matchLoop; // Match Complete.
case 9:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 9;
break;
}
if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
matchState = 11;
break;
}
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
matchState = 11;
break;
}
break matchLoop; // Match Complete.
case 11:
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
matchState = 11;
break;
}
break matchLoop; // Match Complete.
}
}
if (matchState > 4) {
retVals[0] = startIdx;
retVals[1] = idx;
}
return retVals;
}
List charClasses() {
return fSets;
}
}
/**
*
* Sentence Monkey Test Class
*
*
*
*/
static class RBBISentenceMonkey extends RBBIMonkeyKind {
List fSets;
StringBuffer fText;
UnicodeSet fSepSet;
UnicodeSet fFormatSet;
UnicodeSet fSpSet;
UnicodeSet fLowerSet;
UnicodeSet fUpperSet;
UnicodeSet fOLetterSet;
UnicodeSet fNumericSet;
UnicodeSet fATermSet;
UnicodeSet fSContinueSet;
UnicodeSet fSTermSet;
UnicodeSet fCloseSet;
UnicodeSet fOtherSet;
UnicodeSet fExtendSet;
RBBISentenceMonkey() {
fCharProperty = UProperty.SENTENCE_BREAK;
fSets = new ArrayList();
// Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
// set and made into character classes of their own. For the monkey impl,
// they remain in SEP, since Sep always appears with CR and LF in the rules.
fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]");
fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
fOtherSet = new UnicodeSet();
fOtherSet.complement();
fOtherSet.removeAll(fSepSet);
fOtherSet.removeAll(fFormatSet);
fOtherSet.removeAll(fSpSet);
fOtherSet.removeAll(fLowerSet);
fOtherSet.removeAll(fUpperSet);
fOtherSet.removeAll(fOLetterSet);
fOtherSet.removeAll(fNumericSet);
fOtherSet.removeAll(fATermSet);
fOtherSet.removeAll(fSContinueSet);
fOtherSet.removeAll(fSTermSet);
fOtherSet.removeAll(fCloseSet);
fOtherSet.removeAll(fExtendSet);
fSets.add(fSepSet);
fSets.add(fFormatSet);
fSets.add(fSpSet);
fSets.add(fLowerSet);
fSets.add(fUpperSet);
fSets.add(fOLetterSet);
fSets.add(fNumericSet);
fSets.add(fATermSet);
fSets.add(fSContinueSet);
fSets.add(fSTermSet);
fSets.add(fCloseSet);
fSets.add(fOtherSet);
fSets.add(fExtendSet);
}
List charClasses() {
return fSets;
}
void setText(StringBuffer s) {
fText = s;
}
// moveBack() Find the "significant" code point preceding the index i.
// Skips over ($Extend | $Format)*
//
private int moveBack(int i) {
if (i <= 0) {
return -1;
}
int c;
int j = i;
do {
j = moveIndex32(fText, j, -1);
c = UTF16.charAt(fText, j);
}
while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
return j;
}
int moveForward(int i) {
if (i>=fText.length()) {
return fText.length();
}
int c;
int j = i;
do {
j = moveIndex32(fText, j, 1);
c = cAt(j);
}
while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
return j;
}
int cAt(int pos) {
if (pos<0 || pos>=fText.length()) {
return -1;
}
return UTF16.charAt(fText, pos);
}
int next(int prevPos) {
int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
int breakPos = -1;
int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
int c;
// Prev break at end of string. return DONE.
if (prevPos >= fText.length()) {
return -1;
}
/*p0 =*/ p1 = p2 = p3 = prevPos;
c3 = UTF16.charAt(fText, prevPos);
c0 = c1 = c2 = 0;
// Loop runs once per "significant" character position in the input text.
for (;;) {
// Move all of the positions forward in the input string.
/*p0 = p1;*/ c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
// Advancd p3 by X(Extend | Format)* Rule 4
p3 = moveForward(p3);
c3 = cAt(p3);
// Rule (3) CR x LF
if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
continue;
}
// Rule (4) Sep <break>
if (fSepSet.contains(c1)) {
p2 = p1+1; // Separators don't combine with Extend or Format
break;
}
if (p2 >= fText.length()) {
// Reached end of string. Always a break position.
break;
}
if (p2 == prevPos) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
}
// Rule (6). ATerm x Numeric
if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
continue;
}
// Rule (7). Upper ATerm x Uppper
if (fUpperSet.contains(c0) && fATermSet.contains(c1) && fUpperSet.contains(c2)) {
continue;
}
// Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
// Note: Sterm | ATerm are added to the negated part of the expression by a
// note to the Unicode 5.0 documents.
int p8 = p1;
while (p8>0 && fSpSet.contains(cAt(p8))) {
p8 = moveBack(p8);
}
while (p8>0 && fCloseSet.contains(cAt(p8))) {
p8 = moveBack(p8);
}
if (fATermSet.contains(cAt(p8))) {
p8=p2;
for (;;) {
c = cAt(p8);
if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
fLowerSet.contains(c) || fSepSet.contains(c) ||
fATermSet.contains(c) || fSTermSet.contains(c))
{
break;
}
p8 = moveForward(p8);
}
if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
continue;
}
}
// Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
p8 = p1;
while (setContains(fSpSet, cAt(p8))) {
p8 = moveBack(p8);
}
while (setContains(fCloseSet, cAt(p8))) {
p8 = moveBack(p8);
}
c = cAt(p8);
if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
continue;
}
}
// Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
int p9 = p1;
while (p9>0 && fCloseSet.contains(cAt(p9))) {
p9 = moveBack(p9);
}
c = cAt(p9);
if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
continue;
}
}
// Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
int p10 = p1;
while (p10>0 && fSpSet.contains(cAt(p10))) {
p10 = moveBack(p10);
}
while (p10>0 && fCloseSet.contains(cAt(p10))) {
p10 = moveBack(p10);
}
if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
continue;
}
}
// Rule (11) (STerm | ATerm) Close* Sp* <break>
int p11 = p1;
if (p11>0 && fSepSet.contains(cAt(p11))) {
p11 = moveBack(p11);
}
while (p11>0 && fSpSet.contains(cAt(p11))) {
p11 = moveBack(p11);
}
while (p11>0 && fCloseSet.contains(cAt(p11))) {
p11 = moveBack(p11);
}
if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
break;
}
// Rule (12) Any x Any
continue;
}
breakPos = p2;
return breakPos;
}
}
/**
* Move an index into a string by n code points.
* Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
* complicating usage.
* @param s a Text string
* @param pos The starting code unit index into the text string
* @param amt The amount to adjust the string by.
* @return The adjusted code unit index, pinned to the string's length, or
* unchanged if input index was outside of the string.
*/
static int moveIndex32(StringBuffer s, int pos, int amt) {
int i;
char c;
if (amt>0) {
for (i=0; i<amt; i++) {
if (pos >= s.length()) {
return s.length();
}
c = s.charAt(pos);
pos++;
if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
c = s.charAt(pos);
if (UTF16.isTrailSurrogate(c)) {
pos++;
}
}
}
} else {
for (i=0; i>amt; i--) {
if (pos <= 0) {
return 0;
}
pos--;
c = s.charAt(pos);
if (UTF16.isTrailSurrogate(c) && pos >= 0) {
c = s.charAt(pos);
if (UTF16.isLeadSurrogate(c)) {
pos--;
}
}
}
}
return pos;
}
/**
* No-exceptions form of UnicodeSet.contains(c).
* Simplifies loops that terminate with an end-of-input character value.
* @param s A unicode set
* @param c A code point value
* @return true if the set contains c.
*/
static boolean setContains(UnicodeSet s, int c) {
if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
return false;
}
return s.contains(c);
}
/**
* return the index of the next code point in the input text.
* @param i the preceding index
* @return
*/
static int nextCP(StringBuffer s, int i) {
if (i == -1) {
// End of Input indication. Continue to return end value.
return -1;
}
int retVal = i + 1;
if (retVal > s.length()) {
return -1;
}
int c = UTF16.charAt(s, i);
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
retVal++;
}
return retVal;
}
/**
* random number generator. Not using Java's built-in Randoms for two reasons:
* 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
* 2. We need to get and restore the seed from values occurring in the middle
* of a long sequence, to more easily reproduce failing cases.
*/
private static int m_seed = 1;
private static int m_rand()
{
m_seed = m_seed * 1103515245 + 12345;
return (int)(m_seed >>> 16) % 32768;
}
// Helper function for formatting error output.
// Append a string into a fixed-size field in a StringBuffer.
// Blank-pad the string if it is shorter than the field.
// Truncate the source string if it is too long.
//
private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
int appendLen = src.length();
if (appendLen >= fieldLen) {
dest.append(src.substring(0, fieldLen));
} else {
dest.append(src);
while (appendLen < fieldLen) {
dest.append(' ');
appendLen++;
}
}
}
// Helper function for formatting error output.
// Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
String hexChars = "0123456789abcdef";
if (c < 0x10000) {
dest.append("\\u");
for (int bn=12; bn>=0; bn-=4) {
dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
}
appendToBuf(dest, " ", fieldLen-6);
} else {
dest.append("\\U");
for (int bn=28; bn>=0; bn-=4) {
dest.append(hexChars.charAt((((int)c)>>bn)&0xf));
}
appendToBuf(dest, " ", fieldLen-10);
}
}
/**
* Run a RBBI monkey test. Common routine, for all break iterator types.
* Parameters:
* bi - the break iterator to use
* mk - MonkeyKind, abstraction for obtaining expected results
* name - Name of test (char, word, etc.) for use in error messages
* seed - Seed for starting random number generator (parameter from user)
* numIterations
*/
void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) {
int TESTSTRINGLEN = 500;
StringBuffer testText = new StringBuffer();
int numCharClasses;
List chClasses;
int[] expected = new int[TESTSTRINGLEN*2 + 1];
int expectedCount = 0;
boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
int i;
int loopCount = 0;
boolean printTestData = false;
boolean printBreaksFromBI = false;
m_seed = seed;
numCharClasses = mk.charClasses().size();
chClasses = mk.charClasses();
// Verify that the character classes all have at least one member.
for (i=0; i<numCharClasses; i++) {
UnicodeSet s = (UnicodeSet)chClasses.get(i);
if (s == null || s.size() == 0) {
errln("Character Class " + i + " is null or of zero size.");
return;
}
}
//--------------------------------------------------------------------------------------------
//
// Debugging settings. Comment out everything in the following block for normal operation
//
//--------------------------------------------------------------------------------------------
// numIterations = -1;
// RuleBasedBreakIterator_New.fTrace = true;
// m_seed = 859056465;
// TESTSTRINGLEN = 50;
// printTestData = true;
// printBreaksFromBI = true;
// ((RuleBasedBreakIterator_New)bi).dump();
//--------------------------------------------------------------------------------------------
//
// End of Debugging settings.
//
//--------------------------------------------------------------------------------------------
int dotsOnLine = 0;
while (loopCount < numIterations || numIterations == -1) {
if (numIterations == -1 && loopCount % 10 == 0) {
// If test is running in an infinite loop, display a periodic tic so
// we can tell that it is making progress.
System.out.print(".");
if (dotsOnLine++ >= 80){
System.out.println();
dotsOnLine = 0;
}
}
// Save current random number seed, so that we can recreate the random numbers
// for this loop iteration in event of an error.
seed = m_seed;
testText.setLength(0);
// Populate a test string with data.
if (printTestData) {
System.out.println("Test Data string ...");
}
for (i=0; i<TESTSTRINGLEN; i++) {
int aClassNum = m_rand() % numCharClasses;
UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum);
int charIdx = m_rand() % classSet.size();
int c = classSet.charAt(charIdx);
if (c < 0) { // TODO: deal with sets containing strings.
errln("c < 0");
}
UTF16.appendCodePoint(testText, c);
if (printTestData) {
System.out.print(Integer.toHexString(c) + " ");
}
}
if (printTestData) {
System.out.println();
}
Arrays.fill(expected, 0);
Arrays.fill(expectedBreaks, false);
Arrays.fill(forwardBreaks, false);
Arrays.fill(reverseBreaks, false);
Arrays.fill(isBoundaryBreaks, false);
Arrays.fill(followingBreaks, false);
Arrays.fill(precedingBreaks, false);
// Calculate the expected results for this test string.
mk.setText(testText);
expectedCount = 0;
expectedBreaks[0] = true;
expected[expectedCount ++] = 0;
int breakPos = 0;
int lastBreakPos = -1;
for (;;) {
lastBreakPos = breakPos;
breakPos = mk.next(breakPos);
if (breakPos == -1) {
break;
}
if (breakPos > testText.length()) {
errln("breakPos > testText.length()");
}
if (lastBreakPos >= breakPos) {
errln("Next() not increasing.");
// break;
}
expectedBreaks[breakPos] = true;
expected[expectedCount ++] = breakPos;
}
// Find the break positions using forward iteration
if (printBreaksFromBI) {
System.out.println("Breaks from BI...");
}
bi.setText(testText.toString());
for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
if (i < 0 || i > testText.length()) {
errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
break;
}
if (printBreaksFromBI) {
System.out.print(Integer.toHexString(i) + " ");
}
forwardBreaks[i] = true;
}
if (printBreaksFromBI) {
System.out.println();
}
// Find the break positions using reverse iteration
for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
if (i < 0 || i > testText.length()) {
errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
break;
}
reverseBreaks[i] = true;
}
// Find the break positions using isBoundary() tests.
for (i=0; i<=testText.length(); i++) {
isBoundaryBreaks[i] = bi.isBoundary(i);
}
// Find the break positions using the following() function.
lastBreakPos = 0;
followingBreaks[0] = true;
for (i=0; i<testText.length(); i++) {
breakPos = bi.following(i);
if (breakPos <= i ||
breakPos < lastBreakPos ||
breakPos > testText.length() ||
breakPos > lastBreakPos && lastBreakPos > i ) {
errln(name + " break monkey test: " +
"Out of range value returned by BreakIterator::following().\n" +
"index=" + i + "following returned=" + breakPos +
"lastBreak=" + lastBreakPos);
precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
} else {
followingBreaks[breakPos] = true;
lastBreakPos = breakPos;
}
}
// Find the break positions using the preceding() function.
lastBreakPos = testText.length();
precedingBreaks[testText.length()] = true;
for (i=testText.length(); i>0; i--) {
breakPos = bi.preceding(i);
if (breakPos >= i ||
breakPos > lastBreakPos ||
breakPos < 0 ||
breakPos < lastBreakPos && lastBreakPos < i ) {
errln(name + " break monkey test: " +
"Out of range value returned by BreakIterator::preceding().\n" +
"index=" + i + "preceding returned=" + breakPos +
"lastBreak=" + lastBreakPos);
precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
} else {
precedingBreaks[breakPos] = true;
lastBreakPos = breakPos;
}
}
// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
String errorType = null;
if (forwardBreaks[i] != expectedBreaks[i]) {
errorType = "next()";
} else if (reverseBreaks[i] != forwardBreaks[i]) {
errorType = "previous()";
} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
errorType = "isBoundary()";
} else if (followingBreaks[i] != expectedBreaks[i]) {
errorType = "following()";
} else if (precedingBreaks[i] != expectedBreaks[i]) {
errorType = "preceding()";
}
if (errorType != null) {
// Format a range of the test text that includes the failure as
// a data item that can be included in the rbbi test data file.
// Start of the range is the last point where expected and actual results
// both agreed that there was a break position.
int startContext = i;
int count = 0;
for (;;) {
if (startContext==0) { break; }
startContext --;
if (expectedBreaks[startContext]) {
if (count == 2) break;
count ++;
}
}
// End of range is two expected breaks past the start position.
int endContext = i + 1;
int ci;
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
for (;;) {
if (endContext >= testText.length()) {break;}
if (expectedBreaks[endContext-1]) {
if (count == 0) break;
count --;
}
endContext ++;
}
}
// Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
StringBuffer errorText = new StringBuffer();
int c; // Char from test data
for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) {
if (ci == i) {
// This is the location of the error.
errorText.append("<?>---------------------------------\n");
} else if (expectedBreaks[ci]) {
// This a non-error expected break position.
errorText.append("------------------------------------\n");
}
if (ci < testText.length()) {
c = UTF16.charAt(testText, ci);
appendCharToBuf(errorText, c, 11);
String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
appendToBuf(errorText, gc, 8);
int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
String extraPropValue =
UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
appendToBuf(errorText, extraPropValue, 20);
String charName = UCharacter.getExtendedName(c);
appendToBuf(errorText, charName, 40);
errorText.append('\n');
}
}
if (ci == testText.length() && ci != -1) {
errorText.append("<>");
}
errorText.append("</data>\n");
// Output the error
errln(name + " break monkey test error. " +
(expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
"\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" +
errorText);
break;
}
}
loopCount++;
}
}
public void TestCharMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 10000;
}
RBBICharMonkey m = new RBBICharMonkey();
BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
RunMonkey(bi, m, "char", seed, loopCount);
}
public void TestWordMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 10000;
}
logln("Word Break Monkey Test");
RBBIWordMonkey m = new RBBIWordMonkey();
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
RunMonkey(bi, m, "word", seed, loopCount);
}
public void TestLineMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 10000;
}
logln("Line Break Monkey Test");
RBBILineMonkey m = new RBBILineMonkey();
BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
if (params == null) {
loopCount = 50;
}
RunMonkey(bi, m, "line", seed, loopCount);
}
public void TestSentMonkey() {
int loopCount = 500;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 3000;
}
logln("Sentence Break Monkey Test");
RBBISentenceMonkey m = new RBBISentenceMonkey();
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
if (params == null) {
loopCount = 30;
}
RunMonkey(bi, m, "sent", seed, loopCount);
}
//
// Round-trip monkey tests.
// Verify that break iterators created from the rule source from the default
// break iterators still pass the monkey test for the iterator type.
//
// This is a major test for the Rule Compiler. The default break iterators are built
// from pre-compiled binary rule data that was created using ICU4C; these
// round-trip rule recompile tests verify that the Java rule compiler can
// rebuild break iterators from the original source rules.
//
public void TestRTCharMonkey() {
int loopCount = 200;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 2000;
}
RBBICharMonkey m = new RBBICharMonkey();
BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US);
String rules = bi.toString();
BreakIterator rtbi = new RuleBasedBreakIterator(rules);
RunMonkey(rtbi, m, "char", seed, loopCount);
}
public void TestRTWordMonkey() {
int loopCount = 200;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 2000;
}
logln("Word Break Monkey Test");
RBBIWordMonkey m = new RBBIWordMonkey();
BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
String rules = bi.toString();
BreakIterator rtbi = new RuleBasedBreakIterator(rules);
RunMonkey(rtbi, m, "word", seed, loopCount);
}
public void TestRTLineMonkey() {
int loopCount = 200;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 2000;
}
logln("Line Break Monkey Test");
RBBILineMonkey m = new RBBILineMonkey();
BreakIterator bi = BreakIterator.getLineInstance(Locale.US);
String rules = bi.toString();
BreakIterator rtbi = new RuleBasedBreakIterator(rules);
if (params == null) {
loopCount = 50;
}
RunMonkey(rtbi, m, "line", seed, loopCount);
}
public void TestRTSentMonkey() {
int loopCount = 200;
int seed = 1;
if (params.inclusion >= 9) {
loopCount = 1000;
}
logln("Sentence Break Monkey Test");
RBBISentenceMonkey m = new RBBISentenceMonkey();
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
String rules = bi.toString();
BreakIterator rtbi = new RuleBasedBreakIterator(rules);
if (params == null) {
loopCount = 30;
}
RunMonkey(rtbi, m, "sent", seed, loopCount);
}
}