| /* |
| ******************************************************************************* |
| * Copyright (C) 2003-2005 International Business Machines Corporation and * |
| * others. All Rights Reserved. * |
| ******************************************************************************* |
| */ |
| package com.ibm.icu.dev.test.rbbi; |
| |
| |
| // Monkey testing of RuleBasedBreakIterator |
| import com.ibm.icu.dev.test.*; |
| import com.ibm.icu.text.BreakIterator; |
| import com.ibm.icu.text.UTF16; |
| import com.ibm.icu.text.UnicodeSet; |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UProperty; |
| import java.util.List; |
| import java.util.Arrays; |
| import java.util.ArrayList; |
| import java.util.Locale; |
| |
| |
| /** |
| * Monkey tests for RBBI. These tests have independent implementations of |
| * the Unicode TR boundary rules, and compare results between these and ICU's |
| * implementation, using random data. |
| * |
| * Tests cover Grapheme Cluster (char), Word and Line breaks |
| * |
| * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp |
| * |
| */ |
| public class RBBITestMonkey extends TestFmwk { |
| |
| public static void main(String[] args) { |
| new RBBITestMonkey().run(args); |
| } |
| |
| // |
| // classs RBBIMonkeyKind |
| // |
| // Monkey Test for Break Iteration |
| // Abstract interface class. Concrete derived classes independently |
| // implement the break rules for different iterator types. |
| // |
| // The Monkey Test itself uses doesn't know which type of break iterator it is |
| // testing, but works purely in terms of the interface defined here. |
| // |
| abstract static class RBBIMonkeyKind { |
| |
| // Return a List of UnicodeSets, representing the character classes used |
| // for this type of iterator. |
| abstract List charClasses(); |
| |
| // Set the test text on which subsequent calls to next() will operate |
| abstract void setText(StringBuffer text); |
| |
| // Find the next break postion, starting from the specified position. |
| // Return -1 after reaching end of string. |
| abstract int next(int i); |
| } |
| |
| |
| /** |
| * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. |
| */ |
| static class RBBICharMonkey extends RBBIMonkeyKind { |
| List fSets; |
| |
| UnicodeSet fCRLFSet; |
| UnicodeSet fControlSet; |
| UnicodeSet fExtendSet; |
| UnicodeSet fHangulSet; |
| UnicodeSet fAnySet; |
| |
| StringBuffer fText; |
| |
| |
| RBBICharMonkey() { |
| fText = null; |
| fCRLFSet = new UnicodeSet("[\\r\\n]"); |
| fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]"); |
| fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]"); |
| fHangulSet = new UnicodeSet( |
| "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}" + |
| "\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]"); |
| fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); |
| |
| fSets = new ArrayList(); |
| fSets.add(fCRLFSet); |
| fSets.add(fControlSet); |
| fSets.add(fExtendSet); |
| fSets.add(fHangulSet); |
| fSets.add(fAnySet); |
| }; |
| |
| |
| void setText(StringBuffer s) { |
| fText = s; |
| } |
| |
| List charClasses() { |
| return fSets; |
| } |
| |
| int next(int i) { |
| return nextGC(fText, i); |
| } |
| } |
| |
| |
| /** |
| * |
| * Word Monkey Test Class |
| * |
| * |
| * |
| */ |
| static class RBBIWordMonkey extends RBBIMonkeyKind { |
| List fSets; |
| StringBuffer fText; |
| |
| UnicodeSet fKatakanaSet; |
| UnicodeSet fALetterSet; |
| UnicodeSet fMidLetterSet; |
| UnicodeSet fMidNumSet; |
| UnicodeSet fNumericSet; |
| UnicodeSet fFormatSet; |
| UnicodeSet fExtendSet; |
| UnicodeSet fExtendNumLetSet; |
| UnicodeSet fOtherSet; |
| |
| |
| RBBIWordMonkey() { |
| fSets = new ArrayList(); |
| |
| fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); |
| fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); |
| fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); |
| fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); |
| fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); |
| fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); |
| fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); |
| fExtendNumLetSet = new UnicodeSet("[\\p{General_Category = Connector_Punctuation}]"); |
| fOtherSet = new UnicodeSet(); |
| |
| fOtherSet.complement(); |
| fOtherSet.removeAll(fALetterSet); |
| fOtherSet.removeAll(fKatakanaSet); |
| fOtherSet.removeAll(fMidLetterSet); |
| fOtherSet.removeAll(fMidNumSet); |
| fOtherSet.removeAll(fNumericSet); |
| fOtherSet.removeAll(fFormatSet); |
| fOtherSet.removeAll(fExtendSet); |
| fOtherSet.removeAll(fExtendNumLetSet); |
| |
| fSets.add(fALetterSet); |
| fSets.add(fKatakanaSet); |
| fSets.add(fMidLetterSet); |
| fSets.add(fMidNumSet); |
| fSets.add(fNumericSet); |
| fSets.add(fFormatSet); |
| fSets.add(fExtendSet); |
| fSets.add(fExtendNumLetSet); |
| fSets.add(fOtherSet); |
| } |
| |
| |
| List charClasses() { |
| return fSets; |
| } |
| |
| void setText(StringBuffer s) { |
| fText = s; |
| } |
| |
| int next(int prevPos) { |
| int p0, p1, p2, p3; // Indices of the significant code points around the |
| // break position being tested. The candidate break |
| // location is before p2. |
| int breakPos = -1; |
| |
| int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. |
| |
| // Prev break at end of string. return DONE. |
| if (prevPos >= fText.length()) { |
| return -1; |
| } |
| p0 = p1 = p2 = p3 = prevPos; |
| c3 = UTF16.charAt(fText, prevPos); |
| c0 = c1 = c2 = 0; |
| |
| |
| // Format char after prev break? Special case, see last Note for Word Boundaries TR. |
| // break immdiately after the format char. |
| if (breakPos >= 0 && fFormatSet.contains(c3) && breakPos < (fText.length() -1)) { |
| breakPos = UTF16.moveCodePointOffset(fText, breakPos, 1); |
| return breakPos; |
| } |
| |
| |
| // Loop runs once per "significant" character position in the input text. |
| for (;;) { |
| // Move all of the positions forward in the input string. |
| p0 = p1; c0 = c1; |
| p1 = p2; c1 = c2; |
| p2 = p3; c2 = c3; |
| |
| // Advancd p3 by (GC Format*) Rules 3, 4 |
| p3 = nextGC(fText, p3); |
| if (p3 == -1 || p3 >= fText.length()) { |
| p3 = fText.length(); |
| c3 = 0; |
| } else { |
| c3 = UTF16.charAt(fText, p3); |
| while (fFormatSet.contains(c3)) { |
| p3 = moveIndex32(fText, p3, 1); |
| c3 = 0; |
| if (p3 < fText.length()) { |
| c3 = UTF16.charAt(fText, p3); |
| } |
| } |
| } |
| |
| if (p1 == p2) { |
| // Still warming up the loop. (won't work with zero length strings, but we don't care) |
| continue; |
| } |
| if (p2 == fText.length()) { |
| // Reached end of string. Always a break position. |
| break; |
| } |
| |
| // Rule (5). ALetter x ALetter |
| if (fALetterSet.contains(c1) && |
| fALetterSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule (6) ALetter x MidLetter ALetter |
| // |
| if ( fALetterSet.contains(c1) && |
| fMidLetterSet.contains(c2) && |
| fALetterSet.contains(c3)) { |
| continue; |
| } |
| |
| |
| // Rule (7) ALetter MidLetter x ALetter |
| if (fALetterSet.contains(c0) && |
| fMidLetterSet.contains(c1) && |
| fALetterSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule (8) Numeric x Numeric |
| if (fNumericSet.contains(c1) && |
| fNumericSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule (9) ALetter x Numeric |
| if (fALetterSet.contains(c1) && |
| fNumericSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule (10) Numeric x ALetter |
| if (fNumericSet.contains(c1) && |
| fALetterSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule (11) Numeric (MidNum | MidNumLet) x Numeric |
| if ( fNumericSet.contains(c0) && |
| fMidNumSet.contains(c1) && |
| fNumericSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule (12) Numeric x (MidNum | MidNumLet) Numeric |
| if (fNumericSet.contains(c1) && |
| fMidNumSet.contains(c2) && |
| fNumericSet.contains(c3)) { |
| continue; |
| } |
| |
| // Rule (13) Katakana x Katakana |
| if (fKatakanaSet.contains(c1) && |
| fKatakanaSet.contains(c2)) { |
| continue; |
| } |
| |
| // Rule 13a (ALetter | Numeric | Katakana | ExtendNumLet) x ExtendNumLet |
| if ((fALetterSet.contains(c1) || fNumericSet.contains(c1) || |
| fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) && |
| fExtendNumLetSet.contains(c2)) { |
| continue; |
| } |
| // Rule 13b ExtendNumLet x (ALetter | Numeric | Katakana | ExtendNumLet) |
| if (fExtendNumLetSet.contains(c1) && |
| (fALetterSet.contains(c2) || fNumericSet.contains(c2) || |
| fKatakanaSet.contains(c2) || fExtendNumLetSet.contains(c2))) { |
| continue; |
| } |
| |
| // Rule 14. Break found here. |
| break; |
| } |
| |
| |
| // Rule 4 fixup, back up before any trailing |
| // format characters at the end of the word. |
| breakPos = p2; |
| int t = nextGC(fText, p1); |
| if (t > p1) { |
| breakPos = t; |
| } |
| return breakPos; |
| } |
| |
| } |
| |
| |
| static class RBBILineMonkey extends RBBIMonkeyKind { |
| |
| List fSets; |
| |
| UnicodeSet fBK; |
| UnicodeSet fCR; |
| UnicodeSet fLF; |
| UnicodeSet fCM; |
| UnicodeSet fNL; |
| UnicodeSet fSG; |
| UnicodeSet fWJ; |
| UnicodeSet fZW; |
| UnicodeSet fGL; |
| UnicodeSet fCB; |
| UnicodeSet fSP; |
| UnicodeSet fB2; |
| UnicodeSet fBA; |
| UnicodeSet fBB; |
| UnicodeSet fHY; |
| UnicodeSet fCL; |
| UnicodeSet fEX; |
| UnicodeSet fIN; |
| UnicodeSet fNS; |
| UnicodeSet fOP; |
| UnicodeSet fQU; |
| UnicodeSet fIS; |
| UnicodeSet fNU; |
| UnicodeSet fPO; |
| UnicodeSet fPR; |
| UnicodeSet fSY; |
| UnicodeSet fAI; |
| UnicodeSet fAL; |
| UnicodeSet fID; |
| UnicodeSet fSA; |
| UnicodeSet fJL; |
| UnicodeSet fJV; |
| UnicodeSet fJT; |
| UnicodeSet fH2; |
| UnicodeSet fH3; |
| UnicodeSet fXX; |
| |
| StringBuffer fText; |
| int fOrigPositions; |
| |
| |
| |
| RBBILineMonkey() |
| { |
| fSets = new ArrayList(); |
| |
| fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); |
| fCR = new UnicodeSet("[\\p{Line_break=CR}]"); |
| fLF = new UnicodeSet("[\\p{Line_break=LF}]"); |
| fCM = new UnicodeSet("[\\p{Line_break=CM}]"); |
| fNL = new UnicodeSet("[\\p{Line_break=NL}]"); |
| fWJ = new UnicodeSet("[\\p{Line_break=WJ}]"); |
| fZW = new UnicodeSet("[\\p{Line_break=ZW}]"); |
| fGL = new UnicodeSet("[\\p{Line_break=GL}]"); |
| fCB = new UnicodeSet("[\\p{Line_break=CB}]"); |
| fSP = new UnicodeSet("[\\p{Line_break=SP}]"); |
| fB2 = new UnicodeSet("[\\p{Line_break=B2}]"); |
| fBA = new UnicodeSet("[\\p{Line_break=BA}]"); |
| fBB = new UnicodeSet("[\\p{Line_break=BB}]"); |
| fHY = new UnicodeSet("[\\p{Line_break=HY}]"); |
| fCL = new UnicodeSet("[\\p{Line_break=CL}]"); |
| fEX = new UnicodeSet("[\\p{Line_break=EX}]"); |
| fIN = new UnicodeSet("[\\p{Line_break=IN}]"); |
| fNS = new UnicodeSet("[\\p{Line_break=NS}]"); |
| fOP = new UnicodeSet("[\\p{Line_break=OP}]"); |
| fQU = new UnicodeSet("[\\p{Line_break=QU}]"); |
| fIS = new UnicodeSet("[\\p{Line_break=IS}]"); |
| fNU = new UnicodeSet("[\\p{Line_break=NU}]"); |
| fPO = new UnicodeSet("[\\p{Line_break=PO}]"); |
| fPR = new UnicodeSet("[\\p{Line_break=PR}]"); |
| fSY = new UnicodeSet("[\\p{Line_break=SY}]"); |
| fAI = new UnicodeSet("[\\p{Line_break=AI}]"); |
| fAL = new UnicodeSet("[\\p{Line_break=AL}]"); |
| fID = new UnicodeSet("[\\p{Line_break=ID}]"); |
| fSA = new UnicodeSet("[\\p{Line_break=SA}]"); |
| fJL = new UnicodeSet("[\\p{Line_break=JL}]"); |
| fJV = new UnicodeSet("[\\p{Line_break=JV}]"); |
| fJT = new UnicodeSet("[\\p{Line_break=JT}]"); |
| fH2 = new UnicodeSet("[\\p{Line_break=H2}]"); |
| fH3 = new UnicodeSet("[\\p{Line_break=H3}]"); |
| fXX = new UnicodeSet("[\\p{Line_break=XX}]"); |
| |
| fAL.addAll(fXX); // Default behavior for XX is identical to AL |
| fAL.addAll(fAI); // Default behavior for AI is identical to AL |
| fAL.addAll(fSA); // Default behavior for SA is XX, which defaults to AL |
| |
| |
| |
| fSets.add(fBK); |
| fSets.add(fCR); |
| fSets.add(fLF); |
| fSets.add(fCM); |
| fSets.add(fNL); |
| fSets.add(fWJ); |
| fSets.add(fZW); |
| fSets.add(fGL); |
| fSets.add(fCB); |
| fSets.add(fSP); |
| fSets.add(fB2); |
| fSets.add(fBA); |
| fSets.add(fBB); |
| fSets.add(fHY); |
| fSets.add(fCL); |
| fSets.add(fEX); |
| fSets.add(fIN); |
| fSets.add(fNS); |
| fSets.add(fOP); |
| fSets.add(fQU); |
| fSets.add(fIS); |
| fSets.add(fNU); |
| fSets.add(fPO); |
| fSets.add(fPR); |
| fSets.add(fSY); |
| fSets.add(fAI); |
| fSets.add(fAL); |
| fSets.add(fID); |
| fSets.add(fWJ); |
| fSets.add(fSA); |
| |
| } |
| |
| void setText(StringBuffer s) { |
| fText = s; |
| } |
| |
| |
| |
| |
| int next(int startPos) { |
| int pos; // Index of the char following a potential break position |
| int thisChar; // Character at above position "pos" |
| |
| int prevPos; // Index of the char preceding a potential break position |
| int prevChar; // Character at above position. Note that prevChar |
| // and thisChar may not be adjacent because combining |
| // characters between them will be ignored. |
| |
| int nextPos; // Index of the next character following pos. |
| // Usually skips over combining marks. |
| int tPos; // temp value. |
| int c; |
| int matchVals[] = null; // Number Expression Match Results |
| |
| |
| if (startPos >= fText.length()) { |
| return -1; |
| } |
| |
| |
| // Initial values for loop. Loop will run the first time without finding breaks, |
| // while the invalid values shift out and the "this" and |
| // "prev" positions are filled in with good values. |
| pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. |
| thisChar = prevChar = 0; |
| nextPos = startPos; |
| |
| |
| // Loop runs once per position in the test text, until a break position |
| // is found. In each iteration, we are testing for a possible break |
| // just preceding the character at index "pos". The character preceding |
| // this char is at postion "prevPos"; because of combining sequences, |
| // "prevPos" can be arbitrarily far before "pos". |
| for (;;) { |
| // Advance to the next position to be tested. |
| prevPos = pos; |
| prevChar = thisChar; |
| pos = nextPos; |
| nextPos = moveIndex32(fText, pos, 1); |
| |
| // Rule LB2 - Break at end of text. |
| if (pos >= fText.length()) { |
| break; |
| } |
| |
| // Rule LB 7 - adjust for combining sequences. |
| // We do this rule out-of-order because the adjustment does |
| // not effect the way that rules LB 3 through LB 6 match, |
| // and doing it here rather than after LB 6 is substantially |
| // simpler when combining sequences do occur. |
| |
| |
| // LB 7b Keep combining sequences together. |
| // advance over any CM class chars at "pos", |
| // result is "nextPos" for the following loop iteration. |
| thisChar = UTF16.charAt(fText, pos); |
| if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || |
| thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { |
| for (;;) { |
| if (nextPos == fText.length()) { |
| break; |
| } |
| int nextChar = UTF16.charAt(fText, nextPos); |
| if (!fCM.contains(nextChar)) { |
| break; |
| } |
| nextPos = moveIndex32(fText, nextPos, 1); |
| } |
| } |
| |
| // LB 7c Treat unattached combining chars as AL |
| if (fCM.contains(thisChar)) { |
| thisChar = 'A'; |
| } |
| |
| |
| // If the loop is still warming up - if we haven't shifted the initial |
| // -1 positions out of prevPos yet - loop back to advance the |
| // position in the input without any further looking for breaks. |
| if (prevPos == -1) { |
| continue; |
| } |
| |
| // LB 3a Always break after hard line breaks, |
| if (fBK.contains(prevChar)) { |
| break; |
| } |
| |
| // LB 3b Break after CR, LF, NL, but not inside CR LF |
| if (fCR.contains(prevChar) && fLF.contains(thisChar)) { |
| continue; |
| } |
| if (fCR.contains(prevChar) || |
| fLF.contains(prevChar) || |
| fNL.contains(prevChar)) { |
| break; |
| } |
| |
| // LB 3c Don't break before hard line breaks |
| if (fBK.contains(thisChar) || fCR.contains(thisChar) || |
| fLF.contains(thisChar) || fNL.contains(thisChar) ) { |
| continue; |
| } |
| |
| |
| // LB 4 Don't break before spaces or zero-width space. |
| if (fSP.contains(thisChar)) { |
| continue; |
| } |
| |
| if (fZW.contains(thisChar)) { |
| continue; |
| } |
| |
| // LB 5 Break after zero width space |
| if (fZW.contains(prevChar)) { |
| break; |
| } |
| |
| // LB 7 Already done, at top of loop. |
| // |
| |
| |
| // LB 8 Don't break before closings. |
| // NU x CL and NU x IS are not matched here so that they will |
| // fall into LB 17 and the more general number regular expression. |
| // |
| if (!fNU.contains(prevChar) && fCL.contains(thisChar) || |
| fEX.contains(thisChar) || |
| !fNU.contains(prevChar) && fIS.contains(thisChar) || |
| !fNU.contains(prevChar) && fSY.contains(thisChar)) { |
| continue; |
| } |
| |
| // LB 9 Don't break after OP SP* |
| // Scan backwards, checking for this sequence. |
| // The OP char could include combining marks, so we acually check for |
| // OP CM* SP* x |
| tPos = prevPos; |
| while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { |
| tPos=moveIndex32(fText, tPos, -1); |
| } |
| while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { |
| tPos=moveIndex32(fText, tPos, -1); |
| } |
| if (fOP.contains(UTF16.charAt(fText, tPos))) { |
| continue; |
| } |
| |
| // LB 10 Do not break withing "[ |
| // QU CM* SP* x OP |
| if (fOP.contains(thisChar)) { |
| tPos = prevPos; |
| while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { |
| tPos = moveIndex32(fText, tPos, -1); |
| } |
| while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { |
| tPos = moveIndex32(fText, tPos, -1); |
| } |
| if (fQU.contains(UTF16.charAt(fText, tPos))) { |
| continue; |
| } |
| } |
| |
| // LB 11 CL SP* x NS |
| if (fNS.contains(thisChar)) { |
| tPos = prevPos; |
| while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { |
| tPos = moveIndex32(fText, tPos, -1); |
| } |
| while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { |
| tPos = moveIndex32(fText, tPos, -1); |
| } |
| if (fCL.contains(UTF16.charAt(fText, tPos))) { |
| continue; |
| } |
| } |
| |
| |
| // LB 11a B2 SP* x B2 |
| if (fB2.contains(thisChar)) { |
| tPos = prevPos; |
| while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { |
| tPos = moveIndex32(fText, tPos, -1); |
| } |
| while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { |
| tPos = moveIndex32(fText, tPos, -1); |
| } |
| if (fB2.contains(UTF16.charAt(fText, tPos))) { |
| continue; |
| } |
| } |
| |
| // LB 11b |
| // x WJ |
| // WJ x |
| if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { |
| continue; |
| } |
| |
| // LB 12 break after space |
| if (fSP.contains(prevChar)) { |
| break; |
| } |
| |
| // LB 13 |
| // x GL |
| // GL x |
| if (fGL.contains(thisChar) || fGL.contains(prevChar)) { |
| continue; |
| } |
| |
| |
| // LB 14 |
| // x QU |
| // QU x |
| if (fQU.contains(thisChar) || fQU.contains(prevChar)) { |
| continue; |
| } |
| |
| // LB 14a Break around a CB |
| if (fCB.contains(thisChar) || fCB.contains(prevChar)) { |
| break; |
| } |
| |
| // LB 15 |
| if (fBA.contains(thisChar) || |
| fHY.contains(thisChar) || |
| fNS.contains(thisChar) || |
| fBB.contains(prevChar) ) { |
| continue; |
| } |
| |
| // LB 16 |
| if (fAL.contains(prevChar) && fIN.contains(thisChar) || |
| fID.contains(prevChar) && fIN.contains(thisChar) || |
| fIN.contains(prevChar) && fIN.contains(thisChar) || |
| fNU.contains(prevChar) && fIN.contains(thisChar) ) { |
| continue; |
| } |
| |
| |
| // LB 17 ID x PO (Note: Leading CM behaves like ID) |
| // AL x NU |
| // NU x AL |
| if (fID.contains(prevChar) && fPO.contains(thisChar) || |
| fAL.contains(prevChar) && fNU.contains(thisChar) || |
| fNU.contains(prevChar) && fAL.contains(thisChar) ) { |
| continue; |
| } |
| |
| // LB 18 Numbers |
| matchVals = LBNumberCheck(fText, prevPos, matchVals); |
| if (matchVals[0] != -1) { |
| // Matched a number. But could have been just a single digit, which would |
| // not represent a "no break here" between prevChar and thisChar |
| int numEndIdx = matchVals[1]; // idx of first char following num |
| if (numEndIdx > pos) { |
| // Number match includes at least the two chars being checked |
| if (numEndIdx > nextPos) { |
| // Number match includes additional chars. Update pos and nextPos |
| // so that next loop iteration will continue at the end of the number, |
| // checking for breaks between last char in number & whatever follows. |
| nextPos = numEndIdx; |
| pos = numEndIdx; |
| do { |
| pos = moveIndex32(fText, pos, -1); |
| thisChar = UTF16.charAt(fText, pos); |
| } |
| while (fCM.contains(thisChar)); |
| } |
| continue; |
| } |
| } |
| if (fPR.contains(prevChar) && fAL.contains(thisChar)) { |
| continue; |
| } |
| if (fPR.contains(prevChar) && fID.contains(thisChar)) { |
| continue; |
| } |
| |
| |
| // LB 18b Do not break Korean Syllables |
| if (fJL.contains(prevChar) && (fJL.contains(thisChar) || |
| fJV.contains(thisChar) || |
| fH2.contains(thisChar) || |
| fH3.contains(thisChar))) { |
| continue; |
| } |
| |
| if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && |
| (fJV.contains(thisChar) || fJT.contains(thisChar))) { |
| continue; |
| } |
| |
| if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && |
| fJT.contains(thisChar)) { |
| continue; |
| } |
| |
| // LB 18c more Korean |
| if ((fJL.contains(prevChar) || fJV.contains(prevChar) || |
| fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && |
| fIN.contains(thisChar)) { |
| continue; |
| } |
| if ((fJL.contains(prevChar) || fJV.contains(prevChar) || |
| fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && |
| fPO.contains(thisChar)) { |
| continue; |
| } |
| if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || |
| fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { |
| continue; |
| } |
| |
| |
| |
| // LB 19 |
| if (fAL.contains(prevChar) && fAL.contains(thisChar)) { |
| continue; |
| } |
| |
| // LB 19b |
| if (fIS.contains(prevChar) && fAL.contains(thisChar)) { |
| continue; |
| } |
| |
| // LB 20 Break everywhere else |
| break; |
| |
| } |
| |
| return pos; |
| } |
| |
| |
| |
| // Match the following regular expression in the input text. |
| // (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)? |
| // 0 1 3 3 3 7 7 7 7 7 9 9 11 11 (match states) |
| // retVals array [0] index of the start of the match, or -1 if no match |
| // [1] index of first char following the match. |
| // Can not use Java regex because need supplementary character support, |
| // and because Unicode char properties version must be the same as in |
| // the version of ICU being tested. |
| private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { |
| if (retVals == null) { |
| retVals = new int[2]; |
| } |
| retVals[0] = -1; // Indicates no match. |
| int matchState = 0; |
| int idx = startIdx; |
| |
| matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){ |
| int c = UTF16.charAt(s, idx); |
| int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); |
| switch (matchState) { |
| case 0: |
| if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { |
| matchState = 1; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { |
| matchState = 4; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.HYPHEN) { |
| matchState = 4; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.NUMERIC) { |
| matchState = 7; |
| break; |
| } |
| break matchLoop; /* No Match */ |
| |
| case 1: |
| if (cLBType == UCharacter.LineBreak.COMBINING_MARK) { |
| matchState = 1; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { |
| matchState = 4; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.HYPHEN) { |
| matchState = 4; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.NUMERIC) { |
| matchState = 7; |
| break; |
| } |
| break matchLoop; /* No Match */ |
| |
| |
| case 4: |
| if (cLBType == UCharacter.LineBreak.COMBINING_MARK) { |
| matchState = 4; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.NUMERIC) { |
| matchState = 7; |
| break; |
| } |
| break matchLoop; /* No Match */ |
| // (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)? |
| // 0 1 3 3 4 7 7 7 7 7 7 9 9 11 11 (match states) |
| |
| case 7: |
| if (cLBType == UCharacter.LineBreak.COMBINING_MARK) { |
| matchState = 7; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.NUMERIC) { |
| matchState = 7; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { |
| matchState = 7; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { |
| matchState = 7; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { |
| matchState = 9; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { |
| matchState = 11; |
| break; |
| } |
| break matchLoop; // Match Complete. |
| case 9: |
| if (cLBType == UCharacter.LineBreak.COMBINING_MARK) { |
| matchState = 9; |
| break; |
| } |
| if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { |
| matchState = 11; |
| break; |
| } |
| break matchLoop; // Match Complete. |
| case 11: |
| if (cLBType == UCharacter.LineBreak.COMBINING_MARK) { |
| matchState = 11; |
| break; |
| } |
| break matchLoop; // Match Complete. |
| } |
| } |
| if (matchState > 4) { |
| retVals[0] = startIdx; |
| retVals[1] = idx; |
| } |
| return retVals; |
| } |
| |
| |
| List charClasses() { |
| return fSets; |
| } |
| |
| |
| |
| } |
| |
| |
| /** |
| * Move an index into a string by n code points. |
| * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were |
| * complicating usage. |
| * @param s a Text string |
| * @param i The starting code unit index into the text string |
| * @param amt The amount to adjust the string by. |
| * @return The adjusted code unit index, pinned to the string's length, or |
| * unchanged if input index was outside of the string. |
| */ |
| static int moveIndex32(StringBuffer s, int pos, int amt) { |
| int i; |
| char c; |
| if (amt>0) { |
| for (i=0; i<amt; i++) { |
| if (pos >= s.length()) { |
| return s.length(); |
| } |
| c = s.charAt(pos); |
| pos++; |
| if (UTF16.isLeadSurrogate(c) && pos < s.length()) { |
| c = s.charAt(pos); |
| if (UTF16.isTrailSurrogate(c)) { |
| pos++; |
| } |
| } |
| } |
| } else { |
| for (i=0; i>amt; i--) { |
| if (pos <= 0) { |
| return 0; |
| } |
| pos--; |
| c = s.charAt(pos); |
| if (UTF16.isTrailSurrogate(c) && pos >= 0) { |
| c = s.charAt(pos); |
| if (UTF16.isLeadSurrogate(c)) { |
| pos--; |
| } |
| } |
| } |
| } |
| return pos; |
| } |
| |
| |
| /** |
| * return the index of the next code point in the input text. |
| * @param i the preceding index |
| * @return |
| * @internal |
| */ |
| static int nextCP(StringBuffer s, int i) { |
| if (i == -1) { |
| // End of Input indication. Continue to return end value. |
| return -1; |
| } |
| int retVal = i + 1; |
| if (retVal > s.length()) { |
| return -1; |
| } |
| int c = UTF16.charAt(s, i); |
| if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE) { |
| retVal++; |
| } |
| return retVal; |
| } |
| |
| |
| // |
| // The following UnicodeSets are used in matching a Grapheme Cluster |
| // |
| private static UnicodeSet GC_Control; |
| |
| private static UnicodeSet GC_Extend ; |
| |
| private static UnicodeSet GC_L ; |
| |
| private static UnicodeSet GC_V ; |
| |
| private static UnicodeSet GC_T ; |
| |
| private static UnicodeSet GC_LV; |
| |
| private static UnicodeSet GC_LVT ; |
| |
| protected void init()throws Exception{ |
| GC_Control = new UnicodeSet("[[:Zl:][:Zp:][:Cc:][:Cf:]-[\\u000d\\u000a]-[:Grapheme_Extend:]]"); |
| |
| GC_Extend = new UnicodeSet("[[:Grapheme_Extend:]]"); |
| |
| GC_L = new UnicodeSet("[[:Hangul_Syllable_Type=L:]]"); |
| |
| GC_V = new UnicodeSet("[[:Hangul_Syllable_Type=V:]]"); |
| |
| GC_T = new UnicodeSet("[[:Hangul_Syllable_Type=T:]]"); |
| |
| GC_LV = new UnicodeSet("[[:Hangul_Syllable_Type=LV:]]"); |
| |
| GC_LVT = new UnicodeSet("[[:Hangul_Syllable_Type=LVT:]]"); |
| } |
| /** |
| * Find the end of the extent of a grapheme cluster. |
| * This is the reference implementation used by the monkey test for comparison |
| * with the RBBI results. |
| * @param s The string containing the text to be analyzed |
| * @param i The index of the start of the grapheme cluster. |
| * @return The index of the first code point following the grapheme cluster |
| * @internal |
| */ |
| private static int nextGC(StringBuffer s, int i) { |
| if (i >= s.length() || i == -1 ) { |
| return -1; |
| } |
| |
| int c = UTF16.charAt(s, i); |
| int pos = i; |
| |
| if (c == 0x0d) { |
| pos = nextCP(s, i); |
| if (pos >= s.length()) { |
| return pos; |
| } |
| c = UTF16.charAt(s, pos); |
| if (c == 0x0a) { |
| pos = nextCP(s, pos); |
| } |
| return pos; |
| } |
| |
| if (GC_Control.contains(c) || c == 0x0a) { |
| pos = nextCP(s, pos); |
| return pos; |
| } |
| |
| // Little state machine to consume Hangul Syllables |
| int hangulState = 1; |
| state_loop: for (;;) { |
| switch (hangulState) { |
| case 1: |
| if (GC_L.contains(c)) { |
| hangulState = 2; |
| break; |
| } |
| if (GC_V.contains(c) || GC_LV.contains(c)) { |
| hangulState = 3; |
| break; |
| } |
| if (GC_T.contains(c) || GC_LVT.contains(c)) { |
| hangulState = 4; |
| break; |
| } |
| break state_loop; |
| case 2: |
| if (GC_L.contains(c)) { |
| // continue in state 2. |
| break; |
| } |
| if (GC_V.contains(c) || GC_LV.contains(c)) { |
| hangulState = 3; |
| break; |
| } |
| if (GC_LVT.contains(c)) { |
| hangulState = 4; |
| break; |
| } |
| if (GC_Extend.contains(c)) { |
| hangulState = 5; |
| break; |
| } |
| break state_loop; |
| case 3: |
| if (GC_V.contains(c)) { |
| // continue in state 3; |
| break; |
| } |
| if (GC_T.contains(c)) { |
| hangulState = 4; |
| break; |
| } |
| if (GC_Extend.contains(c)) { |
| hangulState = 5; |
| break; |
| } |
| break state_loop; |
| case 4: |
| if (GC_T.contains(c)) { |
| // continue in state 4 |
| break; |
| } |
| if (GC_Extend.contains(c)) { |
| hangulState = 5; |
| break; |
| } |
| break state_loop; |
| case 5: |
| if (GC_Extend.contains(c)) { |
| hangulState = 5; |
| break; |
| } |
| break state_loop; |
| } |
| // We have exited the switch statement, but are still in the loop. |
| // Still in a Hangul Syllable, advance to the next code point. |
| pos = nextCP(s, pos); |
| if (pos >= s.length()) { |
| break; |
| } |
| c = UTF16.charAt(s, pos); |
| } // end of loop |
| |
| if (hangulState != 1) { |
| // We found a Hangul. We're done. |
| return pos; |
| } |
| |
| // Ordinary characters. Consume one codepoint unconditionally, then any following Extends. |
| for (;;) { |
| pos = nextCP(s, pos); |
| if (pos >= s.length()) { |
| break; |
| } |
| c = UTF16.charAt(s, pos); |
| if (GC_Extend.contains(c) == false) { |
| break; |
| } |
| } |
| |
| return pos; |
| } |
| |
| |
| /** |
| * random number generator. Not using Java's built-in Randoms for two reasons: |
| * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. |
| * 2. We need to get and restore the seed from values occuring in the middle |
| * of a long sequence, to more easily reproduce failing cases. |
| */ |
| private static int m_seed = 1; |
| private static int m_rand() |
| { |
| m_seed = m_seed * 1103515245 + 12345; |
| return (int)(m_seed >>> 16) % 32768; |
| } |
| |
| |
| /** |
| * Run a RBBI monkey test. Common routine, for all break iterator types. |
| * Parameters: |
| * bi - the break iterator to use |
| * mk - MonkeyKind, abstraction for obtaining expected results |
| * name - Name of test (char, word, etc.) for use in error messages |
| * seed - Seed for starting random number generator (parameter from user) |
| * numIterations |
| */ |
| void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) { |
| int TESTSTRINGLEN = 500; |
| StringBuffer testText = new StringBuffer(); |
| int numCharClasses; |
| List chClasses; |
| int[] expected = new int[TESTSTRINGLEN*2 + 1]; |
| int expectedCount = 0; |
| boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1]; |
| boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; |
| boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; |
| boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; |
| boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; |
| boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; |
| int i; |
| int loopCount = 0; |
| boolean printTestData = false; |
| boolean printBreaksFromBI = false; |
| |
| m_seed = seed; |
| |
| numCharClasses = mk.charClasses().size(); |
| chClasses = mk.charClasses(); |
| |
| // Verify that the character classes all have at least one member. |
| for (i=0; i<numCharClasses; i++) { |
| UnicodeSet s = (UnicodeSet)chClasses.get(i); |
| if (s == null || s.size() == 0) { |
| errln("Character Class " + i + " is null or of zero size."); |
| return; |
| } |
| } |
| |
| //-------------------------------------------------------------------------------------------- |
| // |
| // Debugging settings. Comment out everything in the following block for normal operation |
| // |
| //-------------------------------------------------------------------------------------------- |
| // numIterations = -1; |
| // RuleBasedBreakIterator_New.fTrace = true; |
| // m_seed = 668686441; |
| // TESTSTRINGLEN = 50; |
| // printTestData = true; |
| // printBreaksFromBI = true; |
| // ((RuleBasedBreakIterator_New)bi).dump(); |
| |
| //-------------------------------------------------------------------------------------------- |
| // |
| // End of Debugging settings. |
| // |
| //-------------------------------------------------------------------------------------------- |
| |
| int dotsOnLine = 0; |
| while (loopCount < numIterations || numIterations == -1) { |
| if (numIterations == -1 && loopCount % 10 == 0) { |
| // If test is running in an infinite loop, display a periodic tic so |
| // we can tell that it is making progress. |
| System.out.print("."); |
| if (dotsOnLine++ >= 80){ |
| System.out.println(); |
| dotsOnLine = 0; |
| } |
| } |
| // Save current random number seed, so that we can recreate the random numbers |
| // for this loop iteration in event of an error. |
| seed = m_seed; |
| |
| testText.setLength(0); |
| // Populate a test string with data. |
| if (printTestData) { |
| System.out.println("Test Data string ..."); |
| } |
| for (i=0; i<TESTSTRINGLEN; i++) { |
| int aClassNum = m_rand() % numCharClasses; |
| UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum); |
| int charIdx = m_rand() % classSet.size(); |
| int c = classSet.charAt(charIdx); |
| if (c < 0) { // TODO: deal with sets containing strings. |
| errln("c < 0"); |
| } |
| UTF16.appendCodePoint(testText, c); |
| if (printTestData) { |
| System.out.print(Integer.toHexString(c) + " "); |
| } |
| } |
| if (printTestData) { |
| System.out.println(); |
| } |
| |
| Arrays.fill(expected, 0); |
| Arrays.fill(expectedBreaks, false); |
| Arrays.fill(forwardBreaks, false); |
| Arrays.fill(reverseBreaks, false); |
| Arrays.fill(isBoundaryBreaks, false); |
| Arrays.fill(followingBreaks, false); |
| Arrays.fill(precedingBreaks, false); |
| |
| // Calculate the expected results for this test string. |
| mk.setText(testText); |
| expectedCount = 0; |
| expectedBreaks[0] = true; |
| expected[expectedCount ++] = 0; |
| int breakPos = 0; |
| int lastBreakPos = -1; |
| for (;;) { |
| lastBreakPos = breakPos; |
| breakPos = mk.next(breakPos); |
| if (breakPos == -1) { |
| break; |
| } |
| if (breakPos > testText.length()) { |
| errln("breakPos > testText.length()"); |
| } |
| if (lastBreakPos >= breakPos) { |
| errln("Next() not increasing."); |
| // break; |
| } |
| expectedBreaks[breakPos] = true; |
| expected[expectedCount ++] = breakPos; |
| } |
| |
| // Find the break positions using forward iteration |
| if (printBreaksFromBI) { |
| System.out.println("Breaks from BI..."); |
| } |
| bi.setText(testText.toString()); |
| for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) { |
| if (i < 0 || i > testText.length()) { |
| errln(name + " break monkey test: Out of range value returned by breakIterator::next()"); |
| break; |
| } |
| if (printBreaksFromBI) { |
| System.out.print(Integer.toHexString(i) + " "); |
| } |
| forwardBreaks[i] = true; |
| } |
| if (printBreaksFromBI) { |
| System.out.println(); |
| } |
| |
| // Find the break positions using reverse iteration |
| for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) { |
| if (i < 0 || i > testText.length()) { |
| errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name); |
| break; |
| } |
| reverseBreaks[i] = true; |
| } |
| |
| // Find the break positions using isBoundary() tests. |
| for (i=0; i<=testText.length(); i++) { |
| isBoundaryBreaks[i] = bi.isBoundary(i); |
| } |
| |
| // Find the break positions using the following() function. |
| lastBreakPos = 0; |
| followingBreaks[0] = true; |
| for (i=0; i<testText.length(); i++) { |
| breakPos = bi.following(i); |
| if (breakPos <= i || |
| breakPos < lastBreakPos || |
| breakPos > testText.length() || |
| breakPos > lastBreakPos && lastBreakPos > i ) { |
| errln(name + " break monkey test: " + |
| "Out of range value returned by BreakIterator::following().\n" + |
| "index=" + i + "following returned=" + breakPos + |
| "lastBreak=" + lastBreakPos); |
| precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. |
| } else { |
| followingBreaks[breakPos] = true; |
| lastBreakPos = breakPos; |
| } |
| } |
| |
| // Find the break positions using the preceding() function. |
| lastBreakPos = testText.length(); |
| precedingBreaks[testText.length()] = true; |
| for (i=testText.length(); i>0; i--) { |
| breakPos = bi.preceding(i); |
| if (breakPos >= i || |
| breakPos > lastBreakPos || |
| breakPos < 0 || |
| breakPos < lastBreakPos && lastBreakPos < i ) { |
| errln(name + " break monkey test: " + |
| "Out of range value returned by BreakIterator::preceding().\n" + |
| "index=" + i + "preceding returned=" + breakPos + |
| "lastBreak=" + lastBreakPos); |
| precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. |
| } else { |
| precedingBreaks[breakPos] = true; |
| lastBreakPos = breakPos; |
| } |
| } |
| |
| |
| |
| // Compare the expected and actual results. |
| for (i=0; i<=testText.length(); i++) { |
| String errorType = null; |
| if (forwardBreaks[i] != expectedBreaks[i]) { |
| errorType = "next()"; |
| } else if (reverseBreaks[i] != forwardBreaks[i]) { |
| errorType = "previous()"; |
| } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { |
| errorType = "isBoundary()"; |
| } else if (followingBreaks[i] != expectedBreaks[i]) { |
| errorType = "following()"; |
| } else if (precedingBreaks[i] != expectedBreaks[i]) { |
| errorType = "preceding()"; |
| } |
| |
| |
| if (errorType != null) { |
| // Format a range of the test text that includes the failure as |
| // a data item that can be included in the rbbi test data file. |
| |
| // Start of the range is the last point where expected and actual results |
| // both agreed that there was a break position. |
| int startContext = i; |
| int count = 0; |
| for (;;) { |
| if (startContext==0) { break; } |
| startContext --; |
| if (expectedBreaks[startContext]) { |
| if (count == 2) break; |
| count ++; |
| } |
| } |
| |
| // End of range is two expected breaks past the start position. |
| int endContext = i + 1; |
| int ci; |
| for (ci=0; ci<2; ci++) { // Number of items to include in error text. |
| for (;;) { |
| if (endContext >= testText.length()) {break;} |
| if (expectedBreaks[endContext-1]) { |
| if (count == 0) break; |
| count --; |
| } |
| endContext ++; |
| } |
| } |
| |
| // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>" |
| StringBuffer errorText = new StringBuffer(); |
| errorText.append("<data>"); |
| |
| String hexChars = "0123456789abcdef"; |
| int c; // Char from test data |
| int bn; |
| for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { |
| if (ci == i) { |
| // This is the location of the error. |
| errorText.append("<?>"); |
| } else if (expectedBreaks[ci]) { |
| // This a non-error expected break position. |
| errorText.append("<>"); |
| } |
| if (ci < testText.length()) { |
| c = UTF16.charAt(testText, ci); |
| if (c < 0x10000) { |
| errorText.append("\\u"); |
| for (bn=12; bn>=0; bn-=4) { |
| errorText.append(hexChars.charAt((((int)c)>>bn)&0xf)); |
| } |
| } else { |
| errorText.append("\\U"); |
| for (bn=28; bn>=0; bn-=4) { |
| errorText.append(hexChars.charAt((((int)c)>>bn)&0xf)); |
| } |
| } |
| } |
| } |
| if (ci == testText.length() && ci != -1) { |
| errorText.append("<>"); |
| } |
| errorText.append("</data>\n"); |
| |
| // Output the error |
| errln(name + " break monkey test error. " + |
| (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") + |
| "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" + |
| errorText); |
| break; |
| } |
| } |
| |
| loopCount++; |
| } |
| } |
| |
| public void TestCharMonkey() { |
| |
| int loopCount = 500; |
| int seed = 1; |
| |
| if (params.inclusion >= 9) { |
| loopCount = 10000; |
| } |
| |
| RBBICharMonkey m = new RBBICharMonkey(); |
| BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); |
| RunMonkey(bi, m, "char", seed, loopCount); |
| } |
| |
| public void TestWordMonkey() { |
| |
| int loopCount = 500; |
| int seed = 1; |
| |
| if (params.inclusion >= 9) { |
| loopCount = 10000; |
| } |
| |
| logln("Word Break Monkey Test"); |
| RBBIWordMonkey m = new RBBIWordMonkey(); |
| BreakIterator bi = BreakIterator.getWordInstance(Locale.US); |
| RunMonkey(bi, m, "word", seed, loopCount); |
| } |
| |
| public void TestLineMonkey() { |
| |
| int loopCount = 500; |
| int seed = 1; |
| |
| if (params.inclusion >= 9) { |
| loopCount = 10000; |
| } |
| |
| logln("Line Break Monkey Test"); |
| RBBILineMonkey m = new RBBILineMonkey(); |
| BreakIterator bi = BreakIterator.getLineInstance(Locale.US); |
| if (params == null) { |
| loopCount = 50; |
| } |
| RunMonkey(bi, m, "line", seed, loopCount); |
| } |
| |
| } |
| |