blob: 1a2188679dd595248a8f1025d63fb1306c7aadab [file] [log] [blame]
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.dev.test.rbbi;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UCharacterName;
import com.ibm.icu.impl.UCharacterNameChoice;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
/**
* RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
* This is the newer, data driven monkey test. It is completely separate from the
* older class RBBITestMonkey.
*/
@RunWith(JUnit4.class)
public class RBBIMonkeyTest extends TestFmwk {
// class CharClass Represents a single character class from the source break rules.
// Inherits from UObject because instances are adopted by UHashtable, which ultimately
// deletes them using hash's object deleter function.
static class CharClass {
String fName;
String fOriginalDef; // set definition as it appeared in user supplied rules.
String fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively.
UnicodeSet fSet;
CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
fName = name;
fOriginalDef = originalDef;
fExpandedDef = expandedDef;
fSet = set;
};
}
// class BreakRule Struct-like class represents a single rule from a set of break rules.
// Each rule has the set definitions expanded, and
// is compiled to a regular expression.
static class BreakRule {
String fName; // Name of the rule.
String fRule; // Rule expression, excluding the name, as written in user source.
String fExpandedRule; // Rule expression after expanding the set definitions.
Matcher fRuleMatcher; // Regular expression that matches the rule.
boolean fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining.
};
// class BreakRules represents a complete set of break rules, possibly tailored,
// compiled from testdata break rules.
static class BreakRules {
BreakRules(RBBIMonkeyImpl monkeyImpl) {
fMonkeyImpl = monkeyImpl;
fBreakRules = new ArrayList<>();
fType = BreakIterator.KIND_TITLE;
fCharClasses = new HashMap<>();
fCharClassList = new ArrayList<>();
fDictionarySet = new UnicodeSet();
// Match an alpha-numeric identifier in a rule. Will be a set name.
// Use negative look-behind to exclude non-identifiers, mostly property names or values.
fSetRefsMatcher = Pattern.compile(
"(?<!\\{[ \\t]{0,4})" +
"(?<!=[ \\t]{0,4})" +
"(?<!\\[:[ \\t]{0,4})" +
"(?<!\\\\)" +
"(?<![A-Za-z0-9_])" +
"([A-Za-z_][A-Za-z0-9_]*)"). // The char class name
matcher("");
// Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
fCommentsMatcher = Pattern.compile("" +
"(^|(?<=;))" + // Start either at start of line, or just after a ';' (look-behind for ';')
"[ \\t]*+" + // Match white space.
"(#.*)?+" + // Optional # plus whatever follows
"$"). // new-line at end of line.
matcher("");
// Match (initial parse) of a character class definition line.
fClassDefMatcher = Pattern.compile("" +
"[ \\t]*" + // leading white space
"([A-Za-z_][A-Za-z0-9_]*)" + // The char class name
"[ \\t]*=[ \\t]*" + // =
"(.*?)" + // The char class UnicodeSet expression
"[ \\t]*;$"). // ; <end of line>
matcher("");
// Match (initial parse) of a break rule line.
fRuleDefMatcher = Pattern.compile("" +
"[ \\t]*" + // leading white space
"([A-Za-z_][A-Za-z0-9_.]*)" + // The rule name
"[ \\t]*:[ \\t]*" + // :
"(.*?)" + // The rule definition
"[ \\t]*;$"). // ; <end of line>
matcher("");
// Match a property expression, either [:xxx:] or \p{...}
fPropertyMatcher = Pattern.compile("" +
"\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
matcher("");
}
/**
* Create the expanded definition for this char class,
* replacing any set references with the corresponding definition.
*/
CharClass addCharClass(String name, String definition) {
StringBuffer expandedDef = new StringBuffer();
fSetRefsMatcher.reset(definition);
while (fSetRefsMatcher.find()) {
String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
CharClass snameClass = fCharClasses.get(sname);
String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
fSetRefsMatcher.appendReplacement(expandedDef, "");
expandedDef.append(expansionForName);
}
fSetRefsMatcher.appendTail(expandedDef);
String expandedDefString = expandedDef.toString();
if (fMonkeyImpl.fDumpExpansions) {
System.out.printf("addCharClass(\"%s\"\n", name);
System.out.printf(" %s\n", definition);
System.out.printf("expandedDef: %s\n", expandedDefString);
}
// Verify that the expanded set definition is valid.
UnicodeSet s;
try {
s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
} catch (java.lang.IllegalArgumentException e) {
System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
throw e;
}
// Get an expanded equivalent pattern from the UnicodeSet.
// This removes set difference operators, which would fail if passed through to Java regex.
StringBuffer expandedPattern = new StringBuffer();
s._generatePattern(expandedPattern, true);
expandedDefString = expandedPattern.toString();
if (fMonkeyImpl.fDumpExpansions) {
System.out.printf("expandedDef2: %s\n", expandedDefString);
}
CharClass cclass = new CharClass(name, definition, expandedDefString, s);
CharClass previousClass = fCharClasses.put(name, cclass);
if (previousClass != null) {
// TODO: decide whether or not to allow redefinitions.
// Can be convenient in some cases.
// String msg = String.format("%s: Redefinition of character class %s\n",
// fMonkeyImpl.fRuleFileName, cclass.fName);
// System.err.println(msg);
// throw new IllegalArgumentException(msg);
}
return cclass;
};
void addRule(String name, String definition) {
BreakRule thisRule = new BreakRule();
StringBuffer expandedDefsRule = new StringBuffer();
thisRule.fName = name;
thisRule.fRule = definition;
// Expand the char class definitions within the rule.
fSetRefsMatcher.reset(definition);
while (fSetRefsMatcher.find()) {
String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
CharClass nameClass = fCharClasses.get(sname);
if (nameClass == null) {
System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
}
String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
expandedDefsRule.append(expansionForName);
}
fSetRefsMatcher.appendTail(expandedDefsRule);
// Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
// obtained from ICU UnicodeSet. Need to do this substitution because Java regex
// does not recognize all properties, and because Java's definitions are likely
// older than ICU's.
StringBuffer expandedRule = new StringBuffer();
fPropertyMatcher.reset(expandedDefsRule);
while (fPropertyMatcher.find()) {
String prop = fPropertyMatcher.group();
UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
StringBuffer propExpansion = new StringBuffer();
propSet._generatePattern(propExpansion, true);
fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
}
fPropertyMatcher.appendTail(expandedRule);
// If rule begins with a '^' rule chaining is disallowed.
// Strip off the '^' from the rule expression, and set the flag.
if (expandedRule.charAt(0) == '^') {
thisRule.fInitialMatchOnly = true;
expandedRule.deleteCharAt(0);
expandedRule = new StringBuffer(expandedRule.toString().trim());
}
// Replace any [^negated sets] with equivalent flattened sets generated by
// ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
// to any nested classes. Variable substitution in rules produces
// nested sets that [^negation] needs to apply to.
StringBuffer ruleWithFlattenedSets = new StringBuffer();
int idx = 0;
while (idx<expandedRule.length()) {
int setOpenPos = expandedRule.indexOf("[^", idx);
if (setOpenPos < 0) {
break;
}
if (setOpenPos > idx) {
// Move anything from the source rule preceding the [^ into the processed rule, unchanged.
ruleWithFlattenedSets.append(expandedRule.substring(idx, setOpenPos));
}
int nestingLevel = 1;
boolean haveNesting = false;
int setClosePos;
for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
char c = expandedRule.charAt(setClosePos);
if (c == '\\') {
++setClosePos;
} else if (c == '[') {
++nestingLevel;
haveNesting = true;
} else if (c == ']') {
--nestingLevel;
}
}
if (haveNesting && nestingLevel == 0) {
// Found one, a negated set that includes interior nested sets.
// Create an ICU UnicodeSet from the source pattern, and obtain an
// equivalent flattened pattern from that.
UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
uset._generatePattern(ruleWithFlattenedSets, true);
} else {
// The [^ set definition did not include any nested sets.
// Copy the original definition without change.
// Java regular expressions will handle it without needing to recast it.
if (nestingLevel > 0) {
// Error case of an unclosed character class expression.
// Java regex will also eventually flag the error.
System.err.printf("No closing ] found in rule %s\n", name);
}
ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
}
idx = setClosePos;
}
if (idx < expandedRule.length()) {
ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
}
thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
// Replace the divide sign (\u00f7) with a regular expression named capture.
// When running the rules, a match that includes this group means we found a break position.
// thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
if (thisRule.fExpandedRule.indexOf("÷") != -1) {
String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
System.err.println(msg);
throw new IllegalArgumentException(msg);
}
// UAX break rule set definitions can be empty, just [].
// Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
// also matches nothing.
thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
// Change Unicode escape syntax for compatibility with Java regular expressions
// \udddd => \x{dddd}
// \U00hhhhhh => \x{hhhhhh}
thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
// Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
// UnicodeSet._generatePattern() inserts un-escaped "#"s
thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
if (fMonkeyImpl.fDumpExpansions) {
System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
}
// Compile a regular expression for this rule.
try {
thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
} catch (PatternSyntaxException e) {
System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
throw e;
}
// Put this new rule into the vector of all Rules.
fBreakRules.add(thisRule);
};
@SuppressWarnings("unused")
private static String hexToCodePoint(String hex) {
int cp = Integer.parseInt(hex, 16);
return new StringBuilder().appendCodePoint(cp).toString();
}
boolean setKeywordParameter(String keyword, String value) {
if (keyword.equals("locale")) {
fLocale = new ULocale(value);
return true;
}
if (keyword.equals("type")) {
if (value.equals("grapheme")) {
fType = BreakIterator.KIND_CHARACTER;
} else if (value.equals("word")) {
fType = BreakIterator.KIND_WORD;
} else if (value.equals("line")) {
fType = BreakIterator.KIND_LINE;
} else if (value.equals("sentence")) {
fType = BreakIterator.KIND_SENTENCE;
} else {
String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
System.err.println(msg);
throw new IllegalArgumentException(msg);
}
return true;
}
return false;
}
RuleBasedBreakIterator createICUBreakIterator() {
BreakIterator bi;
switch(fType) {
case BreakIterator.KIND_CHARACTER:
bi = (BreakIterator.getCharacterInstance(fLocale));
break;
case BreakIterator.KIND_WORD:
bi = (BreakIterator.getWordInstance(fLocale));
break;
case BreakIterator.KIND_LINE:
bi = (BreakIterator.getLineInstance(fLocale));
break;
case BreakIterator.KIND_SENTENCE:
bi = (BreakIterator.getSentenceInstance(fLocale));
break;
default:
String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
System.err.println(msg);
throw new IllegalArgumentException(msg);
}
return (RuleBasedBreakIterator)bi;
};
void compileRules(String rules) {
int lineNumber = 0;
for (String line: rules.split("\\r?\\n")) {
++lineNumber;
// Strip comment lines.
fCommentsMatcher.reset(line);
line = fCommentsMatcher.replaceFirst("");
if (line.isEmpty()) {
continue;
}
// Recognize character class definition and keyword lines
fClassDefMatcher.reset(line);
if (fClassDefMatcher.matches()) {
String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
String classDef = fClassDefMatcher.group(/*"ClassDef"*/ 2);
if (fMonkeyImpl.fDumpExpansions) {
System.out.printf("scanned class: %s = %s\n", className, classDef);
}
if (setKeywordParameter(className, classDef)) {
// The scanned item was "type = ..." or "locale = ...", etc.
// which are not actual character classes.
continue;
}
addCharClass(className, classDef);
continue;
}
// Recognize rule lines.
fRuleDefMatcher.reset(line);
if (fRuleDefMatcher.matches()) {
String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
String ruleDef = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
if (fMonkeyImpl.fDumpExpansions) {
System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
}
addRule(ruleName, ruleDef);
continue;
}
String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
fMonkeyImpl.fRuleFileName, lineNumber, line);
System.err.println(msg);
throw new IllegalArgumentException(msg);
}
// Build the vector of char classes, omitting the dictionary class if there is one.
// This will be used when constructing the random text to be tested.
// Also compute the "other" set, consisting of any characters not included in
// one or more of the user defined sets.
UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
String ccName = el.getKey();
CharClass cclass = el.getValue();
// System.out.printf(" Adding %s\n", ccName);
if (!ccName.equals(cclass.fName)) {
throw new IllegalArgumentException(
String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
}
otherSet.removeAll(cclass.fSet);
if (ccName.equals("dictionary")) {
fDictionarySet = cclass.fSet;
} else {
fCharClassList.add(cclass);
}
}
if (!otherSet.isEmpty()) {
// System.out.printf("have an other set.\n");
CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
fCharClassList.add(cclass);
}
};
CharClass getClassForChar(int c) {
for (CharClass cc: fCharClassList) {
if (cc.fSet.contains(c)) {
return cc;
}
}
return null;
};
RBBIMonkeyImpl fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance.
List<BreakRule> fBreakRules; // Contents are of type (BreakRule *).
Map<String, CharClass> fCharClasses; // Key is the set name.
// // Value is the corresponding CharClass
List<CharClass> fCharClassList; // Char Classes, same contents as fCharClasses values,
UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined.
ULocale fLocale;
int fType; // BreakItererator.KIND_WORD, etc.
Matcher fSetRefsMatcher;
Matcher fCommentsMatcher;
Matcher fClassDefMatcher;
Matcher fRuleDefMatcher;
Matcher fPropertyMatcher;
};
// class MonkeyTestData represents a randomly synthesized test data string together
// with the expected break positions obtained by applying
// the test break rules.
static class MonkeyTestData{
void set(BreakRules rules, ICU_Rand rand) {
int dataLength = 1000; // length of test data to generate, in code points.
// Fill the test string with random characters.
// First randomly pick a char class, then randomly pick a character from that class.
// Exclude any characters from the dictionary set.
// System.out.println("Populating Test Data");
fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages,
// allowing recreation of failing data.
fBkRules = rules;
StringBuilder newString = new StringBuilder();
for (int n=0; n<dataLength;) {
int charClassIndex = rand.next() % rules.fCharClassList.size();
CharClass cclass = rules.fCharClassList.get(charClassIndex);
if (cclass.fSet.size() == 0) {
// Some rules or tailorings do end up with empty char classes.
continue;
}
int charIndex = rand.next() % cclass.fSet.size();
int c = cclass.fSet.charAt(charIndex);
if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
// Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
// Don't let random unpaired surrogates combine in the test data because they might
// produce an unwanted dictionary character.
continue;
}
if (!rules.fDictionarySet.contains(c)) {
newString.appendCodePoint(c);
++n;
}
}
fString = newString.toString();
// Init the expectedBreaks, actualBreaks and ruleForPosition.
// Expected and Actual breaks are one longer than the input string; a true value
// will indicate a boundary preceding that position.
fActualBreaks = new boolean[fString.length()+1];
fExpectedBreaks = new boolean[fString.length()+1];
fRuleForPosition = new int[fString.length()+1];
f2ndRuleForPos = new int[fString.length()+1];
// Apply reference rules to find the expected breaks.
fExpectedBreaks[0] = true; // Force an expected break before the start of the text.
// ICU always reports a break there.
// The reference rules do not have a means to do so.
int strIdx = 0;
boolean initialMatch = true; // True at start of text, and immediately after each boundary,
// // for control over rule chaining.
while (strIdx < fString.length()) {
BreakRule matchingRule = null;
boolean hasBreak = false;
int ruleNum = 0;
int matchStart = 0;
int matchEnd = 0;
for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
BreakRule rule = rules.fBreakRules.get(ruleNum);
if (rule.fInitialMatchOnly && !initialMatch) {
// Skip checking this '^' rule. (No rule chaining)
continue;
}
rule.fRuleMatcher.reset(fString.substring(strIdx));
if (rule.fRuleMatcher.lookingAt()) {
// A candidate rule match, check further to see if we take it or continue to check other rules.
// Matches of zero or one code point count only if they also specify a break.
matchStart = strIdx;
matchEnd = strIdx + rule.fRuleMatcher.end();
hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
if (hasBreak ||
(matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
matchingRule = rule;
break;
}
}
}
if (matchingRule == null) {
// No reference rule matched. This is an error in the rules that should never happen.
String msg = String.format("%s: No reference rules matched at position %d. ",
rules.fMonkeyImpl.fRuleFileName, strIdx);
System.err.println(msg);
dump(strIdx);
throw new IllegalArgumentException(msg);
}
if (matchingRule.fRuleMatcher.group().length() == 0) {
// Zero length rule match. This is also an error in the rule expressions.
String msg = String.format("%s:%s: Zero length rule match at %d.",
rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
System.err.println(msg);
dump(strIdx);
throw new IllegalArgumentException(msg);
}
// Record which rule matched over the length of the match.
for (int i = matchStart; i < matchEnd; i++) {
if (fRuleForPosition[i] == 0) {
fRuleForPosition[i] = ruleNum;
} else {
f2ndRuleForPos[i] = ruleNum;
}
}
// Break positions appear in rules as a matching named capture of zero length at the break position,
// the adjusted pattern contains (?<BreakPosition>)
if (hasBreak) {
int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
fExpectedBreaks[breakPos] = true;
// System.out.printf("recording break at %d\n", breakPos);
// For the next iteration, pick up applying rules immediately after the break,
// which may differ from end of the match. The matching rule may have included
// context following the boundary that needs to be looked at again.
strIdx = breakPos;
initialMatch = true;
} else {
// Original rule didn't specify a break.
// Continue applying rules starting on the last code point of this match.
int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
if (updatedStrIdx == matchStart) {
// Match was only one code point, no progress if we continue.
// Shouldn't get here, case is filtered out at top of loop.
throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
}
strIdx = updatedStrIdx;
initialMatch = false;
}
}
};
// Helper function to find the starting index of a match of the "BreakPosition" named capture group.
// @param m: a Java regex Matcher that has completed a matching operation.
// @return m.start("BreakPosition),
// or -1 if there is no such group, or the group did not participate in the match.
//
// TODO: this becomes m.start("BreakPosition") with Java 8.
// In the mean time, assume that the only zero-length capturing group in
// a reference rule expression is the "BreakPosition" that corresponds to a "÷".
static int BreakGroupStart(Matcher m) {
for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
String group = m.group(groupNum);
if (group == null) {
continue;
}
if (group.equals("")) {
// assert(m.end(groupNum) == m.end("BreakPosition"));
return m.start(groupNum);
}
}
return -1;
}
void dump(int around) {
System.out.print("\n"
+ " char break Rule Character\n"
+ " pos code class R I name name\n"
+ "---------------------------------------------------------------------------------------------\n");
int start;
int end;
if (around == -1) {
start = 0;
end = fString.length();
} else {
// Display context around a failure.
try {
start = fString.offsetByCodePoints(around, -30);
} catch (Exception e) {
start = 0;
}
try {
end = fString.offsetByCodePoints(around, +30);
} catch (Exception e) {
end = fString.length();
}
}
for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
int c = fString.codePointAt(charIdx);
CharClass cc = fBkRules.getClassForChar(c);
BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
String secondRuleName = "";
if (f2ndRuleForPos[charIdx] > 0) {
secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
}
String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
System.out.printf(" %4d %6x %-20s %c %c %-10s %-10s %s\n",
charIdx, c, cc.fName,
fExpectedBreaks[charIdx] ? '*' : '.',
fActualBreaks[charIdx] ? '*' : '.',
rule.fName, secondRuleName, cName
);
}
};
void clearActualBreaks() {
Arrays.fill(fActualBreaks, false);
}
int fRandomSeed; // The initial seed value from the random number generator.
BreakRules fBkRules; // The break rules used to generate this data.
String fString; // The text.
boolean fExpectedBreaks[]; // Breaks as found by the reference rules.
// Parallel to fString. true if break preceding.
boolean fActualBreaks[]; // Breaks as found by ICU break iterator.
int fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
// Also parallel to fString.
int f2ndRuleForPos[]; // As above. A 2nd rule applies when the preceding rule
// didn't cause a break, and a subsequent rule match starts
// on the last code point of the preceding match.
}
// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey
// test for one set of break rules.
//
static class RBBIMonkeyImpl extends Thread {
void setup(String ruleFile) {
fRuleFileName = ruleFile;
openBreakRules(ruleFile);
fRuleSet = new BreakRules(this);
fRuleSet.compileRules(fRuleCharBuffer);
fBI = fRuleSet.createICUBreakIterator();
fTestData = new MonkeyTestData();
};
void openBreakRules(String fileName) {
StringBuilder testFileBuf = new StringBuilder();
InputStream is = null;
String filePath = "break_rules/" + fileName;
try {
is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
if (is == null) {
errln("Could not open test data file " + fileName);
return;
}
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
try {
int c;
int count = 0;
for (;;) {
c = isr.read();
if (c < 0) {
break;
}
count++;
if (c == 0xFEFF && count == 1) {
// BOM in the test data file. Discard it.
continue;
}
testFileBuf.appendCodePoint(c);
}
} finally {
isr.close();
}
} catch (IOException e) {
try {
is.close();
} catch (IOException ignored) {
}
errln(e.toString());
}
fRuleCharBuffer = testFileBuf.toString(); /* the file as a String */
}
class MonkeyException extends RuntimeException {
private static final long serialVersionUID = 1L;
public int fPosition; // Position of the failure in the test data.
MonkeyException(String description, int pos) {
super(description);
fPosition = pos;
}
}
@Override
public void run() {
int errorCount = 0;
if (fBI == null) {
fErrorMsgs.append("Unable to run test because fBI is null.\n");
return;
}
for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
try {
fTestData.set(fRuleSet, fRandomGenerator);
// fTestData.dump(-1);
testForwards();
testPrevious();
testFollowing();
testPreceding();
testIsBoundary();
} catch (MonkeyException e) {
String formattedMsg = String.format(
"%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
System.err.print(formattedMsg);
if (fVerbose) {
fTestData.dump(e.fPosition);
}
fErrorMsgs.append(formattedMsg);
if (++errorCount > 10) {
return;
}
}
if (fLoopCount < 0 && loopCount % 100 == 0) {
System.err.print(".");
}
}
}
enum CheckDirection {
FORWARD,
REVERSE
};
void testForwards() {
fTestData.clearActualBreaks();
fBI.setText(fTestData.fString);
int previousBreak = -2;
for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
if (bk <= previousBreak) {
throw new MonkeyException("Break Iterator Stall", bk);
}
if (bk < 0 || bk > fTestData.fString.length()) {
throw new MonkeyException("Boundary out of bounds", bk);
}
fTestData.fActualBreaks[bk] = true;
}
checkResults("testForwards", CheckDirection.FORWARD);
};
void testFollowing() {
fTestData.clearActualBreaks();
fBI.setText(fTestData.fString);
int nextBreak = -1;
for (int i=-1 ; i<fTestData.fString.length(); ++i) {
int bk = fBI.following(i);
if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
continue;
}
if (bk == nextBreak && bk > i) {
// i is in the gap between two breaks.
continue;
}
if (i == nextBreak && bk > nextBreak) {
fTestData.fActualBreaks[bk] = true;
nextBreak = bk;
continue;
}
throw new MonkeyException("following(i)", i);
}
checkResults("testFollowing", CheckDirection.FORWARD);
};
void testPrevious() {
fTestData.clearActualBreaks();
fBI.setText(fTestData.fString);
int previousBreak = Integer.MAX_VALUE;
for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
if (bk >= previousBreak) {
throw new MonkeyException("Break Iterator Stall", bk);
}
if (bk < 0 || bk > fTestData.fString.length()) {
throw new MonkeyException("Boundary out of bounds", bk);
}
fTestData.fActualBreaks[bk] = true;
}
checkResults("testPrevius", CheckDirection.REVERSE);
};
/**
* Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
* adjust it to point to the lead surrogate, which is the start of the code point.
* @param s the String.
* @param i the initial index
* @return the adjusted index
*/
private int getChar32Start(String s, int i) {
if (i > 0 && i < s.length() &&
Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
--i;
}
return i;
}
void testPreceding() {
fTestData.clearActualBreaks();
fBI.setText(fTestData.fString);
int nextBreak = fTestData.fString.length()+1;
for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
int bk = fBI.preceding(i);
// System.err.printf("testPreceding() i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak);
if (bk == BreakIterator.DONE && i == 0) {
continue;
}
if (bk == nextBreak && bk < i) {
// i is in the gap between two breaks.
continue;
}
if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
// i indexes to a trailing surrogate.
// Break Iterators treat an index to either half as referring to the supplemental code point,
// with preceding going to some preceding code point.
if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
throw new MonkeyException("preceding of trailing surrogate error", i);
}
continue;
}
if (i == nextBreak && bk < nextBreak) {
fTestData.fActualBreaks[bk] = true;
nextBreak = bk;
continue;
}
throw new MonkeyException("preceding(i)", i);
}
checkResults("testPreceding", CheckDirection.REVERSE);
};
void testIsBoundary() {
fTestData.clearActualBreaks();
fBI.setText(fTestData.fString);
for (int i=fTestData.fString.length(); i>=0; --i) {
if (fBI.isBoundary(i)) {
fTestData.fActualBreaks[i] = true;
}
}
checkResults("testForwards", CheckDirection.FORWARD);
};
void checkResults(String msg, CheckDirection direction) {
if (direction == CheckDirection.FORWARD) {
for (int i=0; i<=fTestData.fString.length(); ++i) {
if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
throw new MonkeyException(msg, i);
}
}
} else {
for (int i=fTestData.fString.length(); i>=0; i--) {
if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
throw new MonkeyException(msg, i);
}
}
}
};
String fRuleCharBuffer; // source file contents of the reference rules.
BreakRules fRuleSet;
RuleBasedBreakIterator fBI;
MonkeyTestData fTestData;
ICU_Rand fRandomGenerator;
String fRuleFileName;
boolean fVerbose; // True to do long dump of failing data.
int fLoopCount;
int fErrorCount;
boolean fDumpExpansions; // Debug flag to output expanded form of rules and sets.
StringBuilder fErrorMsgs = new StringBuilder();
}
// Test parameters, specified via Java properties.
//
// rules=file_name Name of file containing the reference rules.
// seed=nnnnn Random number starting seed.
// Setting the seed allows errors to be reproduced.
// loop=nnn Looping count. Controls running time.
// -1: run forever.
// 0 or greater: run length.
// expansions debug option, show expansions of rules and sets.
// verbose Display details of the failure.
//
// Parameters are passed to the JVM on the command line, or
// via the Eclipse Run Configuration settings, arguments tab, VM parameters.
// For example,
// -ea -Drules=line.txt -Dloop=-1
//
@Test
public void TestMonkey() {
String tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
"line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
};
String testNameFromParams = getProperty("rules");
if (testNameFromParams != null) {
tests = new String[] {testNameFromParams};
}
int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
boolean dumpExpansions = getBooleanProperty("expansions", false);
boolean verbose = getBooleanProperty("verbose", false);
int seed = getIntProperty("seed", 1);
List<RBBIMonkeyImpl> startedTests = new ArrayList<>();
// Monkey testing is multi-threaded.
// Each set of break rules to be tested is run in a separate thread.
// Each thread/set of rules gets a separate RBBIMonkeyImpl object.
for (String testName: tests) {
logln(String.format("beginning testing of %s", testName));
RBBIMonkeyImpl test = new RBBIMonkeyImpl();
test.fDumpExpansions = dumpExpansions;
test.fVerbose = verbose;
test.fRandomGenerator = new ICU_Rand(seed);
test.fLoopCount = loopCount;
test.setup(testName);
test.start();
startedTests.add(test);
}
StringBuilder errors = new StringBuilder();
for (RBBIMonkeyImpl test: startedTests) {
try {
test.join();
errors.append(test.fErrorMsgs);
} catch (InterruptedException e) {
errors.append(e + "\n");
}
}
String errorMsgs = errors.toString();
assertEquals(errorMsgs, "", errorMsgs);
}
}