icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBIMonkeyTest.java - external/github.com/unicode-org/icu - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html

 package com.ibm.icu.dev.test.rbbi;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.impl.UCharacterName;
 import com.ibm.icu.impl.UCharacterNameChoice;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.util.ULocale;

 /**
  * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
  * This is the newer, data driven monkey test. It is completely separate from the
  * older class RBBITestMonkey.
  */

 @RunWith(JUnit4.class)
 public class RBBIMonkeyTest extends TestFmwk {


     //  class CharClass    Represents a single character class from the source break rules.
     //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
     //                     deletes them using hash's object deleter function.

     static class CharClass  {
         String         fName;
         String         fOriginalDef;    // set definition as it appeared in user supplied rules.
         String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
         UnicodeSet     fSet;
         CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
             fName = name;
             fOriginalDef = originalDef;
             fExpandedDef = expandedDef;
             fSet = set;
         };
     }


     // class BreakRule    Struct-like class represents a single rule from a set of break rules.
     //                    Each rule has the set definitions expanded, and
     //                    is compiled to a regular expression.

     static class BreakRule {
         String    fName;                      // Name of the rule.
         String    fRule;                      // Rule expression, excluding the name, as written in user source.
         String    fExpandedRule;              // Rule expression after expanding the set definitions.
         Matcher   fRuleMatcher;               // Regular expression that matches the rule.
         boolean   fInitialMatchOnly = false;  // True if rule begins with '^', meaning no chaining.
     };


     // class BreakRules    represents a complete set of break rules, possibly tailored,
     //                     compiled from testdata break rules.

     static class BreakRules {
         BreakRules(RBBIMonkeyImpl monkeyImpl) {
             fMonkeyImpl = monkeyImpl;
             fBreakRules = new ArrayList<>();
             fType = BreakIterator.KIND_TITLE;
             fCharClasses = new HashMap<>();
             fCharClassList = new ArrayList<>();
             fDictionarySet = new UnicodeSet();

             // Match an alpha-numeric identifier in a rule. Will be a set name.
             // Use negative look-behind to exclude non-identifiers, mostly property names or values.
             fSetRefsMatcher = Pattern.compile(
                     "(?<!\\{[ \\t]{0,4})" +
                     "(?<!=[ \\t]{0,4})" +
                     "(?<!\\[:[ \\t]{0,4})" +
                     "(?<!\\\\)" +
                     "(?<![A-Za-z0-9_])" +
                     "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
                     matcher("");

             // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
             fCommentsMatcher = Pattern.compile("" +
                     "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
                     "[ \\t]*+"     +                //   Match white space.
                     "(#.*)?+"      +                //   Optional # plus whatever follows
                     "$").                           //   new-line at end of line.
                     matcher("");

             // Match (initial parse) of a character class definition line.
             fClassDefMatcher = Pattern.compile("" +
                     "[ \\t]*"           +                    // leading white space
                     "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
                     "[ \\t]*=[ \\t]*"   +                    //   =
                     "(.*?)"  +                               // The char class UnicodeSet expression
                     "[ \\t]*;$").                            // ; <end of line>
                     matcher("");

             // Match (initial parse) of a break rule line.
             fRuleDefMatcher = Pattern.compile("" +
                     "[ \\t]*"           +                     // leading white space
                     "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
                     "[ \\t]*:[ \\t]*"   +                     //   :
                     "(.*?)"   +                               // The rule definition
                     "[ \\t]*;$").                             // ; <end of line>
                     matcher("");

             // Match a property expression, either [:xxx:] or \p{...}
             fPropertyMatcher = Pattern.compile("" +
                     "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
                     matcher("");


         }

         /**
          * Create the expanded definition for this char class,
          * replacing any set references with the corresponding definition.
          */
         CharClass  addCharClass(String name, String definition) {
             StringBuffer expandedDef = new StringBuffer();
             fSetRefsMatcher.reset(definition);
             while (fSetRefsMatcher.find()) {
                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
                 CharClass snameClass = fCharClasses.get(sname);
                 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;

                 fSetRefsMatcher.appendReplacement(expandedDef, "");
                 expandedDef.append(expansionForName);
             }
             fSetRefsMatcher.appendTail(expandedDef);
             String expandedDefString = expandedDef.toString();

             if (fMonkeyImpl.fDumpExpansions) {
                 System.out.printf("addCharClass(\"%s\"\n", name);
                 System.out.printf("             %s\n", definition);
                 System.out.printf("expandedDef: %s\n", expandedDefString);
             }

             // Verify that the expanded set definition is valid.

             UnicodeSet s;
             try {
                 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
             } catch (java.lang.IllegalArgumentException e) {
                 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
                 throw e;
             }

             // Get an expanded equivalent pattern from the UnicodeSet.
             // This removes set difference operators, which would fail if passed through to Java regex.

             StringBuffer expandedPattern = new StringBuffer();
             s._generatePattern(expandedPattern, true);
             expandedDefString = expandedPattern.toString();
             if (fMonkeyImpl.fDumpExpansions) {
                 System.out.printf("expandedDef2: %s\n", expandedDefString);
             }

             CharClass cclass = new CharClass(name, definition, expandedDefString, s);
             CharClass previousClass = fCharClasses.put(name, cclass);

             if (previousClass != null) {
                 // TODO: decide whether or not to allow redefinitions.
                 //       Can be convenient in some cases.
                 // String msg = String.format("%s: Redefinition of character class %s\n",
                 //         fMonkeyImpl.fRuleFileName, cclass.fName);
                 // System.err.println(msg);
                 // throw new IllegalArgumentException(msg);
             }
             return cclass;

         };


         void addRule(String  name, String  definition) {
             BreakRule  thisRule = new BreakRule();
             StringBuffer expandedDefsRule = new StringBuffer();
             thisRule.fName = name;
             thisRule.fRule = definition;

             // Expand the char class definitions within the rule.
             fSetRefsMatcher.reset(definition);
             while (fSetRefsMatcher.find()) {
                 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
                 CharClass nameClass = fCharClasses.get(sname);
                 if (nameClass == null) {
                     System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
                 }
                 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
                 fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
                 expandedDefsRule.append(expansionForName);
             }
             fSetRefsMatcher.appendTail(expandedDefsRule);

             // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
             // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
             // does not recognize all properties, and because Java's definitions are likely
             // older than ICU's.

             StringBuffer expandedRule = new StringBuffer();
             fPropertyMatcher.reset(expandedDefsRule);
             while (fPropertyMatcher.find()) {
                 String prop = fPropertyMatcher.group();
                 UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
                 StringBuffer propExpansion = new StringBuffer();
                 propSet._generatePattern(propExpansion, true);
                 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
             }
             fPropertyMatcher.appendTail(expandedRule);

             // If rule begins with a '^' rule chaining is disallowed.
             // Strip off the '^' from the rule expression, and set the flag.
             if (expandedRule.charAt(0) == '^') {
                 thisRule.fInitialMatchOnly = true;
                 expandedRule.deleteCharAt(0);
                 expandedRule = new StringBuffer(expandedRule.toString().trim());
             }

             //   Replace any [^negated sets] with equivalent flattened sets generated by
             //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
             //   to any nested classes. Variable substitution in rules produces
             //   nested sets that [^negation] needs to apply to.

             StringBuffer ruleWithFlattenedSets = new StringBuffer();
             int idx = 0;
             while (idx<expandedRule.length()) {
                 int setOpenPos = expandedRule.indexOf("[^", idx);
                 if (setOpenPos < 0) {
                     break;
                 }
                 if (setOpenPos > idx) {
                     // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
                     ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
                 }
                 int nestingLevel = 1;
                 boolean haveNesting = false;
                 int setClosePos;
                 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
                     char c = expandedRule.charAt(setClosePos);
                     if (c == '\\') {
                         ++setClosePos;
                     } else if (c == '[') {
                         ++nestingLevel;
                         haveNesting = true;
                     } else if (c == ']') {
                         --nestingLevel;
                     }
                 }
                 if (haveNesting && nestingLevel == 0) {
                     // Found one, a negated set that includes interior nested sets.
                     // Create an ICU UnicodeSet from the source pattern, and obtain an
                     // equivalent flattened pattern from that.
                     UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
                     uset._generatePattern(ruleWithFlattenedSets, true);
                 } else {
                     // The [^ set definition did not include any nested sets.
                     // Copy the original definition without change.
                     // Java regular expressions will handle it without needing to recast it.
                     if (nestingLevel > 0) {
                         // Error case of an unclosed character class expression.
                         // Java regex will also eventually flag the error.
                         System.err.printf("No closing ] found in rule %s\n", name);
                     }
                     ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
                 }
                 idx = setClosePos;
             }

             if (idx < expandedRule.length()) {
                 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
             }

             thisRule.fExpandedRule = ruleWithFlattenedSets.toString();

             // Replace the divide sign (\u00f7) with a regular expression named capture.
             // When running the rules, a match that includes this group means we found a break position.

             // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
             if (thisRule.fExpandedRule.indexOf("÷") != -1) {
                 String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
                 System.err.println(msg);
                 throw new IllegalArgumentException(msg);
             }

             // UAX break rule set definitions can be empty, just [].
             // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
             // also matches nothing.

             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");

             // Change Unicode escape syntax for compatibility with Java regular expressions
             //    \udddd     => \x{dddd}
             //    \U00hhhhhh => \x{hhhhhh}

              thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
              thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");

             // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
             // UnicodeSet._generatePattern() inserts un-escaped "#"s

             thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
             if (fMonkeyImpl.fDumpExpansions) {
                 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
             }

             // Compile a regular expression for this rule.

             try {
                 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
             } catch (PatternSyntaxException e) {
                 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
                         fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
                 throw e;
             }

             // Put this new rule into the vector of all Rules.

             fBreakRules.add(thisRule);
         };

         @SuppressWarnings("unused")
         private static String hexToCodePoint(String hex) {
             int cp = Integer.parseInt(hex, 16);
             return new StringBuilder().appendCodePoint(cp).toString();
         }


         boolean setKeywordParameter(String keyword, String value) {
             if (keyword.equals("locale")) {
                 fLocale = new ULocale(value);
                 return true;
             }
             if (keyword.equals("type")) {
                 if (value.equals("grapheme")) {
                     fType = BreakIterator.KIND_CHARACTER;
                 } else if (value.equals("word")) {
                     fType = BreakIterator.KIND_WORD;
                 } else if (value.equals("line")) {
                     fType = BreakIterator.KIND_LINE;
                 } else if (value.equals("sentence")) {
                     fType = BreakIterator.KIND_SENTENCE;
                 } else {
                     String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
                     System.err.println(msg);
                     throw new IllegalArgumentException(msg);
                 }
                 return true;
             }
             return false;
         }


         RuleBasedBreakIterator createICUBreakIterator() {
             BreakIterator bi;
             switch(fType) {
                 case BreakIterator.KIND_CHARACTER:
                     bi = (BreakIterator.getCharacterInstance(fLocale));
                     break;
                 case BreakIterator.KIND_WORD:
                     bi = (BreakIterator.getWordInstance(fLocale));
                     break;
                 case BreakIterator.KIND_LINE:
                     bi = (BreakIterator.getLineInstance(fLocale));
                     break;
                 case BreakIterator.KIND_SENTENCE:
                     bi = (BreakIterator.getSentenceInstance(fLocale));
                     break;
                 default:
                     String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
                     System.err.println(msg);
                     throw new IllegalArgumentException(msg);
             }
             return (RuleBasedBreakIterator)bi;

         };


         void compileRules(String rules) {
             int lineNumber = 0;
             for (String line: rules.split("\\r?\\n")) {
                 ++lineNumber;
                 // Strip comment lines.
                 fCommentsMatcher.reset(line);
                 line = fCommentsMatcher.replaceFirst("");
                 if (line.isEmpty()) {
                     continue;
                 }

                 // Recognize character class definition and keyword lines
                 fClassDefMatcher.reset(line);
                 if (fClassDefMatcher.matches()) {
                     String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
                     String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
                     if (fMonkeyImpl.fDumpExpansions) {
                         System.out.printf("scanned class: %s = %s\n", className, classDef);
                     }
                     if (setKeywordParameter(className, classDef)) {
                         // The scanned item was "type = ..." or "locale = ...", etc.
                         //   which are not actual character classes.
                         continue;
                     }
                     addCharClass(className, classDef);
                     continue;
                 }

                 // Recognize rule lines.
                 fRuleDefMatcher.reset(line);
                 if (fRuleDefMatcher.matches()) {
                     String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
                     String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
                     if (fMonkeyImpl.fDumpExpansions) {
                         System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
                     }
                     addRule(ruleName, ruleDef);
                     continue;
                 }

                 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
                         fMonkeyImpl.fRuleFileName, lineNumber, line);
                 System.err.println(msg);
                 throw new IllegalArgumentException(msg);
             }

             // Build the vector of char classes, omitting the dictionary class if there is one.
             // This will be used when constructing the random text to be tested.

             // Also compute the "other" set, consisting of any characters not included in
             // one or more of the user defined sets.

             UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);

             for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
                 String ccName = el.getKey();
                 CharClass cclass = el.getValue();

                 // System.out.printf("    Adding %s\n", ccName);
                 if (!ccName.equals(cclass.fName)) {
                     throw new IllegalArgumentException(
                             String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
                                     fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
                 }
                 otherSet.removeAll(cclass.fSet);
                 if (ccName.equals("dictionary")) {
                     fDictionarySet = cclass.fSet;
                 } else {
                     fCharClassList.add(cclass);
                 }
             }

             if (!otherSet.isEmpty()) {
                 // System.out.printf("have an other set.\n");
                 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
                 fCharClassList.add(cclass);
             }

         };

         CharClass getClassForChar(int c) {
             for (CharClass cc: fCharClassList) {
                 if (cc.fSet.contains(c)) {
                     return cc;
                 }
             }
             return null;
         };


         RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
         List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).

         Map<String, CharClass>  fCharClasses;       // Key is the set name.
         //                                          // Value is the corresponding CharClass
         List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,

         UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
         ULocale                 fLocale;
         int                     fType;              // BreakItererator.KIND_WORD, etc.


         Matcher fSetRefsMatcher;
         Matcher fCommentsMatcher;
         Matcher fClassDefMatcher;
         Matcher fRuleDefMatcher;
         Matcher fPropertyMatcher;
     };


     // class MonkeyTestData    represents a randomly synthesized test data string together
     //                         with the expected break positions obtained by applying
     //                         the test break rules.

     static class MonkeyTestData{

         void set(BreakRules rules, ICU_Rand rand) {
             int dataLength = 1000;   // length of test data to generate, in code points.

             // Fill the test string with random characters.
             // First randomly pick a char class, then randomly pick a character from that class.
             // Exclude any characters from the dictionary set.

             // System.out.println("Populating Test Data");
             fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
                                                   // allowing recreation of failing data.
             fBkRules = rules;
             StringBuilder newString = new StringBuilder();
             for (int n=0; n<dataLength;) {
                 int charClassIndex = rand.next() % rules.fCharClassList.size();
                 CharClass cclass = rules.fCharClassList.get(charClassIndex);
                 if (cclass.fSet.size() == 0) {
                     // Some rules or tailorings do end up with empty char classes.
                     continue;
                 }
                 int charIndex = rand.next() % cclass.fSet.size();
                 int c = cclass.fSet.charAt(charIndex);
                 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
                         newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
                     // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
                     // Don't let random unpaired surrogates combine in the test data because they might
                     // produce an unwanted dictionary character.
                     continue;
                 }

                 if (!rules.fDictionarySet.contains(c)) {
                     newString.appendCodePoint(c);
                     ++n;
                 }
             }
             fString = newString.toString();

             // Init the expectedBreaks, actualBreaks and ruleForPosition.
             // Expected and Actual breaks are one longer than the input string; a true value
             // will indicate a boundary preceding that position.

             fActualBreaks    = new boolean[fString.length()+1];
             fExpectedBreaks  = new boolean[fString.length()+1];
             fRuleForPosition = new int[fString.length()+1];
             f2ndRuleForPos   = new int[fString.length()+1];

             // Apply reference rules to find the expected breaks.

             fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
                                              // ICU always reports a break there.
                                              // The reference rules do not have a means to do so.
             int strIdx = 0;
             boolean initialMatch = true;     // True at start of text, and immediately after each boundary,
             //                               // for control over rule chaining.

             while (strIdx < fString.length()) {
                 BreakRule matchingRule = null;
                 boolean hasBreak = false;
                 int ruleNum = 0;
                 int matchStart = 0;
                 int matchEnd = 0;
                 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
                     BreakRule rule = rules.fBreakRules.get(ruleNum);
                     if (rule.fInitialMatchOnly && !initialMatch) {
                         // Skip checking this '^' rule. (No rule chaining)
                         continue;
                     }
                     rule.fRuleMatcher.reset(fString.substring(strIdx));
                     if (rule.fRuleMatcher.lookingAt()) {
                         // A candidate rule match, check further to see if we take it or continue to check other rules.
                         // Matches of zero or one code point count only if they also specify a break.
                         matchStart = strIdx;
                         matchEnd = strIdx + rule.fRuleMatcher.end();
                         hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
                         if (hasBreak ||
                                 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
                             matchingRule = rule;
                             break;
                         }
                     }
                 }
                 if (matchingRule == null) {
                     // No reference rule matched. This is an error in the rules that should never happen.
                     String msg = String.format("%s: No reference rules matched at position %d. ",
                             rules.fMonkeyImpl.fRuleFileName, strIdx);
                     System.err.println(msg);
                     dump(strIdx);
                     throw new IllegalArgumentException(msg);
                 }
                 if (matchingRule.fRuleMatcher.group().length() == 0) {
                     // Zero length rule match. This is also an error in the rule expressions.
                     String msg = String.format("%s:%s: Zero length rule match at %d.",
                             rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
                     System.err.println(msg);
                     dump(strIdx);
                     throw new IllegalArgumentException(msg);
                 }

                 // Record which rule matched over the length of the match.
                 for (int i = matchStart; i < matchEnd; i++) {
                     if (fRuleForPosition[i] == 0) {
                         fRuleForPosition[i] = ruleNum;
                     } else {
                         f2ndRuleForPos[i] = ruleNum;
                     }
                 }

                 // Break positions appear in rules as a matching named capture of zero length at the break position,
                 //   the adjusted pattern contains (?<BreakPosition>)
                 if (hasBreak) {
                     int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
                     fExpectedBreaks[breakPos] = true;
                     // System.out.printf("recording break at %d\n", breakPos);
                     // For the next iteration, pick up applying rules immediately after the break,
                     // which may differ from end of the match. The matching rule may have included
                     // context following the boundary that needs to be looked at again.
                     strIdx = breakPos;
                     initialMatch = true;
                 } else {
                     // Original rule didn't specify a break.
                     // Continue applying rules starting on the last code point of this match.
                     int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
                     if (updatedStrIdx == matchStart) {
                         // Match was only one code point, no progress if we continue.
                         // Shouldn't get here, case is filtered out at top of loop.
                         throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
                                 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
                     }
                     strIdx = updatedStrIdx;
                     initialMatch = false;
                 }
             }
         };

         // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
         // @param m: a Java regex Matcher that has completed a matching operation.
         // @return m.start("BreakPosition),
         //         or -1 if there is no such group, or the group did not participate in the match.
         //
         // TODO: this becomes m.start("BreakPosition") with Java 8.
         //       In the mean time, assume that the only zero-length capturing group in
         //       a reference rule expression is the "BreakPosition" that corresponds to a "÷".

         static int BreakGroupStart(Matcher m) {
             for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
                 String group = m.group(groupNum);
                 if (group == null) {
                     continue;
                 }
                 if (group.equals("")) {
                     // assert(m.end(groupNum) == m.end("BreakPosition"));
                     return m.start(groupNum);
                 }
             }
             return -1;
         }

         void dump(int around) {
             System.out.print("\n"
                     +        "         char                        break  Rule                     Character\n"
                     +        "   pos   code   class                 R I   name                     name\n"
                     +        "---------------------------------------------------------------------------------------------\n");

             int start;
             int end;

             if (around == -1) {
                 start = 0;
                 end = fString.length();
             } else {
                 // Display context around a failure.
                 try {
                     start = fString.offsetByCodePoints(around, -30);
                 } catch (Exception e) {
                     start = 0;
                 }
                 try {
                     end = fString.offsetByCodePoints(around, +30);
                 } catch (Exception e) {
                     end = fString.length();
                 }
             }

             for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
                 int c = fString.codePointAt(charIdx);
                 CharClass cc = fBkRules.getClassForChar(c);

                 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
                 String secondRuleName = "";
                 if (f2ndRuleForPos[charIdx] > 0) {
                     secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
                 }
                 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);

                 System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
                         charIdx, c, cc.fName,
                         fExpectedBreaks[charIdx] ? '*' : '.',
                         fActualBreaks[charIdx] ? '*' : '.',
                         rule.fName, secondRuleName, cName
                         );
                 }

         };

         void clearActualBreaks() {
             Arrays.fill(fActualBreaks, false);
         }


         int               fRandomSeed;        // The initial seed value from the random number generator.
         BreakRules        fBkRules;           // The break rules used to generate this data.
         String            fString;            // The text.
         boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
                                               //     Parallel to fString. true if break preceding.
         boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
         int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
                                               // Also parallel to fString.
         int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
                                               //   didn't cause a break, and a subsequent rule match starts
                                               //   on the last code point of the preceding match.

     }


     // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
     //                          test for one set of break rules.
     //

     static class RBBIMonkeyImpl extends Thread {

         void setup(String ruleFile) {
             fRuleFileName = ruleFile;
             openBreakRules(ruleFile);
             fRuleSet = new BreakRules(this);
             fRuleSet.compileRules(fRuleCharBuffer);
             fBI = fRuleSet.createICUBreakIterator();
             fTestData = new MonkeyTestData();
         };

         void openBreakRules(String fileName) {
             StringBuilder testFileBuf = new StringBuilder();
             InputStream is = null;
             String filePath = "break_rules/" + fileName;
             try {
                 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
                 if (is == null) {
                     errln("Could not open test data file " + fileName);
                     return;
                 }
                 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
                 try {
                     int c;
                     int count = 0;
                     for (;;) {
                         c = isr.read();
                         if (c < 0) {
                             break;
                         }
                         count++;
                         if (c == 0xFEFF && count == 1) {
                             // BOM in the test data file. Discard it.
                             continue;
                         }
                        testFileBuf.appendCodePoint(c);
                     }
                 } finally {
                     isr.close();
                 }
                 } catch (IOException e) {
                 try {
                     is.close();
                 } catch (IOException ignored) {
                 }
                 errln(e.toString());
             }
             fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
         }

         class MonkeyException extends RuntimeException  {
             private static final long serialVersionUID = 1L;
             public int fPosition;    // Position of the failure in the test data.
             MonkeyException(String description, int pos) {
                 super(description);
                 fPosition = pos;
             }
         }

         @Override
         public void run() {
             int errorCount = 0;
             if (fBI == null) {
                 fErrorMsgs.append("Unable to run test because fBI is null.\n");
                 return;
             }
             for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
                 try {
                     fTestData.set(fRuleSet, fRandomGenerator);
                     // fTestData.dump(-1);
                     testForwards();
                     testPrevious();
                     testFollowing();
                     testPreceding();
                     testIsBoundary();
                 } catch (MonkeyException e) {
                     String formattedMsg = String.format(
                             "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
                             e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
                     System.err.print(formattedMsg);
                     if (fVerbose) {
                         fTestData.dump(e.fPosition);
                     }
                     fErrorMsgs.append(formattedMsg);
                     if (++errorCount > 10) {
                         return;
                     }
                 }
                 if (fLoopCount < 0 && loopCount % 100 == 0) {
                     System.err.print(".");
                 }
             }
         }

         enum CheckDirection {
             FORWARD,
             REVERSE
         };

         void testForwards() {
             fTestData.clearActualBreaks();
             fBI.setText(fTestData.fString);
             int previousBreak = -2;
             for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
                 if (bk <= previousBreak) {
                     throw new MonkeyException("Break Iterator Stall", bk);
                 }
                 if (bk < 0 || bk > fTestData.fString.length()) {
                     throw new MonkeyException("Boundary out of bounds", bk);
                 }
                 fTestData.fActualBreaks[bk] = true;
             }
             checkResults("testForwards", CheckDirection.FORWARD);
         };


        void testFollowing() {
            fTestData.clearActualBreaks();
            fBI.setText(fTestData.fString);
            int nextBreak = -1;
            for (int i=-1 ; i<fTestData.fString.length(); ++i) {
                int bk = fBI.following(i);
                if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
                    continue;
                }
                if (bk == nextBreak && bk > i) {
                    // i is in the gap between two breaks.
                    continue;
                }
                if (i == nextBreak && bk > nextBreak) {
                    fTestData.fActualBreaks[bk] = true;
                    nextBreak = bk;
                    continue;
                }
                throw new MonkeyException("following(i)", i);
            }
            checkResults("testFollowing", CheckDirection.FORWARD);
         };


         void testPrevious() {
             fTestData.clearActualBreaks();
             fBI.setText(fTestData.fString);
             int previousBreak = Integer.MAX_VALUE;
             for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
                  if (bk >= previousBreak) {
                      throw new MonkeyException("Break Iterator Stall", bk);
                 }
                 if (bk < 0 || bk > fTestData.fString.length()) {
                     throw new MonkeyException("Boundary out of bounds", bk);
                 }
                 fTestData.fActualBreaks[bk] = true;
             }
             checkResults("testPrevius", CheckDirection.REVERSE);
         };


         /**
          * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
          * adjust it to point to the lead surrogate, which is the start of the code point.
          * @param s the String.
          * @param i the initial index
          * @return the adjusted index
          */
         private int getChar32Start(String s, int i) {
             if (i > 0 && i < s.length() &&
                     Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
                 --i;
             }
             return i;
         }


         void testPreceding() {
             fTestData.clearActualBreaks();
             fBI.setText(fTestData.fString);
             int nextBreak = fTestData.fString.length()+1;
             for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
                 int bk = fBI.preceding(i);
                 // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
                 if (bk == BreakIterator.DONE && i == 0) {
                     continue;
                 }
                 if (bk == nextBreak && bk < i) {
                     // i is in the gap between two breaks.
                     continue;
                 }
                 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
                     // i indexes to a trailing surrogate.
                     // Break Iterators treat an index to either half as referring to the supplemental code point,
                     // with preceding going to some preceding code point.
                     if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
                         throw new MonkeyException("preceding of trailing surrogate error", i);
                     }
                     continue;
                 }
                 if (i == nextBreak && bk < nextBreak) {
                     fTestData.fActualBreaks[bk] = true;
                     nextBreak = bk;
                     continue;
                 }
                 throw new MonkeyException("preceding(i)", i);
             }
             checkResults("testPreceding", CheckDirection.REVERSE);

         };


         void testIsBoundary() {
             fTestData.clearActualBreaks();
             fBI.setText(fTestData.fString);
             for (int i=fTestData.fString.length(); i>=0; --i) {
                 if (fBI.isBoundary(i)) {
                     fTestData.fActualBreaks[i] = true;
                 }
             }
             checkResults("testForwards", CheckDirection.FORWARD);
         };


         void checkResults(String msg, CheckDirection direction) {
             if (direction == CheckDirection.FORWARD) {
                 for (int i=0; i<=fTestData.fString.length(); ++i) {
                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
                         throw new MonkeyException(msg, i);
                     }
                 }
             } else {
                 for (int i=fTestData.fString.length(); i>=0; i--) {
                     if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
                         throw new MonkeyException(msg, i);
                     }
                 }
             }

         };

         String                 fRuleCharBuffer;         // source file contents of the reference rules.
         BreakRules             fRuleSet;
         RuleBasedBreakIterator fBI;
         MonkeyTestData         fTestData;
         ICU_Rand               fRandomGenerator;
         String                 fRuleFileName;
         boolean                fVerbose;                 // True to do long dump of failing data.
         int                    fLoopCount;
         int                    fErrorCount;

         boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
         StringBuilder          fErrorMsgs = new StringBuilder();

     }

     //  Test parameters, specified via Java properties.
     //
     //  rules=file_name   Name of file containing the reference rules.
     //  seed=nnnnn        Random number starting seed.
     //                    Setting the seed allows errors to be reproduced.
     //  loop=nnn          Looping count.  Controls running time.
     //                    -1:  run forever.
     //                     0 or greater:  run length.
     //  expansions        debug option, show expansions of rules and sets.
     //  verbose           Display details of the failure.
     //
     // Parameters are passed to the JVM on the command line, or
     // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
     // For example,
     //      -ea -Drules=line.txt -Dloop=-1
     //
     @Test
     public void TestMonkey() {
         String tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt",
                 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
         };

         String testNameFromParams = getProperty("rules");

         if (testNameFromParams != null) {
             tests = new String[] {testNameFromParams};
         }

         int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
         boolean dumpExpansions =  getBooleanProperty("expansions", false);
         boolean verbose = getBooleanProperty("verbose", false);
         int seed = getIntProperty("seed", 1);

         List<RBBIMonkeyImpl> startedTests = new ArrayList<>();

         // Monkey testing is multi-threaded.
         // Each set of break rules to be tested is run in a separate thread.
         // Each thread/set of rules gets a separate RBBIMonkeyImpl object.

         for (String testName: tests) {
             logln(String.format("beginning testing of %s", testName));

             RBBIMonkeyImpl test = new RBBIMonkeyImpl();

             test.fDumpExpansions = dumpExpansions;
             test.fVerbose = verbose;
             test.fRandomGenerator = new ICU_Rand(seed);
             test.fLoopCount = loopCount;
             test.setup(testName);

             test.start();
             startedTests.add(test);
         }

         StringBuilder errors = new StringBuilder();
         for (RBBIMonkeyImpl test: startedTests) {
             try {
                 test.join();
                 errors.append(test.fErrorMsgs);
             } catch (InterruptedException e) {
                 errors.append(e + "\n");
             }
         }
         String errorMsgs = errors.toString();
         assertEquals(errorMsgs, "", errorMsgs);

     }


 }