| // © 2016 and later: Unicode, Inc. and others. |
| // License & terms of use: http://www.unicode.org/copyright.html |
| /******************************************************************** |
| * Copyright (c) 2016, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ********************************************************************/ |
| |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING |
| |
| #include "rbbimonkeytest.h" |
| #include "unicode/utypes.h" |
| #include "unicode/brkiter.h" |
| #include "unicode/utf16.h" |
| #include "unicode/uniset.h" |
| #include "unicode/unistr.h" |
| |
| #include "charstr.h" |
| #include "cmemory.h" |
| #include "cstr.h" |
| #include "uelement.h" |
| #include "uhash.h" |
| |
| #include <iostream> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string> |
| |
| using namespace icu; |
| |
| |
| void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) { |
| fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function. |
| |
| TESTCASE_AUTO_BEGIN; |
| TESTCASE_AUTO(testMonkey); |
| TESTCASE_AUTO_END; |
| } |
| |
| //--------------------------------------------------------------------------------------- |
| // |
| // class BreakRule implementation. |
| // |
| //--------------------------------------------------------------------------------------- |
| |
| BreakRule::BreakRule() // : all field default initialized. |
| { |
| } |
| |
| BreakRule::~BreakRule() {} |
| |
| |
| //--------------------------------------------------------------------------------------- |
| // |
| // class BreakRules implementation. |
| // |
| //--------------------------------------------------------------------------------------- |
| BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) : |
| fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) { |
| fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString, |
| uhash_compareUnicodeString, |
| NULL, // value comparator. |
| &status)); |
| if (U_FAILURE(status)) { |
| return; |
| } |
| uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject); |
| uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject); |
| fBreakRules.setDeleter(uprv_deleteUObject); |
| |
| fCharClassList.adoptInstead(new UVector(status)); |
| |
| fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative look behind for '{' or '=' or '[:' |
| // (the identifier is a unicode property name or value) |
| "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name |
| 0, status)); |
| |
| // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. |
| fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';') |
| "[ \\t]*+" // Match white space. |
| "(#.*)?+" // Optional # plus whatever follows |
| "\\R$" // new-line at end of line. |
| ), 0, status)); |
| |
| // Match (initial parse) of a character class definition line. |
| fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| "[ \\t]*" // leading white space |
| "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name |
| "[ \\t]*=[ \\t]*" // = |
| "(?<ClassDef>.*?)" // The char class UnicodeSet expression |
| "[ \\t]*;$"), // ; <end of line> |
| 0, status)); |
| |
| // Match (initial parse) of a break rule line. |
| fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( |
| "[ \\t]*" // leading white space |
| "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name |
| "[ \\t]*:[ \\t]*" // : |
| "(?<RuleDef>.*?)" // The rule definition |
| "[ \\t]*;$"), // ; <end of line> |
| 0, status)); |
| |
| } |
| |
| |
| BreakRules::~BreakRules() {} |
| |
| |
| CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { |
| |
| // Create the expanded definition for this char class, |
| // replacing any set references with the corresponding definition. |
| |
| UnicodeString expandedDef; |
| UnicodeString emptyString; |
| fSetRefsMatcher->reset(definition); |
| while (fSetRefsMatcher->find() && U_SUCCESS(status)) { |
| const UnicodeString name = |
| fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); |
| CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name)); |
| const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; |
| |
| fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status); |
| expandedDef.append(expansionForName); |
| } |
| fSetRefsMatcher->appendTail(expandedDef); |
| |
| // Verify that the expanded set definition is valid. |
| |
| if (fMonkeyImpl->fDumpExpansions) { |
| printf("epandedDef: %s\n", CStr(expandedDef)()); |
| } |
| |
| LocalPointer<UnicodeSet> s(new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status), status); |
| if (U_FAILURE(status)) { |
| IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s\n Expanded set definition: %s", |
| __FILE__, __LINE__, u_errorName(status), CStr(name)(), CStr(expandedDef)()); |
| return nullptr; |
| } |
| CharClass *cclass = new CharClass(name, definition, expandedDef, s.orphan()); |
| CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(), |
| new UnicodeString(name), // Key, owned by hash table. |
| cclass, // Value, owned by hash table. |
| &status)); |
| |
| if (previousClass != NULL) { |
| // Duplicate class def. |
| // These are legitimate, they are adjustments of an existing class. |
| // TODO: will need to keep the old around when we handle tailorings. |
| IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)()); |
| delete previousClass; |
| } |
| return cclass; |
| } |
| |
| |
| void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { |
| LocalPointer<BreakRule> thisRule(new BreakRule); |
| thisRule->fName = name; |
| thisRule->fRule = definition; |
| |
| // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes, |
| // This gives a numeric sort order that matches Unicode UAX rule numbering conventions. |
| UnicodeString emptyString; |
| |
| // Expand the char class definitions within the rule. |
| fSetRefsMatcher->reset(definition); |
| while (fSetRefsMatcher->find() && U_SUCCESS(status)) { |
| const UnicodeString name = |
| fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); |
| CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name)); |
| if (!nameClass) { |
| IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"", |
| __FILE__, __LINE__, CStr(name)(), CStr(definition)()); |
| } |
| const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; |
| |
| fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status); |
| thisRule->fExpandedRule.append(expansionForName); |
| } |
| fSetRefsMatcher->appendTail(thisRule->fExpandedRule); |
| |
| // If rule begins with a '^' rule chaining is disallowed. |
| // Strip off the '^' from the rule expression, and set the flag. |
| if (thisRule->fExpandedRule.charAt(0) == u'^') { |
| thisRule->fInitialMatchOnly = true; |
| thisRule->fExpandedRule.remove(0, 1); |
| thisRule->fExpandedRule.trim(); |
| } |
| |
| // Replace the divide sign (\u00f7) with a regular expression named capture. |
| // When running the rules, a match that includes this group means we found a break position. |
| |
| int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7); |
| if (dividePos >= 0) { |
| thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)")); |
| } |
| if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) { |
| status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message. |
| } |
| |
| // UAX break rule set definitions can be empty, just []. |
| // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which |
| // also matches nothing. |
| |
| static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0}; |
| int32_t where = 0; |
| while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) { |
| thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]")); |
| } |
| if (fMonkeyImpl->fDumpExpansions) { |
| printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)()); |
| } |
| |
| // Compile a regular expression for this rule. |
| thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status)); |
| if (U_FAILURE(status)) { |
| IntlTest::gTest->errln("%s:%d Error creating regular expression for %s", |
| __FILE__, __LINE__, CStr(thisRule->fExpandedRule)()); |
| return; |
| } |
| |
| // Put this new rule into the vector of all Rules. |
| fBreakRules.addElement(thisRule.orphan(), status); |
| } |
| |
| |
| bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) { |
| if (keyword == UnicodeString("locale")) { |
| CharString localeName; |
| localeName.append(CStr(value)(), -1, status); |
| fLocale = Locale::createFromName(localeName.data()); |
| return true; |
| } |
| if (keyword == UnicodeString("type")) { |
| if (value == UnicodeString("grapheme")) { |
| fType = UBRK_CHARACTER; |
| } else if (value == UnicodeString("word")) { |
| fType = UBRK_WORD; |
| } else if (value == UnicodeString("line")) { |
| fType = UBRK_LINE; |
| } else if (value == UnicodeString("sentence")) { |
| fType = UBRK_SENTENCE; |
| } else { |
| IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)()); |
| } |
| return true; |
| } |
| // TODO: add tailoring base setting here. |
| return false; |
| } |
| |
| RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return NULL; |
| } |
| RuleBasedBreakIterator *bi = NULL; |
| switch(fType) { |
| case UBRK_CHARACTER: |
| bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status)); |
| break; |
| case UBRK_WORD: |
| bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status)); |
| break; |
| case UBRK_LINE: |
| bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status)); |
| break; |
| case UBRK_SENTENCE: |
| bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status)); |
| break; |
| default: |
| IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType); |
| status = U_ILLEGAL_ARGUMENT_ERROR; |
| } |
| return bi; |
| } |
| |
| |
| void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| |
| UnicodeString emptyString; |
| for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line. |
| if (U_FAILURE(status)) { |
| return; |
| } |
| int32_t lineLength = 0; |
| const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status); |
| if (lineBuf == NULL) { |
| break; |
| } |
| UnicodeString line(lineBuf, lineLength); |
| |
| // Strip comment lines. |
| fCommentsMatcher->reset(line); |
| line = fCommentsMatcher->replaceFirst(emptyString, status); |
| if (line.isEmpty()) { |
| continue; |
| } |
| |
| // Recognize character class definition and keyword lines |
| fClassDefMatcher->reset(line); |
| if (fClassDefMatcher->matches(status)) { |
| UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status); |
| UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status); |
| if (fMonkeyImpl->fDumpExpansions) { |
| printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)()); |
| } |
| if (setKeywordParameter(className, classDef, status)) { |
| // The scanned item was "type = ..." or "locale = ...", etc. |
| // which are not actual character classes. |
| continue; |
| } |
| addCharClass(className, classDef, status); |
| continue; |
| } |
| |
| // Recognize rule lines. |
| fRuleDefMatcher->reset(line); |
| if (fRuleDefMatcher->matches(status)) { |
| UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status); |
| UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status); |
| if (fMonkeyImpl->fDumpExpansions) { |
| printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)()); |
| } |
| addRule(ruleName, ruleDef, status); |
| continue; |
| } |
| |
| IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n", |
| __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)()); |
| } |
| |
| // Build the vector of char classes, omitting the dictionary class if there is one. |
| // This will be used when constructing the random text to be tested. |
| |
| // Also compute the "other" set, consisting of any characters not included in |
| // one or more of the user defined sets. |
| |
| UnicodeSet otherSet((UChar32)0, 0x10ffff); |
| int32_t pos = UHASH_FIRST; |
| const UHashElement *el = NULL; |
| while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) { |
| const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer); |
| CharClass *cclass = static_cast<CharClass *>(el->value.pointer); |
| // printf(" Adding %s\n", CStr(*ccName)()); |
| if (*ccName != cclass->fName) { |
| IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n", |
| __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)()); |
| } |
| const UnicodeSet *set = cclass->fSet.getAlias(); |
| otherSet.removeAll(*set); |
| if (*ccName == UnicodeString("dictionary")) { |
| fDictionarySet = *set; |
| } else { |
| fCharClassList->addElement(cclass, status); |
| } |
| } |
| |
| if (!otherSet.isEmpty()) { |
| // fprintf(stderr, "have an other set.\n"); |
| UnicodeString pattern; |
| CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status); |
| fCharClassList->addElement(cclass, status); |
| } |
| } |
| |
| |
| const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const { |
| int32_t localIter = 0; |
| int32_t &it = iter? *iter : localIter; |
| |
| while (it < fCharClassList->size()) { |
| const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it)); |
| ++it; |
| if (cc->fSet->contains(c)) { |
| return cc; |
| } |
| } |
| return NULL; |
| } |
| |
| //--------------------------------------------------------------------------------------- |
| // |
| // class MonkeyTestData implementation. |
| // |
| //--------------------------------------------------------------------------------------- |
| |
| void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) { |
| const int32_t dataLength = 1000; |
| |
| // Fill the test string with random characters. |
| // First randomly pick a char class, then randomly pick a character from that class. |
| // Exclude any characters from the dictionary set. |
| |
| // std::cout << "Populating Test Data" << std::endl; |
| fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, |
| // allowing recreation of failing data. |
| fBkRules = rules; |
| fString.remove(); |
| for (int32_t n=0; n<dataLength;) { |
| int charClassIndex = rand() % rules->fCharClassList->size(); |
| const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex)); |
| if (cclass->fSet->size() == 0) { |
| // Some rules or tailorings do end up with empty char classes. |
| continue; |
| } |
| int32_t charIndex = rand() % cclass->fSet->size(); |
| UChar32 c = cclass->fSet->charAt(charIndex); |
| if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) { |
| // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. |
| // Don't let random unpaired surrogates combine in the test data because they might |
| // produce an unwanted dictionary character. |
| continue; |
| } |
| |
| if (!rules->fDictionarySet.contains(c)) { |
| fString.append(c); |
| ++n; |
| } |
| } |
| |
| // Reset each rule matcher regex with this new string. |
| // (Although we are always using the same string object, ICU regular expressions |
| // don't like the underlying string data changing without doing a reset). |
| |
| for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) { |
| BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum)); |
| rule->fRuleMatcher->reset(fString); |
| } |
| |
| // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays). |
| // Expected and Actual breaks are one longer than the input string; a non-zero value |
| // will indicate a boundary preceding that position. |
| |
| clearActualBreaks(); |
| fExpectedBreaks = fActualBreaks; |
| fRuleForPosition = fActualBreaks; |
| f2ndRuleForPos = fActualBreaks; |
| |
| // Apply reference rules to find the expected breaks. |
| |
| fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text. |
| // ICU always reports a break there. |
| // The reference rules do not have a means to do so. |
| int32_t strIdx = 0; |
| bool initialMatch = true; // True at start of text, and immediately after each boundary, |
| // for control over rule chaining. |
| while (strIdx < fString.length()) { |
| BreakRule *matchingRule = NULL; |
| UBool hasBreak = FALSE; |
| int32_t ruleNum = 0; |
| int32_t matchStart = 0; |
| int32_t matchEnd = 0; |
| int32_t breakGroup = 0; |
| for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) { |
| BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum)); |
| if (rule->fInitialMatchOnly && !initialMatch) { |
| // Skip checking this '^' rule. (No rule chaining) |
| continue; |
| } |
| rule->fRuleMatcher->reset(); |
| if (rule->fRuleMatcher->lookingAt(strIdx, status)) { |
| // A candidate rule match, check further to see if we take it or continue to check other rules. |
| // Matches of zero or one codepoint count only if they also specify a break. |
| matchStart = rule->fRuleMatcher->start(status); |
| matchEnd = rule->fRuleMatcher->end(status); |
| breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status); |
| hasBreak = U_SUCCESS(status); |
| if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) { |
| status = U_ZERO_ERROR; |
| } |
| if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) { |
| matchingRule = rule; |
| break; |
| } |
| } |
| } |
| if (matchingRule == NULL) { |
| // No reference rule matched. This is an error in the rules that should never happen. |
| IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ", |
| __FILE__, __LINE__, strIdx); |
| dump(strIdx); |
| status = U_INVALID_FORMAT_ERROR; |
| return; |
| } |
| if (matchingRule->fRuleMatcher->group(status).length() == 0) { |
| // Zero length rule match. This is also an error in the rule expressions. |
| IntlTest::gTest->errln("%s:%d Zero length rule match.", |
| __FILE__, __LINE__); |
| status = U_INVALID_FORMAT_ERROR; |
| return; |
| } |
| |
| // Record which rule matched over the length of the match. |
| for (int i = matchStart; i < matchEnd; i++) { |
| if (fRuleForPosition.charAt(i) == 0) { |
| fRuleForPosition.setCharAt(i, (UChar)ruleNum); |
| } else { |
| f2ndRuleForPos.setCharAt(i, (UChar)ruleNum); |
| } |
| } |
| |
| // Break positions appear in rules as a matching named capture of zero length at the break position, |
| // the adjusted pattern contains (?<BreakPosition>) |
| if (hasBreak) { |
| int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status); |
| if (U_FAILURE(status) || breakPos < 0) { |
| // Rule specified a break, but that break wasn't part of the match, even |
| // though the rule as a whole matched. |
| // Can't happen with regular expressions derived from (equivalent to) ICU break rules. |
| // Shouldn't get here. |
| IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__); |
| status = U_INVALID_FORMAT_ERROR; |
| break; |
| } |
| fExpectedBreaks.setCharAt(breakPos, (UChar)1); |
| // printf("recording break at %d\n", breakPos); |
| // For the next iteration, pick up applying rules immediately after the break, |
| // which may differ from end of the match. The matching rule may have included |
| // context following the boundary that needs to be looked at again. |
| strIdx = matchingRule->fRuleMatcher->end(breakGroup, status); |
| initialMatch = true; |
| } else { |
| // Original rule didn't specify a break. |
| // Continue applying rules starting on the last code point of this match. |
| strIdx = fString.moveIndex32(matchEnd, -1); |
| initialMatch = false; |
| if (strIdx == matchStart) { |
| // Match was only one code point, no progress if we continue. |
| // Shouldn't get here, case is filtered out at top of loop. |
| CharString ruleName; |
| ruleName.appendInvariantChars(matchingRule->fName, status); |
| IntlTest::gTest->errln("%s:%d Rule %s internal error", |
| __FILE__, __LINE__, ruleName.data()); |
| status = U_INVALID_FORMAT_ERROR; |
| break; |
| } |
| } |
| if (U_FAILURE(status)) { |
| IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.", |
| __FILE__, __LINE__, u_errorName(status)); |
| break; |
| } |
| } |
| } |
| |
| void MonkeyTestData::clearActualBreaks() { |
| fActualBreaks.remove(); |
| // Actual Breaks length is one longer than the data string length, allowing |
| // for breaks before the first and after the last character in the data. |
| for (int32_t i=0; i<=fString.length(); i++) { |
| fActualBreaks.append((UChar)0); |
| } |
| } |
| |
| void MonkeyTestData::dump(int32_t around) const { |
| printf("\n" |
| " char break Rule Character\n" |
| " pos code class R I name name\n" |
| "---------------------------------------------------------------------------------------------\n"); |
| |
| int32_t start; |
| int32_t end; |
| |
| if (around == -1) { |
| start = 0; |
| end = fString.length(); |
| } else { |
| // Display context around a failure. |
| start = fString.moveIndex32(around, -30); |
| end = fString.moveIndex32(around, +30); |
| } |
| |
| for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) { |
| UErrorCode status = U_ZERO_ERROR; |
| UChar32 c = fString.char32At(charIdx); |
| const CharClass *cc = fBkRules->getClassForChar(c); |
| CharString ccName; |
| ccName.appendInvariantChars(cc->fName, status); |
| CharString ruleName, secondRuleName; |
| const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx))); |
| ruleName.appendInvariantChars(rule->fName, status); |
| if (f2ndRuleForPos.charAt(charIdx) > 0) { |
| const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx))); |
| secondRuleName.appendInvariantChars(secondRule->fName, status); |
| } |
| char cName[200]; |
| u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status); |
| |
| printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n", |
| charIdx, c, ccName.data(), |
| fExpectedBreaks.charAt(charIdx) ? '*' : '.', |
| fActualBreaks.charAt(charIdx) ? '*' : '.', |
| ruleName.data(), secondRuleName.data(), cName |
| ); |
| } |
| } |
| |
| |
| //--------------------------------------------------------------------------------------- |
| // |
| // class RBBIMonkeyImpl |
| // |
| //--------------------------------------------------------------------------------------- |
| |
| RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) { |
| (void)status; // suppress unused parameter compiler warning. |
| } |
| |
| |
| // RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the |
| // reference rules and creating the icu breakiterator to test, |
| // with its type and locale coming from the reference rules. |
| |
| void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) { |
| fRuleFileName = ruleFile; |
| openBreakRules(ruleFile, status); |
| if (U_FAILURE(status)) { |
| IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); |
| return; |
| } |
| fRuleSet.adoptInstead(new BreakRules(this, status)); |
| fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status); |
| if (U_FAILURE(status)) { |
| IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); |
| return; |
| } |
| fBI.adoptInstead(fRuleSet->createICUBreakIterator(status)); |
| fTestData.adoptInstead(new MonkeyTestData()); |
| } |
| |
| |
| RBBIMonkeyImpl::~RBBIMonkeyImpl() { |
| } |
| |
| |
| void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) { |
| CharString path; |
| path.append(IntlTest::getSourceTestData(status), status); |
| path.append("break_rules" U_FILE_SEP_STRING, status); |
| path.appendPathPart(fileName, status); |
| const char *codePage = "UTF-8"; |
| fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status)); |
| } |
| |
| |
| void RBBIMonkeyImpl::startTest() { |
| fThread.start(); // invokes runTest() in a separate thread. |
| } |
| |
| void RBBIMonkeyImpl::join() { |
| fThread.join(); |
| } |
| |
| |
| #define MONKEY_ERROR(msg, index) UPRV_BLOCK_MACRO_BEGIN { \ |
| IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \ |
| __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \ |
| if (fVerbose) { fTestData->dump(index); } \ |
| status = U_INVALID_STATE_ERROR; \ |
| } UPRV_BLOCK_MACRO_END |
| |
| void RBBIMonkeyImpl::runTest() { |
| UErrorCode status = U_ZERO_ERROR; |
| int32_t errorCount = 0; |
| for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { |
| status = U_ZERO_ERROR; |
| fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status); |
| if (fBI.isNull()) { |
| IntlTest::gTest->dataerrln("Unable to run test because fBI is null."); |
| return; |
| } |
| // fTestData->dump(); |
| testForwards(status); |
| testPrevious(status); |
| testFollowing(status); |
| testPreceding(status); |
| testIsBoundary(status); |
| testIsBoundaryRandom(status); |
| |
| if (fLoopCount < 0 && loopCount % 100 == 0) { |
| fprintf(stderr, "."); |
| } |
| if (U_FAILURE(status)) { |
| if (++errorCount > 10) { |
| return; |
| } |
| } |
| } |
| } |
| |
| void RBBIMonkeyImpl::testForwards(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fTestData->clearActualBreaks(); |
| fBI->setText(fTestData->fString); |
| int32_t previousBreak = -2; |
| for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) { |
| if (bk <= previousBreak) { |
| MONKEY_ERROR("Break Iterator Stall", bk); |
| return; |
| } |
| if (bk < 0 || bk > fTestData->fString.length()) { |
| MONKEY_ERROR("Boundary out of bounds", bk); |
| return; |
| } |
| fTestData->fActualBreaks.setCharAt(bk, 1); |
| } |
| checkResults("testForwards", FORWARD, status); |
| } |
| |
| void RBBIMonkeyImpl::testFollowing(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fTestData->clearActualBreaks(); |
| fBI->setText(fTestData->fString); |
| int32_t nextBreak = -1; |
| for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) { |
| int32_t bk = fBI->following(i); |
| if (bk == BreakIterator::DONE && i == fTestData->fString.length()) { |
| continue; |
| } |
| if (bk == nextBreak && bk > i) { |
| // i is in the gap between two breaks. |
| continue; |
| } |
| if (i == nextBreak && bk > nextBreak) { |
| fTestData->fActualBreaks.setCharAt(bk, 1); |
| nextBreak = bk; |
| continue; |
| } |
| MONKEY_ERROR("following(i)", i); |
| return; |
| } |
| checkResults("testFollowing", FORWARD, status); |
| } |
| |
| |
| |
| void RBBIMonkeyImpl::testPrevious(UErrorCode &status) { |
| if (U_FAILURE(status)) {return;} |
| |
| fTestData->clearActualBreaks(); |
| fBI->setText(fTestData->fString); |
| int32_t previousBreak = INT32_MAX; |
| for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) { |
| if (bk >= previousBreak) { |
| MONKEY_ERROR("Break Iterator Stall", bk); |
| return; |
| } |
| if (bk < 0 || bk > fTestData->fString.length()) { |
| MONKEY_ERROR("Boundary out of bounds", bk); |
| return; |
| } |
| fTestData->fActualBreaks.setCharAt(bk, 1); |
| } |
| checkResults("testPrevius", REVERSE, status); |
| } |
| |
| |
| void RBBIMonkeyImpl::testPreceding(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fTestData->clearActualBreaks(); |
| fBI->setText(fTestData->fString); |
| int32_t nextBreak = fTestData->fString.length()+1; |
| for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) { |
| int32_t bk = fBI->preceding(i); |
| // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); |
| if (bk == BreakIterator::DONE && i == 0) { |
| continue; |
| } |
| if (bk == nextBreak && bk < i) { |
| // i is in the gap between two breaks. |
| continue; |
| } |
| if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) { |
| // i indexes to a trailing surrogate. |
| // Break Iterators treat an index to either half as referring to the supplemental code point, |
| // with preceding going to some preceding code point. |
| if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) { |
| MONKEY_ERROR("preceding of trailing surrogate error", i); |
| } |
| continue; |
| } |
| if (i == nextBreak && bk < nextBreak) { |
| fTestData->fActualBreaks.setCharAt(bk, 1); |
| nextBreak = bk; |
| continue; |
| } |
| MONKEY_ERROR("preceding(i)", i); |
| return; |
| } |
| checkResults("testPreceding", REVERSE, status); |
| } |
| |
| |
| void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fTestData->clearActualBreaks(); |
| fBI->setText(fTestData->fString); |
| for (int i=fTestData->fString.length(); i>=0; --i) { |
| if (fBI->isBoundary(i)) { |
| fTestData->fActualBreaks.setCharAt(i, 1); |
| } |
| } |
| checkResults("testForwards", FORWARD, status); |
| } |
| |
| void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| fBI->setText(fTestData->fString); |
| |
| int stringLen = fTestData->fString.length(); |
| for (int i=stringLen; i>=0; --i) { |
| int strIdx = fRandomGenerator() % stringLen; |
| if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) { |
| IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", |
| __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed); |
| if (fVerbose) { |
| fTestData->dump(i); |
| } |
| status = U_INVALID_STATE_ERROR; |
| break; |
| } |
| } |
| } |
| |
| |
| |
| void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) { |
| if (U_FAILURE(status)) { |
| return; |
| } |
| if (direction == FORWARD) { |
| for (int i=0; i<=fTestData->fString.length(); ++i) { |
| if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { |
| IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", |
| __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); |
| if (fVerbose) { |
| fTestData->dump(i); |
| } |
| status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely |
| break; // produce many redundant errors. |
| } |
| } |
| } else { |
| for (int i=fTestData->fString.length(); i>=0; i--) { |
| if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { |
| IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", |
| __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); |
| if (fVerbose) { |
| fTestData->dump(i); |
| } |
| status = U_INVALID_STATE_ERROR; |
| break; |
| } |
| } |
| } |
| } |
| |
| |
| |
| //--------------------------------------------------------------------------------------- |
| // |
| // class RBBIMonkeyTest implementation. |
| // |
| //--------------------------------------------------------------------------------------- |
| RBBIMonkeyTest::RBBIMonkeyTest() { |
| } |
| |
| RBBIMonkeyTest::~RBBIMonkeyTest() { |
| } |
| |
| |
| // params, taken from this->fParams. |
| // rules=file_name Name of file containing the reference rules. |
| // seed=nnnnn Random number starting seed. |
| // Setting the seed allows errors to be reproduced. |
| // loop=nnn Looping count. Controls running time. |
| // -1: run forever. |
| // 0 or greater: run length. |
| // expansions debug option, show expansions of rules and sets. |
| // verbose Display details of the failure. |
| // |
| // Parameters on the intltest command line follow the test name, and are preceded by '@'. |
| // For example, |
| // intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1 |
| // |
| void RBBIMonkeyTest::testMonkey() { |
| // printf("Test parameters: %s\n", fParams); |
| UnicodeString params(fParams); |
| UErrorCode status = U_ZERO_ERROR; |
| |
| const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "line_cj.txt", "sentence.txt", "line_normal.txt", |
| "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt", |
| NULL }; |
| CharString testNameFromParams; |
| if (getStringParam("rules", params, testNameFromParams, status)) { |
| tests[0] = testNameFromParams.data(); |
| tests[1] = NULL; |
| } |
| |
| int64_t loopCount = quick? 100 : 5000; |
| getIntParam("loop", params, loopCount, status); |
| |
| UBool dumpExpansions = FALSE; |
| getBoolParam("expansions", params, dumpExpansions, status); |
| |
| UBool verbose = FALSE; |
| getBoolParam("verbose", params, verbose, status); |
| |
| int64_t seed = 0; |
| getIntParam("seed", params, seed, status); |
| |
| if (params.length() != 0) { |
| // Options processing did not consume all of the parameters. Something unrecognized was present. |
| CharString unrecognizedParameters; |
| unrecognizedParameters.append(CStr(params)(), -1, status); |
| errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data()); |
| return; |
| } |
| |
| UVector startedTests(status); |
| if (U_FAILURE(status)) { |
| errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status)); |
| return; |
| } |
| |
| // Monkey testing is multi-threaded. |
| // Each set of break rules to be tested is run in a separate thread. |
| // Each thread/set of rules gets a separate RBBIMonkeyImpl object. |
| int32_t i; |
| for (i=0; tests[i] != NULL; ++i) { |
| logln("beginning testing of %s", tests[i]); |
| LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status)); |
| if (U_FAILURE(status)) { |
| dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); |
| break; |
| } |
| test->fDumpExpansions = dumpExpansions; |
| test->fVerbose = verbose; |
| test->fRandomGenerator.seed(static_cast<uint32_t>(seed)); |
| test->fLoopCount = static_cast<int32_t>(loopCount); |
| test->setup(tests[i], status); |
| if (U_FAILURE(status)) { |
| dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); |
| break; |
| } |
| test->startTest(); |
| startedTests.addElement(test.orphan(), status); |
| if (U_FAILURE(status)) { |
| errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); |
| break; |
| } |
| } |
| |
| for (i=0; i<startedTests.size(); ++i) { |
| RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i)); |
| test->join(); |
| delete test; |
| } |
| } |
| |
| |
| UBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) { |
| name.append(" *= *(-?\\d+) *,? *"); |
| RegexMatcher m(name, params, 0, status); |
| if (m.find()) { |
| // The param exists. Convert the string to an int. |
| CharString str; |
| str.append(CStr(m.group(1, status))(), -1, status); |
| val = strtol(str.data(), NULL, 10); |
| |
| // Delete this parameter from the params string. |
| m.reset(); |
| params = m.replaceFirst(UnicodeString(), status); |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) { |
| name.append(" *= *([^ ,]*) *,? *"); |
| RegexMatcher m(name, params, 0, status); |
| if (m.find()) { |
| // The param exists. |
| dest.append(CStr(m.group(1, status))(), -1, status); |
| |
| // Delete this parameter from the params string. |
| m.reset(); |
| params = m.replaceFirst(UnicodeString(), status); |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) { |
| name.append("(?: *= *(true|false))? *,? *"); |
| RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status); |
| if (m.find()) { |
| if (m.start(1, status) > 0) { |
| // user option included a value. |
| dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0; |
| } else { |
| // No explicit user value, implies true. |
| dest = TRUE; |
| } |
| |
| // Delete this parameter from the params string. |
| m.reset(); |
| params = m.replaceFirst(UnicodeString(), status); |
| return TRUE; |
| } |
| return FALSE; |
| } |
| |
| #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ |