ICU-21071 Fix lenient parse rules
- Check non-lenient rules before call lenint parsing
- Remove logKnownIssue 9503 from test code
- Adjust TestAllLocales test on ICU4C
- Add lenient checks on ICU4J
diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp
index 3ad0291..4f809f9 100644
--- a/icu4c/source/i18n/nfrule.cpp
+++ b/icu4c/source/i18n/nfrule.cpp
@@ -1297,6 +1297,10 @@
#if !UCONFIG_NO_COLLATION
// go through all this grief if we're in lenient-parse mode
if (formatter->isLenient()) {
+ // Check if non-lenient rule finds the text before call lenient parsing
+ if (str.startsWith(prefix)) {
+ return prefix.length();
+ }
// get the formatter's collator and use it to create two
// collation element iterators, one over the target string
// and another over the prefix (right now, we'll throw an
@@ -1505,9 +1509,15 @@
return str.indexOf(key, startingAt);
}
else {
- // but if lenient parsing is turned ON, we've got some work
- // ahead of us
- return findTextLenient(str, key, startingAt, length);
+ // Check if non-lenient rule finds the text before call lenient parsing
+ *length = key.length();
+ int32_t pos = str.indexOf(key, startingAt);
+ if(pos >= 0) {
+ return pos;
+ } else {
+ // but if lenient parsing is turned ON, we've got some work ahead of us
+ return findTextLenient(str, key, startingAt, length);
+ }
}
}
diff --git a/icu4c/source/i18n/plurfmt.cpp b/icu4c/source/i18n/plurfmt.cpp
index b994376..aac35c5 100644
--- a/icu4c/source/i18n/plurfmt.cpp
+++ b/icu4c/source/i18n/plurfmt.cpp
@@ -549,9 +549,15 @@
UnicodeString currArg = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit());
if (rbnfLenientScanner != NULL) {
- // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
- int32_t length = -1;
- currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
+ // Check if non-lenient rule finds the text before call lenient parsing
+ int32_t tempIndex = source.indexOf(currArg, startingAt);
+ if (tempIndex >= 0) {
+ currMatchIndex = tempIndex;
+ } else {
+ // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
+ int32_t length = -1;
+ currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length);
+ }
}
else {
currMatchIndex = source.indexOf(currArg, startingAt);
diff --git a/icu4c/source/test/intltest/itrbnf.cpp b/icu4c/source/test/intltest/itrbnf.cpp
index ed4e865..efddaa7 100644
--- a/icu4c/source/test/intltest/itrbnf.cpp
+++ b/icu4c/source/test/intltest/itrbnf.cpp
@@ -1889,16 +1889,19 @@
UErrorCode status = U_ZERO_ERROR;
RuleBasedNumberFormat* f = new RuleBasedNumberFormat((URBNFRuleSetTag)j, *loc, status);
- if (status == U_USING_DEFAULT_WARNING || status == U_USING_FALLBACK_WARNING) {
- // Skip it.
- delete f;
- break;
- }
if (U_FAILURE(status)) {
errln(UnicodeString(loc->getName()) + names[j]
+ "ERROR could not instantiate -> " + u_errorName(status));
continue;
}
+
+ Locale actualLocale = f->getLocale(ULOC_ACTUAL_LOCALE, status);
+ if (actualLocale != *loc) {
+ // Skip the redundancy
+ delete f;
+ break;
+ }
+
#if !UCONFIG_NO_COLLATION
for (unsigned int numidx = 0; numidx < UPRV_LENGTHOF(numbers); numidx++) {
double n = numbers[numidx];
@@ -1936,28 +1939,26 @@
+ UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
}
}
- if (!quick && !logKnownIssue("9503") ) {
- // lenient parse
- status = U_ZERO_ERROR;
- f->setLenient(TRUE);
- f->parse(str, num, status);
- if (U_FAILURE(status)) {
+ // lenient parse
+ status = U_ZERO_ERROR;
+ f->setLenient(TRUE);
+ f->parse(str, num, status);
+ if (U_FAILURE(status)) {
+ errln(UnicodeString(loc->getName()) + names[j]
+ + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+ }
+ // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
+ if (j == 0) {
+ if (num.getType() == Formattable::kLong && num.getLong() != n) {
errln(UnicodeString(loc->getName()) + names[j]
- + "ERROR could not parse(lenient) '" + str + "' -> " + u_errorName(status));
+ + UnicodeString("ERROR could not roundtrip ") + n
+ + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
}
- // We only check the spellout. The behavior is undefined for numbers < 1 and fractional numbers.
- if (j == 0) {
- if (num.getType() == Formattable::kLong && num.getLong() != n) {
- errln(UnicodeString(loc->getName()) + names[j]
- + UnicodeString("ERROR could not roundtrip ") + n
- + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getLong());
- }
- else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
- // The epsilon difference is too high.
- errln(UnicodeString(loc->getName()) + names[j]
- + UnicodeString("ERROR could not roundtrip ") + n
- + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
- }
+ else if (num.getType() == Formattable::kDouble && (int64_t)(num.getDouble() * 1000) != (int64_t)(n*1000)) {
+ // The epsilon difference is too high.
+ errln(UnicodeString(loc->getName()) + names[j]
+ + UnicodeString("ERROR could not roundtrip ") + n
+ + UnicodeString(" -> ") + str + UnicodeString(" -> ") + num.getDouble());
}
}
}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java b/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
index b54fe8a..4a0c4a8 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/NFRule.java
@@ -1241,6 +1241,10 @@
RbnfLenientScanner scanner = formatter.getLenientScanner();
if (scanner != null) {
+ // Check if non-lenient rule finds the text before call lenient parsing
+ if (str.startsWith(prefix)) {
+ return prefix.length();
+ }
return scanner.prefixLength(str, prefix);
}
@@ -1290,9 +1294,14 @@
}
if (scanner != null) {
- // if lenient parsing is turned ON, we've got some work
- // ahead of us
- return scanner.findText(str, key, startingAt);
+ // Check if non-lenient rule finds the text before call lenient parsing
+ int pos[] = new int[] { str.indexOf(key, startingAt), key.length() };
+ if (pos[0] >= 0) {
+ return pos;
+ } else {
+ // if lenient parsing is turned ON, we've got some work ahead of us
+ return scanner.findText(str, key, startingAt);
+ }
}
// if lenient parsing is turned off, this is easy. Just call
// String.indexOf() and we're done
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java b/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
index dee6513..3d73861 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/PluralFormat.java
@@ -760,9 +760,15 @@
String currArg = pattern.substring(partStart.getLimit(), partLimit.getIndex());
if (scanner != null) {
- // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
- int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
- currMatchIndex = scannerMatchResult[0];
+ // Check if non-lenient rule finds the text before call lenient parsing
+ int tempPos = source.indexOf(currArg, startingAt);
+ if (tempPos >= 0) {
+ currMatchIndex = tempPos;
+ } else {
+ // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us.
+ int[] scannerMatchResult = scanner.findText(source, currArg, startingAt);
+ currMatchIndex = scannerMatchResult[0];
+ }
}
else {
currMatchIndex = source.indexOf(currArg, startingAt);
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
index 0dbe81e..b278649 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/RbnfTest.java
@@ -296,6 +296,17 @@
};
doTest(formatter, testData, true);
+
+ String[][] testDataLenient = {
+ { "fifty-7", "57" },
+ { " fifty-7", "57" },
+ { " fifty-7", "57" },
+ { "2 thousand six HUNDRED fifty-7", "2,657" },
+ { "fifteen hundred and zero", "1,500" },
+ { "FOurhundred thiRTY six", "436" }
+ };
+
+ doParsingTest(formatter, testDataLenient, true);
}
/**
@@ -350,6 +361,12 @@
};
doTest(formatter, testData, true);
+
+ String[][] testDataLenient = {
+ { "2-51-33", "10,293" },
+ };
+
+ doParsingTest(formatter, testDataLenient, true);
}
/**
@@ -425,6 +442,13 @@
};
doTest(formatter, testData, true);
+
+ String[][] testDataLenient = {
+ { "trente-et-un", "31" },
+ { "un cent quatre vingt dix huit", "198" },
+ };
+
+ doParsingTest(formatter, testDataLenient, true);
}
/**
@@ -529,6 +553,12 @@
};
doTest(formatter, testData, true);
+
+ String[][] testDataLenient = {
+ { "ein Tausend sechs Hundert fuenfunddreissig", "1,635" },
+ };
+
+ doParsingTest(formatter, testDataLenient, true);
}
/**
@@ -1117,6 +1147,10 @@
" (ordinal) "
//" (duration) " // English only
};
+ boolean[] lenientMode = {
+ false, // non-lenient mode
+ true // lenient mode
+ };
double[] numbers = {45.678, 1, 2, 10, 11, 100, 110, 200, 1000, 1111, -1111};
int count = numbers.length;
Random r = (count <= numbers.length ? null : createRandom());
@@ -1142,25 +1176,25 @@
logln(loc.getName() + names[j] + "success format: " + n + " -> " + s);
}
- try {
- // RBNF parse is extremely slow when lenient option is enabled.
- // non-lenient parse
- fmt.setLenientParseMode(false);
- Number num = fmt.parse(s);
- if (isVerbose()) {
- logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
+ for (int k = 0; k < lenientMode.length; k++) {
+ try {
+ fmt.setLenientParseMode(lenientMode[k]);
+ Number num = fmt.parse(s);
+ if (isVerbose()) {
+ logln(loc.getName() + names[j] + "success parse: " + s + " -> " + num);
+ }
+ if (j != 0) {
+ // TODO: Fix the ordinal rules.
+ continue;
+ }
+ if (n != num.doubleValue()) {
+ errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
+ }
+ } catch (ParseException pe) {
+ String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
+ logln(msg);
+ errors.append("\n" + msg);
}
- if (j != 0) {
- // TODO: Fix the ordinal rules.
- continue;
- }
- if (n != num.doubleValue()) {
- errors.append("\n" + loc + names[j] + "got " + num + " expected " + n);
- }
- } catch (ParseException pe) {
- String msg = loc.getName() + names[j] + "ERROR:" + pe.getMessage();
- logln(msg);
- errors.append("\n" + msg);
}
}
}
@@ -1170,10 +1204,12 @@
}
}
- void doTest(RuleBasedNumberFormat formatter, String[][] testData,
- boolean testParsing) {
- // NumberFormat decFmt = NumberFormat.getInstance(Locale.US);
- NumberFormat decFmt = new DecimalFormat("#,###.################");
+ NumberFormat createDecimalFormatter() {
+ return new DecimalFormat("#,###.################");
+ }
+
+ void doTest(RuleBasedNumberFormat formatter, String[][] testData, boolean testParsing) {
+ NumberFormat decFmt = createDecimalFormatter();
try {
for (int i = 0; i < testData.length; i++) {
String number = testData[i][0];
@@ -1207,6 +1243,35 @@
}
}
+ void doParsingTest(RuleBasedNumberFormat formatter, String[][] testData, boolean lenient) {
+ NumberFormat decFmt = createDecimalFormatter();
+
+ if (lenient) {
+ formatter.setLenientParseMode(true);
+ }
+ for (int i = 0; i < testData.length; i++) {
+ try {
+ String s = testData[i][0];
+ Number expectedNumber = decFmt.parse(testData[i][1]);
+ if (isVerbose()) {
+ logln("test[" + i + "] spellout value: (" + s + ") target: " + expectedNumber);
+ }
+
+ Number num = formatter.parse(s);
+ if (isVerbose()) {
+ logln("success parse: (" + s + ") -> " + num);
+ }
+
+ if (expectedNumber.doubleValue() != num.doubleValue()) {
+ errln("\nParsing (" + s + ") failed: got " + num + " expected " + expectedNumber);
+ }
+ } catch (Throwable e) {
+ e.printStackTrace();
+ errln("Test failed with exception: " + e.toString());
+ }
+ }
+ }
+
/* Tests the method
* public boolean equals(Object that)
*/