ICU-21127 Error when rbbi got unpaired surrogate char
See #1520
diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp
index 10b7e9b..45911b1 100644
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@@ -856,6 +856,10 @@
return (UChar32)-1;
}
ch = fRB->fRules.char32At(fNextIndex);
+ if (U_IS_SURROGATE(ch)) {
+ error(U_ILLEGAL_CHAR_FOUND);
+ return U_SENTINEL;
+ }
fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
if (ch == chCR ||
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 8e3086b..b02478c 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -134,6 +134,7 @@
TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
TESTCASE_AUTO(TestTable_8_16_Bits);
TESTCASE_AUTO(TestBug13590);
+ TESTCASE_AUTO(TestUnpairedSurrogate);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@@ -5323,4 +5324,43 @@
}
#endif
+void RBBITest::TestUnpairedSurrogate() {
+ UnicodeString rules(u"ab;");
+
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError pe;
+ RuleBasedBreakIterator bi1(rules, pe, status);
+ assertSuccess(WHERE, status);
+ UnicodeString rtRules = bi1.getRules();
+ // make sure the simple one work first.
+ assertEquals(WHERE, rules, rtRules);
+
+
+ rules = UnicodeString(u"a\\ud800b;").unescape();
+ pe.line = 0;
+ pe.offset = 0;
+ RuleBasedBreakIterator bi2(rules, pe, status);
+ assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
+ if (pe.line != 1 || pe.offset != 1) {
+ errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
+ }
+
+ status = U_ZERO_ERROR;
+ rules = UnicodeString(u"a\\ude00b;").unescape();
+ pe.line = 0;
+ pe.offset = 0;
+ RuleBasedBreakIterator bi3(rules, pe, status);
+ assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
+ if (pe.line != 1 || pe.offset != 1) {
+ errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
+ }
+
+ // make sure the surrogate one work too.
+ status = U_ZERO_ERROR;
+ rules = UnicodeString(u"a😀b;");
+ RuleBasedBreakIterator bi4(rules, pe, status);
+ rtRules = bi4.getRules();
+ assertEquals(WHERE, rules, rtRules);
+}
+
#endif // #if !UCONFIG_NO_BREAK_ITERATION
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index da14411..754b3e6 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -83,6 +83,7 @@
void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
void TestBug13692();
void TestDebugRules();
+ void TestUnpairedSurrogate();
void TestDebug();
void TestProperties();
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
index b086e32..1da2703 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@@ -85,6 +85,9 @@
// using these simplified the porting, and consolidated the
// creation of Java exceptions
//
+ static final int U_ILLEGAL_CHAR_FOUND = 12;
+ /**< Character conversion: Illegal input sequence/combination of input units. */
+
static final int U_BRK_ERROR_START = 0x10200;
/**< Start of codes indicating Break Iterator failures */
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
index fae2773..c9a8aff 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
@@ -723,6 +723,9 @@
return -1;
}
ch = UTF16.charAt(fRB.fRules, fNextIndex);
+ if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) {
+ error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND);
+ }
fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
if (ch == '\r' ||
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
index 268f6c0..dae29ad 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@@ -905,4 +905,41 @@
assertEquals("Wrong number of breaks found", 2, breaksFound);
}
+ /* Test handling of unpair surrogate.
+ */
+ @Test
+ public void TestUnpairedSurrogate() {
+ // make sure the simple one work first.
+ String rules = "ab;";
+ RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
+ assertEquals("Rules does not match", rules, bi.toString());
+
+ try {
+ new RuleBasedBreakIterator("a\ud800b;");
+ fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
+ }
+ catch (IllegalArgumentException e) {
+ // expected exception with unpair surrogate.
+ }
+ catch (Exception e) {
+ fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
+ }
+
+ try {
+ new RuleBasedBreakIterator("a\ude00b;");
+ fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
+ }
+ catch (IllegalArgumentException e) {
+ // expected exception with unpair surrogate.
+ }
+ catch (Exception e) {
+ fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
+ }
+
+
+ // make sure the surrogate one work too.
+ rules = "a😀b;";
+ bi = new RuleBasedBreakIterator(rules);
+ assertEquals("Rules does not match", rules, bi.toString());
+ }
}