ICU-21127 Error when rbbi got unpaired surrogate char See #1520

commit: ce640dc85040b5312b2fb22439853cf3107eb02c [log] [tgz]
author: Frank Tang <ftang@chromium.org> Fri Mar 05 22:25:53 2021 +0000
committer: Frank Yung-Fong Tang <ftang@google.com> Fri Mar 05 15:45:37 2021 -0800
tree: 45e1bf369b6aaf884abd85a8c86e7bd6919fc44e
parent: b1a685a67649152dd729c975c1ab3f09a5dfd33a [diff]
diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp
index 10b7e9b..45911b1 100644
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp

@@ -856,6 +856,10 @@
         return (UChar32)-1;
     }
     ch         = fRB->fRules.char32At(fNextIndex);
+    if (U_IS_SURROGATE(ch)) {
+        error(U_ILLEGAL_CHAR_FOUND);
+        return U_SENTINEL;
+    }
     fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
 
     if (ch == chCR ||

diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 8e3086b..b02478c 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp

@@ -134,6 +134,7 @@
     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
     TESTCASE_AUTO(TestTable_8_16_Bits);
     TESTCASE_AUTO(TestBug13590);
+    TESTCASE_AUTO(TestUnpairedSurrogate);
 
 #if U_ENABLE_TRACING
     TESTCASE_AUTO(TestTraceCreateCharacter);
@@ -5323,4 +5324,43 @@
 }
 #endif
 
+void RBBITest::TestUnpairedSurrogate() {
+    UnicodeString rules(u"ab;");
+
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    RuleBasedBreakIterator bi1(rules, pe, status);
+    assertSuccess(WHERE, status);
+    UnicodeString rtRules = bi1.getRules();
+    // make sure the simple one work first.
+    assertEquals(WHERE, rules,  rtRules);
+
+
+    rules = UnicodeString(u"a\\ud800b;").unescape();
+    pe.line = 0;
+    pe.offset = 0;
+    RuleBasedBreakIterator bi2(rules, pe, status);
+    assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
+    if (pe.line != 1 || pe.offset != 1) {
+        errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
+    }
+
+    status = U_ZERO_ERROR;
+    rules = UnicodeString(u"a\\ude00b;").unescape();
+    pe.line = 0;
+    pe.offset = 0;
+    RuleBasedBreakIterator bi3(rules, pe, status);
+    assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
+    if (pe.line != 1 || pe.offset != 1) {
+        errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
+    }
+
+    // make sure the surrogate one work too.
+    status = U_ZERO_ERROR;
+    rules = UnicodeString(u"a😀b;");
+    RuleBasedBreakIterator bi4(rules, pe, status);
+    rtRules = bi4.getRules();
+    assertEquals(WHERE, rules, rtRules);
+}
+
 #endif // #if !UCONFIG_NO_BREAK_ITERATION

diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index da14411..754b3e6 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h

@@ -83,6 +83,7 @@
     void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
     void TestBug13692();
     void TestDebugRules();
+    void TestUnpairedSurrogate();
 
     void TestDebug();
     void TestProperties();

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
index b086e32..1da2703 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java

@@ -85,6 +85,9 @@
     //    using these simplified the porting, and consolidated the
     //    creation of Java exceptions
     //
+    static final int U_ILLEGAL_CHAR_FOUND = 12;
+    /**< Character conversion: Illegal input sequence/combination of input units. */
+
     static final int U_BRK_ERROR_START = 0x10200;
     /**< Start of codes indicating Break Iterator failures */
 

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
index fae2773..c9a8aff 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java

@@ -723,6 +723,9 @@
             return -1;
         }
         ch = UTF16.charAt(fRB.fRules, fNextIndex);
+        if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char)ch)) {
+            error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND);
+        }
         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
 
         if (ch == '\r' ||

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
index 268f6c0..dae29ad 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java

@@ -905,4 +905,41 @@
         assertEquals("Wrong number of breaks found", 2, breaksFound);
     }
 
+    /* Test handling of unpair surrogate.
+     */
+    @Test
+    public void TestUnpairedSurrogate() {
+        // make sure the simple one work first.
+        String rules = "ab;";
+        RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules);
+        assertEquals("Rules does not match", rules, bi.toString());
+
+        try {
+            new RuleBasedBreakIterator("a\ud800b;");
+            fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair low surrogate.");
+        }
+        catch (IllegalArgumentException e) {
+            // expected exception with unpair surrogate.
+        }
+        catch (Exception e) {
+            fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair low surrogate: " + e);
+        }
+
+        try {
+            new RuleBasedBreakIterator("a\ude00b;");
+            fail("TestUnpairedSurrogate: RuleBasedBreakIterator() failed to throw an exception with unpair high surrogate.");
+        }
+        catch (IllegalArgumentException e) {
+            // expected exception with unpair surrogate.
+        }
+        catch (Exception e) {
+            fail("TestUnpairedSurrogate: Unexpected exception while new RuleBasedBreakIterator() with unpair high surrogate: " + e);
+        }
+
+
+        // make sure the surrogate one work too.
+        rules = "a😀b;";
+        bi = new RuleBasedBreakIterator(rules);
+        assertEquals("Rules does not match", rules, bi.toString());
+    }
 }
commit	ce640dc85040b5312b2fb22439853cf3107eb02c	[log] [tgz]
author	Frank Tang <ftang@chromium.org>	Fri Mar 05 22:25:53 2021 +0000
committer	Frank Yung-Fong Tang <ftang@google.com>	Fri Mar 05 15:45:37 2021 -0800
tree	45e1bf369b6aaf884abd85a8c86e7bd6919fc44e
parent	b1a685a67649152dd729c975c1ab3f09a5dfd33a [diff]