ICU-21030 validate ACE label edge cases

commit: 9279e49d2fdf5e0d554c7de03ef6d5f171dff147 [log] [tgz]
author: Markus Scherer <markus.icu@gmail.com> Fri Aug 14 13:33:34 2020 -0700
committer: Markus Scherer <markus.icu@gmail.com> Fri Aug 14 14:32:47 2020 -0700
tree: 809beab6bc47a087f1e6f67d29f9a88b96488c3d
parent: e19d12997b1ab9599125c77709bfaf3c8c9ec09f [diff]
diff --git a/icu4c/source/common/uts46.cpp b/icu4c/source/common/uts46.cpp
index b9e6cb0..f25b4e1 100644
--- a/icu4c/source/common/uts46.cpp
+++ b/icu4c/source/common/uts46.cpp

@@ -714,6 +714,16 @@
     UBool wasPunycode;
     if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
         // Label starts with "xn--", try to un-Punycode it.
+        // In IDNA2008, labels like "xn--" (decodes to an empty string) and
+        // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
+        // comparing the ToUnicode input with the back-to-ToASCII output.
+        // They are alternate encodings of the respective ASCII labels.
+        // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
+        // the round-trip verification.
+        if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
+            info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
+            return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
+        }
         wasPunycode=TRUE;
         UChar *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit
         if(unicodeBuffer==NULL) {
@@ -925,10 +935,10 @@
     UBool isASCII=TRUE;
     UBool onlyLDH=TRUE;
     const UChar *label=dest.getBuffer()+labelStart;
-    // Ok to cast away const because we own the UnicodeString.
-    UChar *s=(UChar *)label+4;  // After the initial "xn--".
     const UChar *limit=label+labelLength;
-    do {
+    // Start after the initial "xn--".
+    // Ok to cast away const because we own the UnicodeString.
+    for(UChar *s=const_cast<UChar *>(label+4); s<limit; ++s) {
         UChar c=*s;
         if(c<=0x7f) {
             if(c==0x2e) {
@@ -945,7 +955,7 @@
         } else {
             isASCII=onlyLDH=FALSE;
         }
-    } while(++s<limit);
+    }
     if(onlyLDH) {
         dest.insert(labelStart+labelLength, (UChar)0xfffd);
         if(dest.isBogus()) {

diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp
index 39ba01d..6700d0b 100644
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp

@@ -40,6 +40,7 @@
     void TestAPI();
     void TestNotSTD3();
     void TestInvalidPunycodeDigits();
+    void TestACELabelEdgeCases();
     void TestSomeCases();
     void IdnaTest();
 
@@ -84,6 +85,7 @@
     TESTCASE_AUTO(TestAPI);
     TESTCASE_AUTO(TestNotSTD3);
     TESTCASE_AUTO(TestInvalidPunycodeDigits);
+    TESTCASE_AUTO(TestACELabelEdgeCases);
     TESTCASE_AUTO(TestSomeCases);
     TESTCASE_AUTO(IdnaTest);
     TESTCASE_AUTO_END;
@@ -312,6 +314,35 @@
     }
 }
 
+void UTS46Test::TestACELabelEdgeCases() {
+    // In IDNA2008, these labels fail the round-trip validation from comparing
+    // the ToUnicode input with the back-to-ToASCII output.
+    IcuTestErrorCode errorCode(*this, "TestACELabelEdgeCases()");
+    LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
+    if(errorCode.isFailure()) {
+        return;
+    }
+    UnicodeString result;
+    {
+        IDNAInfo info;
+        idna->labelToUnicode(u"xn--", result, info, errorCode);
+        assertTrue("empty xn--", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+    }
+    {
+        IDNAInfo info;
+        idna->labelToUnicode(u"xN--ASCII-", result, info, errorCode);
+        assertTrue("nothing but ASCII", (info.getErrors()&UIDNA_ERROR_INVALID_ACE_LABEL)!=0);
+    }
+    {
+        // Different error: The Punycode decoding procedure does not consume the last delimiter
+        // if it is right after the xn-- so the main decoding loop fails because the hyphen
+        // is not a valid Punycode digit.
+        IDNAInfo info;
+        idna->labelToUnicode(u"Xn---", result, info, errorCode);
+        assertTrue("empty Xn---", (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+}
+
 struct TestCase {
     // Input string and options string (Nontransitional/Transitional/Both).
     const char *s, *o;
@@ -558,13 +589,13 @@
       UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|
       UIDNA_ERROR_HYPHEN_3_4 },
     { "a..c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
-    { "a.xn--.c", "B", "a..c", UIDNA_ERROR_EMPTY_LABEL },
+    { "a.xn--.c", "B", "a.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
     { "a.-b.", "B", "a.-b.", UIDNA_ERROR_LEADING_HYPHEN },
     { "a.b-.c", "B", "a.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
     { "a.-.c", "B", "a.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },
     { "a.bc--de.f", "B", "a.bc--de.f", UIDNA_ERROR_HYPHEN_3_4 },
     { "\\u00E4.\\u00AD.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
-    { "\\u00E4.xn--.c", "B", "\\u00E4..c", UIDNA_ERROR_EMPTY_LABEL },
+    { "\\u00E4.xn--.c", "B", "\\u00E4.xn--\\uFFFD.c", UIDNA_ERROR_INVALID_ACE_LABEL },
     { "\\u00E4.-b.", "B", "\\u00E4.-b.", UIDNA_ERROR_LEADING_HYPHEN },
     { "\\u00E4.b-.c", "B", "\\u00E4.b-.c", UIDNA_ERROR_TRAILING_HYPHEN },
     { "\\u00E4.-.c", "B", "\\u00E4.-.c", UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN },

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
index c830bc3..1e52f8f 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java

@@ -341,6 +341,16 @@
             dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
         ) {
             // Label starts with "xn--", try to un-Punycode it.
+            // In IDNA2008, labels like "xn--" (decodes to an empty string) and
+            // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
+            // comparing the ToUnicode input with the back-to-ToASCII output.
+            // They are alternate encodings of the respective ASCII labels.
+            // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
+            // the round-trip verification.
+            if(labelLength==4 || (labelLength>5 && dest.charAt(labelStart+labelLength-1)=='-')) {
+                addLabelError(info, Error.INVALID_ACE_LABEL);
+                return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
+            }
             wasPunycode=true;
             try {
                 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
@@ -496,9 +506,9 @@
         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
         boolean isASCII=true;
         boolean onlyLDH=true;
-        int i=labelStart+4;  // After the initial "xn--".
         int limit=labelStart+labelLength;
-        do {
+        // Start after the initial "xn--".
+        for(int i=labelStart+4; i<limit; ++i) {
             char c=dest.charAt(i);
             if(c<=0x7f) {
                 if(c=='.') {
@@ -515,7 +525,7 @@
             } else {
                 isASCII=onlyLDH=false;
             }
-        } while(++i<limit);
+        }
         if(onlyLDH) {
             dest.insert(labelStart+labelLength, '\ufffd');
             ++labelLength;

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
index 8ab2d72..25ee84a 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java

@@ -152,6 +152,28 @@
                 info.getErrors().contains(IDNA.Error.PUNYCODE));
     }
 
+    @Test
+    public void TestACELabelEdgeCases() {
+        // In IDNA2008, these labels fail the round-trip validation from comparing
+        // the ToUnicode input with the back-to-ToASCII output.
+        IDNA idna=IDNA.getUTS46Instance(0);
+        StringBuilder result=new StringBuilder();
+        IDNA.Info info=new IDNA.Info();
+        idna.labelToUnicode("xn--", result, info);
+        assertTrue("empty xn--", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
+
+        info=new IDNA.Info();
+        idna.labelToUnicode("xN--ASCII-", result, info);
+        assertTrue("nothing but ASCII", info.getErrors().contains(IDNA.Error.INVALID_ACE_LABEL));
+
+        // Different error: The Punycode decoding procedure does not consume the last delimiter
+        // if it is right after the xn-- so the main decoding loop fails because the hyphen
+        // is not a valid Punycode digit.
+        info=new IDNA.Info();
+        idna.labelToUnicode("Xn---", result, info);
+        assertTrue("empty Xn---", info.getErrors().contains(IDNA.Error.PUNYCODE));
+    }
+
     private static final Map<String, IDNA.Error> errorNamesToErrors;
     static {
         errorNamesToErrors=new TreeMap<>();
@@ -432,13 +454,13 @@
           "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
           "UIDNA_ERROR_HYPHEN_3_4" },
         { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
-        { "a.xn--.c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
+        { "a.xn--.c", "B", "a.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
         { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
         { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
         { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
         { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
         { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
-        { "\u00E4.xn--.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
+        { "\u00E4.xn--.c", "B", "\u00E4.xn--\uFFFD.c", "UIDNA_ERROR_INVALID_ACE_LABEL" },
         { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
         { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
         { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
commit	9279e49d2fdf5e0d554c7de03ef6d5f171dff147	[log] [tgz]
author	Markus Scherer <markus.icu@gmail.com>	Fri Aug 14 13:33:34 2020 -0700
committer	Markus Scherer <markus.icu@gmail.com>	Fri Aug 14 14:32:47 2020 -0700
tree	809beab6bc47a087f1e6f67d29f9a88b96488c3d
parent	e19d12997b1ab9599125c77709bfaf3c8c9ec09f [diff]