ICU-21212 full range check for Punycode digits

commit: e19d12997b1ab9599125c77709bfaf3c8c9ec09f [log] [tgz]
author: Markus Scherer <markus.icu@gmail.com> Thu Aug 13 15:35:17 2020 -0700
committer: Markus Scherer <markus.icu@gmail.com> Fri Aug 14 11:37:40 2020 -0700
tree: c34944173811b9f2bdc99f5bb8fbeec729d5db2a
parent: e3f2c0dd70018d924bf22a9b3f0cbf387316b50b [diff]
diff --git a/icu4c/source/common/punycode.cpp b/icu4c/source/common/punycode.cpp
index 90fe1ec..8f14a7a 100644
--- a/icu4c/source/common/punycode.cpp
+++ b/icu4c/source/common/punycode.cpp

@@ -107,36 +107,26 @@
 }
 
 /**
- * basicToDigit[] contains the numeric value of a basic code
- * point (for use in representing integers) in the range 0 to
- * BASE-1, or -1 if b is does not represent a value.
+ * @return the numeric value of a basic code point (for use in representing integers)
+ *         in the range 0 to BASE-1, or a negative value if cp is invalid.
  */
-static const int8_t
-basicToDigit[256]={
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-
-    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
-    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-};
+static int32_t decodeDigit(int32_t cp) {
+    if(cp<=u'Z') {
+        if(cp<=u'9') {
+            if(cp<u'0') {
+                return -1;
+            } else {
+                return cp-u'0'+26;  // 0..9 -> 26..35
+            }
+        } else {
+            return cp-u'A';  // A-Z -> 0..25
+        }
+    } else if(cp<=u'z') {
+        return cp-'a';  // a..z -> 0..25
+    } else {
+        return -1;
+    }
+}
 
 static inline char
 asciiCaseMap(char b, UBool uppercase) {
@@ -455,7 +445,7 @@
                 return 0;
             }
 
-            digit=basicToDigit[(uint8_t)src[in++]];
+            digit=decodeDigit(src[in++]);
             if(digit<0) {
                 *pErrorCode=U_INVALID_CHAR_FOUND;
                 return 0;

diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp
index b399d2d..39ba01d 100644
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp

@@ -39,6 +39,7 @@
     void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
     void TestAPI();
     void TestNotSTD3();
+    void TestInvalidPunycodeDigits();
     void TestSomeCases();
     void IdnaTest();
 
@@ -82,6 +83,7 @@
     TESTCASE_AUTO_BEGIN;
     TESTCASE_AUTO(TestAPI);
     TESTCASE_AUTO(TestNotSTD3);
+    TESTCASE_AUTO(TestInvalidPunycodeDigits);
     TESTCASE_AUTO(TestSomeCases);
     TESTCASE_AUTO(IdnaTest);
     TESTCASE_AUTO_END;
@@ -245,6 +247,71 @@
     }
 }
 
+void UTS46Test::TestInvalidPunycodeDigits() {
+    IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()");
+    LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
+    if(errorCode.isFailure()) {
+        return;
+    }
+    UnicodeString result;
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--pleP", result, info, errorCode);  // P=U+0050
+        assertFalse("nameToUnicode() should succeed",
+                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+        assertEquals("normal result", u"ᔼᔴ", result);
+    }
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode);  // ends with non-ASCII U+0450
+        assertTrue("nameToUnicode() should detect non-ASCII",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    // Test with ASCII characters adjacent to LDH.
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple/", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '/'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple:", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect ':'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple@", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '@'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple[", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '['",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple`", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '`'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple{", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '{'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+}
+
 struct TestCase {
     // Input string and options string (Nontransitional/Transitional/Both).
     const char *s, *o;

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
index 5cdcdb8..7b2395c 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java

@@ -13,7 +13,7 @@
 import com.ibm.icu.text.UTF16;
 
 /**
- * Ported code from ICU punycode.c 
+ * Ported code from ICU punycode.c
  * @author ram
  */
 public final class Punycode {
@@ -26,17 +26,17 @@
     private static final int DAMP           = 700;
     private static final int INITIAL_BIAS   = 72;
     private static final int INITIAL_N      = 0x80;
-    
+
     /* "Basic" Unicode/ASCII code points */
     private static final char HYPHEN        = 0x2d;
     private static final char DELIMITER     = HYPHEN;
-    
+
     private static final int ZERO           = 0x30;
     //private static final int NINE           = 0x39;
-    
+
     private static final int SMALL_A        = 0x61;
     private static final int SMALL_Z        = 0x7a;
-    
+
     private static final int CAPITAL_A      = 0x41;
     private static final int CAPITAL_Z      = 0x5a;
 
@@ -53,39 +53,30 @@
             delta/=(BASE-TMIN);
         }
 
-        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));         
+        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
     }
 
     /**
-     * basicToDigit[] contains the numeric value of a basic code
-     * point (for use in representing integers) in the range 0 to
-     * BASE-1, or -1 if b is does not represent a value.
+     * @return the numeric value of a basic code point (for use in representing integers)
+     *         in the range 0 to BASE-1, or a negative value if cp is invalid.
      */
-    static final int[]    basicToDigit= new int[]{
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-    
-        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-    
-        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-    };
+    private static final int decodeDigit(int cp) {
+        if(cp<='Z') {
+            if(cp<='9') {
+                if(cp<'0') {
+                    return -1;
+                } else {
+                    return cp-'0'+26;  // 0..9 -> 26..35
+                }
+            } else {
+                return cp-'A';  // A-Z -> 0..25
+            }
+        } else if(cp<='z') {
+            return cp-'a';  // a..z -> 0..25
+        } else {
+            return -1;
+        }
+    }
 
     ///CLOVER:OFF
     private static char asciiCaseMap(char b, boolean uppercase) {
@@ -99,7 +90,7 @@
             }
         }
         return b;
-    }    
+    }
     ///CLOVER:ON
     /**
      * digitToBasic() returns the basic code point whose value
@@ -124,7 +115,7 @@
      * Converts Unicode to Punycode.
      * The input string must not contain single, unpaired surrogates.
      * The output will be represented as an array of ASCII code points.
-     * 
+     *
      * @param src The source of the String Buffer passed.
      * @param caseFlags The boolean array of case flags.
      * @return An array of ASCII code points.
@@ -140,7 +131,7 @@
          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
          */
         srcCPCount=0;
-        
+
         for(j=0; j<srcLength; ++j) {
             c=src.charAt(j);
             if(isBasic(c)) {
@@ -152,7 +143,7 @@
                     n|=c;
                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
                     ++j;
-                    
+
                     n|=UCharacter.getCodePoint(c, c2);
                 } else {
                     /* error: unmatched surrogate */
@@ -211,7 +202,7 @@
                     /* Represent delta as a generalized variable-length integer: */
                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
 
-                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt   
+                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 
                         t=k-bias;
                         if(t<TMIN) {
@@ -220,7 +211,7 @@
                             t=TMAX;
                         }
                         */
-                    
+
                         t=k-bias;
                         if(t<TMIN) {
                             t=TMIN;
@@ -249,7 +240,7 @@
 
         return dest;
     }
-    
+
     private static boolean isBasic(int ch){
         return (ch < INITIAL_N);
     }
@@ -264,12 +255,12 @@
     /**
      * Converts Punycode to Unicode.
      * The Unicode string will be at most as long as the Punycode string.
-     * 
+     *
      * @param src The source of the string buffer being passed.
      * @param caseFlags The array of boolean case flags.
      * @return StringBuilder string.
      */
-    public static StringBuilder decode(CharSequence src, boolean[] caseFlags) 
+    public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
                                throws StringPrepParseException{
         int srcLength = src.length();
         StringBuilder dest = new StringBuilder(src.length());
@@ -330,7 +321,7 @@
                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
                 }
 
-                digit=basicToDigit[src.charAt(in++) & 0xFF];
+                digit=decodeDigit(src.charAt(in++));
                 if(digit<0) {
                     throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
                 }

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
index 0401e16..8ab2d72 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java

@@ -105,9 +105,56 @@
         }
     }
 
+    @Test
+    public void TestInvalidPunycodeDigits() {
+        IDNA idna=IDNA.getUTS46Instance(0);
+        StringBuilder result=new StringBuilder();
+        IDNA.Info info=new IDNA.Info();
+        idna.nameToUnicode("xn--pleP", result, info);  // P=U+0050
+        assertFalse("nameToUnicode() should succeed",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+        assertEquals("normal result", "ᔼᔴ", result.toString());
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--pleѐ", result, info);  // ends with non-ASCII U+0450
+        assertTrue("nameToUnicode() should detect non-ASCII",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        // Test with ASCII characters adjacent to LDH.
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--PLE/", result, info);
+        assertTrue("nameToUnicode() should detect '/'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple:", result, info);
+        assertTrue("nameToUnicode() should detect ':'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple@", result, info);
+        assertTrue("nameToUnicode() should detect '@'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple[", result, info);
+        assertTrue("nameToUnicode() should detect '['",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple`", result, info);
+        assertTrue("nameToUnicode() should detect '`'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple{", result, info);
+        assertTrue("nameToUnicode() should detect '{'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+    }
+
     private static final Map<String, IDNA.Error> errorNamesToErrors;
     static {
-        errorNamesToErrors=new TreeMap<String, IDNA.Error>();
+        errorNamesToErrors=new TreeMap<>();
         errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
         errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
         errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);
commit	e19d12997b1ab9599125c77709bfaf3c8c9ec09f	[log] [tgz]
author	Markus Scherer <markus.icu@gmail.com>	Thu Aug 13 15:35:17 2020 -0700
committer	Markus Scherer <markus.icu@gmail.com>	Fri Aug 14 11:37:40 2020 -0700
tree	c34944173811b9f2bdc99f5bb8fbeec729d5db2a
parent	e3f2c0dd70018d924bf22a9b3f0cbf387316b50b [diff]