ICU-21212 full range check for Punycode digits
diff --git a/icu4c/source/common/punycode.cpp b/icu4c/source/common/punycode.cpp
index 90fe1ec..8f14a7a 100644
--- a/icu4c/source/common/punycode.cpp
+++ b/icu4c/source/common/punycode.cpp
@@ -107,36 +107,26 @@
}
/**
- * basicToDigit[] contains the numeric value of a basic code
- * point (for use in representing integers) in the range 0 to
- * BASE-1, or -1 if b is does not represent a value.
+ * @return the numeric value of a basic code point (for use in representing integers)
+ * in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
-static const int8_t
-basicToDigit[256]={
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-
- -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
- -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-};
+static int32_t decodeDigit(int32_t cp) {
+ if(cp<=u'Z') {
+ if(cp<=u'9') {
+ if(cp<u'0') {
+ return -1;
+ } else {
+ return cp-u'0'+26; // 0..9 -> 26..35
+ }
+ } else {
+ return cp-u'A'; // A-Z -> 0..25
+ }
+ } else if(cp<=u'z') {
+ return cp-'a'; // a..z -> 0..25
+ } else {
+ return -1;
+ }
+}
static inline char
asciiCaseMap(char b, UBool uppercase) {
@@ -455,7 +445,7 @@
return 0;
}
- digit=basicToDigit[(uint8_t)src[in++]];
+ digit=decodeDigit(src[in++]);
if(digit<0) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;
diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp
index b399d2d..39ba01d 100644
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@@ -39,6 +39,7 @@
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
void TestAPI();
void TestNotSTD3();
+ void TestInvalidPunycodeDigits();
void TestSomeCases();
void IdnaTest();
@@ -82,6 +83,7 @@
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestAPI);
TESTCASE_AUTO(TestNotSTD3);
+ TESTCASE_AUTO(TestInvalidPunycodeDigits);
TESTCASE_AUTO(TestSomeCases);
TESTCASE_AUTO(IdnaTest);
TESTCASE_AUTO_END;
@@ -245,6 +247,71 @@
}
}
+void UTS46Test::TestInvalidPunycodeDigits() {
+ IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()");
+ LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
+ if(errorCode.isFailure()) {
+ return;
+ }
+ UnicodeString result;
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--pleP", result, info, errorCode); // P=U+0050
+ assertFalse("nameToUnicode() should succeed",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ assertEquals("normal result", u"ᔼᔴ", result);
+ }
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode); // ends with non-ASCII U+0450
+ assertTrue("nameToUnicode() should detect non-ASCII",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+
+ // Test with ASCII characters adjacent to LDH.
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--ple/", result, info, errorCode);
+ assertTrue("nameToUnicode() should detect '/'",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--ple:", result, info, errorCode);
+ assertTrue("nameToUnicode() should detect ':'",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--ple@", result, info, errorCode);
+ assertTrue("nameToUnicode() should detect '@'",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--ple[", result, info, errorCode);
+ assertTrue("nameToUnicode() should detect '['",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--ple`", result, info, errorCode);
+ assertTrue("nameToUnicode() should detect '`'",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+
+ {
+ IDNAInfo info;
+ idna->nameToUnicode(u"xn--ple{", result, info, errorCode);
+ assertTrue("nameToUnicode() should detect '{'",
+ (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+ }
+}
+
struct TestCase {
// Input string and options string (Nontransitional/Transitional/Both).
const char *s, *o;
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
index 5cdcdb8..7b2395c 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
@@ -13,7 +13,7 @@
import com.ibm.icu.text.UTF16;
/**
- * Ported code from ICU punycode.c
+ * Ported code from ICU punycode.c
* @author ram
*/
public final class Punycode {
@@ -26,17 +26,17 @@
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 0x80;
-
+
/* "Basic" Unicode/ASCII code points */
private static final char HYPHEN = 0x2d;
private static final char DELIMITER = HYPHEN;
-
+
private static final int ZERO = 0x30;
//private static final int NINE = 0x39;
-
+
private static final int SMALL_A = 0x61;
private static final int SMALL_Z = 0x7a;
-
+
private static final int CAPITAL_A = 0x41;
private static final int CAPITAL_Z = 0x5a;
@@ -53,39 +53,30 @@
delta/=(BASE-TMIN);
}
- return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
+ return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
/**
- * basicToDigit[] contains the numeric value of a basic code
- * point (for use in representing integers) in the range 0 to
- * BASE-1, or -1 if b is does not represent a value.
+ * @return the numeric value of a basic code point (for use in representing integers)
+ * in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
- static final int[] basicToDigit= new int[]{
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-
- -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
- -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- };
+ private static final int decodeDigit(int cp) {
+ if(cp<='Z') {
+ if(cp<='9') {
+ if(cp<'0') {
+ return -1;
+ } else {
+ return cp-'0'+26; // 0..9 -> 26..35
+ }
+ } else {
+ return cp-'A'; // A-Z -> 0..25
+ }
+ } else if(cp<='z') {
+ return cp-'a'; // a..z -> 0..25
+ } else {
+ return -1;
+ }
+ }
///CLOVER:OFF
private static char asciiCaseMap(char b, boolean uppercase) {
@@ -99,7 +90,7 @@
}
}
return b;
- }
+ }
///CLOVER:ON
/**
* digitToBasic() returns the basic code point whose value
@@ -124,7 +115,7 @@
* Converts Unicode to Punycode.
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
- *
+ *
* @param src The source of the String Buffer passed.
* @param caseFlags The boolean array of case flags.
* @return An array of ASCII code points.
@@ -140,7 +131,7 @@
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=0;
-
+
for(j=0; j<srcLength; ++j) {
c=src.charAt(j);
if(isBasic(c)) {
@@ -152,7 +143,7 @@
n|=c;
} else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
++j;
-
+
n|=UCharacter.getCodePoint(c, c2);
} else {
/* error: unmatched surrogate */
@@ -211,7 +202,7 @@
/* Represent delta as a generalized variable-length integer: */
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
- /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
+ /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
@@ -220,7 +211,7 @@
t=TMAX;
}
*/
-
+
t=k-bias;
if(t<TMIN) {
t=TMIN;
@@ -249,7 +240,7 @@
return dest;
}
-
+
private static boolean isBasic(int ch){
return (ch < INITIAL_N);
}
@@ -264,12 +255,12 @@
/**
* Converts Punycode to Unicode.
* The Unicode string will be at most as long as the Punycode string.
- *
+ *
* @param src The source of the string buffer being passed.
* @param caseFlags The array of boolean case flags.
* @return StringBuilder string.
*/
- public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
+ public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
throws StringPrepParseException{
int srcLength = src.length();
StringBuilder dest = new StringBuilder(src.length());
@@ -330,7 +321,7 @@
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
- digit=basicToDigit[src.charAt(in++) & 0xFF];
+ digit=decodeDigit(src.charAt(in++));
if(digit<0) {
throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
}
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
index 0401e16..8ab2d72 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@@ -105,9 +105,56 @@
}
}
+ @Test
+ public void TestInvalidPunycodeDigits() {
+ IDNA idna=IDNA.getUTS46Instance(0);
+ StringBuilder result=new StringBuilder();
+ IDNA.Info info=new IDNA.Info();
+ idna.nameToUnicode("xn--pleP", result, info); // P=U+0050
+ assertFalse("nameToUnicode() should succeed",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+ assertEquals("normal result", "ᔼᔴ", result.toString());
+
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--pleѐ", result, info); // ends with non-ASCII U+0450
+ assertTrue("nameToUnicode() should detect non-ASCII",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+ // Test with ASCII characters adjacent to LDH.
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--PLE/", result, info);
+ assertTrue("nameToUnicode() should detect '/'",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--ple:", result, info);
+ assertTrue("nameToUnicode() should detect ':'",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--ple@", result, info);
+ assertTrue("nameToUnicode() should detect '@'",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--ple[", result, info);
+ assertTrue("nameToUnicode() should detect '['",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--ple`", result, info);
+ assertTrue("nameToUnicode() should detect '`'",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+ info=new IDNA.Info();
+ idna.nameToUnicode("xn--ple{", result, info);
+ assertTrue("nameToUnicode() should detect '{'",
+ info.getErrors().contains(IDNA.Error.PUNYCODE));
+ }
+
private static final Map<String, IDNA.Error> errorNamesToErrors;
static {
- errorNamesToErrors=new TreeMap<String, IDNA.Error>();
+ errorNamesToErrors=new TreeMap<>();
errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);