ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index d6fa180..dc3e0b2 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h
@@ -217,9 +217,8 @@ class RuleCharacterIterator; * </tr> * <tr align="top"> * <td nowrap valign="top" align="right"><code>hex := </code></td> - * <td valign="top"><em>any character for which - * </em><code>Character.digit(c, 16)</code><em> - * returns a non-negative result</em></td> + * <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> + * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> * </tr> * <tr> * <td nowrap valign="top" align="right"><code>property := </code></td>
diff --git a/icu4c/source/common/ustring.cpp b/icu4c/source/common/ustring.cpp index d7f63ec..8477256 100644 --- a/icu4c/source/common/ustring.cpp +++ b/icu4c/source/common/ustring.cpp
@@ -1185,23 +1185,23 @@ static const UChar UNESCAPE_MAP[] = { enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) }; /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ -static int8_t _digit8(UChar c) { - if (c >= 0x0030 && c <= 0x0037) { - return (int8_t)(c - 0x0030); +static int32_t _digit8(UChar c) { + if (c >= u'0' && c <= u'7') { + return c - u'0'; } return -1; } /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ -static int8_t _digit16(UChar c) { - if (c >= 0x0030 && c <= 0x0039) { - return (int8_t)(c - 0x0030); +static int32_t _digit16(UChar c) { + if (c >= u'0' && c <= u'9') { + return c - u'0'; } - if (c >= 0x0041 && c <= 0x0046) { - return (int8_t)(c - (0x0041 - 10)); + if (c >= u'A' && c <= u'F') { + return c - (u'A' - 10); } - if (c >= 0x0061 && c <= 0x0066) { - return (int8_t)(c - (0x0061 - 10)); + if (c >= u'a' && c <= u'f') { + return c - (u'a' - 10); } return -1; } @@ -1216,14 +1216,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, void *context) { int32_t start = *offset; - UChar c; + UChar32 c; UChar32 result = 0; int8_t n = 0; int8_t minDig = 0; int8_t maxDig = 0; int8_t bitsPerDigit = 4; - int8_t dig; - int32_t i; + int32_t dig; UBool braces = FALSE; /* Check that offset is in range */ @@ -1236,15 +1235,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, /* Convert hexadecimal and octal escapes */ switch (c) { - case 0x0075 /*'u'*/: + case u'u': minDig = maxDig = 4; break; - case 0x0055 /*'U'*/: + case u'U': minDig = maxDig = 8; break; - case 0x0078 /*'x'*/: + case u'x': minDig = 1; - if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { + if (*offset < length && charAt(*offset, context) == u'{') { ++(*offset); braces = TRUE; maxDig = 8; @@ -1266,7 +1265,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, if (minDig != 0) { while (*offset < length && n < maxDig) { c = charAt(*offset, context); - dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); + dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); if (dig < 0) { break; } @@ -1278,7 +1277,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, goto err; } if (braces) { - if (c != 0x7D /*}*/) { + if (c != u'}') { goto err; } ++(*offset); @@ -1293,16 +1292,15 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, if (*offset < length && U16_IS_LEAD(result)) { int32_t ahead = *offset + 1; c = charAt(*offset, context); - if (c == 0x5C /*'\\'*/ && ahead < length) { - // Calling u_unescapeAt recursively may cause a stack overflow if - // we have repeated surrogate lead after that. Limit the - // length to 5 ('u' and 4 hex) after ahead. - int32_t tailLimit = ahead + 5; + if (c == u'\\' && ahead < length) { + // Calling ourselves recursively may cause a stack overflow if + // we have repeated escaped lead surrogates. + // Limit the length to 11 ("x{0000DFFF}") after ahead. + int32_t tailLimit = ahead + 11; if (tailLimit > length) { tailLimit = length; } - c = (UChar) u_unescapeAt(charAt, &ahead, tailLimit, - context); + c = u_unescapeAt(charAt, &ahead, tailLimit, context); } if (U16_IS_TRAIL(c)) { *offset = ahead; @@ -1313,7 +1311,7 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, } /* Convert C-style escapes in table */ - for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { + for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { if (c == UNESCAPE_MAP[i]) { return UNESCAPE_MAP[i+1]; } else if (c < UNESCAPE_MAP[i]) { @@ -1322,13 +1320,13 @@ u_unescapeAt(UNESCAPE_CHAR_AT charAt, } /* Map \cX to control-X: X & 0x1F */ - if (c == 0x0063 /*'c'*/ && *offset < length) { + if (c == u'c' && *offset < length) { c = charAt((*offset)++, context); if (U16_IS_LEAD(c) && *offset < length) { UChar c2 = charAt(*offset, context); if (U16_IS_TRAIL(c2)) { ++(*offset); - c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */ + c = U16_GET_SUPPLEMENTARY(c, c2); } } return 0x1F & c;
diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp index 560f72a..b3af657 100644 --- a/icu4c/source/test/intltest/ustrtest.cpp +++ b/icu4c/source/test/intltest/ustrtest.cpp
@@ -1345,6 +1345,12 @@ void UnicodeStringTest::TestUnescape(void) { if (!UNICODE_STRING("wrong \\u sequence", 17).unescape().isEmpty()) { errln("FAIL: unescaping of a string with an illegal escape sequence did not return an empty string"); } + + // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits + UnicodeString euro = UnicodeString(u"\\u20aC").unescape(); + assertEquals("ASCII Euro", u"€", euro); + UnicodeString nonASCIIEuro = UnicodeString(u"\\u୨෦aC").unescape(); + assertTrue("unescape() accepted non-ASCII digits", nonASCIIEuro.isEmpty()); } /* test code point counting functions --------------------------------------- */
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java index 59e44d6..5308cad 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Utility.java
@@ -777,6 +777,28 @@ public static final String escape(String s) { /*v*/ 0x76, 0x0b }; + /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ + private static final int _digit8(int c) { + if (c >= '0' && c <= '7') { + return c - '0'; + } + return -1; + } + + /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ + private static final int _digit16(int c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'A' && c <= 'F') { + return c - ('A' - 10); + } + if (c >= 'a' && c <= 'f') { + return c - ('a' - 10); + } + return -1; + } + /** * Converts an escape to a code point value. We attempt * to parallel the icu4c unescapeAt() function. @@ -788,26 +810,26 @@ public static final String escape(String s) { * @return the code point and length, or -1 on error. */ public static int unescapeAndLengthAt(CharSequence s, int offset) { - int c; + return unescapeAndLengthAt(s, offset, s.length()); + } + + private static int unescapeAndLengthAt(CharSequence s, int offset, int length) { int result = 0; int n = 0; int minDig = 0; int maxDig = 0; int bitsPerDigit = 4; int dig; - int i; boolean braces = false; /* Check that offset is in range */ - int length = s.length(); if (offset < 0 || offset >= length) { return -1; } int start = offset; /* Fetch first UChar after '\\' */ - c = Character.codePointAt(s, offset); - offset += UTF16.getCharCount(c); + int c = s.charAt(offset++); /* Convert hexadecimal and octal escapes */ switch (c) { @@ -819,7 +841,7 @@ public static int unescapeAndLengthAt(CharSequence s, int offset) { break; case 'x': minDig = 1; - if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { + if (offset < length && s.charAt(offset) == '{') { ++offset; braces = true; maxDig = 8; @@ -828,7 +850,7 @@ public static int unescapeAndLengthAt(CharSequence s, int offset) { } break; default: - dig = UCharacter.digit(c, 8); + dig = _digit8(c); if (dig >= 0) { minDig = 1; maxDig = 3; @@ -840,20 +862,20 @@ public static int unescapeAndLengthAt(CharSequence s, int offset) { } if (minDig != 0) { while (offset < length && n < maxDig) { - c = UTF16.charAt(s, offset); - dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); + c = s.charAt(offset); + dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; - offset += UTF16.getCharCount(c); + ++offset; ++n; } if (n < minDig) { return -1; } if (braces) { - if (c != 0x7D /*}*/) { + if (c != '}') { return -1; } ++offset; @@ -867,9 +889,16 @@ public static int unescapeAndLengthAt(CharSequence s, int offset) { // supplementary. if (offset < length && UTF16.isLeadSurrogate(result)) { int ahead = offset+1; - c = s.charAt(offset); // [sic] get 16-bit code unit + c = s.charAt(offset); if (c == '\\' && ahead < length) { - int cpAndLength = unescapeAndLengthAt(s, ahead); + // Calling ourselves recursively may cause a stack overflow if + // we have repeated escaped lead surrogates. + // Limit the length to 11 ("x{0000DFFF}") after ahead. + int tailLimit = ahead + 11; + if (tailLimit > length) { + tailLimit = length; + } + int cpAndLength = unescapeAndLengthAt(s, ahead, tailLimit); if (cpAndLength >= 0) { c = cpAndLength >> 8; ahead += cpAndLength & 0xff; @@ -877,14 +906,14 @@ public static int unescapeAndLengthAt(CharSequence s, int offset) { } if (UTF16.isTrailSurrogate(c)) { offset = ahead; - result = Character.toCodePoint((char) result, (char) c); + result = UCharacter.toCodePoint(result, c); } } return codePointAndLength(result, start, offset); } /* Convert C-style escapes in table */ - for (i=0; i<UNESCAPE_MAP.length; i+=2) { + for (int i=0; i<UNESCAPE_MAP.length; i+=2) { if (c == UNESCAPE_MAP[i]) { return codePointAndLength(UNESCAPE_MAP[i+1], start, offset); } else if (c < UNESCAPE_MAP[i]) { @@ -894,12 +923,20 @@ public static int unescapeAndLengthAt(CharSequence s, int offset) { /* Map \cX to control-X: X & 0x1F */ if (c == 'c' && offset < length) { - c = UTF16.charAt(s, offset); - return codePointAndLength(c & 0x1F, start, offset + UTF16.getCharCount(c)); + c = Character.codePointAt(s, offset); + return codePointAndLength(c & 0x1F, start, offset + Character.charCount(c)); } /* If no special forms are recognized, then consider - * the backslash to generically escape the next character. */ + * the backslash to generically escape the next character. + * Deal with surrogate pairs. */ + if (UTF16.isLeadSurrogate(c) && offset < length) { + int c2 = s.charAt(offset); + if (UTF16.isTrailSurrogate(c2)) { + ++offset; + c = UCharacter.toCodePoint(c, c2); + } + } return codePointAndLength(c, start, offset); }
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 9126987..cda5bf1 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@@ -225,9 +225,8 @@ * </tr> * <tr style="vertical-align: top"> * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>hex := </code></td> - * <td style="vertical-align: top;"><em>any character for which - * </em><code>Character.digit(c, 16)</code><em> - * returns a non-negative result</em></td> + * <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> + * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> * </tr> * <tr> * <td style="white-space: nowrap; vertical-align: top;" align="right"><code>property := </code></td>
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java index 15d1e60..ddc8b0d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/UtilityTest.java
@@ -63,6 +63,17 @@ public void TestUnescape() { assertTrue(pattern + " contains U+1DA8B", set.contains(0x1DA8B)); assertTrue(pattern + " contains U+1DF00..U+1DF1E", set.contains(0x1DF00, 0x1DF1E)); assertFalse(pattern + " contains U+1DF1F", set.contains(0x1DF1F)); + + // ICU-21648 limit backslash-uhhhh escapes to ASCII hex digits + String euro = Utility.unescape("\\u20aC"); + assertEquals("ASCII Euro", "€", euro); + try { + Utility.unescape("\\u୨෦aC"); + fail("unescape() accepted non-ASCII digits"); + } catch(IllegalArgumentException expected) { + } + String euro2 = Utility.unescapeLeniently("\\u20aC\\u୨෦aC"); + assertEquals("lenient", "€\\u୨෦aC", euro2); } @Test