Add std/json quirk_allow_backslash_capital_u
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 3a8bc17..e59aa6d 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -20468,6 +20468,9 @@
uint64_t v_uni4_string = 0;
uint32_t v_uni4_value = 0;
uint32_t v_uni4_high_surrogate = 0;
+ uint8_t v_uni8_ok = 0;
+ uint64_t v_uni8_string = 0;
+ uint32_t v_uni8_value = 0;
uint32_t v_expect = 0;
uint32_t v_expect_after_value = 0;
@@ -20864,6 +20867,79 @@
(((uint64_t)(6)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
goto label__string_loop_outer__continue;
}
+ } else if ((v_c == 85) &&
+ self->private_impl
+ .f_quirk_enabled_allow_backslash_capital_u) {
+ if (((uint64_t)(io2_a_src - iop_a_src)) < 10) {
+ if (a_src && a_src->meta.closed) {
+ status = wuffs_base__make_status(
+ wuffs_json__error__bad_backslash_escape);
+ goto exit;
+ }
+ status = wuffs_base__make_status(
+ wuffs_base__suspension__short_read);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(9);
+ v_string_length = 0;
+ v_char = 0;
+ goto label__string_loop_outer__continue;
+ }
+ v_uni8_string =
+ wuffs_base__load_u64le__no_bounds_check(iop_a_src + 2);
+ v_uni8_value = 0;
+ v_uni8_ok = 128;
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 0))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 28);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 8))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 24);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 16))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 20);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 24))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 16);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 32))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 12);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 40))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 8);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 48))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 4);
+ v_c = wuffs_json__lut_hexadecimal_digits[(
+ 255 & (v_uni8_string >> 56))];
+ v_uni8_ok &= v_c;
+ v_uni8_value |= (((uint32_t)((v_c & 15))) << 0);
+ if (v_uni8_ok == 0) {
+ } else if ((v_uni8_value < 55296) ||
+ ((57343 < v_uni8_value) &&
+ (v_uni8_value <= 1114111))) {
+ (iop_a_src += 10, wuffs_base__make_empty_struct());
+ *iop_a_dst++ = wuffs_base__make_token(
+ (((uint64_t)((6291456 | (v_uni8_value & 2097151))))
+ << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
+ (((uint64_t)(3)) << WUFFS_BASE__TOKEN__LINK__SHIFT) |
+ (((uint64_t)(10)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ goto label__string_loop_outer__continue;
+ } else if (self->private_impl
+ .f_quirk_enabled_replace_invalid_utf_8) {
+ (iop_a_src += 10, wuffs_base__make_empty_struct());
+ *iop_a_dst++ = wuffs_base__make_token(
+ (((uint64_t)(6356989))
+ << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
+ (((uint64_t)(3)) << WUFFS_BASE__TOKEN__LINK__SHIFT) |
+ (((uint64_t)(6)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ goto label__string_loop_outer__continue;
+ }
}
status = wuffs_base__make_status(
wuffs_json__error__bad_backslash_escape);
@@ -20900,7 +20976,7 @@
}
status = wuffs_base__make_status(
wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(9);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(10);
v_string_length = 0;
v_char = 0;
goto label__string_loop_outer__continue;
@@ -20956,7 +21032,7 @@
}
status = wuffs_base__make_status(
wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(10);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(11);
v_string_length = 0;
v_char = 0;
goto label__string_loop_outer__continue;
@@ -21017,7 +21093,7 @@
}
status = wuffs_base__make_status(
wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(11);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(12);
v_string_length = 0;
v_char = 0;
goto label__string_loop_outer__continue;
@@ -21087,13 +21163,13 @@
}
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(12);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(13);
goto label__1__continue;
}
if (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_write);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(13);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(14);
goto label__1__continue;
}
(iop_a_src += 1, wuffs_base__make_empty_struct());
@@ -21171,11 +21247,11 @@
} else {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(14);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(15);
while (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
status = wuffs_base__make_status(
wuffs_base__suspension__short_write);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(15);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(16);
}
}
}
@@ -21309,7 +21385,7 @@
} else if (v_match == 1) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(16);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(17);
goto label__outer__continue;
}
} else if (v_class == 10) {
@@ -21330,7 +21406,7 @@
} else if (v_match == 1) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(17);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(18);
goto label__outer__continue;
}
} else if (v_class == 11) {
@@ -21351,7 +21427,7 @@
} else if (v_match == 1) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(18);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(19);
goto label__outer__continue;
}
}
@@ -21372,7 +21448,7 @@
if (a_src) {
a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
}
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT(19);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT(20);
status =
wuffs_json__decoder__decode_trailing_new_line(self, a_dst, a_src);
if (a_dst) {
diff --git a/std/json/decode_json.wuffs b/std/json/decode_json.wuffs
index 51edc4f..017e6b2 100644
--- a/std/json/decode_json.wuffs
+++ b/std/json/decode_json.wuffs
@@ -122,6 +122,10 @@
var uni4_value : base.u32[..= 0xFFFF]
var uni4_high_surrogate : base.u32[..= 0x10_FC00]
+ var uni8_ok : base.u8
+ var uni8_string : base.u64
+ var uni8_value : base.u32[..= 0xFFFF_FFFF]
+
// expect is a bitmask of what the next character class can be.
//
// expect_after_value is what to expect after seeing a value (a literal,
@@ -483,6 +487,73 @@
continue.string_loop_outer
}
// -------- END backslash-u.
+
+ } else if (c == 0x55) and
+ this.quirk_enabled_allow_backslash_capital_u { // 0x55 is 'U'.
+ // -------- BEGIN backslash-capital-u.
+ if args.src.available() < 10 {
+ if args.src.is_closed() {
+ return "#bad backslash-escape"
+ }
+ yield? base."$short read"
+ string_length = 0
+ char = 0
+ continue.string_loop_outer
+ }
+ uni8_string = args.src.peek_u64le_at(offset: 2)
+ uni8_value = 0
+ uni8_ok = 0x80
+
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 0)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 28
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 8)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 24
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 16)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 20
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 24)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 16
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 32)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 12
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 40)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 8
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 48)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 4
+ c = lut_hexadecimal_digits[0xFF & (uni8_string >> 56)]
+ uni8_ok &= c
+ uni8_value |= ((c & 0x0F) as base.u32) << 0
+
+ if uni8_ok == 0 {
+ // It wasn't 8 hexadecimal digits. No-op
+ // (and fall through to "#bad
+ // backslash-escape").
+
+ } else if (uni8_value < 0xD800) or (
+ (0xDFFF < uni8_value) and (uni8_value <= 0x10_FFFF)) {
+ // Not a Unicode surrogate. We're good.
+ args.src.skip32_fast!(actual: 10, worst_case: 10)
+ args.dst.write_fast_token!(
+ value_major: 0,
+ value_minor: 0x60_0000 | (uni8_value & 0x1F_FFFF),
+ link: 0x3,
+ length: 10)
+ continue.string_loop_outer
+ } else if this.quirk_enabled_replace_invalid_utf_8 {
+ args.src.skip32_fast!(actual: 10, worst_case: 10)
+ args.dst.write_fast_token!(
+ value_major: 0,
+ value_minor: 0x60_FFFD,
+ link: 0x3,
+ length: 6)
+ continue.string_loop_outer
+ }
+ // -------- END backslash-capital-u.
}
return "#bad backslash-escape"
diff --git a/std/json/decode_quirks.wuffs b/std/json/decode_quirks.wuffs
index c354505..4daf310 100644
--- a/std/json/decode_quirks.wuffs
+++ b/std/json/decode_quirks.wuffs
@@ -28,6 +28,8 @@
// When this quirk is enabled, e.g. "abc\U0001F4A9z" is accepted as a JSON
// string, equivalent to "abc\uD83D\uDCA9z", containing the U+0001F4A9 PILE OF
// POO Unicode code point. There are exactly 8 encoded bytes after each "\U".
+//
+// This quirk can combine with quirk_replace_invalid_utf_8.
pub const quirk_allow_backslash_capital_u base.u32 = 0x4909_9400 | 0x01
// When this quirk is enabled, e.g. "abc\ez" is accepted as a JSON string,
@@ -51,7 +53,9 @@
// decoding to 5 bytes: 0x61, 0x62, 0x63, 0xEF and 0x7A. There are exactly 2
// encoded bytes after each "\x".
//
-// Decoded strings are byte strings, no longer guaranteed to be valid UTF-8.
+// Decoded strings are byte strings, no longer guaranteed to be valid UTF-8 and
+// even if the overall byte string is valid UTF-8, tokens are also no longer
+// guaranteed to split on UTF-8 boundaries.
//
// "\x", "\x9", "\x9$" and "\X99" are all still rejected.
pub const quirk_allow_backslash_x base.u32 = 0x4909_9400 | 0x06
@@ -149,10 +153,16 @@
// When this quirk is enabled, invalid UTF-8 inside a JSON string is accepted.
// Each byte of invalid UTF-8 is equivalent to "\uFFFD", the Unicode
-// Replacement Character. Invalid UTF-8 outside a JSON string remains an error.
+// Replacement Character. The UTF-8 encoding of U+FFFD is "\xEF\xBF\xBD".
+//
+// Invalid UTF-8 outside a JSON string remains an error.
//
// Similarly, for backslash-u escapes featuring incorrectly paired Unicode
// surrogates, each backslash-u 6-byte unit is replaced. For example,
// "abc\uDC00z" and "ijk\uD800\uDBFFz" are equivalent to "abc\uFFFDz" and
// "ijk\uFFFD\uFFFDz".
+//
+// When combined with quirk_allow_backslash_capital_u, a "\U12345678" 10-byte
+// unit that is an invalid Unicode code point (i.e. in the range U+D800 ..=
+// U+DFFF or above U+10FFFF) is similarly replaced with U+FFFD.
pub const quirk_replace_invalid_utf_8 base.u32 = 0x4909_9400 | 0x0F
diff --git a/test/c/std/json.c b/test/c/std/json.c
index eb5d018..9de587d 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c
@@ -1120,6 +1120,11 @@
.quirk = WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_A,
},
{
+ .want = 0x0001F4A9,
+ .str = "\"\\U0001F4A9\"",
+ .quirk = WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_CAPITAL_U,
+ },
+ {
.want = 0x1B,
.str = "\"\\e\"",
.quirk = WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_E,