Have std/json reject invalid UTF-8
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index ba8c057..77efc36 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -18940,6 +18940,7 @@
uint8_t v_backslash = 0;
uint8_t v_char = 0;
uint8_t v_class = 0;
+ uint32_t v_multi_byte_utf8 = 0;
uint8_t v_uni4_ok = 0;
uint64_t v_uni4_string = 0;
uint32_t v_uni4_value = 0;
@@ -19110,11 +19111,11 @@
v_char = wuffs_json__lut_chars[v_c];
if (v_char == 0) {
(iop_a_src += 1, wuffs_base__make_empty_struct());
- if (v_string_length >= 65534) {
+ if (v_string_length >= 65531) {
*iop_a_dst++ = wuffs_base__make_token(
(((uint64_t)(2097697))
<< WUFFS_BASE__TOKEN__VALUE__SHIFT) |
- (((uint64_t)(65535))
+ (((uint64_t)(65532))
<< WUFFS_BASE__TOKEN__LENGTH__SHIFT));
v_string_length = 0;
goto label__string_loop__continue;
@@ -19281,19 +19282,111 @@
status = wuffs_base__make_status(
wuffs_json__error__bad_backslash_escape);
goto exit;
- } else if (v_char <= 16) {
- (iop_a_src += 1, wuffs_base__make_empty_struct());
- if (v_string_length >= 65534) {
- *iop_a_dst++ = wuffs_base__make_token(
- (((uint64_t)(2097697))
- << WUFFS_BASE__TOKEN__VALUE__SHIFT) |
- (((uint64_t)(65535))
- << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ } else if (v_char == 3) {
+ if (((uint64_t)(io2_a_src - iop_a_src)) < 2) {
+ if (a_src && a_src->meta.closed) {
+ status =
+ wuffs_base__make_status(wuffs_json__error__bad_utf_8);
+ goto exit;
+ }
+ status = wuffs_base__make_status(
+ wuffs_base__suspension__short_read);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(10);
v_string_length = 0;
+ v_char = 0;
goto label__string_loop__continue;
}
- v_string_length += 1;
- goto label__1__continue;
+ v_multi_byte_utf8 =
+ ((uint32_t)(wuffs_base__load_u16le(iop_a_src)));
+ if ((v_multi_byte_utf8 & 49152) == 32768) {
+ v_multi_byte_utf8 = ((1984 & (v_multi_byte_utf8 << 6)) |
+ (63 & (v_multi_byte_utf8 >> 8)));
+ (iop_a_src += 2, wuffs_base__make_empty_struct());
+ if (v_string_length >= 65528) {
+ *iop_a_dst++ = wuffs_base__make_token(
+ (((uint64_t)(2097697))
+ << WUFFS_BASE__TOKEN__VALUE__SHIFT) |
+ (((uint64_t)(((uint64_t)((v_string_length + 2)))))
+ << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ v_string_length = 0;
+ goto label__string_loop__continue;
+ }
+ v_string_length += 2;
+ goto label__1__continue;
+ }
+ } else if (v_char == 4) {
+ if (((uint64_t)(io2_a_src - iop_a_src)) < 3) {
+ if (a_src && a_src->meta.closed) {
+ status =
+ wuffs_base__make_status(wuffs_json__error__bad_utf_8);
+ goto exit;
+ }
+ status = wuffs_base__make_status(
+ wuffs_base__suspension__short_read);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(11);
+ v_string_length = 0;
+ v_char = 0;
+ goto label__string_loop__continue;
+ }
+ v_multi_byte_utf8 =
+ ((uint32_t)(wuffs_base__load_u24le(iop_a_src)));
+ if ((v_multi_byte_utf8 & 12632064) == 8421376) {
+ v_multi_byte_utf8 = ((61440 & (v_multi_byte_utf8 << 12)) |
+ (4032 & (v_multi_byte_utf8 >> 2)) |
+ (63 & (v_multi_byte_utf8 >> 16)));
+ if ((2047 < v_multi_byte_utf8) &&
+ ((v_multi_byte_utf8 < 55296) ||
+ (57343 < v_multi_byte_utf8))) {
+ (iop_a_src += 3, wuffs_base__make_empty_struct());
+ if (v_string_length >= 65528) {
+ *iop_a_dst++ = wuffs_base__make_token(
+ (((uint64_t)(2097697))
+ << WUFFS_BASE__TOKEN__VALUE__SHIFT) |
+ (((uint64_t)(((uint64_t)((v_string_length + 3)))))
+ << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ v_string_length = 0;
+ goto label__string_loop__continue;
+ }
+ v_string_length += 3;
+ goto label__1__continue;
+ }
+ }
+ } else if (v_char == 5) {
+ if (((uint64_t)(io2_a_src - iop_a_src)) < 4) {
+ if (a_src && a_src->meta.closed) {
+ status =
+ wuffs_base__make_status(wuffs_json__error__bad_utf_8);
+ goto exit;
+ }
+ status = wuffs_base__make_status(
+ wuffs_base__suspension__short_read);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(12);
+ v_string_length = 0;
+ v_char = 0;
+ goto label__string_loop__continue;
+ }
+ v_multi_byte_utf8 = wuffs_base__load_u32le(iop_a_src);
+ if ((v_multi_byte_utf8 & 3233857536) == 2155905024) {
+ v_multi_byte_utf8 = ((1835008 & (v_multi_byte_utf8 << 18)) |
+ (258048 & (v_multi_byte_utf8 << 4)) |
+ (4032 & (v_multi_byte_utf8 >> 10)) |
+ (63 & (v_multi_byte_utf8 >> 24)));
+ if ((65535 < v_multi_byte_utf8) &&
+ (v_multi_byte_utf8 <= 1114111)) {
+ (iop_a_src += 4, wuffs_base__make_empty_struct());
+ if (v_string_length >= 65528) {
+ *iop_a_dst++ = wuffs_base__make_token(
+ (((uint64_t)(2097697))
+ << WUFFS_BASE__TOKEN__VALUE__SHIFT) |
+ (((uint64_t)(((uint64_t)((v_string_length + 4)))))
+ << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ v_string_length = 0;
+ goto label__string_loop__continue;
+ }
+ v_string_length += 4;
+ goto label__1__continue;
+ }
+ }
}
if (v_string_length > 0) {
*iop_a_dst++ = wuffs_base__make_token(
@@ -19321,13 +19414,13 @@
}
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(10);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(13);
goto label__2__continue;
}
if (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_write);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(11);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(14);
goto label__2__continue;
}
(iop_a_src += 1, wuffs_base__make_empty_struct());
@@ -19403,11 +19496,11 @@
} else {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(12);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(15);
while (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
status = wuffs_base__make_status(
wuffs_base__suspension__short_write);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(13);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(16);
}
}
}
@@ -19532,7 +19625,7 @@
} else if (v_match == 1) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(14);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(17);
goto label__outer__continue;
}
} else if (v_class == 10) {
@@ -19552,7 +19645,7 @@
} else if (v_match == 1) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(15);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(18);
goto label__outer__continue;
}
} else if (v_class == 11) {
@@ -19572,7 +19665,7 @@
} else if (v_match == 1) {
status =
wuffs_base__make_status(wuffs_base__suspension__short_read);
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(16);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(19);
goto label__outer__continue;
}
}
diff --git a/std/json/decode_json.wuffs b/std/json/decode_json.wuffs
index 64746cf..959589e 100644
--- a/std/json/decode_json.wuffs
+++ b/std/json/decode_json.wuffs
@@ -48,7 +48,7 @@
var token_value : base.u64[..= 0xFF_FFFF]
var number_length : base.u32[..= 0x3FF]
var number_status : base.u32[..= 0x3]
- var string_length : base.u32[..= 0xFFFE]
+ var string_length : base.u32[..= 0xFFFB]
var whitespace_length : base.u32[..= 0xFFFE]
var depth : base.u32[..= 1024]
var stack_byte : base.u32[..= (1024 / 32) - 1]
@@ -58,6 +58,7 @@
var backslash : base.u8
var char : base.u8
var class : base.u8[..= 0x0F]
+ var multi_byte_utf8 : base.u32
var uni4_ok : base.u8
var uni4_string : base.u64
@@ -208,10 +209,10 @@
if char == 0x00 { // Non-special ASCII.
args.src.skip32_fast!(actual: 1, worst_case: 1)
- if string_length >= 0xFFFE {
+ if string_length >= 0xFFFB {
args.dst.write_fast_token!(
value: 0x20_0221,
- length: 0xFFFF)
+ length: 0xFFFC)
string_length = 0
continue.string_loop
}
@@ -381,18 +382,92 @@
return "#bad backslash-escape"
- } else if char <= 0x10 {
- // TODO: reject invalid UTF-8.
- args.src.skip32_fast!(actual: 1, worst_case: 1)
- if string_length >= 0xFFFE {
- args.dst.write_fast_token!(
- value: 0x20_0221,
- length: 0xFFFF)
+ } else if char == 0x03 { // 2-byte UTF-8.
+ if args.src.available() < 2 {
+ if args.src.is_closed() {
+ return "#bad UTF-8"
+ }
+ yield? base."$short read"
string_length = 0
+ char = 0
continue.string_loop
}
- string_length += 1
- continue
+ multi_byte_utf8 = args.src.peek_u16le_as_u32()
+ if (multi_byte_utf8 & 0xC000) == 0x8000 {
+ multi_byte_utf8 = (0x00_07C0 & (multi_byte_utf8 ~mod<< 6)) |
+ (0x00_003F & (multi_byte_utf8 >> 8))
+ args.src.skip32_fast!(actual: 2, worst_case: 2)
+ if string_length >= 0xFFF8 {
+ args.dst.write_fast_token!(
+ value: 0x20_0221,
+ length: (string_length + 2) as base.u64)
+ string_length = 0
+ continue.string_loop
+ }
+ string_length += 2
+ continue
+ }
+
+ } else if char == 0x04 { // 3-byte UTF-8.
+ if args.src.available() < 3 {
+ if args.src.is_closed() {
+ return "#bad UTF-8"
+ }
+ yield? base."$short read"
+ string_length = 0
+ char = 0
+ continue.string_loop
+ }
+ multi_byte_utf8 = args.src.peek_u24le_as_u32()
+ if (multi_byte_utf8 & 0xC0_C000) == 0x80_8000 {
+ multi_byte_utf8 = (0x00_F000 & (multi_byte_utf8 ~mod<< 12)) |
+ (0x00_0FC0 & (multi_byte_utf8 >> 2)) |
+ (0x00_003F & (multi_byte_utf8 >> 16))
+ if (0x07FF < multi_byte_utf8) and
+ ((multi_byte_utf8 < 0xD800) or (0xDFFF < multi_byte_utf8)) {
+
+ args.src.skip32_fast!(actual: 3, worst_case: 3)
+ if string_length >= 0xFFF8 {
+ args.dst.write_fast_token!(
+ value: 0x20_0221,
+ length: (string_length + 3) as base.u64)
+ string_length = 0
+ continue.string_loop
+ }
+ string_length += 3
+ continue
+ }
+ }
+
+ } else if char == 0x05 { // 4-byte UTF-8.
+ if args.src.available() < 4 {
+ if args.src.is_closed() {
+ return "#bad UTF-8"
+ }
+ yield? base."$short read"
+ string_length = 0
+ char = 0
+ continue.string_loop
+ }
+ multi_byte_utf8 = args.src.peek_u32le()
+ if (multi_byte_utf8 & 0xC0C0_C000) == 0x8080_8000 {
+ multi_byte_utf8 = (0x1C_0000 & (multi_byte_utf8 ~mod<< 18)) |
+ (0x03_F000 & (multi_byte_utf8 ~mod<< 4)) |
+ (0x00_0FC0 & (multi_byte_utf8 >> 10)) |
+ (0x00_003F & (multi_byte_utf8 >> 24))
+ if (0xFFFF < multi_byte_utf8) and (multi_byte_utf8 <= 0x10_FFFF) {
+ args.src.skip32_fast!(actual: 4, worst_case: 4)
+ if string_length >= 0xFFF8 {
+ args.dst.write_fast_token!(
+ value: 0x20_0221,
+ length: (string_length + 4) as base.u64)
+ string_length = 0
+ continue.string_loop
+ }
+ string_length += 4
+ continue
+ }
+ }
}
if string_length > 0 {
diff --git a/test/c/std/json.c b/test/c/std/json.c
index 8e52262..d913d11 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c
@@ -401,10 +401,15 @@
{.want_status_repr = NULL, .str = "\"\xC2\x80\""}, // U+00000080.
{.want_status_repr = NULL, .str = "\"\xDF\xBF\""}, // U+000007FF.
{.want_status_repr = NULL, .str = "\"\xE0\xA0\x80\""}, // U+00000800.
- {.want_status_repr = NULL, .str = "\"\xED\xAF\xBF\""}, // U+0000DBFF.
+ {.want_status_repr = NULL, .str = "\"\xED\x80\x80\""}, // U+0000D000.
+ {.want_status_repr = NULL, .str = "\"\xED\x9F\xBF\""}, // U+0000D7FF.
{.want_status_repr = NULL, .str = "\"\xEE\x80\x80\""}, // U+0000E000.
{.want_status_repr = NULL, .str = "\"\xEF\xBF\xBF\""}, // U+0000FFFF.
{.want_status_repr = NULL, .str = "\"\xF0\x90\x80\x80\""}, // U+00010000.
+ {.want_status_repr = NULL, .str = "\"\xF0\xB0\x80\x81\""}, // U+00030001.
+ {.want_status_repr = NULL, .str = "\"\xF1\xB0\x80\x82\""}, // U+00070002.
+ {.want_status_repr = NULL, .str = "\"\xF3\xB0\x80\x83\""}, // U+000F0003.
+ {.want_status_repr = NULL, .str = "\"\xF4\x80\x80\x84\""}, // U+00100004.
{.want_status_repr = NULL, .str = "\"\xF4\x8F\xBF\xBF\""}, // U+0010FFFF.
{.want_status_repr = NULL, .str = "\"abc\""},
{.want_status_repr = NULL, .str = "\"i\x6Ak\""},
@@ -418,6 +423,21 @@
{.want_status_repr = bad_ccc, .str = "\"\x1F\""},
{.want_status_repr = bad_ccc, .str = "\"tab+\t+tab\""},
+ {.want_status_repr = bad_utf, .str = "\"\x80\""},
+ {.want_status_repr = bad_utf, .str = "\"\xBF\""},
+ {.want_status_repr = bad_utf, .str = "\"\xC1\x80\""},
+ {.want_status_repr = bad_utf, .str = "\"\xC2\x7F\""},
+ {.want_status_repr = bad_utf, .str = "\"\xDF\xC0\""},
+ {.want_status_repr = bad_utf, .str = "\"\xDF\xFF\""},
+ {.want_status_repr = bad_utf, .str = "\"\xE0\x9F\xBF\""},
+ {.want_status_repr = bad_utf, .str = "\"\xED\xA0\xB0\""}, // U+0000D800.
+ {.want_status_repr = bad_utf, .str = "\"\xED\xBF\xBF\""}, // U+0000DFFF.
+ {.want_status_repr = bad_utf, .str = "\"\xF0\x80\x80\""},
+ {.want_status_repr = bad_utf, .str = "\"\xF0\x8F\xBF\xBF\""},
+ {.want_status_repr = bad_utf, .str = "\"\xF2\x7F\x80\x80\""},
+ {.want_status_repr = bad_utf, .str = "\"\xF2\x80\x7F\x80\""},
+ {.want_status_repr = bad_utf, .str = "\"\xF2\x80\x80\x7F\""},
+ {.want_status_repr = bad_utf, .str = "\"\xF4\x90\x80\x80\""},
{.want_status_repr = bad_utf, .str = "\"\xF5\""},
{.want_status_repr = bad_utf, .str = "\"\xFF\xFF\xFF\xFF\""},
};