Add std/json quirk_allow_ascii_control_codes
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index f7409ff..b5cca28 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6048,6 +6048,8 @@
#define WUFFS_JSON__DECODER_SRC_IO_BUFFER_LENGTH_MIN_INCL 100
+#define WUFFS_JSON__QUIRK_ALLOW_ASCII_CONTROL_CODES 1225364480
+
#define WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_A 1225364481
#define WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_CAPITAL_U 1225364482
@@ -6151,6 +6153,7 @@
wuffs_base__vtable null_vtable;
bool f_quirk_enabled_allow_backslash_etc[8];
+ bool f_quirk_enabled_allow_ascii_control_codes;
bool f_quirk_enabled_allow_backslash_capital_u;
bool f_quirk_enabled_allow_backslash_x;
bool f_quirk_enabled_allow_comment_block;
@@ -20842,9 +20845,9 @@
static const uint8_t //
wuffs_json__lut_chars[256] //
WUFFS_BASE__POTENTIALLY_UNUSED = {
- 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
+ 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
+ 156, 157, 158, 159, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -20855,12 +20858,12 @@
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 129, 129, 3, 3,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 32, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 5, 5, 5, 5, 5, 129, 129, 129, 129, 129, 129, 129,
- 129, 129, 129, 129,
+ 4, 4, 5, 5, 5, 5, 5, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
};
static const uint8_t //
@@ -21041,7 +21044,9 @@
return wuffs_base__make_empty_struct();
}
- if (a_quirk == 1225364481) {
+ if (a_quirk == 1225364480) {
+ self->private_impl.f_quirk_enabled_allow_ascii_control_codes = a_enabled;
+ } else if (a_quirk == 1225364481) {
self->private_impl.f_quirk_enabled_allow_backslash_etc[1] = a_enabled;
} else if (a_quirk == 1225364482) {
self->private_impl.f_quirk_enabled_allow_backslash_capital_u = a_enabled;
@@ -21862,7 +21867,17 @@
goto label__string_loop_outer__continue;
}
}
- if (v_char == 128) {
+ if ((v_char & 128) != 0) {
+ if (self->private_impl
+ .f_quirk_enabled_allow_ascii_control_codes) {
+ *iop_a_dst++ = wuffs_base__make_token(
+ (((uint64_t)((6291456 | ((uint32_t)((v_char & 127))))))
+ << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
+ (((uint64_t)(3)) << WUFFS_BASE__TOKEN__LINK__SHIFT) |
+ (((uint64_t)(1)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+ (iop_a_src += 1, wuffs_base__make_empty_struct());
+ goto label__string_loop_outer__continue;
+ }
status = wuffs_base__make_status(
wuffs_json__error__bad_c0_control_code);
goto exit;
diff --git a/std/json/common_consts.wuffs b/std/json/common_consts.wuffs
index 1beeae9..18a3139 100644
--- a/std/json/common_consts.wuffs
+++ b/std/json/common_consts.wuffs
@@ -134,8 +134,8 @@
// - 0x04 is the start of 3-byte UTF-8.
// - 0x05 is the start of 4-byte UTF-8.
// - 0x10 is a UTF-8 tail byte.
-// - 0x80 is invalid JSON (C0 control codes).
-// - 0x81 is invalid UTF-8.
+// - 0x20 is invalid UTF-8.
+// - 0x80 and above is invalid JSON (C0 control codes).
//
// RFC 3629 (UTF-8) gives this grammar for valid UTF-8:
// UTF8-1 = %x00-7F
@@ -148,10 +148,10 @@
pri const lut_chars array[256] base.u8 = [
// 0 1 2 3 4 5 6 7
// 8 9 A B C D E F
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0x00 ..= 0x07. C0 control codes.
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0x08 ..= 0x0F. C0 control codes.
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0x10 ..= 0x17. C0 control codes.
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // 0x18 ..= 0x1F. C0 control codes.
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, // 0x00 ..= 0x07. C0 control codes.
+ 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, // 0x08 ..= 0x0F. C0 control codes.
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, // 0x10 ..= 0x17. C0 control codes.
+ 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, // 0x18 ..= 0x1F. C0 control codes.
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x20 ..= 0x27. UTF-8-1; '"'.
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x28 ..= 0x2F. UTF-8-1.
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x30 ..= 0x37. UTF-8-1.
@@ -175,14 +175,14 @@
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0xB0 ..= 0xB7. UTF-8 tail.
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0xB8 ..= 0xBF. UTF-8 tail.
- 0x81, 0x81, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // 0xC0 ..= 0xC7. Invalid UTF-8; UTF-8-2.
+ 0x20, 0x20, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // 0xC0 ..= 0xC7. Invalid UTF-8; UTF-8-2.
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // 0xC8 ..= 0xCF. UTF-8-2.
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // 0xD0 ..= 0xD7. UTF-8-2.
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // 0xD8 ..= 0xDF. UTF-8-2.
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, // 0xE0 ..= 0xE7. UTF-8-3.
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, // 0xE8 ..= 0xEF. UTF-8-3.
- 0x05, 0x05, 0x05, 0x05, 0x05, 0x81, 0x81, 0x81, // 0xF0 ..= 0xF7. UTF-8-4; Invalid UTF-8.
- 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, // 0xF8 ..= 0xFF. Invalid UTF-8.
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x20, 0x20, 0x20, // 0xF0 ..= 0xF7. UTF-8-4; Invalid UTF-8.
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, // 0xF8 ..= 0xFF. Invalid UTF-8.
// 0 1 2 3 4 5 6 7
// 8 9 A B C D E F
]
diff --git a/std/json/decode_json.wuffs b/std/json/decode_json.wuffs
index eda8234..c5169d3 100644
--- a/std/json/decode_json.wuffs
+++ b/std/json/decode_json.wuffs
@@ -17,6 +17,7 @@
// same enum as lut_quirky_backslashes.
quirk_enabled_allow_backslash_etc : array[8] base.bool,
+ quirk_enabled_allow_ascii_control_codes : base.bool,
quirk_enabled_allow_backslash_capital_u : base.bool,
quirk_enabled_allow_backslash_x : base.bool,
quirk_enabled_allow_comment_block : base.bool,
@@ -67,7 +68,9 @@
)
pub func decoder.set_quirk_enabled!(quirk: base.u32, enabled: base.bool) {
- if args.quirk == quirk_allow_backslash_a {
+ if args.quirk == quirk_allow_ascii_control_codes {
+ this.quirk_enabled_allow_ascii_control_codes = args.enabled
+ } else if args.quirk == quirk_allow_backslash_a {
this.quirk_enabled_allow_backslash_etc[1] = args.enabled
} else if args.quirk == quirk_allow_backslash_capital_u {
this.quirk_enabled_allow_backslash_capital_u = args.enabled
@@ -788,7 +791,16 @@
continue.string_loop_outer
}
}
- if char == 0x80 {
+ if (char & 0x80) <> 0 {
+ if this.quirk_enabled_allow_ascii_control_codes {
+ args.dst.write_simple_token_fast!(
+ value_major: 0,
+ value_minor: 0x60_0000 | ((char & 0x7F) as base.u32),
+ link: 0x3,
+ length: 1)
+ args.src.skip32_fast!(actual: 1, worst_case: 1)
+ continue.string_loop_outer
+ }
return "#bad C0 control code"
}
if this.quirk_enabled_replace_invalid_unicode {
diff --git a/std/json/decode_quirks.wuffs b/std/json/decode_quirks.wuffs
index fdb14b2..7feb5af 100644
--- a/std/json/decode_quirks.wuffs
+++ b/std/json/decode_quirks.wuffs
@@ -21,6 +21,20 @@
// --------
+// When this quirk is enabled, literal (not escaped) ASCII control codes, such
+// as backspaces, new lines and NUL bytes, are accepted in JSON strings.
+//
+// Regardless of whether this quirk is enabled, escaped ASCII control codes,
+// such as "\b" or "\u0008", are always accepted.
+//
+// This allows for multi-line strings, where a literal new line in the encoding
+// becomes a literal new line in the decoding. A backslash before a new line is
+// still a syntax error, unless combined with quirk_allow_backslash_new_line.
+//
+// Any indentation following a new line is not stripped, but remains part of
+// the decoded string.
+pub const quirk_allow_ascii_control_codes base.u32 = 0x4909_9400 | 0x00
+
// When this quirk is enabled, e.g. "abc\az" is accepted as a JSON string,
// equivalent to "abc\u0007z", containing an ASCII Bell control character.
pub const quirk_allow_backslash_a base.u32 = 0x4909_9400 | 0x01
diff --git a/test/c/std/json.c b/test/c/std/json.c
index 8ea6bc5..42f259e 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c
@@ -1414,6 +1414,11 @@
uint32_t quirk;
} test_cases[] = {
{
+ .want = 0x09,
+ .str = "\"\t\"",
+ .quirk = WUFFS_JSON__QUIRK_ALLOW_ASCII_CONTROL_CODES,
+ },
+ {
.want = 0x07,
.str = "\"\\a\"",
.quirk = WUFFS_JSON__QUIRK_ALLOW_BACKSLASH_A,
@@ -1475,6 +1480,11 @@
.repr;
const char* want_status_repr =
q ? NULL : wuffs_json__error__bad_backslash_escape;
+ if ((test_cases[tc].quirk ==
+ WUFFS_JSON__QUIRK_ALLOW_ASCII_CONTROL_CODES) &&
+ want_status_repr) {
+ want_status_repr = wuffs_json__error__bad_c0_control_code;
+ }
if (have_status_repr != want_status_repr) {
RETURN_FAIL("tc=%d, q=%d: decode_tokens: have \"%s\", want \"%s\"", tc,
q, have_status_repr, want_status_repr);