Add json QUIRK_ALLOW_TRAILING_COMMENT

commit: fa50f4de67da25a320584164ed12869603898a16 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Mon Sep 21 11:07:36 2020 +1000
committer: Nigel Tao <nigeltao@golang.org> Mon Sep 21 11:24:22 2020 +1000
tree: 10f71f0ccdc9bc1c0f2fabc5b3f48fff90aad29e
parent: 2912ea8e286a694a980c4d2701a533608e65fb2c [diff]
diff --git a/example/jsonptr/jsonptr.cc b/example/jsonptr/jsonptr.cc
index 84d4c44..0ee259b 100644
--- a/example/jsonptr/jsonptr.cc
+++ b/example/jsonptr/jsonptr.cc

@@ -934,6 +934,7 @@
   if (g_flags.input_allow_comments) {
     g_dec.set_quirk_enabled(WUFFS_JSON__QUIRK_ALLOW_COMMENT_BLOCK, true);
     g_dec.set_quirk_enabled(WUFFS_JSON__QUIRK_ALLOW_COMMENT_LINE, true);
+    g_dec.set_quirk_enabled(WUFFS_JSON__QUIRK_ALLOW_TRAILING_COMMENT, true);
   }
   if (g_flags.input_allow_extra_comma) {
     g_dec.set_quirk_enabled(WUFFS_JSON__QUIRK_ALLOW_EXTRA_COMMA, true);

diff --git a/fuzz/c/std/json_fuzzer.c b/fuzz/c/std/json_fuzzer.c
index c82946b..0a83aa5 100644
--- a/fuzz/c/std/json_fuzzer.c
+++ b/fuzz/c/std/json_fuzzer.c

@@ -240,6 +240,7 @@
       WUFFS_JSON__QUIRK_ALLOW_INF_NAN_NUMBERS,
       WUFFS_JSON__QUIRK_ALLOW_LEADING_ASCII_RECORD_SEPARATOR,
       WUFFS_JSON__QUIRK_ALLOW_LEADING_UNICODE_BYTE_ORDER_MARK,
+      WUFFS_JSON__QUIRK_ALLOW_TRAILING_COMMENT,
       WUFFS_JSON__QUIRK_ALLOW_TRAILING_NEW_LINE,
       WUFFS_JSON__QUIRK_REPLACE_INVALID_UNICODE,
       0,

diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 7ed4f1f..3f4ad79 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -7661,6 +7661,8 @@
 
 #define WUFFS_JSON__QUIRK_ALLOW_LEADING_UNICODE_BYTE_ORDER_MARK 1225364496
 
+#define WUFFS_JSON__QUIRK_ALLOW_TRAILING_COMMENT 1225364497
+
 #define WUFFS_JSON__QUIRK_ALLOW_TRAILING_NEW_LINE 1225364498
 
 #define WUFFS_JSON__QUIRK_JSON_POINTER_ALLOW_TILDE_R_TILDE_N 1225364499
@@ -7758,7 +7760,9 @@
     bool f_quirks[21];
     bool f_allow_leading_ars;
     bool f_allow_leading_ubom;
+    bool f_allow_trailing_comment;
     bool f_end_of_data;
+    uint8_t f_comment_type;
 
     uint32_t p_decode_tokens[1];
     uint32_t p_decode_leading[1];
@@ -27227,6 +27231,7 @@
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
 
+    self->private_impl.f_comment_type = 0;
     label__0__continue:;
     while ((((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) || (((uint64_t)(io2_a_src - iop_a_src)) <= 1)) {
       if (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
@@ -27276,6 +27281,7 @@
             *iop_a_dst++ = wuffs_base__make_token(
                 (((uint64_t)(2)) << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
                 (((uint64_t)((v_length + 2))) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+            self->private_impl.f_comment_type = 1;
             status = wuffs_base__make_status(NULL);
             goto ok;
           }
@@ -27325,6 +27331,7 @@
             *iop_a_dst++ = wuffs_base__make_token(
                 (((uint64_t)(4)) << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
                 (((uint64_t)((v_length + 1))) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+            self->private_impl.f_comment_type = 2;
             status = wuffs_base__make_status(NULL);
             goto ok;
           }
@@ -27568,6 +27575,7 @@
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
 
+    self->private_impl.f_allow_trailing_comment = self->private_impl.f_quirks[17];
     label__outer__continue:;
     while (true) {
       if (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
@@ -27600,6 +27608,33 @@
                 (((uint64_t)(v_whitespace_length)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
             v_whitespace_length = 0;
           }
+          if (self->private_impl.f_allow_trailing_comment) {
+            if (a_dst) {
+              a_dst->meta.wi = ((size_t)(iop_a_dst - a_dst->data.ptr));
+            }
+            if (a_src) {
+              a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+            }
+            WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3);
+            status = wuffs_json__decoder__decode_comment(self, a_dst, a_src);
+            if (a_dst) {
+              iop_a_dst = a_dst->data.ptr + a_dst->meta.wi;
+            }
+            if (a_src) {
+              iop_a_src = a_src->data.ptr + a_src->meta.ri;
+            }
+            if (status.repr) {
+              goto suspend;
+            }
+            v_c = 0;
+            v_whitespace_length = 0;
+            if (self->private_impl.f_comment_type == 1) {
+              self->private_impl.f_allow_trailing_comment = false;
+              goto label__outer__continue;
+            } else if (self->private_impl.f_comment_type == 2) {
+              goto label__outer__break;
+            }
+          }
           status = wuffs_base__make_status(wuffs_json__error__bad_input);
           goto exit;
         }

diff --git a/std/json/decode_json.wuffs b/std/json/decode_json.wuffs
index 761ea00..df48da1 100644
--- a/std/json/decode_json.wuffs
+++ b/std/json/decode_json.wuffs

@@ -15,11 +15,18 @@
 pub struct decoder? implements base.token_decoder(
 	quirks : array[QUIRKS_COUNT] base.bool,
 
-	allow_leading_ars  : base.bool,
-	allow_leading_ubom : base.bool,
+	allow_leading_ars      : base.bool,
+	allow_leading_ubom     : base.bool,
+	allow_trailing_comment : base.bool,
 
 	end_of_data : base.bool,
 
+	// comment_type is set as a side-effect of decode_comment?.
+	//  - 0 means failure.
+	//  - 1 means a block comment.
+	//  - 2 means a line  comment.
+	comment_type : base.u8,
+
 	util : base.utility,
 )(
 	// stack is conceptually an array of bits, implemented as an array of u32.
@@ -1420,6 +1427,8 @@
 	var c2     : base.u16
 	var length : base.u32[..= 0xFFFD]
 
+	this.comment_type = 0
+
 	while (args.dst.length() <= 0) or (args.src.length() <= 1),
 		post args.dst.length() > 0,
 		post args.src.length() > 1,
@@ -1475,6 +1484,7 @@
 						base.TOKEN__VBD__FILLER__COMMENT_BLOCK,
 						continued: 0,
 						length: length + 2)
+					this.comment_type = 1
 					return ok
 				}
 
@@ -1533,6 +1543,7 @@
 						base.TOKEN__VBD__FILLER__COMMENT_LINE,
 						continued: 0,
 						length: length + 1)
+					this.comment_type = 2
 					return ok
 				}
 
@@ -1670,6 +1681,8 @@
 	var c                 : base.u8
 	var whitespace_length : base.u32[..= 0xFFFE]
 
+	this.allow_trailing_comment = this.quirks[QUIRK_ALLOW_TRAILING_COMMENT - QUIRKS_BASE]
+
 	while.outer true {
 		if args.dst.length() <= 0 {
 			yield? base."$short write"
@@ -1701,6 +1714,17 @@
 						value_major: 0, value_minor: 0, continued: 0, length: whitespace_length)
 					whitespace_length = 0
 				}
+				if this.allow_trailing_comment {
+					this.decode_comment?(dst: args.dst, src: args.src)
+					c = 0
+					whitespace_length = 0
+					if this.comment_type == 1 {  // Block comment.
+						this.allow_trailing_comment = false
+						continue.outer
+					} else if this.comment_type == 2 {  // Line comment.
+						break.outer
+					}
+				}
 				return "#bad input"
 			}
 

diff --git a/std/json/decode_quirks.wuffs b/std/json/decode_quirks.wuffs
index 81eb35a..5f31b5b 100644
--- a/std/json/decode_quirks.wuffs
+++ b/std/json/decode_quirks.wuffs

@@ -97,7 +97,7 @@
 pub const QUIRK_ALLOW_BACKSLASH_ZERO : base.u32 = 0x4909_9400 | 0x0A
 
 // When this quirk is enabled, "/* C/C++ style block comments */" are accepted
-// anywhere whitespace would be, although see the QUIRK_ALLOW_TRAILING_NEW_LINE
+// anywhere whitespace would be, although see the QUIRK_ALLOW_TRAILING_COMMENT
 // comment for additional interaction when combining multiple quirks.
 //
 // They produce WUFFS_BASE__TOKEN__VBD__FILLER__COMMENT_BLOCK tokens. The token
@@ -105,7 +105,7 @@
 pub const QUIRK_ALLOW_COMMENT_BLOCK : base.u32 = 0x4909_9400 | 0x0B
 
 // When this quirk is enabled, "// C/C++ style line comments\n" are accepted
-// anywhere whitespace would be, although see the QUIRK_ALLOW_TRAILING_NEW_LINE
+// anywhere whitespace would be, although see the QUIRK_ALLOW_TRAILING_COMMENT
 // comment for additional interaction when combining multiple quirks.
 //
 // A line comment may not omit the ending "\n", even if there is no input
@@ -152,6 +152,23 @@
 // may come first in the byte stream.
 pub const QUIRK_ALLOW_LEADING_UNICODE_BYTE_ORDER_MARK : base.u32 = 0x4909_9400 | 0x10
 
+// When this quirk is enabled and both:
+//  - QUIRK_ALLOW_TRAILING_NEW_LINE is enabled,
+//  - at least one of QUIRK_ALLOW_COMMENT_ETC is enabled,
+// the trailing whitespace may optionally contain a single comment. As per
+// QUIRK_ALLOW_TRAILING_NEW_LINE, processing will still stop at the first
+// trailing '\n' (outside of a block comment), even if more comments followed.
+//
+// For a trailing block comment, new lines within the comment are not counted
+// and after the comment concludes, the decoder will continue consuming
+// whitespace up to and including the next '\n' (or end-of-file).
+//
+// For a trailing line comment, the decoder stops immediately after the
+// comment. If not stopped by end-of-file, this stops after the '\n' that
+// concludes (and is part of) the comment. One implication is that if multiple
+// line comments trail a JSON value, only the first one will be processed.
+pub const QUIRK_ALLOW_TRAILING_COMMENT : base.u32 = 0x4909_9400 | 0x11
+
 // When this quirk is enabled, following a successful decoding of a top-level
 // JSON value, any trailing whitespace (ASCII characters 0x09, 0x0A, 0x0D or
 // 0x20) is also consumed (and WUFFS_BASE__TOKEN__VBC__FILLER tokens emitted)
@@ -159,14 +176,16 @@
 // also known as '\n'), whichever comes first. This trailing whitespace is not
 // mandatory, but it is consumed if present.
 //
-// When enabled, trailing non-whitespace (before a '\n') is an error. For
+// When enabled, trailing non-whitespace (before a '\n') is an error (unless it
+// is a comment and QUIRK_ALLOW_TRAILING_COMMENT is enabled; see above). For
 // example, with "007" input, decoding with this quirk disabled (the default
 // case) will consume just 1 byte ("0") and leave the rest ("07") unread
 // without error, as "0" is a perfectly valid JSON value (but "00" is not).
 // Decoding "007" (or "007\n") with this quirk enabled will return an error.
 //
-// When enabled, the decoder will not consume more than one trailing '\n', nor
-// will it consume any other whitespace immediately after a trailing '\n'.
+// When enabled, the decoder will not consume more than one trailing '\n'
+// (outside of a block comment), nor will it consume any other whitespace
+// immediately after a trailing '\n'.
 //
 // If a JSON encoder avoids emitting (optional) '\n' bytes, other than a single
 // '\n' after each top-level value in a multi-JSON-value stream, this format is
@@ -182,10 +201,11 @@
 // "application/json-seq".
 //
 // When combined with QUIRK_ALLOW_COMMENT_BLOCK or QUIRK_ALLOW_COMMENT_LINE, it
-// is an error for a comment to occur in this trailing whitespace, before an
-// end-of-file or '\n' is encountered. Treating this as an error avoids any
-// ambiguity in accounting for new lines within a block comment or ending a
-// line comment.
+// is an error for a comment to occur in this trailing whitespace, unless
+// QUIRK_ALLOW_TRAILING_COMMENT is also enabled. Be aware that block comments
+// can contain multiple new lines, so combining such quirks can break the
+// "exactly one JSON value per line" assumption for newline-delimited but
+// otherwise compact (minified) JSON.
 pub const QUIRK_ALLOW_TRAILING_NEW_LINE : base.u32 = 0x4909_9400 | 0x12
 
 // When this quirk is enabled, JSON Pointer strings containing "~r" or "~n",

diff --git a/test/c/std/json.c b/test/c/std/json.c
index c1ca60c..413e586 100644
--- a/test/c/std/json.c
+++ b/test/c/std/json.c

@@ -3386,7 +3386,72 @@
 }
 
 const char*  //
-test_wuffs_json_decode_quirk_allow_trailing_etc() {
+test_wuffs_json_decode_quirk_allow_trailing_comment() {
+  CHECK_FOCUS(__func__);
+
+  // These test cases all end with two '\n' bytes. If the first byte is '8'
+  // then decoding should succeed, and stop between those two '\n' bytes.
+  // Otherwise, decoding should fail.
+  const char* test_cases[] = {
+      "80\n\n",                //
+      "81 \n\n",               //
+      "82 /*foo*/ \n\n",       //
+      "83/*bar\nbaz*/\n\n",    //
+      "84 // qux\n\n",         //
+      "95 /*c0*/ /*c1*/\n\n",  //
+      "96 /*c0*/ // c2 \n\n",  //
+  };
+
+  int tc;
+  for (tc = 0; tc < WUFFS_TESTLIB_ARRAY_SIZE(test_cases); tc++) {
+    wuffs_json__decoder dec;
+    CHECK_STATUS("initialize", wuffs_json__decoder__initialize(
+                                   &dec, sizeof dec, WUFFS_VERSION,
+                                   WUFFS_INITIALIZE__DEFAULT_OPTIONS));
+    wuffs_json__decoder__set_quirk_enabled(
+        &dec, WUFFS_JSON__QUIRK_ALLOW_COMMENT_BLOCK, true);
+    wuffs_json__decoder__set_quirk_enabled(
+        &dec, WUFFS_JSON__QUIRK_ALLOW_COMMENT_LINE, true);
+    wuffs_json__decoder__set_quirk_enabled(
+        &dec, WUFFS_JSON__QUIRK_ALLOW_TRAILING_COMMENT, true);
+    wuffs_json__decoder__set_quirk_enabled(
+        &dec, WUFFS_JSON__QUIRK_ALLOW_TRAILING_NEW_LINE, true);
+
+    void* tc_ptr = (void*)(test_cases[tc]);
+    size_t tc_len = strlen(test_cases[tc]);
+    wuffs_base__token_buffer tok =
+        wuffs_base__slice_token__writer(g_have_slice_token);
+    wuffs_base__io_buffer src =
+        wuffs_base__ptr_u8__reader(tc_ptr, tc_len, true);
+    const char* have =
+        wuffs_json__decoder__decode_tokens(&dec, &tok, &src, g_work_slice_u8)
+            .repr;
+    const char* want =
+        (test_cases[tc][0] == '8') ? NULL : wuffs_json__error__bad_input;
+    if (have != want) {
+      RETURN_FAIL("tc=%d: decode_tokens: have \"%s\", want \"%s\"", tc, have,
+                  want);
+    } else if (have != NULL) {
+      continue;
+    }
+
+    size_t total_length = 0;
+    while (tok.meta.ri < tok.meta.wi) {
+      total_length += wuffs_base__token__length(&tok.data.ptr[tok.meta.ri++]);
+    }
+    if (total_length != src.meta.ri) {
+      RETURN_FAIL("tc=%d: total_length: have %zu, want %zu", tc, total_length,
+                  src.meta.ri);
+    } else if ((total_length + 1) != tc_len) {
+      RETURN_FAIL("tc=%d: total_length+1: have %zu, want %zu", tc,
+                  total_length + 1, tc_len);
+    }
+  }
+  return NULL;
+}
+
+const char*  //
+test_wuffs_json_decode_quirk_allow_trailing_new_line() {
   CHECK_FOCUS(__func__);
 
   struct {
@@ -4069,7 +4134,8 @@
     test_wuffs_json_decode_quirk_allow_extra_comma,
     test_wuffs_json_decode_quirk_allow_inf_nan_numbers,
     test_wuffs_json_decode_quirk_allow_leading_etc,
-    test_wuffs_json_decode_quirk_allow_trailing_etc,
+    test_wuffs_json_decode_quirk_allow_trailing_comment,
+    test_wuffs_json_decode_quirk_allow_trailing_new_line,
     test_wuffs_json_decode_quirk_replace_invalid_unicode,
     test_wuffs_json_decode_src_io_buffer_length,
     test_wuffs_json_decode_string,
commit	fa50f4de67da25a320584164ed12869603898a16	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Mon Sep 21 11:07:36 2020 +1000
committer	Nigel Tao <nigeltao@golang.org>	Mon Sep 21 11:24:22 2020 +1000
tree	10f71f0ccdc9bc1c0f2fabc5b3f48fff90aad29e
parent	2912ea8e286a694a980c4d2701a533608e65fb2c [diff]