Optimize std/json string decoding

name                                    old speed     new speed      delta

wuffs_json_decode_1k/clang5             729MB/s ± 1%  1130MB/s ± 0%  +54.99%  (p=0.000 n=10+10)
wuffs_json_decode_21k_formatted/clang5  541MB/s ± 1%   576MB/s ± 1%   +6.44%  (p=0.000 n=10+10)
wuffs_json_decode_26k_compact/clang5    603MB/s ± 0%   873MB/s ± 1%  +44.72%  (p=0.000 n=9+10)
wuffs_json_decode_217k_stringy/clang5   607MB/s ± 0%   727MB/s ± 0%  +19.78%  (p=0.000 n=10+10)

wuffs_json_decode_1k/gcc9               871MB/s ± 0%  1104MB/s ± 1%  +26.74%  (p=0.000 n=10+10)
wuffs_json_decode_21k_formatted/gcc9    641MB/s ± 1%   628MB/s ± 0%   -2.03%  (p=0.000 n=10+10)
wuffs_json_decode_26k_compact/gcc9      746MB/s ± 1%   892MB/s ± 0%  +19.54%  (p=0.000 n=10+10)
wuffs_json_decode_217k_stringy/gcc9     736MB/s ± 1%   724MB/s ± 1%   -1.68%  (p=0.000 n=9+10)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 149522b..cb70b3b 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -19349,6 +19349,7 @@
   uint32_t v_stack_byte = 0;
   uint32_t v_stack_bit = 0;
   uint32_t v_match = 0;
+  uint32_t v_c_by_4 = 0;
   uint8_t v_c = 0;
   uint8_t v_backslash = 0;
   uint8_t v_char = 0;
@@ -19495,6 +19496,28 @@
                 v_string_length = 0;
                 goto label__string_loop_outer__continue;
               }
+              while (((uint64_t)(io2_a_src - iop_a_src)) > 4) {
+                v_c_by_4 = wuffs_base__load_u32le__no_bounds_check(iop_a_src);
+                if (0 != (wuffs_json__lut_chars[(255 & (v_c_by_4 >> 0))] |
+                          wuffs_json__lut_chars[(255 & (v_c_by_4 >> 8))] |
+                          wuffs_json__lut_chars[(255 & (v_c_by_4 >> 16))] |
+                          wuffs_json__lut_chars[(255 & (v_c_by_4 >> 24))])) {
+                  goto label__0__break;
+                }
+                (iop_a_src += 4, wuffs_base__make_empty_struct());
+                if (v_string_length > 65527) {
+                  *iop_a_dst++ = wuffs_base__make_token(
+                      (((uint64_t)(4194337))
+                       << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
+                      (((uint64_t)(3)) << WUFFS_BASE__TOKEN__LINK__SHIFT) |
+                      (((uint64_t)((v_string_length + 4)))
+                       << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
+                  v_string_length = 0;
+                  goto label__string_loop_outer__continue;
+                }
+                v_string_length += 4;
+              }
+            label__0__break:;
               v_c = wuffs_base__load_u8be__no_bounds_check(iop_a_src);
               v_char = wuffs_json__lut_chars[v_c];
               if (v_char == 0) {
@@ -19839,7 +19862,7 @@
             }
           }
         label__string_loop_outer__break:;
-        label__0__continue:;
+        label__1__continue:;
           while (true) {
             if (((uint64_t)(io2_a_src - iop_a_src)) <= 0) {
               if (a_src && a_src->meta.closed) {
@@ -19849,13 +19872,13 @@
               status =
                   wuffs_base__make_status(wuffs_base__suspension__short_read);
               WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(11);
-              goto label__0__continue;
+              goto label__1__continue;
             }
             if (((uint64_t)(io2_a_dst - iop_a_dst)) <= 0) {
               status =
                   wuffs_base__make_status(wuffs_base__suspension__short_write);
               WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(12);
-              goto label__0__continue;
+              goto label__1__continue;
             }
             (iop_a_src += 1, wuffs_base__make_empty_struct());
             *iop_a_dst++ = wuffs_base__make_token(
@@ -19863,9 +19886,9 @@
                  << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
                 (((uint64_t)(2)) << WUFFS_BASE__TOKEN__LINK__SHIFT) |
                 (((uint64_t)(1)) << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
-            goto label__0__break;
+            goto label__1__break;
           }
-        label__0__break:;
+        label__1__break:;
           if (0 == (v_expect & 16)) {
             v_expect = 8;
             goto label__outer__continue;
@@ -19910,7 +19933,7 @@
                    << WUFFS_BASE__TOKEN__VALUE_MINOR__SHIFT) |
                   (((uint64_t)(v_number_length))
                    << WUFFS_BASE__TOKEN__LENGTH__SHIFT));
-              goto label__1__break;
+              goto label__2__break;
             }
             while (v_number_length > 0) {
               v_number_length -= 1;
@@ -19940,7 +19963,7 @@
               }
             }
           }
-        label__1__break:;
+        label__2__break:;
           goto label__goto_parsed_a_leaf_value__break;
         } else if (v_class == 5) {
           v_vminor = 2113553;
diff --git a/std/json/decode_json.wuffs b/std/json/decode_json.wuffs
index c2f1efb..7f3f483 100644
--- a/std/json/decode_json.wuffs
+++ b/std/json/decode_json.wuffs
@@ -55,6 +55,7 @@
 	var stack_byte        : base.u32[..= (1024 / 32) - 1]
 	var stack_bit         : base.u32[..= 31]
 	var match             : base.u32[..= 2]
+	var c_by_4            : base.u32
 	var c                 : base.u8
 	var backslash         : base.u8
 	var char              : base.u8
@@ -190,6 +191,32 @@
 							string_length = 0
 							continue.string_loop_outer
 						}
+
+						// As an optimization, consume non-special ASCII 4 bytes at a time.
+						while args.src.available() > 4,
+							inv args.dst.available() > 0,
+							inv args.src.available() > 0,
+						{
+							c_by_4 = args.src.peek_u32le()
+							if 0x00 <> (lut_chars[0xFF & (c_by_4 >> 0)] |
+								lut_chars[0xFF & (c_by_4 >> 8)] |
+								lut_chars[0xFF & (c_by_4 >> 16)] |
+								lut_chars[0xFF & (c_by_4 >> 24)]) {
+								break
+							}
+							args.src.skip32_fast!(actual: 4, worst_case: 4)
+							if string_length > (0xFFFB - 4) {
+								args.dst.write_fast_token!(
+									value_major: 0,
+									value_minor: 0x40_0021,
+									link: 0x3,
+									length: string_length + 4)
+								string_length = 0
+								continue.string_loop_outer
+							}
+							string_length += 4
+						}
+
 						c = args.src.peek_u8()
 						char = lut_chars[c]