Re-organize internal/cgen/base/io-private.h
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index e4fd9ce..23114d0 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -32,6 +32,104 @@
   return wuffs_base__make_slice_u8(NULL, 0);
 }
 
+// --------
+
+// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes
+// start with the given prefix (up to 7 bytes long). It is peek-like, not
+// read-like, in that there are no side-effects.
+//
+// The low 3 bits of a hold the prefix length, n.
+//
+// The high 56 bits of a hold the prefix itself, in little-endian order. The
+// first prefix byte is in bits 8..=15, the second prefix byte is in bits
+// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.
+//
+// There are three possible return values:
+//  - 0 means success.
+//  - 1 means inconclusive, equivalent to "$short read".
+//  - 2 means failure.
+static inline uint32_t  //
+wuffs_base__io_reader__match7(const uint8_t* iop_r,
+                              const uint8_t* io2_r,
+                              wuffs_base__io_buffer* r,
+                              uint64_t a) {
+  uint32_t n = a & 7;
+  a >>= 8;
+  if ((io2_r - iop_r) >= 8) {
+    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);
+    uint32_t shift = 8 * (8 - n);
+    return ((a << shift) == (x << shift)) ? 0 : 2;
+  }
+  for (; n > 0; n--) {
+    if (iop_r >= io2_r) {
+      return (r && r->meta.closed) ? 2 : 1;
+    } else if (*iop_r != ((uint8_t)(a))) {
+      return 2;
+    }
+    iop_r++;
+    a >>= 8;
+  }
+  return 0;
+}
+
+static inline wuffs_base__io_buffer*  //
+wuffs_base__io_reader__set(wuffs_base__io_buffer* b,
+                           const uint8_t** ptr_iop_r,
+                           const uint8_t** ptr_io0_r,
+                           const uint8_t** ptr_io1_r,
+                           const uint8_t** ptr_io2_r,
+                           wuffs_base__slice_u8 data) {
+  b->data = data;
+  b->meta.wi = data.len;
+  b->meta.ri = 0;
+  b->meta.pos = 0;
+  b->meta.closed = false;
+
+  *ptr_iop_r = data.ptr;
+  *ptr_io0_r = data.ptr;
+  *ptr_io1_r = data.ptr;
+  *ptr_io2_r = data.ptr + data.len;
+
+  return b;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+// TODO: can we avoid the const_cast (by deleting this function)? This might
+// involve converting the call sites to take an io_reader instead of a slice u8
+// (the result of io_reader.take).
+static inline wuffs_base__slice_u8  //
+wuffs_base__io_reader__take(const uint8_t** ptr_iop_r,
+                            const uint8_t* io2_r,
+                            uint64_t n) {
+  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {
+    const uint8_t* p = *ptr_iop_r;
+    *ptr_iop_r += n;
+    // The arg is what C calls C++'s "const_cast<uint8_t*>(p)".
+    return wuffs_base__make_slice_u8((uint8_t*)(p), n);
+  }
+  return wuffs_base__make_slice_u8(NULL, 0);
+}
+#pragma GCC diagnostic pop
+
+// --------
+
+static inline uint64_t  //
+wuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,
+                                       uint8_t* io2_w,
+                                       wuffs_base__slice_u8 src) {
+  uint8_t* iop_w = *ptr_iop_w;
+  size_t n = src.len;
+  if (n > ((size_t)(io2_w - iop_w))) {
+    n = (size_t)(io2_w - iop_w);
+  }
+  if (n > 0) {
+    memmove(iop_w, src.ptr, n);
+    *ptr_iop_w += n;
+  }
+  return (uint64_t)(n);
+}
+
 static inline uint32_t  //
 wuffs_base__io_writer__copy_n32_from_history(uint8_t** ptr_iop_w,
                                              uint8_t* io1_w,
@@ -125,22 +223,6 @@
   return (uint32_t)(n);
 }
 
-static inline uint64_t  //
-wuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,
-                                       uint8_t* io2_w,
-                                       wuffs_base__slice_u8 src) {
-  uint8_t* iop_w = *ptr_iop_w;
-  size_t n = src.len;
-  if (n > ((size_t)(io2_w - iop_w))) {
-    n = (size_t)(io2_w - iop_w);
-  }
-  if (n > 0) {
-    memmove(iop_w, src.ptr, n);
-    *ptr_iop_w += n;
-  }
-  return (uint64_t)(n);
-}
-
 static inline uint32_t  //
 wuffs_base__io_writer__copy_n32_from_slice(uint8_t** ptr_iop_w,
                                            uint8_t* io2_w,
@@ -161,84 +243,6 @@
   return (uint32_t)(n);
 }
 
-// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes
-// start with the given prefix (up to 7 bytes long). It is peek-like, not
-// read-like, in that there are no side-effects.
-//
-// The low 3 bits of a hold the prefix length, n.
-//
-// The high 56 bits of a hold the prefix itself, in little-endian order. The
-// first prefix byte is in bits 8..=15, the second prefix byte is in bits
-// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.
-//
-// There are three possible return values:
-//  - 0 means success.
-//  - 1 means inconclusive, equivalent to "$short read".
-//  - 2 means failure.
-static inline uint32_t  //
-wuffs_base__io_reader__match7(const uint8_t* iop_r,
-                              const uint8_t* io2_r,
-                              wuffs_base__io_buffer* r,
-                              uint64_t a) {
-  uint32_t n = a & 7;
-  a >>= 8;
-  if ((io2_r - iop_r) >= 8) {
-    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);
-    uint32_t shift = 8 * (8 - n);
-    return ((a << shift) == (x << shift)) ? 0 : 2;
-  }
-  for (; n > 0; n--) {
-    if (iop_r >= io2_r) {
-      return (r && r->meta.closed) ? 2 : 1;
-    } else if (*iop_r != ((uint8_t)(a))) {
-      return 2;
-    }
-    iop_r++;
-    a >>= 8;
-  }
-  return 0;
-}
-
-static inline wuffs_base__io_buffer*  //
-wuffs_base__io_reader__set(wuffs_base__io_buffer* b,
-                           const uint8_t** ptr_iop_r,
-                           const uint8_t** ptr_io0_r,
-                           const uint8_t** ptr_io1_r,
-                           const uint8_t** ptr_io2_r,
-                           wuffs_base__slice_u8 data) {
-  b->data = data;
-  b->meta.wi = data.len;
-  b->meta.ri = 0;
-  b->meta.pos = 0;
-  b->meta.closed = false;
-
-  *ptr_iop_r = data.ptr;
-  *ptr_io0_r = data.ptr;
-  *ptr_io1_r = data.ptr;
-  *ptr_io2_r = data.ptr + data.len;
-
-  return b;
-}
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-qual"
-// TODO: can we avoid the const_cast (by deleting this function)? This might
-// involve converting the call sites to take an io_reader instead of a slice u8
-// (the result of io_reader.take).
-static inline wuffs_base__slice_u8  //
-wuffs_base__io_reader__take(const uint8_t** ptr_iop_r,
-                            const uint8_t* io2_r,
-                            uint64_t n) {
-  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {
-    const uint8_t* p = *ptr_iop_r;
-    *ptr_iop_r += n;
-    // The arg is what C calls C++'s "const_cast<uint8_t*>(p)".
-    return wuffs_base__make_slice_u8((uint8_t*)(p), n);
-  }
-  return wuffs_base__make_slice_u8(NULL, 0);
-}
-#pragma GCC diagnostic pop
-
 static inline wuffs_base__io_buffer*  //
 wuffs_base__io_writer__set(wuffs_base__io_buffer* b,
                            uint8_t** ptr_iop_w,
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index dc46046..1e6d285 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -325,14 +325,17 @@
 	""
 
 const baseIOPrivateH = "" +
-	"// ---------------- I/O\n\nstatic inline uint64_t  //\nwuffs_base__io__count_since(uint64_t mark, uint64_t index) {\n  if (index >= mark) {\n    return index - mark;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io__since(uint64_t mark, uint64_t index, uint8_t* ptr) {\n  if (index >= mark) {\n    return wuffs_base__make_slice_u8(ptr + mark, index - mark);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_history(uint8_t** ptr_iop_w,\n                                             uint8_t* io1_w,\n                                             uint8_t* io2_w,\n                                             uint32_t length,\n                                             uint32_t distance) {\n  if (!distance) {\n    return 0;\n  }\n  uint8_t* p = *ptr_iop_w;\n  if ((size_t)(p - io1_w) < (size_t)(distance)) {\n    return 0;\n  }\n  uint8_t* q = p - distance;\n  size_t n = (size_t)(io2_w - p);\n  if ((size_t)(length) > n) {\n    length = (uint32_t)(n);\n " +
-	" } else {\n    n = (size_t)(length);\n  }\n  // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n  // is mostly because 3 is the minimum length for the deflate format. This\n  // function implementation shouldn't overfit to that one format. Perhaps the\n  // copy_n32_from_history Wuffs method should also take an unroll hint\n  // argument, and the cgen can look if that argument is the constant\n  // expression '3'.\n  //\n  // See also wuffs_base__io_writer__copy_n32_from_history_fast below.\n  //\n  // Alternatively, or additionally, have a sloppy_copy_n32_from_history method\n  // that copies 8 bytes at a time, possibly writing more than length bytes?\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__copy_n32_from_history_fast is like the\n// wuffs_base__io_writer__copy_n32_from_history function above, but has\n// stronger pre-conditions. The caller needs to prove" +
-	" that:\n//  - distance >  0\n//  - distance <= (*ptr_iop_w - io1_w)\n//  - length   <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_history_fast(uint8_t** ptr_iop_w,\n                                                  uint8_t* io1_w,\n                                                  uint8_t* io2_w,\n                                                  uint32_t length,\n                                                  uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_reader(uint8_t** ptr_iop_w,\n                                            uint8_t* io2_w,\n                                            uint32_t length,\n                                            const uint8_t** ptr_iop_r,\n                       " +
-	"                     const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint64_t  //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n                                       uint8_t* io2_w,\n                                       wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint64_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_slice(uint8_t** ptr_iop_w,\n                                           uint8_t* io2_w,\n                                 " +
-	"          uint32_t length,\n                                           wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\n// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes\n// start with the given prefix (up to 7 bytes long). It is peek-like, not\n// read-like, in that there are no side-effects.\n//\n// The low 3 bits of a hold the prefix length, n.\n//\n// The high 56 bits of a hold the prefix itself, in little-endian order. The\n// first prefix byte is in bits 8..=15, the second prefix byte is in bits\n// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.\n//\n// There are three possible return values:\n//  - 0 means success.\n//  - 1 means inconclusive, equivalent to \"$short read\".\n//  - 2 means failure.\nstatic inline uint32_t  //\nwuffs_base__io" +
-	"_reader__match7(const uint8_t* iop_r,\n                              const uint8_t* io2_r,\n                              wuffs_base__io_buffer* r,\n                              uint64_t a) {\n  uint32_t n = a & 7;\n  a >>= 8;\n  if ((io2_r - iop_r) >= 8) {\n    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);\n    uint32_t shift = 8 * (8 - n);\n    return ((a << shift) == (x << shift)) ? 0 : 2;\n  }\n  for (; n > 0; n--) {\n    if (iop_r >= io2_r) {\n      return (r && r->meta.closed) ? 2 : 1;\n    } else if (*iop_r != ((uint8_t)(a))) {\n      return 2;\n    }\n    iop_r++;\n    a >>= 8;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_reader__set(wuffs_base__io_buffer* b,\n                           const uint8_t** ptr_iop_r,\n                           const uint8_t** ptr_io0_r,\n                           const uint8_t** ptr_io1_r,\n                           const uint8_t** ptr_io2_r,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = data.len;\n  b->" +
-	"meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_r = data.ptr;\n  *ptr_io0_r = data.ptr;\n  *ptr_io1_r = data.ptr;\n  *ptr_io2_r = data.ptr + data.len;\n\n  return b;\n}\n\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wcast-qual\"\n// TODO: can we avoid the const_cast (by deleting this function)? This might\n// involve converting the call sites to take an io_reader instead of a slice u8\n// (the result of io_reader.take).\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io_reader__take(const uint8_t** ptr_iop_r,\n                            const uint8_t* io2_r,\n                            uint64_t n) {\n  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {\n    const uint8_t* p = *ptr_iop_r;\n    *ptr_iop_r += n;\n    // The arg is what C calls C++'s \"const_cast<uint8_t*>(p)\".\n    return wuffs_base__make_slice_u8((uint8_t*)(p), n);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n#pragma GCC diagnostic pop\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_writer__set(wuffs_base__io_buff" +
-	"er* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_w = data.ptr;\n  *ptr_io2_w = data.ptr + data.len;\n\n  return b;\n}\n\n  " +
+	"// ---------------- I/O\n\nstatic inline uint64_t  //\nwuffs_base__io__count_since(uint64_t mark, uint64_t index) {\n  if (index >= mark) {\n    return index - mark;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io__since(uint64_t mark, uint64_t index, uint8_t* ptr) {\n  if (index >= mark) {\n    return wuffs_base__make_slice_u8(ptr + mark, index - mark);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n\n" +
+	"" +
+	"// --------\n\n// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes\n// start with the given prefix (up to 7 bytes long). It is peek-like, not\n// read-like, in that there are no side-effects.\n//\n// The low 3 bits of a hold the prefix length, n.\n//\n// The high 56 bits of a hold the prefix itself, in little-endian order. The\n// first prefix byte is in bits 8..=15, the second prefix byte is in bits\n// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.\n//\n// There are three possible return values:\n//  - 0 means success.\n//  - 1 means inconclusive, equivalent to \"$short read\".\n//  - 2 means failure.\nstatic inline uint32_t  //\nwuffs_base__io_reader__match7(const uint8_t* iop_r,\n                              const uint8_t* io2_r,\n                              wuffs_base__io_buffer* r,\n                              uint64_t a) {\n  uint32_t n = a & 7;\n  a >>= 8;\n  if ((io2_r - iop_r) >= 8) {\n    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);\n    uint32_t shift = 8 * (8 - n);\n " +
+	"   return ((a << shift) == (x << shift)) ? 0 : 2;\n  }\n  for (; n > 0; n--) {\n    if (iop_r >= io2_r) {\n      return (r && r->meta.closed) ? 2 : 1;\n    } else if (*iop_r != ((uint8_t)(a))) {\n      return 2;\n    }\n    iop_r++;\n    a >>= 8;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_reader__set(wuffs_base__io_buffer* b,\n                           const uint8_t** ptr_iop_r,\n                           const uint8_t** ptr_io0_r,\n                           const uint8_t** ptr_io1_r,\n                           const uint8_t** ptr_io2_r,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = data.len;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_r = data.ptr;\n  *ptr_io0_r = data.ptr;\n  *ptr_io1_r = data.ptr;\n  *ptr_io2_r = data.ptr + data.len;\n\n  return b;\n}\n\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wcast-qual\"\n// TODO: can we avoid the const_cast (by deleting this function)? This might\n// involve conver" +
+	"ting the call sites to take an io_reader instead of a slice u8\n// (the result of io_reader.take).\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io_reader__take(const uint8_t** ptr_iop_r,\n                            const uint8_t* io2_r,\n                            uint64_t n) {\n  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {\n    const uint8_t* p = *ptr_iop_r;\n    *ptr_iop_r += n;\n    // The arg is what C calls C++'s \"const_cast<uint8_t*>(p)\".\n    return wuffs_base__make_slice_u8((uint8_t*)(p), n);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n#pragma GCC diagnostic pop\n\n" +
+	"" +
+	"// --------\n\nstatic inline uint64_t  //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n                                       uint8_t* io2_w,\n                                       wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint64_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_history(uint8_t** ptr_iop_w,\n                                             uint8_t* io1_w,\n                                             uint8_t* io2_w,\n                                             uint32_t length,\n                                             uint32_t distance) {\n  if (!distance) {\n    return 0;\n  }\n  uint8_t* p = *ptr_iop_w;\n  if ((size_t)(p - io1_w) < (size_t)(distance)) {\n    return 0;\n  }\n  uint8_t* q = p - distance;\n  size_t n = (size_t)(io2_w - p);\n  if ((size_t)(length) > n) {\n    le" +
+	"ngth = (uint32_t)(n);\n  } else {\n    n = (size_t)(length);\n  }\n  // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n  // is mostly because 3 is the minimum length for the deflate format. This\n  // function implementation shouldn't overfit to that one format. Perhaps the\n  // copy_n32_from_history Wuffs method should also take an unroll hint\n  // argument, and the cgen can look if that argument is the constant\n  // expression '3'.\n  //\n  // See also wuffs_base__io_writer__copy_n32_from_history_fast below.\n  //\n  // Alternatively, or additionally, have a sloppy_copy_n32_from_history method\n  // that copies 8 bytes at a time, possibly writing more than length bytes?\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__copy_n32_from_history_fast is like the\n// wuffs_base__io_writer__copy_n32_from_history function above, but has\n// stronger pre-conditions. Th" +
+	"e caller needs to prove that:\n//  - distance >  0\n//  - distance <= (*ptr_iop_w - io1_w)\n//  - length   <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_history_fast(uint8_t** ptr_iop_w,\n                                                  uint8_t* io1_w,\n                                                  uint8_t* io2_w,\n                                                  uint32_t length,\n                                                  uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_reader(uint8_t** ptr_iop_w,\n                                            uint8_t* io2_w,\n                                            uint32_t length,\n                                            const uint8_t** ptr_iop_r,\n" +
+	"                                            const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__copy_n32_from_slice(uint8_t** ptr_iop_w,\n                                           uint8_t* io2_w,\n                                           uint32_t length,\n                                           wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_b" +
+	"ase__io_writer__set(wuffs_base__io_buffer* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_w = data.ptr;\n  *ptr_io2_w = data.ptr + data.len;\n\n  return b;\n}\n\n  " +
 	"" +
 	"// ---------------- I/O (Utility)\n\n#define wuffs_base__utility__empty_io_reader wuffs_base__empty_io_reader\n#define wuffs_base__utility__empty_io_writer wuffs_base__empty_io_writer\n" +
 	""
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b5bf088..e310080 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -7639,6 +7639,104 @@
   return wuffs_base__make_slice_u8(NULL, 0);
 }
 
+// --------
+
+// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes
+// start with the given prefix (up to 7 bytes long). It is peek-like, not
+// read-like, in that there are no side-effects.
+//
+// The low 3 bits of a hold the prefix length, n.
+//
+// The high 56 bits of a hold the prefix itself, in little-endian order. The
+// first prefix byte is in bits 8..=15, the second prefix byte is in bits
+// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.
+//
+// There are three possible return values:
+//  - 0 means success.
+//  - 1 means inconclusive, equivalent to "$short read".
+//  - 2 means failure.
+static inline uint32_t  //
+wuffs_base__io_reader__match7(const uint8_t* iop_r,
+                              const uint8_t* io2_r,
+                              wuffs_base__io_buffer* r,
+                              uint64_t a) {
+  uint32_t n = a & 7;
+  a >>= 8;
+  if ((io2_r - iop_r) >= 8) {
+    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);
+    uint32_t shift = 8 * (8 - n);
+    return ((a << shift) == (x << shift)) ? 0 : 2;
+  }
+  for (; n > 0; n--) {
+    if (iop_r >= io2_r) {
+      return (r && r->meta.closed) ? 2 : 1;
+    } else if (*iop_r != ((uint8_t)(a))) {
+      return 2;
+    }
+    iop_r++;
+    a >>= 8;
+  }
+  return 0;
+}
+
+static inline wuffs_base__io_buffer*  //
+wuffs_base__io_reader__set(wuffs_base__io_buffer* b,
+                           const uint8_t** ptr_iop_r,
+                           const uint8_t** ptr_io0_r,
+                           const uint8_t** ptr_io1_r,
+                           const uint8_t** ptr_io2_r,
+                           wuffs_base__slice_u8 data) {
+  b->data = data;
+  b->meta.wi = data.len;
+  b->meta.ri = 0;
+  b->meta.pos = 0;
+  b->meta.closed = false;
+
+  *ptr_iop_r = data.ptr;
+  *ptr_io0_r = data.ptr;
+  *ptr_io1_r = data.ptr;
+  *ptr_io2_r = data.ptr + data.len;
+
+  return b;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+// TODO: can we avoid the const_cast (by deleting this function)? This might
+// involve converting the call sites to take an io_reader instead of a slice u8
+// (the result of io_reader.take).
+static inline wuffs_base__slice_u8  //
+wuffs_base__io_reader__take(const uint8_t** ptr_iop_r,
+                            const uint8_t* io2_r,
+                            uint64_t n) {
+  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {
+    const uint8_t* p = *ptr_iop_r;
+    *ptr_iop_r += n;
+    // The arg is what C calls C++'s "const_cast<uint8_t*>(p)".
+    return wuffs_base__make_slice_u8((uint8_t*)(p), n);
+  }
+  return wuffs_base__make_slice_u8(NULL, 0);
+}
+#pragma GCC diagnostic pop
+
+// --------
+
+static inline uint64_t  //
+wuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,
+                                       uint8_t* io2_w,
+                                       wuffs_base__slice_u8 src) {
+  uint8_t* iop_w = *ptr_iop_w;
+  size_t n = src.len;
+  if (n > ((size_t)(io2_w - iop_w))) {
+    n = (size_t)(io2_w - iop_w);
+  }
+  if (n > 0) {
+    memmove(iop_w, src.ptr, n);
+    *ptr_iop_w += n;
+  }
+  return (uint64_t)(n);
+}
+
 static inline uint32_t  //
 wuffs_base__io_writer__copy_n32_from_history(uint8_t** ptr_iop_w,
                                              uint8_t* io1_w,
@@ -7732,22 +7830,6 @@
   return (uint32_t)(n);
 }
 
-static inline uint64_t  //
-wuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,
-                                       uint8_t* io2_w,
-                                       wuffs_base__slice_u8 src) {
-  uint8_t* iop_w = *ptr_iop_w;
-  size_t n = src.len;
-  if (n > ((size_t)(io2_w - iop_w))) {
-    n = (size_t)(io2_w - iop_w);
-  }
-  if (n > 0) {
-    memmove(iop_w, src.ptr, n);
-    *ptr_iop_w += n;
-  }
-  return (uint64_t)(n);
-}
-
 static inline uint32_t  //
 wuffs_base__io_writer__copy_n32_from_slice(uint8_t** ptr_iop_w,
                                            uint8_t* io2_w,
@@ -7768,84 +7850,6 @@
   return (uint32_t)(n);
 }
 
-// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes
-// start with the given prefix (up to 7 bytes long). It is peek-like, not
-// read-like, in that there are no side-effects.
-//
-// The low 3 bits of a hold the prefix length, n.
-//
-// The high 56 bits of a hold the prefix itself, in little-endian order. The
-// first prefix byte is in bits 8..=15, the second prefix byte is in bits
-// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.
-//
-// There are three possible return values:
-//  - 0 means success.
-//  - 1 means inconclusive, equivalent to "$short read".
-//  - 2 means failure.
-static inline uint32_t  //
-wuffs_base__io_reader__match7(const uint8_t* iop_r,
-                              const uint8_t* io2_r,
-                              wuffs_base__io_buffer* r,
-                              uint64_t a) {
-  uint32_t n = a & 7;
-  a >>= 8;
-  if ((io2_r - iop_r) >= 8) {
-    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);
-    uint32_t shift = 8 * (8 - n);
-    return ((a << shift) == (x << shift)) ? 0 : 2;
-  }
-  for (; n > 0; n--) {
-    if (iop_r >= io2_r) {
-      return (r && r->meta.closed) ? 2 : 1;
-    } else if (*iop_r != ((uint8_t)(a))) {
-      return 2;
-    }
-    iop_r++;
-    a >>= 8;
-  }
-  return 0;
-}
-
-static inline wuffs_base__io_buffer*  //
-wuffs_base__io_reader__set(wuffs_base__io_buffer* b,
-                           const uint8_t** ptr_iop_r,
-                           const uint8_t** ptr_io0_r,
-                           const uint8_t** ptr_io1_r,
-                           const uint8_t** ptr_io2_r,
-                           wuffs_base__slice_u8 data) {
-  b->data = data;
-  b->meta.wi = data.len;
-  b->meta.ri = 0;
-  b->meta.pos = 0;
-  b->meta.closed = false;
-
-  *ptr_iop_r = data.ptr;
-  *ptr_io0_r = data.ptr;
-  *ptr_io1_r = data.ptr;
-  *ptr_io2_r = data.ptr + data.len;
-
-  return b;
-}
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-qual"
-// TODO: can we avoid the const_cast (by deleting this function)? This might
-// involve converting the call sites to take an io_reader instead of a slice u8
-// (the result of io_reader.take).
-static inline wuffs_base__slice_u8  //
-wuffs_base__io_reader__take(const uint8_t** ptr_iop_r,
-                            const uint8_t* io2_r,
-                            uint64_t n) {
-  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {
-    const uint8_t* p = *ptr_iop_r;
-    *ptr_iop_r += n;
-    // The arg is what C calls C++'s "const_cast<uint8_t*>(p)".
-    return wuffs_base__make_slice_u8((uint8_t*)(p), n);
-  }
-  return wuffs_base__make_slice_u8(NULL, 0);
-}
-#pragma GCC diagnostic pop
-
 static inline wuffs_base__io_buffer*  //
 wuffs_base__io_writer__set(wuffs_base__io_buffer* b,
                            uint8_t** ptr_iop_w,