Add etc_copy_u32_from_history_8_byte_chunks_fast

On a mid-range x86_64 laptop:

----

$ ls -lh linux-5.11.3.tar.gz | awk '{print $5 " " $9}'
178M linux-5.11.3.tar.gz

$ # Debian's /usr/bin/zcat
$ time /usr/bin/zcat        < linux-5.11.3.tar.gz > /dev/null
real	0m8.423s

$ # Wuffs; before
$ time gen/bin/example-zcat < linux-5.11.3.tar.gz > /dev/null
real	0m4.054s

$ # Wuffs; after
$ time gen/bin/example-zcat < linux-5.11.3.tar.gz > /dev/null
real	0m2.651s

----

Wuffs benchmarks:

wuffs_deflate_decode_1k_full_init/clang9                  95.6MB/s ± 0%  98.1MB/s ± 1%   +2.62%  (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/clang9                   109MB/s ± 0%   112MB/s ± 0%   +3.31%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/clang9                  162MB/s ± 0%   201MB/s ± 0%  +24.32%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/clang9                  165MB/s ± 0%   206MB/s ± 0%  +25.16%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/clang9             170MB/s ± 0%   253MB/s ± 0%  +48.63%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/clang9            147MB/s ± 0%   160MB/s ± 0%   +8.77%  (p=0.016 n=4+5)

wuffs_deflate_decode_1k_full_init/gcc10                   92.1MB/s ± 0%  91.7MB/s ± 0%   -0.44%  (p=0.016 n=5+5)
wuffs_deflate_decode_1k_part_init/gcc10                    107MB/s ± 0%   104MB/s ± 0%   -3.35%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/gcc10                   149MB/s ± 0%   195MB/s ± 0%  +30.68%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/gcc10                   152MB/s ± 1%   199MB/s ± 1%  +31.23%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/gcc10              213MB/s ± 0%   256MB/s ± 0%  +20.42%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/gcc10             177MB/s ± 0%   177MB/s ± 1%     ~     (p=0.595 n=5+5)

mimic_deflate_decode_1k                                    128MB/s ± 0%   128MB/s ± 0%   +0.42%  (p=0.016 n=5+4)
mimic_deflate_decode_10k                                   143MB/s ± 0%   143MB/s ± 0%     ~     (p=0.310 n=5+5)
mimic_deflate_decode_100k_just_one_read                    176MB/s ± 0%   176MB/s ± 0%   -0.24%  (p=0.008 n=5+5)
mimic_deflate_decode_100k_many_big_reads                   135MB/s ± 0%   135MB/s ± 0%   +0.12%  (p=0.016 n=5+5)

----

wuffs_gzip_decode_10k/clang9                               166MB/s ± 0%   198MB/s ± 1%  +19.66%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang9                              178MB/s ± 0%   241MB/s ± 0%  +35.41%  (p=0.008 n=5+5)

wuffs_gzip_decode_10k/gcc10                                172MB/s ± 0%   199MB/s ± 0%  +15.67%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc10                               198MB/s ± 0%   258MB/s ± 0%  +29.95%  (p=0.008 n=5+5)

mimic_gzip_decode_10k                                      121MB/s ± 0%   121MB/s ± 0%     ~     (p=0.421 n=5+5)
mimic_gzip_decode_100k                                     144MB/s ± 0%   144MB/s ± 0%     ~     (p=0.548 n=5+5)

----

wuffs_zlib_decode_10k/clang9                               154MB/s ± 0%   199MB/s ± 0%  +28.89%  (p=0.016 n=4+5)
wuffs_zlib_decode_100k/clang9                              186MB/s ± 1%   234MB/s ± 1%  +25.58%  (p=0.008 n=5+5)

wuffs_zlib_decode_10k/gcc10                                156MB/s ± 0%   202MB/s ± 0%  +30.01%  (p=0.008 n=5+5)
wuffs_zlib_decode_100k/gcc10                               198MB/s ± 0%   262MB/s ± 0%  +32.21%  (p=0.008 n=5+5)

mimic_zlib_decode_10k                                      133MB/s ± 0%   133MB/s ± 0%     ~     (p=1.000 n=5+5)
mimic_zlib_decode_100k                                     160MB/s ± 0%   160MB/s ± 0%     ~     (p=0.095 n=5+5)

----

wuffs_png_decode_image_19k_8bpp/clang9                     115MB/s ± 0%   131MB/s ± 0%  +14.45%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9                    153MB/s ± 0%   153MB/s ± 0%   -0.51%  (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang9                     415MB/s ± 0%   474MB/s ± 0%  +14.37%  (p=0.016 n=4+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   345MB/s ± 1%   371MB/s ± 0%   +7.62%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   335MB/s ± 0%   359MB/s ± 0%   +7.06%  (p=0.016 n=4+5)
wuffs_png_decode_image_4002k_24bpp/clang9                  153MB/s ± 0%   156MB/s ± 0%   +2.37%  (p=0.008 n=5+5)

wuffs_png_decode_image_19k_8bpp/gcc10                      123MB/s ± 0%   135MB/s ± 1%   +9.38%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10                     159MB/s ± 1%   163MB/s ± 0%   +2.60%  (p=0.016 n=5+4)
wuffs_png_decode_image_77k_8bpp/gcc10                      452MB/s ± 0%   479MB/s ± 0%   +6.00%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10    353MB/s ± 0%   387MB/s ± 0%   +9.42%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10    341MB/s ± 0%   372MB/s ± 0%   +9.07%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10                   161MB/s ± 0%   164MB/s ± 0%   +1.86%  (p=0.016 n=4+5)

mimic_png_decode_image_19k_8bpp                           58.1MB/s ± 0%  58.2MB/s ± 0%   +0.22%  (p=0.008 n=5+5)
mimic_png_decode_image_40k_24bpp                          73.2MB/s ± 0%  73.2MB/s ± 0%     ~     (p=0.341 n=5+5)
mimic_png_decode_image_77k_8bpp                            177MB/s ± 0%   177MB/s ± 0%     ~     (p=0.095 n=5+5)
mimic_png_decode_image_552k_32bpp_ignore_checksum               skipped
mimic_png_decode_image_552k_32bpp_verify_checksum          147MB/s ± 0%   146MB/s ± 0%     ~     (p=0.056 n=5+5)
mimic_png_decode_image_4002k_24bpp                         104MB/s ± 0%   104MB/s ± 0%     ~     (p=0.841 n=4+5)
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index 77e25a9..11b6274 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -204,7 +204,9 @@
 
 // wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the
 // wuffs_base__io_writer__limited_copy_u32_from_history function above, but has
-// stronger pre-conditions. The caller needs to prove that:
+// stronger pre-conditions.
+//
+// The caller needs to prove that:
 //  - distance >  0
 //  - distance <= (*ptr_iop_w - io1_w)
 //  - length   <= (io2_w      - *ptr_iop_w)
@@ -229,6 +231,44 @@
   return length;
 }
 
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
+// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - distance     >  0
+//  - distance     <= (*ptr_iop_w - io1_w)
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+static inline uint32_t  //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+    uint8_t** ptr_iop_w,
+    uint8_t* io1_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint8_t* q = p - distance;
+  uint32_t n = length;
+  while (1) {
+    memcpy(p, q, 8);
+    if (n <= 8) {
+      p += n;
+      break;
+    }
+    p += 8;
+    q += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return length;
+}
+
 static inline uint32_t  //
 wuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,
                                                     uint8_t* io2_w,
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index be32c70..cb10c1d 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -307,13 +307,11 @@
 	}
 
 	switch method {
-	case t.IDLimitedCopyU32FromHistory, t.IDLimitedCopyU32FromHistoryFast:
-		suffix := ""
-		if method == t.IDLimitedCopyU32FromHistoryFast {
-			suffix = "_fast"
-		}
-		b.printf("wuffs_base__io_writer__limited_copy_u32_from_history%s(\n&%s%s, %s%s, %s%s",
-			suffix, iopPrefix, recvName, io0Prefix, recvName, io2Prefix, recvName)
+	case t.IDLimitedCopyU32FromHistory,
+		t.IDLimitedCopyU32FromHistory8ByteChunksFast,
+		t.IDLimitedCopyU32FromHistoryFast:
+		b.printf("wuffs_base__io_writer__%s(\n&%s%s, %s%s, %s%s",
+			method.Str(g.tm), iopPrefix, recvName, io0Prefix, recvName, io2Prefix, recvName)
 		for _, o := range args {
 			b.writes(", ")
 			if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 535b79a..1c91fcc 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -212,9 +212,11 @@
 	"" +
 	"// --------\n\nstatic inline uint64_t  //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n                                       uint8_t* io2_w,\n                                       wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint64_t)(n);\n}\n\nstatic inline void  //\nwuffs_base__io_writer__limit(uint8_t** ptr_io2_w,\n                             uint8_t* iop_w,\n                             uint64_t limit) {\n  if (((uint64_t)(*ptr_io2_w - iop_w)) > limit) {\n    *ptr_io2_w = iop_w + limit;\n  }\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history(uint8_t** ptr_iop_w,\n                                                     uint8_t* io1_w,\n                                                     uint8_t* io2_w,\n                                                     uint32_t length,\n           " +
 	"                                          uint32_t distance) {\n  if (!distance) {\n    return 0;\n  }\n  uint8_t* p = *ptr_iop_w;\n  if ((size_t)(p - io1_w) < (size_t)(distance)) {\n    return 0;\n  }\n  uint8_t* q = p - distance;\n  size_t n = (size_t)(io2_w - p);\n  if ((size_t)(length) > n) {\n    length = (uint32_t)(n);\n  } else {\n    n = (size_t)(length);\n  }\n  // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n  // is mostly because 3 is the minimum length for the deflate format. This\n  // function implementation shouldn't overfit to that one format. Perhaps the\n  // limited_copy_u32_from_history Wuffs method should also take an unroll hint\n  // argument, and the cgen can look if that argument is the constant\n  // expression '3'.\n  //\n  // See also wuffs_base__io_writer__limited_copy_u32_from_history_fast below.\n  //\n  // Alternatively or additionally, have a sloppy_limited_copy_u32_from_history\n  // method that copies 8 bytes at a time, which can more than length bytes?\n  for (; n >= 3; " +
-	"n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions. The caller needs to prove that:\n//  - distance >  0\n//  - distance <= (*ptr_iop_w - io1_w)\n//  - length   <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n                                                          uint8_t* io1_w,\n                                                          uint8_t* io2_w,\n                                                          uint32_t length,\n                                                          uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n" +
-	"  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n                                                    uint8_t* io2_w,\n                                                    uint32_t length,\n                                                    const uint8_t** ptr_iop_r,\n                                                    const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_w,\n                                                   uint8_t* io2_w,\n                         " +
-	"                          uint32_t length,\n                                                   wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_w = data.ptr;\n  *ptr_io2_w = data.ptr + data.len;\n\n  return b;\n}\n\n" +
+	"n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n//  - distance >  0\n//  - distance <= (*ptr_iop_w - io1_w)\n//  - length   <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n                                                          uint8_t* io1_w,\n                                                          uint8_t* io2_w,\n                                                          uint32_t length,\n                                                          uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = " +
+	"*q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n//  - distance     >  0\n//  - distance     <= (*ptr_iop_w - io1_w)\n//  - (length + 8) <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n    uint8_t** ptr_iop_w,\n    uint8_t* io1_w,\n    uint8_t* io2_w,\n    uint32_t length,\n    uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t" +
+	" n = length;\n  while (1) {\n    memcpy(p, q, 8);\n    if (n <= 8) {\n      p += n;\n      break;\n    }\n    p += 8;\n    q += 8;\n    n -= 8;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n                                                    uint8_t* io2_w,\n                                                    uint32_t length,\n                                                    const uint8_t** ptr_iop_r,\n                                                    const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_" +
+	"w,\n                                                   uint8_t* io2_w,\n                                                   uint32_t length,\n                                                   wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_w = data.ptr;\n  *ptr_io2_w = data.pt" +
+	"r + data.len;\n\n  return b;\n}\n\n" +
 	"" +
 	"// ---------------- I/O (Utility)\n\n#define wuffs_base__utility__empty_io_reader wuffs_base__empty_io_reader\n#define wuffs_base__utility__empty_io_writer wuffs_base__empty_io_writer\n" +
 	""
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 375014e..487a4b0 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -538,10 +538,17 @@
 	// TODO: this should have explicit pre-conditions:
 	//  - up_to <= this.length()
 	//  - distance > 0
-	//  - distance <= this.since_mark().length()
+	//  - distance <= this.history_length()
 	// For now, that's all implicitly checked (i.e. hard coded).
 	"io_writer.limited_copy_u32_from_history_fast!(up_to: u32, distance: u32) u32",
 
+	// TODO: this should have explicit pre-conditions:
+	//  - (up_to + 8) <= this.length()
+	//  - distance > 0
+	//  - distance <= this.history_length()
+	// For now, that's all implicitly checked (i.e. hard coded).
+	"io_writer.limited_copy_u32_from_history_8_byte_chunks_fast!(up_to: u32, distance: u32) u32",
+
 	// ---- token_writer
 
 	"token_writer.write_simple_token_fast!(" +
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index 24df237..2fa3044 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1167,8 +1167,13 @@
 				return bounds{}, err
 			}
 
+		} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight); err != nil {
+				return bounds{}, err
+			}
+
 		} else if method == t.IDLimitedCopyU32FromHistoryFast {
-			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args()); err != nil {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil); err != nil {
 				return bounds{}, err
 			}
 
@@ -1281,11 +1286,13 @@
 	return fmt.Errorf("check: could not prove %s.can_undo_byte()", recv.Str(q.tm))
 }
 
-func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node) error {
+func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int) error {
 	// As per cgen's io-private.h, there are three pre-conditions:
-	//  - upTo <= this.length()
+	//  - (upTo + adj) <= this.length()
 	//  - distance > 0
 	//  - distance <= this.history_length()
+	//
+	// adj may be nil, in which case (upTo + adj) is just upTo.
 
 	if len(args) != 2 {
 		return fmt.Errorf("check: internal error: inconsistent limited_copy_u32_from_history_fast arguments")
@@ -1301,15 +1308,26 @@
 				continue
 			}
 
-			// Check that the LHS is "upTo as base.u64".
+			// Check that the LHS is "(upTo + adj) as base.u64".
 			lhs := x.LHS().AsExpr()
 			if lhs.Operator() != t.IDXBinaryAs {
 				continue
 			}
 			llhs, lrhs := lhs.LHS().AsExpr(), lhs.RHS().AsTypeExpr()
-			if !llhs.Eq(upTo) || !lrhs.Eq(typeExprU64) {
+			if !lrhs.Eq(typeExprU64) {
 				continue
 			}
+			if adj == nil {
+				if !llhs.Eq(upTo) {
+					continue
+				}
+			} else {
+				if (llhs.Operator() != t.IDXBinaryPlus) || !llhs.LHS().AsExpr().Eq(upTo) {
+					continue
+				} else if cv := llhs.RHS().AsExpr().ConstValue(); (cv == nil) || (cv.Cmp(adj) != 0) {
+					continue
+				}
+			}
 
 			// Check that the RHS is "recv.length()".
 			y, method, yArgs := splitReceiverMethodArgs(x.RHS().AsExpr())
@@ -1322,7 +1340,12 @@
 
 			break check0
 		}
-		return fmt.Errorf("check: could not prove (up_to as base.u64) <= %s.length()", recv.Str(q.tm))
+		if adj == nil {
+			return fmt.Errorf("check: could not prove (%s as base.u64) <= %s.length()",
+				upTo.Str(q.tm), recv.Str(q.tm))
+		}
+		return fmt.Errorf("check: could not prove ((%s + %v) as base.u64) <= %s.length()",
+			upTo.Str(q.tm), adj, recv.Str(q.tm))
 	}
 
 	// Check "distance > 0".
@@ -1340,7 +1363,7 @@
 			}
 			break check1
 		}
-		return fmt.Errorf("check: could not prove distance > 0")
+		return fmt.Errorf("check: could not prove %s > 0", distance.Str(q.tm))
 	}
 
 	// Check "distance <= this.history_length()".
@@ -1372,7 +1395,8 @@
 
 			break check2
 		}
-		return fmt.Errorf("check: could not prove distance <= %s.history_length()", recv.Str(q.tm))
+		return fmt.Errorf("check: could not prove %s <= %s.history_length()",
+			distance.Str(q.tm), recv.Str(q.tm))
 	}
 
 	return nil
diff --git a/lang/token/list.go b/lang/token/list.go
index 7cfbde7..797f983 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -504,12 +504,13 @@
 	IDSkipU32       = ID(0x16B)
 	IDSkipU32Fast   = ID(0x16C)
 
-	IDCopyFromSlice                 = ID(0x170)
-	IDLimitedCopyU32FromHistory     = ID(0x171)
-	IDLimitedCopyU32FromHistoryFast = ID(0x172)
-	IDLimitedCopyU32FromReader      = ID(0x173)
-	IDLimitedCopyU32FromSlice       = ID(0x174)
-	IDLimitedCopyU32ToSlice         = ID(0x175)
+	IDCopyFromSlice                            = ID(0x170)
+	IDLimitedCopyU32FromHistory                = ID(0x171)
+	IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x172)
+	IDLimitedCopyU32FromHistoryFast            = ID(0x173)
+	IDLimitedCopyU32FromReader                 = ID(0x174)
+	IDLimitedCopyU32FromSlice                  = ID(0x175)
+	IDLimitedCopyU32ToSlice                    = ID(0x176)
 
 	// -------- 0x180 block.
 
@@ -927,12 +928,13 @@
 	IDSkipU32:       "skip_u32",
 	IDSkipU32Fast:   "skip_u32_fast",
 
-	IDCopyFromSlice:                 "copy_from_slice",
-	IDLimitedCopyU32FromHistory:     "limited_copy_u32_from_history",
-	IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
-	IDLimitedCopyU32FromReader:      "limited_copy_u32_from_reader",
-	IDLimitedCopyU32FromSlice:       "limited_copy_u32_from_slice",
-	IDLimitedCopyU32ToSlice:         "limited_copy_u32_to_slice",
+	IDCopyFromSlice:                            "copy_from_slice",
+	IDLimitedCopyU32FromHistory:                "limited_copy_u32_from_history",
+	IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
+	IDLimitedCopyU32FromHistoryFast:            "limited_copy_u32_from_history_fast",
+	IDLimitedCopyU32FromReader:                 "limited_copy_u32_from_reader",
+	IDLimitedCopyU32FromSlice:                  "limited_copy_u32_from_slice",
+	IDLimitedCopyU32ToSlice:                    "limited_copy_u32_to_slice",
 
 	// -------- 0x180 block.
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b0f7e8a..d7e2bda 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -10055,7 +10055,9 @@
 
 // wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the
 // wuffs_base__io_writer__limited_copy_u32_from_history function above, but has
-// stronger pre-conditions. The caller needs to prove that:
+// stronger pre-conditions.
+//
+// The caller needs to prove that:
 //  - distance >  0
 //  - distance <= (*ptr_iop_w - io1_w)
 //  - length   <= (io2_w      - *ptr_iop_w)
@@ -10080,6 +10082,44 @@
   return length;
 }
 
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
+// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - distance     >  0
+//  - distance     <= (*ptr_iop_w - io1_w)
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+static inline uint32_t  //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+    uint8_t** ptr_iop_w,
+    uint8_t* io1_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint8_t* q = p - distance;
+  uint32_t n = length;
+  while (1) {
+    memcpy(p, q, 8);
+    if (n <= 8) {
+      p += n;
+      break;
+    }
+    p += 8;
+    q += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return length;
+}
+
 static inline uint32_t  //
 wuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,
                                                     uint8_t* io2_w,
@@ -24151,7 +24191,7 @@
   v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
   v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
   label__loop__continue:;
-  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
+  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 266) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
     v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63)));
     iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
     v_n_bits |= 56;
@@ -24250,14 +24290,18 @@
         if (v_length == 0) {
           goto label__loop__continue;
         }
-        if ((((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) || (((uint64_t)(v_length)) > ((uint64_t)(io2_a_dst - iop_a_dst)))) {
+        if ((((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) || (((uint64_t)(v_length)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)((v_length + 8))) > ((uint64_t)(io2_a_dst - iop_a_dst)))) {
           status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_distance);
           goto exit;
         }
-      } else {
       }
-      wuffs_base__io_writer__limited_copy_u32_from_history_fast(
-          &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      if ((v_dist_minus_1 + 1) >= 8) {
+        wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+            &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      } else {
+        wuffs_base__io_writer__limited_copy_u32_from_history_fast(
+            &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      }
       goto label__0__break;
     }
     label__0__break:;
diff --git a/std/deflate/decode_huffman_fast64.wuffs b/std/deflate/decode_huffman_fast64.wuffs
index d714b70..9e0e8e2 100644
--- a/std/deflate/decode_huffman_fast64.wuffs
+++ b/std/deflate/decode_huffman_fast64.wuffs
@@ -44,7 +44,7 @@
 	dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
 
 	// Check up front, on each iteration, that we have enough buffer space to
-	// both read (8 bytes) and write (258 bytes) as much as we need to. Doing
+	// both read (8 bytes) and write (266 bytes) as much as we need to. Doing
 	// this check once (per iteration), up front, removes the need to check
 	// multiple times inside the loop body, so it's faster overall.
 	//
@@ -52,12 +52,18 @@
 	// 258 is the maximum length in a length-distance pair, as specified in the
 	// RFC section 3.2.5. Compressed blocks (length and distance codes).
 	//
+	// We adjust 258 up to (258 + 8) = 266 since it can be faster to overshoot
+	// a little and make multiple-of-8-byte copies even when the length in the
+	// length-distance pair isn't an exact multiple-of-8. Strictly speaking,
+	// 264 (an exact multiple-of-8) is the tight bound but (258 + 8) is easier
+	// for the Wuffs proof system, as length's type is refined to [..= 258],
+	//
 	// For reading, strictly speaking, we only need 6 bytes (48 bits) of
 	// available input, because the H-L Literal/Length code is up to 15 bits
 	// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
 	// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
 	// read 64 bits than 48 bits or (48 - n_bits) bits.
-	while.loop(args.dst.length() >= 258) and (args.src.length() >= 8) {
+	while.loop(args.dst.length() >= 266) and (args.src.length() >= 8) {
 		// Ensure that we have at least 56 bits of input.
 		//
 		// This is "Variant 4" of
@@ -178,8 +184,12 @@
 		// its presence minimizes the diff between decode_huffman_fastxx and
 		// decode_huffman_slow.
 		while true,
-			pre args.dst.length() >= 258,
+			pre args.dst.length() >= 266,
 		{
+			// We can therefore prove:
+			assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+			assert ((length + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+
 			// Copy from this.history.
 			if ((dist_minus_1 + 1) as base.u64) > args.dst.history_length() {
 				// Set (hlen, hdist) to be the length-distance pair to copy
@@ -217,23 +227,33 @@
 				}
 
 				if (((dist_minus_1 + 1) as base.u64) > args.dst.history_length()) or
-					((length as base.u64) > args.dst.length()) {
+					((length as base.u64) > args.dst.length()) or
+					(((length + 8) as base.u64) > args.dst.length()) {
 					return "#internal error: inconsistent distance"
 				}
-			} else {
-				assert (length as base.u64) <= 258
-				assert args.dst.length() >= 258
-				// We can therefore prove:
-				assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 258)
 			}
 			// Once again, redundant but explicit assertions.
 			assert (dist_minus_1 + 1) > 0
 			assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length()
 			assert (length as base.u64) <= args.dst.length()
+			assert ((length + 8) as base.u64) <= args.dst.length()
 
 			// Copy from args.dst.
-			args.dst.limited_copy_u32_from_history_fast!(
-				up_to: length, distance: (dist_minus_1 + 1))
+			//
+			// For short distances, less than 8 bytes, copying atomic 8-byte
+			// chunks can result in incorrect output, so we fall back to a
+			// slower 1-byte-at-a-time copy. Deflate's copy-from-history can
+			// pick up freshly written bytes. A length = 5, distance = 2 copy
+			// starting with "abc" should give "abcbcbcb" but the 8-byte chunk
+			// technique would give "abcbc???", the exact output depending on
+			// what was previously in the writer buffer.
+			if (dist_minus_1 + 1) >= 8 {
+				args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
+					up_to: length, distance: (dist_minus_1 + 1))
+			} else {
+				args.dst.limited_copy_u32_from_history_fast!(
+					up_to: length, distance: (dist_minus_1 + 1))
+			}
 			break
 		} endwhile
 	} endwhile.loop