Optimize std/deflate decoding for RLE sections

name                                                      old speed      new speed      delta

wuffs_deflate_decode_1k_full_init/clang9                   104MB/s ± 1%   104MB/s ± 1%     ~     (p=0.161 n=9+9)
wuffs_deflate_decode_1k_part_init/clang9                   121MB/s ± 1%   119MB/s ± 1%   -0.92%  (p=0.000 n=9+9)
wuffs_deflate_decode_10k_full_init/clang9                  223MB/s ± 1%   223MB/s ± 1%     ~     (p=0.796 n=9+9)
wuffs_deflate_decode_10k_part_init/clang9                  230MB/s ± 1%   228MB/s ± 2%     ~     (p=0.065 n=10+9)
wuffs_deflate_decode_100k_just_one_read/clang9             279MB/s ± 1%   292MB/s ± 0%   +4.78%  (p=0.000 n=9+8)
wuffs_deflate_decode_100k_many_big_reads/clang9            178MB/s ± 5%   189MB/s ± 2%   +6.41%  (p=0.000 n=10+10)

wuffs_deflate_decode_1k_full_init/gcc10                    103MB/s ± 1%   102MB/s ± 1%   -0.57%  (p=0.022 n=10+9)
wuffs_deflate_decode_1k_part_init/gcc10                    122MB/s ± 1%   121MB/s ± 3%   -1.46%  (p=0.000 n=10+9)
wuffs_deflate_decode_10k_full_init/gcc10                   235MB/s ± 1%   231MB/s ± 1%   -1.34%  (p=0.000 n=10+8)
wuffs_deflate_decode_10k_part_init/gcc10                   242MB/s ± 1%   239MB/s ± 0%   -1.40%  (p=0.001 n=9+8)
wuffs_deflate_decode_100k_just_one_read/gcc10              300MB/s ± 4%   297MB/s ± 2%     ~     (p=0.156 n=10+9)
wuffs_deflate_decode_100k_many_big_reads/gcc10             193MB/s ± 2%   197MB/s ± 1%   +2.29%  (p=0.000 n=10+9)

wuffs_gzip_decode_10k/clang9                               220MB/s ± 1%   219MB/s ± 1%     ~     (p=0.605 n=9+9)
wuffs_gzip_decode_100k/clang9                              269MB/s ± 2%   283MB/s ± 2%   +4.98%  (p=0.000 n=9+10)

wuffs_gzip_decode_10k/gcc10                                227MB/s ± 1%   224MB/s ± 1%   -1.42%  (p=0.000 n=10+9)
wuffs_gzip_decode_100k/gcc10                               283MB/s ± 1%   285MB/s ± 4%     ~     (p=0.182 n=9+10)

wuffs_zlib_decode_10k/clang9                               226MB/s ± 1%   220MB/s ± 4%   -2.66%  (p=0.000 n=9+10)
wuffs_zlib_decode_100k/clang9                              272MB/s ± 2%   285MB/s ± 1%   +4.77%  (p=0.000 n=10+8)

wuffs_zlib_decode_10k/gcc10                                227MB/s ± 1%   212MB/s ± 1%   -6.80%  (p=0.000 n=10+9)
wuffs_zlib_decode_100k/gcc10                               282MB/s ± 1%   273MB/s ± 0%   -3.32%  (p=0.000 n=9+8)

wuffs_png_decode_image_19k_8bpp/clang9                     148MB/s ± 1%   138MB/s ± 2%   -6.79%  (p=0.000 n=9+10)
wuffs_png_decode_image_40k_24bpp/clang9                    170MB/s ± 1%   171MB/s ± 2%     ~     (p=0.133 n=9+10)
wuffs_png_decode_image_77k_8bpp/clang9                     525MB/s ± 1%   495MB/s ± 4%   -5.81%  (p=0.000 n=8+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   400MB/s ± 2%   457MB/s ± 4%  +14.16%  (p=0.000 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   388MB/s ± 1%   435MB/s ± 5%  +12.17%  (p=0.000 n=9+10)
wuffs_png_decode_image_4002k_24bpp/clang9                  175MB/s ± 1%   172MB/s ± 2%   -1.78%  (p=0.005 n=10+10)

wuffs_png_decode_image_19k_8bpp/gcc10                      154MB/s ± 3%   157MB/s ± 1%   +1.72%  (p=0.001 n=9+9)
wuffs_png_decode_image_40k_24bpp/gcc10                     182MB/s ± 1%   187MB/s ± 1%   +2.56%  (p=0.000 n=9+10)
wuffs_png_decode_image_77k_8bpp/gcc10                      552MB/s ± 3%   554MB/s ± 1%     ~     (p=0.863 n=9+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10    426MB/s ± 2%   502MB/s ± 1%  +17.85%  (p=0.000 n=10+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10    410MB/s ± 2%   481MB/s ± 1%  +17.17%  (p=0.000 n=9+8)
wuffs_png_decode_image_4002k_24bpp/gcc10                   186MB/s ± 1%   189MB/s ± 2%   +1.33%  (p=0.001 n=9+10)
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index bc180d0..c275fee 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -228,6 +228,46 @@
   return length;
 }
 
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     == 1
+//  - distance     <= (*ptr_iop_w - io1_w)
+static inline uint32_t  //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+    uint8_t** ptr_iop_w,
+    uint8_t* io1_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint64_t x = p[-1];
+  x |= x << 8;
+  x |= x << 16;
+  x |= x << 32;
+  uint32_t n = length;
+  while (1) {
+    wuffs_base__poke_u64le__no_bounds_check(p, x);
+    if (n <= 8) {
+      p += n;
+      break;
+    }
+    p += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return length;
+}
+
 // wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
 // like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
 // above, but copies 8 byte chunks at a time.
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index c1c4300..b0e4d1b 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -308,6 +308,7 @@
 
 	switch method {
 	case t.IDLimitedCopyU32FromHistory,
+		t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast,
 		t.IDLimitedCopyU32FromHistory8ByteChunksFast,
 		t.IDLimitedCopyU32FromHistoryFast:
 		b.printf("wuffs_base__io_writer__%s(\n&%s%s, %s%s, %s%s",
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 8c03c4c..ada5cde 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -223,10 +223,12 @@
 	"" +
 	"// --------\n\nstatic inline uint64_t  //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n                                       uint8_t* io2_w,\n                                       wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint64_t)(n);\n}\n\nstatic inline void  //\nwuffs_base__io_writer__limit(uint8_t** ptr_io2_w,\n                             uint8_t* iop_w,\n                             uint64_t limit) {\n  if (((uint64_t)(*ptr_io2_w - iop_w)) > limit) {\n    *ptr_io2_w = iop_w + limit;\n  }\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history(uint8_t** ptr_iop_w,\n                                                     uint8_t* io1_w,\n                                                     uint8_t* io2_w,\n                                                     uint32_t length,\n           " +
 	"                                          uint32_t distance) {\n  if (!distance) {\n    return 0;\n  }\n  uint8_t* p = *ptr_iop_w;\n  if ((size_t)(p - io1_w) < (size_t)(distance)) {\n    return 0;\n  }\n  uint8_t* q = p - distance;\n  size_t n = (size_t)(io2_w - p);\n  if ((size_t)(length) > n) {\n    length = (uint32_t)(n);\n  } else {\n    n = (size_t)(length);\n  }\n  // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n  // is mostly because 3 is the minimum length for the deflate format. This\n  // function implementation shouldn't overfit to that one format. Perhaps the\n  // limited_copy_u32_from_history Wuffs method should also take an unroll hint\n  // argument, and the cgen can look if that argument is the constant\n  // expression '3'.\n  //\n  // See also wuffs_base__io_writer__limited_copy_u32_from_history_fast below.\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_w" +
-	"riter__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n//  - length   <= (io2_w      - *ptr_iop_w)\n//  - distance >= 1\n//  - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n                                                          uint8_t* io1_w,\n                                                          uint8_t* io2_w,\n                                                          uint32_t length,\n                                                          uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast" +
-	" is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n//  - (length + 8) <= (io2_w      - *ptr_iop_w)\n//  - distance     >= 8\n//  - distance     <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n    uint8_t** ptr_iop_w,\n    uint8_t* io1_w,\n    uint8_t* io2_w,\n    uint32_t length,\n    uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  while (1) {\n    memcpy(p, q, 8);\n    if (n <= 8) {\n      p += n;\n      break;\n    }\n    p += 8;\n    q += 8;\n    n -= 8;\n  }\n  *ptr_iop_w = p;\n  ret" +
-	"urn length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n                                                    uint8_t* io2_w,\n                                                    uint32_t length,\n                                                    const uint8_t** ptr_iop_r,\n                                                    const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_w,\n                                                   uint8_t* io2_w,\n                                                   uint32_t length,\n                        " +
-	"                           wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_w = data.ptr;\n  *ptr_io2_w = data.ptr + data.len;\n\n  return b;\n}\n\n" +
+	"riter__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n//  - length   <= (io2_w      - *ptr_iop_w)\n//  - distance >= 1\n//  - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n                                                          uint8_t* io1_w,\n                                                          uint8_t* io2_w,\n                                                          uint32_t length,\n                                                          uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_dist" +
+	"ance_1_fast\n// copies the previous byte (the one immediately before *ptr_iop_w), copying 8\n// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n//  - (length + 8) <= (io2_w      - *ptr_iop_w)\n//  - distance     == 1\n//  - distance     <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(\n    uint8_t** ptr_iop_w,\n    uint8_t* io1_w,\n    uint8_t* io2_w,\n    uint32_t length,\n    uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint64_t x = p[-1];\n  x |= x << 8;\n  x |= x << 16;\n  x |= x << 32;\n  uint32_t n = length;\n  while (1) {\n    wuffs_base__poke_u64le__no_bounds_check(" +
+	"p, x);\n    if (n <= 8) {\n      p += n;\n      break;\n    }\n    p += 8;\n    n -= 8;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n//  - (length + 8) <= (io2_w      - *ptr_iop_w)\n//  - distance     >= 8\n//  - distance     <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n    uint8_t** ptr_iop_w,\n    uint8_t* io1_w,\n    uint8_t* io2_w,\n    uint32_t length,\n    uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  u" +
+	"int8_t* q = p - distance;\n  uint32_t n = length;\n  while (1) {\n    memcpy(p, q, 8);\n    if (n <= 8) {\n      p += n;\n      break;\n    }\n    p += 8;\n    q += 8;\n    n -= 8;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n                                                    uint8_t* io2_w,\n                                                    uint32_t length,\n                                                    const uint8_t** ptr_iop_r,\n                                                    const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_co" +
+	"py_u32_from_slice(uint8_t** ptr_iop_w,\n                                                   uint8_t* io2_w,\n                                                   uint32_t length,\n                                                   wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_" +
+	"w = data.ptr;\n  *ptr_io2_w = data.ptr + data.len;\n\n  return b;\n}\n\n" +
 	"" +
 	"// ---------------- I/O (Utility)\n\n#define wuffs_base__utility__empty_io_reader wuffs_base__empty_io_reader\n#define wuffs_base__utility__empty_io_writer wuffs_base__empty_io_writer\n" +
 	""
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 4bc325b..dc4efdb 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -549,6 +549,13 @@
 	// For now, that's all implicitly checked (i.e. hard coded).
 	"io_writer.limited_copy_u32_from_history_8_byte_chunks_fast!(up_to: u32, distance: u32) u32",
 
+	// TODO: this should have explicit pre-conditions:
+	//  - (up_to + 8) <= this.length()
+	//  - distance == 1
+	//  - distance <= this.history_length()
+	// For now, that's all implicitly checked (i.e. hard coded).
+	"io_writer.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(up_to: u32, distance: u32) u32",
+
 	// ---- token_writer
 
 	"token_writer.write_simple_token_fast!(" +
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index d718e4d..390f33b 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1160,13 +1160,18 @@
 				return bounds{}, err
 			}
 
+		} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, nil, one); err != nil {
+				return bounds{}, err
+			}
+
 		} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
-			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight); err != nil {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight, nil); err != nil {
 				return bounds{}, err
 			}
 
 		} else if method == t.IDLimitedCopyU32FromHistoryFast {
-			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil, one); err != nil {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil, one, nil); err != nil {
 				return bounds{}, err
 			}
 
@@ -1272,10 +1277,11 @@
 	return fmt.Errorf("check: could not prove %s.can_undo_byte()", recv.Str(q.tm))
 }
 
-func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int, minDistance *big.Int) error {
+func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int, minDistance *big.Int, exactDistance *big.Int) error {
 	// As per cgen's io-private.h, there are three pre-conditions:
 	//  - (upTo + adj) <= this.length()
-	//  - distance >= minDistance
+	//  - either (distance >= minDistance) or (distance == exactDistance),
+	//    depending on which of minDistance and exactDistance is non-nil.
 	//  - distance <= this.history_length()
 	//
 	// adj may be nil, in which case (upTo + adj) is just upTo.
@@ -1335,8 +1341,8 @@
 	}
 
 	// Check "distance >= minDistance".
-check1:
-	for {
+check1a:
+	for minDistance != nil {
 		for _, x := range q.facts {
 			if x.Operator() != t.IDXBinaryGreaterEq {
 				continue
@@ -1347,11 +1353,29 @@
 			if rcv := x.RHS().AsExpr().ConstValue(); (rcv == nil) || (rcv.Cmp(minDistance) < 0) {
 				continue
 			}
-			break check1
+			break check1a
 		}
 		return fmt.Errorf("check: could not prove %s >= %v", distance.Str(q.tm), minDistance)
 	}
 
+	// Check "distance == exactDistance".
+check1b:
+	for exactDistance != nil {
+		for _, x := range q.facts {
+			if x.Operator() != t.IDXBinaryEqEq {
+				continue
+			}
+			if lhs := x.LHS().AsExpr(); !lhs.Eq(distance) {
+				continue
+			}
+			if rcv := x.RHS().AsExpr().ConstValue(); (rcv == nil) || (rcv.Cmp(exactDistance) != 0) {
+				continue
+			}
+			break check1b
+		}
+		return fmt.Errorf("check: could not prove %s == %v", distance.Str(q.tm), exactDistance)
+	}
+
 	// Check "distance <= this.history_length()".
 check2:
 	for {
diff --git a/lang/token/list.go b/lang/token/list.go
index c046b42..b8f33e3 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -504,13 +504,14 @@
 	IDSkipU32       = ID(0x16B)
 	IDSkipU32Fast   = ID(0x16C)
 
-	IDCopyFromSlice                            = ID(0x170)
-	IDLimitedCopyU32FromHistory                = ID(0x171)
-	IDLimitedCopyU32FromHistory8ByteChunksFast = ID(0x172)
-	IDLimitedCopyU32FromHistoryFast            = ID(0x173)
-	IDLimitedCopyU32FromReader                 = ID(0x174)
-	IDLimitedCopyU32FromSlice                  = ID(0x175)
-	IDLimitedCopyU32ToSlice                    = ID(0x176)
+	IDCopyFromSlice                                     = ID(0x170)
+	IDLimitedCopyU32FromHistory                         = ID(0x171)
+	IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast = ID(0x172)
+	IDLimitedCopyU32FromHistory8ByteChunksFast          = ID(0x173)
+	IDLimitedCopyU32FromHistoryFast                     = ID(0x174)
+	IDLimitedCopyU32FromReader                          = ID(0x175)
+	IDLimitedCopyU32FromSlice                           = ID(0x176)
+	IDLimitedCopyU32ToSlice                             = ID(0x177)
 
 	// -------- 0x180 block.
 
@@ -929,13 +930,14 @@
 	IDSkipU32:       "skip_u32",
 	IDSkipU32Fast:   "skip_u32_fast",
 
-	IDCopyFromSlice:                            "copy_from_slice",
-	IDLimitedCopyU32FromHistory:                "limited_copy_u32_from_history",
-	IDLimitedCopyU32FromHistory8ByteChunksFast: "limited_copy_u32_from_history_8_byte_chunks_fast",
-	IDLimitedCopyU32FromHistoryFast:            "limited_copy_u32_from_history_fast",
-	IDLimitedCopyU32FromReader:                 "limited_copy_u32_from_reader",
-	IDLimitedCopyU32FromSlice:                  "limited_copy_u32_from_slice",
-	IDLimitedCopyU32ToSlice:                    "limited_copy_u32_to_slice",
+	IDCopyFromSlice:             "copy_from_slice",
+	IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
+	IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast",
+	IDLimitedCopyU32FromHistory8ByteChunksFast:          "limited_copy_u32_from_history_8_byte_chunks_fast",
+	IDLimitedCopyU32FromHistoryFast:                     "limited_copy_u32_from_history_fast",
+	IDLimitedCopyU32FromReader:                          "limited_copy_u32_from_reader",
+	IDLimitedCopyU32FromSlice:                           "limited_copy_u32_from_slice",
+	IDLimitedCopyU32ToSlice:                             "limited_copy_u32_to_slice",
 
 	// -------- 0x180 block.
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 543fda8..d9756cc 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -10466,6 +10466,46 @@
   return length;
 }
 
+// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     == 1
+//  - distance     <= (*ptr_iop_w - io1_w)
+static inline uint32_t  //
+wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+    uint8_t** ptr_iop_w,
+    uint8_t* io1_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint64_t x = p[-1];
+  x |= x << 8;
+  x |= x << 16;
+  x |= x << 32;
+  uint32_t n = length;
+  while (1) {
+    wuffs_base__poke_u64le__no_bounds_check(p, x);
+    if (n <= 8) {
+      p += n;
+      break;
+    }
+    p += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return length;
+}
+
 // wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is
 // like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function
 // above, but copies 8 byte chunks at a time.
@@ -26485,6 +26525,9 @@
       if ((v_dist_minus_1 + 1) >= 8) {
         wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
             &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      } else if ((v_dist_minus_1 + 1) == 1) {
+        wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+            &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
       } else {
         wuffs_base__io_writer__limited_copy_u32_from_history_fast(
             &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
@@ -26947,6 +26990,9 @@
       if ((v_dist_minus_1 + 1) >= 8) {
         wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
             &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      } else if ((v_dist_minus_1 + 1) == 1) {
+        wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast(
+            &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
       } else {
         wuffs_base__io_writer__limited_copy_u32_from_history_fast(
             &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
diff --git a/std/deflate/decode_huffman_bmi2.wuffs b/std/deflate/decode_huffman_bmi2.wuffs
index cad7dc1..991cee8 100644
--- a/std/deflate/decode_huffman_bmi2.wuffs
+++ b/std/deflate/decode_huffman_bmi2.wuffs
@@ -253,6 +253,11 @@
 			if (dist_minus_1 + 1) >= 8 {
 				args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
 					up_to: length, distance: (dist_minus_1 + 1))
+			} else if (dist_minus_1 + 1) == 1 {
+				// (distance == 1) is essentially RLE (Run Length Encoding). It
+				// happens often enough that it's worth special-casing.
+				args.dst.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(
+					up_to: length, distance: (dist_minus_1 + 1))
 			} else {
 				args.dst.limited_copy_u32_from_history_fast!(
 					up_to: length, distance: (dist_minus_1 + 1))
diff --git a/std/deflate/decode_huffman_fast64.wuffs b/std/deflate/decode_huffman_fast64.wuffs
index 8290b3c..a7dc731 100644
--- a/std/deflate/decode_huffman_fast64.wuffs
+++ b/std/deflate/decode_huffman_fast64.wuffs
@@ -252,6 +252,11 @@
 			if (dist_minus_1 + 1) >= 8 {
 				args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
 					up_to: length, distance: (dist_minus_1 + 1))
+			} else if (dist_minus_1 + 1) == 1 {
+				// (distance == 1) is essentially RLE (Run Length Encoding). It
+				// happens often enough that it's worth special-casing.
+				args.dst.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(
+					up_to: length, distance: (dist_minus_1 + 1))
 			} else {
 				args.dst.limited_copy_u32_from_history_fast!(
 					up_to: length, distance: (dist_minus_1 + 1))