Check 8-byte-chunk copy distance precondition
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index 11b6274..492d010 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -207,9 +207,9 @@
 // stronger pre-conditions.
 //
 // The caller needs to prove that:
-//  - distance >  0
-//  - distance <= (*ptr_iop_w - io1_w)
 //  - length   <= (io2_w      - *ptr_iop_w)
+//  - distance >= 1
+//  - distance <= (*ptr_iop_w - io1_w)
 static inline uint32_t  //
 wuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,
                                                           uint8_t* io1_w,
@@ -242,9 +242,9 @@
 // In terms of advancing *ptr_iop_w, length is not rounded up.
 //
 // The caller needs to prove that:
-//  - distance     >  0
-//  - distance     <= (*ptr_iop_w - io1_w)
 //  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     >= 8
+//  - distance     <= (*ptr_iop_w - io1_w)
 static inline uint32_t  //
 wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
     uint8_t** ptr_iop_w,
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index d3df782..a617b6b 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -213,8 +213,8 @@
 	"" +
 	"// --------\n\nstatic inline uint64_t  //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n                                       uint8_t* io2_w,\n                                       wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint64_t)(n);\n}\n\nstatic inline void  //\nwuffs_base__io_writer__limit(uint8_t** ptr_io2_w,\n                             uint8_t* iop_w,\n                             uint64_t limit) {\n  if (((uint64_t)(*ptr_io2_w - iop_w)) > limit) {\n    *ptr_io2_w = iop_w + limit;\n  }\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history(uint8_t** ptr_iop_w,\n                                                     uint8_t* io1_w,\n                                                     uint8_t* io2_w,\n                                                     uint32_t length,\n           " +
 	"                                          uint32_t distance) {\n  if (!distance) {\n    return 0;\n  }\n  uint8_t* p = *ptr_iop_w;\n  if ((size_t)(p - io1_w) < (size_t)(distance)) {\n    return 0;\n  }\n  uint8_t* q = p - distance;\n  size_t n = (size_t)(io2_w - p);\n  if ((size_t)(length) > n) {\n    length = (uint32_t)(n);\n  } else {\n    n = (size_t)(length);\n  }\n  // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n  // is mostly because 3 is the minimum length for the deflate format. This\n  // function implementation shouldn't overfit to that one format. Perhaps the\n  // limited_copy_u32_from_history Wuffs method should also take an unroll hint\n  // argument, and the cgen can look if that argument is the constant\n  // expression '3'.\n  //\n  // See also wuffs_base__io_writer__limited_copy_u32_from_history_fast below.\n  //\n  // Alternatively or additionally, have a sloppy_limited_copy_u32_from_history\n  // method that copies 8 bytes at a time, which can more than length bytes?\n  for (; n >= 3; " +
-	"n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n//  - distance >  0\n//  - distance <= (*ptr_iop_w - io1_w)\n//  - length   <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n                                                          uint8_t* io1_w,\n                                                          uint8_t* io2_w,\n                                                          uint32_t length,\n                                                          uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = " +
-	"*q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n//  - distance     >  0\n//  - distance     <= (*ptr_iop_w - io1_w)\n//  - (length + 8) <= (io2_w      - *ptr_iop_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n    uint8_t** ptr_iop_w,\n    uint8_t* io1_w,\n    uint8_t* io2_w,\n    uint32_t length,\n    uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t" +
+	"n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy_u32_from_history function above, but has\n// stronger pre-conditions.\n//\n// The caller needs to prove that:\n//  - length   <= (io2_w      - *ptr_iop_w)\n//  - distance >= 1\n//  - distance <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,\n                                                          uint8_t* io1_w,\n                                                          uint8_t* io2_w,\n                                                          uint32_t length,\n                                                          uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t n = length;\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = " +
+	"*q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast is\n// like the wuffs_base__io_writer__limited_copy_u32_from_history_fast function\n// above, but copies 8 byte chunks at a time.\n//\n// In terms of number of bytes copied, length is rounded up to a multiple of 8.\n// As a special case, a zero length rounds up to 8 (even though 0 is already a\n// multiple of 8), since there is always at least one 8 byte chunk copied.\n//\n// In terms of advancing *ptr_iop_w, length is not rounded up.\n//\n// The caller needs to prove that:\n//  - (length + 8) <= (io2_w      - *ptr_iop_w)\n//  - distance     >= 8\n//  - distance     <= (*ptr_iop_w - io1_w)\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(\n    uint8_t** ptr_iop_w,\n    uint8_t* io1_w,\n    uint8_t* io2_w,\n    uint32_t length,\n    uint32_t distance) {\n  uint8_t* p = *ptr_iop_w;\n  uint8_t* q = p - distance;\n  uint32_t" +
 	" n = length;\n  while (1) {\n    memcpy(p, q, 8);\n    if (n <= 8) {\n      p += n;\n      break;\n    }\n    p += 8;\n    q += 8;\n    n -= 8;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_reader(uint8_t** ptr_iop_w,\n                                                    uint8_t* io2_w,\n                                                    uint32_t length,\n                                                    const uint8_t** ptr_iop_r,\n                                                    const uint8_t* io2_r) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = length;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  const uint8_t* iop_r = *ptr_iop_r;\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(iop_w, iop_r, n);\n    *ptr_iop_w += n;\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_slice(uint8_t** ptr_iop_" +
 	"w,\n                                                   uint8_t* io2_w,\n                                                   uint32_t length,\n                                                   wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint32_t)(n);\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_writer__set(wuffs_base__io_buffer* b,\n                           uint8_t** ptr_iop_w,\n                           uint8_t** ptr_io0_w,\n                           uint8_t** ptr_io1_w,\n                           uint8_t** ptr_io2_w,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = 0;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_w = data.ptr;\n  *ptr_io0_w = data.ptr;\n  *ptr_io1_w = data.ptr;\n  *ptr_io2_w = data.pt" +
 	"r + data.len;\n\n  return b;\n}\n\n" +
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index ef4a241..4bc325b 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -537,14 +537,14 @@
 
 	// TODO: this should have explicit pre-conditions:
 	//  - up_to <= this.length()
-	//  - distance > 0
+	//  - distance >= 1
 	//  - distance <= this.history_length()
 	// For now, that's all implicitly checked (i.e. hard coded).
 	"io_writer.limited_copy_u32_from_history_fast!(up_to: u32, distance: u32) u32",
 
 	// TODO: this should have explicit pre-conditions:
 	//  - (up_to + 8) <= this.length()
-	//  - distance > 0
+	//  - distance >= 8
 	//  - distance <= this.history_length()
 	// For now, that's all implicitly checked (i.e. hard coded).
 	"io_writer.limited_copy_u32_from_history_8_byte_chunks_fast!(up_to: u32, distance: u32) u32",
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index 2fa3044..46815e7 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1168,12 +1168,12 @@
 			}
 
 		} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
-			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight); err != nil {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight); err != nil {
 				return bounds{}, err
 			}
 
 		} else if method == t.IDLimitedCopyU32FromHistoryFast {
-			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil); err != nil {
+			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), nil, one); err != nil {
 				return bounds{}, err
 			}
 
@@ -1286,10 +1286,10 @@
 	return fmt.Errorf("check: could not prove %s.can_undo_byte()", recv.Str(q.tm))
 }
 
-func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int) error {
+func (q *checker) canLimitedCopyU32FromHistoryFast(recv *a.Expr, args []*a.Node, adj *big.Int, minDistance *big.Int) error {
 	// As per cgen's io-private.h, there are three pre-conditions:
 	//  - (upTo + adj) <= this.length()
-	//  - distance > 0
+	//  - distance >= minDistance
 	//  - distance <= this.history_length()
 	//
 	// adj may be nil, in which case (upTo + adj) is just upTo.
@@ -1348,22 +1348,22 @@
 			upTo.Str(q.tm), adj, recv.Str(q.tm))
 	}
 
-	// Check "distance > 0".
+	// Check "distance >= minDistance".
 check1:
 	for {
 		for _, x := range q.facts {
-			if x.Operator() != t.IDXBinaryGreaterThan {
+			if x.Operator() != t.IDXBinaryGreaterEq {
 				continue
 			}
 			if lhs := x.LHS().AsExpr(); !lhs.Eq(distance) {
 				continue
 			}
-			if rcv := x.RHS().AsExpr().ConstValue(); rcv == nil || rcv.Sign() != 0 {
+			if rcv := x.RHS().AsExpr().ConstValue(); (rcv == nil) || (rcv.Cmp(minDistance) < 0) {
 				continue
 			}
 			break check1
 		}
-		return fmt.Errorf("check: could not prove %s > 0", distance.Str(q.tm))
+		return fmt.Errorf("check: could not prove %s >= %v", distance.Str(q.tm), minDistance)
 	}
 
 	// Check "distance <= this.history_length()".
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index d12ea43..604f212 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -10191,9 +10191,9 @@
 // stronger pre-conditions.
 //
 // The caller needs to prove that:
-//  - distance >  0
-//  - distance <= (*ptr_iop_w - io1_w)
 //  - length   <= (io2_w      - *ptr_iop_w)
+//  - distance >= 1
+//  - distance <= (*ptr_iop_w - io1_w)
 static inline uint32_t  //
 wuffs_base__io_writer__limited_copy_u32_from_history_fast(uint8_t** ptr_iop_w,
                                                           uint8_t* io1_w,
@@ -10226,9 +10226,9 @@
 // In terms of advancing *ptr_iop_w, length is not rounded up.
 //
 // The caller needs to prove that:
-//  - distance     >  0
-//  - distance     <= (*ptr_iop_w - io1_w)
 //  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     >= 8
+//  - distance     <= (*ptr_iop_w - io1_w)
 static inline uint32_t  //
 wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
     uint8_t** ptr_iop_w,
diff --git a/std/deflate/decode_huffman_fast32.wuffs b/std/deflate/decode_huffman_fast32.wuffs
index 75c229e..543b074 100644
--- a/std/deflate/decode_huffman_fast32.wuffs
+++ b/std/deflate/decode_huffman_fast32.wuffs
@@ -341,7 +341,7 @@
 				}
 			}
 			// Once again, redundant but explicit assertions.
-			assert (dist_minus_1 + 1) > 0
+			assert (dist_minus_1 + 1) >= 1
 			assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length()
 			assert (length as base.u64) <= args.dst.length()
 			assert ((length + 8) as base.u64) <= args.dst.length()
diff --git a/std/deflate/decode_huffman_fast64.wuffs b/std/deflate/decode_huffman_fast64.wuffs
index 9e0e8e2..55e72dc 100644
--- a/std/deflate/decode_huffman_fast64.wuffs
+++ b/std/deflate/decode_huffman_fast64.wuffs
@@ -233,7 +233,7 @@
 				}
 			}
 			// Once again, redundant but explicit assertions.
-			assert (dist_minus_1 + 1) > 0
+			assert (dist_minus_1 + 1) >= 1
 			assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length()
 			assert (length as base.u64) <= args.dst.length()
 			assert ((length + 8) as base.u64) <= args.dst.length()