std/lzma: use 8-byte-copy if applicable

name                            old speed      new speed      delta
wuffs_lzma_decode_100k/clang14  56.2MB/s ± 0%  60.7MB/s ± 0%  +7.96%  (p=0.008 n=5+5)
wuffs_lzma_decode_100k/gcc12    56.4MB/s ± 1%  61.7MB/s ± 1%  +9.44%  (p=0.008 n=5+5)

$ time example-mzcat < linux-6.8.2.tar.xz > /dev/null
Before: user 0m8.396s
After:  user 0m8.392s
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index e132c24..b1e4e31 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -308,6 +308,52 @@
   return length;
 }
 
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+// It also returns the cusp: a byte pair (as a u16le) being the last byte of
+// and next byte after the copied history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - length       >= 1
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     == 1
+//  - distance     <= (*ptr_iop_w - io0_w)
+static inline uint32_t  //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp(
+    uint8_t** ptr_iop_w,
+    uint8_t* io0_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint8_t* q = p - distance;
+  uint64_t x = p[-1];
+  x |= x << 8;
+  x |= x << 16;
+  x |= x << 32;
+  uint32_t n = length;
+  while (1) {
+    wuffs_base__poke_u64le__no_bounds_check(p, x);
+    if (n <= 8) {
+      p += n;
+      q += n;
+      break;
+    }
+    p += 8;
+    q += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
 // wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast
 // is like the
 // wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
@@ -348,6 +394,49 @@
   return length;
 }
 
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp
+// is like the
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time. It also returns the cusp: a byte
+// pair (as a u16le) being the last byte of and next byte after the copied
+// history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - length       >= 1
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     >= 8
+//  - distance     <= (*ptr_iop_w - io0_w)
+static inline uint32_t  //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp(
+    uint8_t** ptr_iop_w,
+    uint8_t* io0_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint8_t* q = p - distance;
+  uint32_t n = length;
+  while (1) {
+    memcpy(p, q, 8);
+    if (n <= 8) {
+      p += n;
+      q += n;
+      break;
+    }
+    p += 8;
+    q += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
 static inline uint32_t  //
 wuffs_private_impl__io_writer__limited_copy_u32_from_reader(
     uint8_t** ptr_iop_w,
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 555624e..b7aec79 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -321,7 +321,9 @@
 	switch method {
 	case t.IDLimitedCopyU32FromHistory,
 		t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast,
+		t.IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp,
 		t.IDLimitedCopyU32FromHistory8ByteChunksFast,
+		t.IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp,
 		t.IDLimitedCopyU32FromHistoryFast,
 		t.IDLimitedCopyU32FromHistoryFastReturnCusp:
 		b.printf("wuffs_private_impl__io_writer__%s(\n&%s%s, %s%s, %s%s",
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 8e113c6..4f5d0f5 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -613,11 +613,27 @@
 	// TODO: this should have explicit pre-conditions:
 	//  - up_to >= 1
 	//  - (up_to + 8) <= this.length()
+	//  - distance >= 8
+	//  - distance <= this.history_length()
+	// For now, that's all implicitly checked (i.e. hard coded).
+	"io_writer.limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp!(up_to: u32, distance: u32) u32[..= 0xFFFF]",
+
+	// TODO: this should have explicit pre-conditions:
+	//  - up_to >= 1
+	//  - (up_to + 8) <= this.length()
 	//  - distance == 1
 	//  - distance <= this.history_length()
 	// For now, that's all implicitly checked (i.e. hard coded).
 	"io_writer.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast!(up_to: u32, distance: u32) u32",
 
+	// TODO: this should have explicit pre-conditions:
+	//  - up_to >= 1
+	//  - (up_to + 8) <= this.length()
+	//  - distance == 1
+	//  - distance <= this.history_length()
+	// For now, that's all implicitly checked (i.e. hard coded).
+	"io_writer.limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp!(up_to: u32, distance: u32) u32[..= 0xFFFF]",
+
 	// ---- token_writer
 
 	"token_writer.write_simple_token_fast!(" +
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index 89d21f1..51b64e7 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1198,12 +1198,14 @@
 				return bounds{}, err
 			}
 
-		} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast {
+		} else if (method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast) ||
+			(method == t.IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp) {
 			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, nil, one); err != nil {
 				return bounds{}, err
 			}
 
-		} else if method == t.IDLimitedCopyU32FromHistory8ByteChunksFast {
+		} else if (method == t.IDLimitedCopyU32FromHistory8ByteChunksFast) ||
+			(method == t.IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp) {
 			if err := q.canLimitedCopyU32FromHistoryFast(recv, n.Args(), eight, eight, nil); err != nil {
 				return bounds{}, err
 			}
diff --git a/lang/token/list.go b/lang/token/list.go
index 4c071ad..404df85 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -522,15 +522,17 @@
 	IDSkipU32         = ID(0x16C)
 	IDSkipU32Fast     = ID(0x16D)
 
-	IDCopyFromSlice                                     = ID(0x170)
-	IDLimitedCopyU32FromHistory                         = ID(0x171)
-	IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast = ID(0x172)
-	IDLimitedCopyU32FromHistory8ByteChunksFast          = ID(0x173)
-	IDLimitedCopyU32FromHistoryFast                     = ID(0x174)
-	IDLimitedCopyU32FromHistoryFastReturnCusp           = ID(0x175)
-	IDLimitedCopyU32FromReader                          = ID(0x176)
-	IDLimitedCopyU32FromSlice                           = ID(0x177)
-	IDLimitedCopyU32ToSlice                             = ID(0x178)
+	IDCopyFromSlice                                               = ID(0x170)
+	IDLimitedCopyU32FromHistory                                   = ID(0x171)
+	IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast           = ID(0x172)
+	IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp = ID(0x173)
+	IDLimitedCopyU32FromHistory8ByteChunksFast                    = ID(0x174)
+	IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp          = ID(0x175)
+	IDLimitedCopyU32FromHistoryFast                               = ID(0x176)
+	IDLimitedCopyU32FromHistoryFastReturnCusp                     = ID(0x177)
+	IDLimitedCopyU32FromReader                                    = ID(0x178)
+	IDLimitedCopyU32FromSlice                                     = ID(0x179)
+	IDLimitedCopyU32ToSlice                                       = ID(0x17A)
 
 	// -------- 0x180 block.
 
@@ -973,13 +975,15 @@
 
 	IDCopyFromSlice:             "copy_from_slice",
 	IDLimitedCopyU32FromHistory: "limited_copy_u32_from_history",
-	IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast",
-	IDLimitedCopyU32FromHistory8ByteChunksFast:          "limited_copy_u32_from_history_8_byte_chunks_fast",
-	IDLimitedCopyU32FromHistoryFast:                     "limited_copy_u32_from_history_fast",
-	IDLimitedCopyU32FromHistoryFastReturnCusp:           "limited_copy_u32_from_history_fast_return_cusp",
-	IDLimitedCopyU32FromReader:                          "limited_copy_u32_from_reader",
-	IDLimitedCopyU32FromSlice:                           "limited_copy_u32_from_slice",
-	IDLimitedCopyU32ToSlice:                             "limited_copy_u32_to_slice",
+	IDLimitedCopyU32FromHistory8ByteChunksDistance1Fast:           "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast",
+	IDLimitedCopyU32FromHistory8ByteChunksDistance1FastReturnCusp: "limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp",
+	IDLimitedCopyU32FromHistory8ByteChunksFast:                    "limited_copy_u32_from_history_8_byte_chunks_fast",
+	IDLimitedCopyU32FromHistory8ByteChunksFastReturnCusp:          "limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp",
+	IDLimitedCopyU32FromHistoryFast:                               "limited_copy_u32_from_history_fast",
+	IDLimitedCopyU32FromHistoryFastReturnCusp:                     "limited_copy_u32_from_history_fast_return_cusp",
+	IDLimitedCopyU32FromReader:                                    "limited_copy_u32_from_reader",
+	IDLimitedCopyU32FromSlice:                                     "limited_copy_u32_from_slice",
+	IDLimitedCopyU32ToSlice:                                       "limited_copy_u32_to_slice",
 
 	// -------- 0x180 block.
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 5ae002d..ba174db 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -15705,6 +15705,52 @@
   return length;
 }
 
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp
+// copies the previous byte (the one immediately before *ptr_iop_w), copying 8
+// byte chunks at a time. Each chunk contains 8 repetitions of the same byte.
+// It also returns the cusp: a byte pair (as a u16le) being the last byte of
+// and next byte after the copied history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - length       >= 1
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     == 1
+//  - distance     <= (*ptr_iop_w - io0_w)
+static inline uint32_t  //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_distance_1_fast_return_cusp(
+    uint8_t** ptr_iop_w,
+    uint8_t* io0_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint8_t* q = p - distance;
+  uint64_t x = p[-1];
+  x |= x << 8;
+  x |= x << 16;
+  x |= x << 32;
+  uint32_t n = length;
+  while (1) {
+    wuffs_base__poke_u64le__no_bounds_check(p, x);
+    if (n <= 8) {
+      p += n;
+      q += n;
+      break;
+    }
+    p += 8;
+    q += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
 // wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast
 // is like the
 // wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
@@ -15745,6 +15791,49 @@
   return length;
 }
 
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp
+// is like the
+// wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast function
+// above, but copies 8 byte chunks at a time. It also returns the cusp: a byte
+// pair (as a u16le) being the last byte of and next byte after the copied
+// history.
+//
+// In terms of number of bytes copied, length is rounded up to a multiple of 8.
+// As a special case, a zero length rounds up to 8 (even though 0 is already a
+// multiple of 8), since there is always at least one 8 byte chunk copied.
+//
+// In terms of advancing *ptr_iop_w, length is not rounded up.
+//
+// The caller needs to prove that:
+//  - length       >= 1
+//  - (length + 8) <= (io2_w      - *ptr_iop_w)
+//  - distance     >= 8
+//  - distance     <= (*ptr_iop_w - io0_w)
+static inline uint32_t  //
+wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp(
+    uint8_t** ptr_iop_w,
+    uint8_t* io0_w,
+    uint8_t* io2_w,
+    uint32_t length,
+    uint32_t distance) {
+  uint8_t* p = *ptr_iop_w;
+  uint8_t* q = p - distance;
+  uint32_t n = length;
+  while (1) {
+    memcpy(p, q, 8);
+    if (n <= 8) {
+      p += n;
+      q += n;
+      break;
+    }
+    p += 8;
+    q += 8;
+    n -= 8;
+  }
+  *ptr_iop_w = p;
+  return (uint32_t)wuffs_base__peek_u16le__no_bounds_check(q - 1);
+}
+
 static inline uint32_t  //
 wuffs_private_impl__io_writer__limited_copy_u32_from_reader(
     uint8_t** ptr_iop_w,
@@ -52817,15 +52906,22 @@
       wuffs_private_impl__io_writer__limited_copy_u32_from_slice(
           &iop_a_dst, io2_a_dst,v_adj_dist, wuffs_base__slice_u8__subslice_i(a_workbuf, v_wb_index));
       v_len -= v_adj_dist;
-      if ((((uint64_t)(v_len)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)(v_dist)) > ((uint64_t)(iop_a_dst - io0_a_dst)))) {
+      if ((((uint64_t)(v_len)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)((v_len + 8u))) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)(v_dist)) > ((uint64_t)(iop_a_dst - io0_a_dst)))) {
         status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_dictionary_state);
         goto exit;
       }
     }
-    v_match_cusp = wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast_return_cusp(
-        &iop_a_dst, io0_a_dst, io2_a_dst, v_len, v_dist);
-    v_match_byte = (v_match_cusp >> 8u);
-    v_prev_byte = ((uint8_t)(v_match_cusp));
+    if (v_dist >= 8u) {
+      v_match_cusp = wuffs_private_impl__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp(
+          &iop_a_dst, io0_a_dst, io2_a_dst, v_len, v_dist);
+      v_match_byte = (v_match_cusp >> 8u);
+      v_prev_byte = ((uint8_t)(v_match_cusp));
+    } else {
+      v_match_cusp = wuffs_private_impl__io_writer__limited_copy_u32_from_history_fast_return_cusp(
+          &iop_a_dst, io0_a_dst, io2_a_dst, v_len, v_dist);
+      v_match_byte = (v_match_cusp >> 8u);
+      v_prev_byte = ((uint8_t)(v_match_cusp));
+    }
   }
   label__outer__break:;
   self->private_impl.f_stashed_bytes[0u] = v_prev_byte;
diff --git a/std/lzma/decode_bitstream_fast.wuffs b/std/lzma/decode_bitstream_fast.wuffs
index 5f2d213..03fa8f4 100644
--- a/std/lzma/decode_bitstream_fast.wuffs
+++ b/std/lzma/decode_bitstream_fast.wuffs
@@ -932,6 +932,7 @@
         }
         pos ~mod+= len as base.u64
         assert (len as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 282)
+        assert ((len + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 282)
 
         // Copy from the args.workbuf history, if necessary.
         if (dist as base.u64) > args.dst.history_length() {
@@ -984,6 +985,7 @@
             len -= adj_dist
             assert len >= 1
             if ((len as base.u64) > args.dst.length()) or
+                    (((len + 8) as base.u64) > args.dst.length()) or
                     ((dist as base.u64) > args.dst.history_length()) {
                 return "#internal error: inconsistent dictionary state"
             }
@@ -993,11 +995,19 @@
         assert len >= 1
         assert dist >= 1
         assert (len as base.u64) <= args.dst.length()
+        assert ((len + 8) as base.u64) <= args.dst.length()
         assert (dist as base.u64) <= args.dst.history_length()
-        match_cusp = args.dst.limited_copy_u32_from_history_fast_return_cusp!(
-                up_to: len, distance: dist)
-        match_byte = match_cusp >> 8
-        prev_byte = (match_cusp & 0xFF) as base.u8
+        if dist >= 8 {
+            match_cusp = args.dst.limited_copy_u32_from_history_8_byte_chunks_fast_return_cusp!(
+                    up_to: len, distance: dist)
+            match_byte = match_cusp >> 8
+            prev_byte = (match_cusp & 0xFF) as base.u8
+        } else {
+            match_cusp = args.dst.limited_copy_u32_from_history_fast_return_cusp!(
+                    up_to: len, distance: dist)
+            match_byte = match_cusp >> 8
+            prev_byte = (match_cusp & 0xFF) as base.u8
+        }
     } endwhile.outer
 
     this.stashed_bytes[0] = prev_byte