Add io_reader.limited_copy_u32_to_slice
diff --git a/internal/cgen/base/io-private.h b/internal/cgen/base/io-private.h
index 4bd89f2..02ed2c7 100644
--- a/internal/cgen/base/io-private.h
+++ b/internal/cgen/base/io-private.h
@@ -34,6 +34,26 @@
 
 // --------
 
+static inline uint32_t  //
+wuffs_base__io_reader__limited_copy_u32_to_slice(const uint8_t** ptr_iop_r,
+                                                 const uint8_t* io2_r,
+                                                 uint32_t length,
+                                                 wuffs_base__slice_u8 dst) {
+  const uint8_t* iop_r = *ptr_iop_r;
+  size_t n = dst.len;
+  if (n > length) {
+    n = length;
+  }
+  if (n > ((size_t)(io2_r - iop_r))) {
+    n = (size_t)(io2_r - iop_r);
+  }
+  if (n > 0) {
+    memmove(dst.ptr, iop_r, n);
+    *ptr_iop_r += n;
+  }
+  return (uint32_t)(n);
+}
+
 // wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes
 // start with the given prefix (up to 7 bytes long). It is peek-like, not
 // read-like, in that there are no side-effects.
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index f028724..989706c 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -152,6 +152,11 @@
 		b.printf("(%s%s > %s%s)", iopPrefix, name, io1Prefix, name)
 		return nil
 
+	case t.IDLimitedCopyU32ToSlice:
+		b.printf("wuffs_base__io_reader__limited_copy_u32_to_slice(&%s%s, %s%s,",
+			iopPrefix, name, io2Prefix, name)
+		return g.writeArgs(b, args, depth)
+
 	case t.IDCountSince:
 		b.printf("wuffs_base__io__count_since(")
 		if err := g.writeExpr(b, args[0].AsArg().Value(), depth); err != nil {
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 4b871f2..4e6f069 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -327,9 +327,10 @@
 const baseIOPrivateH = "" +
 	"// ---------------- I/O\n\nstatic inline uint64_t  //\nwuffs_base__io__count_since(uint64_t mark, uint64_t index) {\n  if (index >= mark) {\n    return index - mark;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io__since(uint64_t mark, uint64_t index, uint8_t* ptr) {\n  if (index >= mark) {\n    return wuffs_base__make_slice_u8(ptr + mark, index - mark);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n\n" +
 	"" +
-	"// --------\n\n// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes\n// start with the given prefix (up to 7 bytes long). It is peek-like, not\n// read-like, in that there are no side-effects.\n//\n// The low 3 bits of a hold the prefix length, n.\n//\n// The high 56 bits of a hold the prefix itself, in little-endian order. The\n// first prefix byte is in bits 8..=15, the second prefix byte is in bits\n// 16..=23, etc. The high (8 * (7 - n)) bits are ignored.\n//\n// There are three possible return values:\n//  - 0 means success.\n//  - 1 means inconclusive, equivalent to \"$short read\".\n//  - 2 means failure.\nstatic inline uint32_t  //\nwuffs_base__io_reader__match7(const uint8_t* iop_r,\n                              const uint8_t* io2_r,\n                              wuffs_base__io_buffer* r,\n                              uint64_t a) {\n  uint32_t n = a & 7;\n  a >>= 8;\n  if ((io2_r - iop_r) >= 8) {\n    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);\n    uint32_t shift = 8 * (8 - n);\n " +
-	"   return ((a << shift) == (x << shift)) ? 0 : 2;\n  }\n  for (; n > 0; n--) {\n    if (iop_r >= io2_r) {\n      return (r && r->meta.closed) ? 2 : 1;\n    } else if (*iop_r != ((uint8_t)(a))) {\n      return 2;\n    }\n    iop_r++;\n    a >>= 8;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_reader__set(wuffs_base__io_buffer* b,\n                           const uint8_t** ptr_iop_r,\n                           const uint8_t** ptr_io0_r,\n                           const uint8_t** ptr_io1_r,\n                           const uint8_t** ptr_io2_r,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = data.len;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_r = data.ptr;\n  *ptr_io0_r = data.ptr;\n  *ptr_io1_r = data.ptr;\n  *ptr_io2_r = data.ptr + data.len;\n\n  return b;\n}\n\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wcast-qual\"\n// TODO: can we avoid the const_cast (by deleting this function)? This might\n// involve conver" +
-	"ting the call sites to take an io_reader instead of a slice u8\n// (the result of io_reader.take).\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io_reader__take(const uint8_t** ptr_iop_r,\n                            const uint8_t* io2_r,\n                            uint64_t n) {\n  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {\n    const uint8_t* p = *ptr_iop_r;\n    *ptr_iop_r += n;\n    // The arg is what C calls C++'s \"const_cast<uint8_t*>(p)\".\n    return wuffs_base__make_slice_u8((uint8_t*)(p), n);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n#pragma GCC diagnostic pop\n\n" +
+	"// --------\n\nstatic inline uint32_t  //\nwuffs_base__io_reader__limited_copy_u32_to_slice(const uint8_t** ptr_iop_r,\n                                                 const uint8_t* io2_r,\n                                                 uint32_t length,\n                                                 wuffs_base__slice_u8 dst) {\n  const uint8_t* iop_r = *ptr_iop_r;\n  size_t n = dst.len;\n  if (n > length) {\n    n = length;\n  }\n  if (n > ((size_t)(io2_r - iop_r))) {\n    n = (size_t)(io2_r - iop_r);\n  }\n  if (n > 0) {\n    memmove(dst.ptr, iop_r, n);\n    *ptr_iop_r += n;\n  }\n  return (uint32_t)(n);\n}\n\n// wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes\n// start with the given prefix (up to 7 bytes long). It is peek-like, not\n// read-like, in that there are no side-effects.\n//\n// The low 3 bits of a hold the prefix length, n.\n//\n// The high 56 bits of a hold the prefix itself, in little-endian order. The\n// first prefix byte is in bits 8..=15, the second prefix byte is in bits\n// 16..=23" +
+	", etc. The high (8 * (7 - n)) bits are ignored.\n//\n// There are three possible return values:\n//  - 0 means success.\n//  - 1 means inconclusive, equivalent to \"$short read\".\n//  - 2 means failure.\nstatic inline uint32_t  //\nwuffs_base__io_reader__match7(const uint8_t* iop_r,\n                              const uint8_t* io2_r,\n                              wuffs_base__io_buffer* r,\n                              uint64_t a) {\n  uint32_t n = a & 7;\n  a >>= 8;\n  if ((io2_r - iop_r) >= 8) {\n    uint64_t x = wuffs_base__load_u64le__no_bounds_check(iop_r);\n    uint32_t shift = 8 * (8 - n);\n    return ((a << shift) == (x << shift)) ? 0 : 2;\n  }\n  for (; n > 0; n--) {\n    if (iop_r >= io2_r) {\n      return (r && r->meta.closed) ? 2 : 1;\n    } else if (*iop_r != ((uint8_t)(a))) {\n      return 2;\n    }\n    iop_r++;\n    a >>= 8;\n  }\n  return 0;\n}\n\nstatic inline wuffs_base__io_buffer*  //\nwuffs_base__io_reader__set(wuffs_base__io_buffer* b,\n                           const uint8_t** ptr_iop_r,\n                           c" +
+	"onst uint8_t** ptr_io0_r,\n                           const uint8_t** ptr_io1_r,\n                           const uint8_t** ptr_io2_r,\n                           wuffs_base__slice_u8 data) {\n  b->data = data;\n  b->meta.wi = data.len;\n  b->meta.ri = 0;\n  b->meta.pos = 0;\n  b->meta.closed = false;\n\n  *ptr_iop_r = data.ptr;\n  *ptr_io0_r = data.ptr;\n  *ptr_io1_r = data.ptr;\n  *ptr_io2_r = data.ptr + data.len;\n\n  return b;\n}\n\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wcast-qual\"\n// TODO: can we avoid the const_cast (by deleting this function)? This might\n// involve converting the call sites to take an io_reader instead of a slice u8\n// (the result of io_reader.take).\nstatic inline wuffs_base__slice_u8  //\nwuffs_base__io_reader__take(const uint8_t** ptr_iop_r,\n                            const uint8_t* io2_r,\n                            uint64_t n) {\n  if (n <= ((size_t)(io2_r - *ptr_iop_r))) {\n    const uint8_t* p = *ptr_iop_r;\n    *ptr_iop_r += n;\n    // The arg is what C calls C++'s \"const_cast" +
+	"<uint8_t*>(p)\".\n    return wuffs_base__make_slice_u8((uint8_t*)(p), n);\n  }\n  return wuffs_base__make_slice_u8(NULL, 0);\n}\n#pragma GCC diagnostic pop\n\n" +
 	"" +
 	"// --------\n\nstatic inline uint64_t  //\nwuffs_base__io_writer__copy_from_slice(uint8_t** ptr_iop_w,\n                                       uint8_t* io2_w,\n                                       wuffs_base__slice_u8 src) {\n  uint8_t* iop_w = *ptr_iop_w;\n  size_t n = src.len;\n  if (n > ((size_t)(io2_w - iop_w))) {\n    n = (size_t)(io2_w - iop_w);\n  }\n  if (n > 0) {\n    memmove(iop_w, src.ptr, n);\n    *ptr_iop_w += n;\n  }\n  return (uint64_t)(n);\n}\n\nstatic inline uint32_t  //\nwuffs_base__io_writer__limited_copy_u32_from_history(uint8_t** ptr_iop_w,\n                                                     uint8_t* io1_w,\n                                                     uint8_t* io2_w,\n                                                     uint32_t length,\n                                                     uint32_t distance) {\n  if (!distance) {\n    return 0;\n  }\n  uint8_t* p = *ptr_iop_w;\n  if ((size_t)(p - io1_w) < (size_t)(distance)) {\n    return 0;\n  }\n  uint8_t* q = p - distance;\n  size_t n = (size_t)(io2_w - " +
 	"p);\n  if ((size_t)(length) > n) {\n    length = (uint32_t)(n);\n  } else {\n    n = (size_t)(length);\n  }\n  // TODO: unrolling by 3 seems best for the std/deflate benchmarks, but that\n  // is mostly because 3 is the minimum length for the deflate format. This\n  // function implementation shouldn't overfit to that one format. Perhaps the\n  // limited_copy_u32_from_history Wuffs method should also take an unroll hint\n  // argument, and the cgen can look if that argument is the constant\n  // expression '3'.\n  //\n  // See also wuffs_base__io_writer__limited_copy_u32_from_history_fast below.\n  //\n  // Alternatively or additionally, have a sloppy_limited_copy_u32_from_history\n  // method that copies 8 bytes at a time, which can more than length bytes?\n  for (; n >= 3; n -= 3) {\n    *p++ = *q++;\n    *p++ = *q++;\n    *p++ = *q++;\n  }\n  for (; n; n--) {\n    *p++ = *q++;\n  }\n  *ptr_iop_w = p;\n  return length;\n}\n\n// wuffs_base__io_writer__limited_copy_u32_from_history_fast is like the\n// wuffs_base__io_writer__limited_copy" +
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index efce8f3..5f7c32e 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -275,6 +275,8 @@
 	"io_reader.since(mark: u64) slice u8",
 	"io_reader.take!(n: u64) slice u8",
 
+	"io_reader.limited_copy_u32_to_slice!(up_to: u32, s: slice u8) u32",
+
 	"io_reader.skip?(n: u64)",
 	"io_reader.skip_u32?(n: u32)",
 
diff --git a/lang/token/list.go b/lang/token/list.go
index 5343025..4e5ea4f 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -522,6 +522,7 @@
 	IDLimitedCopyU32FromHistoryFast = ID(0x172)
 	IDLimitedCopyU32FromReader      = ID(0x173)
 	IDLimitedCopyU32FromSlice       = ID(0x174)
+	IDLimitedCopyU32ToSlice         = ID(0x175)
 
 	// -------- 0x180 block.
 
@@ -883,6 +884,7 @@
 	IDLimitedCopyU32FromHistoryFast: "limited_copy_u32_from_history_fast",
 	IDLimitedCopyU32FromReader:      "limited_copy_u32_from_reader",
 	IDLimitedCopyU32FromSlice:       "limited_copy_u32_from_slice",
+	IDLimitedCopyU32ToSlice:         "limited_copy_u32_to_slice",
 
 	// -------- 0x180 block.
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 4d68520..9415def 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -7641,6 +7641,26 @@
 
 // --------
 
+static inline uint32_t  //
+wuffs_base__io_reader__limited_copy_u32_to_slice(const uint8_t** ptr_iop_r,
+                                                 const uint8_t* io2_r,
+                                                 uint32_t length,
+                                                 wuffs_base__slice_u8 dst) {
+  const uint8_t* iop_r = *ptr_iop_r;
+  size_t n = dst.len;
+  if (n > length) {
+    n = length;
+  }
+  if (n > ((size_t)(io2_r - iop_r))) {
+    n = (size_t)(io2_r - iop_r);
+  }
+  if (n > 0) {
+    memmove(dst.ptr, iop_r, n);
+    *ptr_iop_r += n;
+  }
+  return (uint32_t)(n);
+}
+
 // wuffs_base__io_reader__match7 returns whether the io_reader's upcoming bytes
 // start with the given prefix (up to 7 bytes long). It is peek-like, not
 // read-like, in that there are no side-effects.
@@ -21277,8 +21297,8 @@
 
   uint64_t v_block_size = 0;
   bool v_need_block_size = false;
+  uint32_t v_n_copied = 0;
   uint64_t v_n_compressed = 0;
-  wuffs_base__slice_u8 v_compressed = {0};
   wuffs_base__io_buffer u_r = wuffs_base__empty_io_buffer();
   wuffs_base__io_buffer* v_r = &u_r;
   const uint8_t* iop_v_r WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
@@ -21345,17 +21365,16 @@
         if (v_n_compressed <= 0) {
           goto label__0__break;
         }
-        v_compressed =
-            wuffs_base__io_reader__take(&iop_a_src, io2_a_src, v_n_compressed);
-        wuffs_base__slice_u8__copy_from_slice(
+        v_n_copied = wuffs_base__io_reader__limited_copy_u32_to_slice(
+            &iop_a_src, io2_a_src, ((uint32_t)((v_n_compressed & 4294967295))),
             wuffs_base__slice_u8__subslice_i(
                 wuffs_base__make_slice_u8(self->private_data.f_compressed,
                                           4096),
-                self->private_impl.f_compressed_wi),
-            v_compressed);
+                self->private_impl.f_compressed_wi));
         wuffs_base__u64__sat_add_indirect(&self->private_impl.f_compressed_wi,
-                                          v_n_compressed);
-        wuffs_base__u64__sat_sub_indirect(&v_block_size, v_n_compressed);
+                                          ((uint64_t)(v_n_copied)));
+        wuffs_base__u64__sat_sub_indirect(&v_block_size,
+                                          ((uint64_t)(v_n_copied)));
         if (v_block_size > 0) {
           goto label__0__break;
         }
diff --git a/std/gif/decode_gif.wuffs b/std/gif/decode_gif.wuffs
index 4b049cc..5bc27a0 100644
--- a/std/gif/decode_gif.wuffs
+++ b/std/gif/decode_gif.wuffs
@@ -924,8 +924,8 @@
 pri func decoder.decode_id_part2?(dst: ptr base.pixel_buffer, src: base.io_reader, workbuf: slice base.u8) {
 	var block_size      : base.u64[..= 255]
 	var need_block_size : base.bool
+	var n_copied        : base.u32
 	var n_compressed    : base.u64
-	var compressed      : slice base.u8
 	var r               : base.io_reader
 	var mark            : base.u64
 	var lzw_status      : base.status
@@ -954,10 +954,11 @@
 			if n_compressed <= 0 {
 				break
 			}
-			compressed = args.src.take!(n: n_compressed)
-			this.compressed[this.compressed_wi ..].copy_from_slice!(s: compressed)
-			this.compressed_wi ~sat+= n_compressed
-			block_size ~sat-= n_compressed
+			n_copied = args.src.limited_copy_u32_to_slice!(
+				up_to: (n_compressed & 0xFFFF_FFFF) as base.u32,
+				s: this.compressed[this.compressed_wi ..])
+			this.compressed_wi ~sat+= n_copied as base.u64
+			block_size ~sat-= n_copied as base.u64
 			if block_size > 0 {
 				break
 			}