Roll dst_palette into the pixel_swizzler struct

This commit will soon be followed by a rollback, but it is committed
anyway so that we can refer to these numbers in the git log.

name                                        old speed      new speed      delta

wuffs_gif_decode_1k_bw/clang5                286MB/s ± 1%   251MB/s ± 0%  -12.11%  (p=0.000 n=10+9)
wuffs_gif_decode_1k_color_full_init/clang5  96.0MB/s ± 0%  88.9MB/s ± 0%   -7.35%  (p=0.000 n=10+10)
wuffs_gif_decode_1k_color_part_init/clang5   119MB/s ± 0%   108MB/s ± 0%   -9.06%  (p=0.000 n=10+8)
wuffs_gif_decode_10k_bgra/clang5             469MB/s ± 1%   438MB/s ± 0%   -6.50%  (p=0.000 n=10+8)
wuffs_gif_decode_10k_indexed/clang5          125MB/s ± 1%   117MB/s ± 1%   -6.78%  (p=0.000 n=9+9)
wuffs_gif_decode_20k/clang5                  153MB/s ± 0%   139MB/s ± 0%   -9.40%  (p=0.000 n=9+9)
wuffs_gif_decode_100k_artificial/clang5      324MB/s ± 0%   304MB/s ± 0%   -6.22%  (p=0.000 n=10+9)
wuffs_gif_decode_100k_realistic/clang5       139MB/s ± 1%   125MB/s ± 1%   -9.69%  (p=0.000 n=9+10)
wuffs_gif_decode_1000k_full_init/clang5      141MB/s ± 0%   126MB/s ± 0%  -10.40%  (p=0.000 n=10+10)
wuffs_gif_decode_1000k_part_init/clang5      141MB/s ± 1%   126MB/s ± 0%  -10.50%  (p=0.000 n=10+8)
wuffs_gif_decode_anim_screencap/clang5       655MB/s ± 0%   617MB/s ± 1%   -5.91%  (p=0.000 n=9+9)

wuffs_gif_decode_1k_bw/gcc7                  299MB/s ± 0%   295MB/s ± 0%   -1.18%  (p=0.000 n=10+8)
wuffs_gif_decode_1k_color_full_init/gcc7    85.2MB/s ± 1%  88.2MB/s ± 0%   +3.58%  (p=0.000 n=10+10)
wuffs_gif_decode_1k_color_part_init/gcc7     104MB/s ± 0%   103MB/s ± 0%   -0.39%  (p=0.001 n=9+10)
wuffs_gif_decode_10k_bgra/gcc7               349MB/s ± 2%   352MB/s ± 0%   +0.77%  (p=0.000 n=9+10)
wuffs_gif_decode_10k_indexed/gcc7            106MB/s ± 0%   106MB/s ± 2%     ~     (p=0.762 n=8+10)
wuffs_gif_decode_20k/gcc7                    133MB/s ± 0%   133MB/s ± 0%     ~     (p=0.258 n=9+9)
wuffs_gif_decode_100k_artificial/gcc7        302MB/s ± 0%   301MB/s ± 1%   -0.41%  (p=0.035 n=10+10)
wuffs_gif_decode_100k_realistic/gcc7         118MB/s ± 1%   118MB/s ± 1%     ~     (p=0.095 n=9+10)
wuffs_gif_decode_1000k_full_init/gcc7        120MB/s ± 1%   119MB/s ± 0%   -0.34%  (p=0.007 n=10+10)
wuffs_gif_decode_1000k_part_init/gcc7        119MB/s ± 1%   119MB/s ± 1%     ~     (p=0.138 n=10+10)
wuffs_gif_decode_anim_screencap/gcc7         640MB/s ± 0%   640MB/s ± 0%     ~     (p=0.905 n=9+10)
diff --git a/internal/cgen/base/image-public.h b/internal/cgen/base/image-public.h
index 39c62ed..52c7bad 100644
--- a/internal/cgen/base/image-public.h
+++ b/internal/cgen/base/image-public.h
@@ -1287,16 +1287,16 @@
 // --------
 
 // TODO: should the func type take restrict pointers?
-typedef uint64_t (*wuffs_base__pixel_swizzler__func)(
-    wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
-    wuffs_base__slice_u8 src);
+typedef uint64_t (*wuffs_base__pixel_swizzler__func)(const uint8_t* scratch1024,
+                                                     wuffs_base__slice_u8 dst,
+                                                     wuffs_base__slice_u8 src);
 
 typedef struct {
   // Do not access the private_impl's fields directly. There is no API/ABI
   // compatibility or safety guarantee if you do so.
   struct {
     wuffs_base__pixel_swizzler__func func;
+    uint8_t scratch1024[1024];
   } private_impl;
 
 #ifdef __cplusplus
@@ -1306,7 +1306,6 @@
                                     wuffs_base__slice_u8 src_palette,
                                     wuffs_base__pixel_blend blend);
   inline uint64_t swizzle_interleaved(wuffs_base__slice_u8 dst,
-                                      wuffs_base__slice_u8 dst_palette,
                                       wuffs_base__slice_u8 src) const;
 #endif  // __cplusplus
 
@@ -1336,7 +1335,6 @@
 wuffs_base__pixel_swizzler__swizzle_interleaved(
     const wuffs_base__pixel_swizzler* p,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src);
 
 #ifdef __cplusplus
@@ -1354,10 +1352,8 @@
 uint64_t  //
 wuffs_base__pixel_swizzler::swizzle_interleaved(
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) const {
-  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, dst_palette,
-                                                         src);
+  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, src);
 }
 
 #endif  // __cplusplus
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index e041e5c..2479a19 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -446,15 +446,15 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,
-                                     wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__copy_1_1(const uint8_t* scratch1024,
+                                     wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 src) {
   return wuffs_base__slice_u8__copy_from_slice(dst, src);
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__copy_3_3(wuffs_base__slice_u8 dst,
-                                     wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__copy_3_3(const uint8_t* scratch1024,
+                                     wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t src_len3 = src.len / 3;
@@ -466,8 +466,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,
-                                     wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__copy_4_4(const uint8_t* scratch1024,
+                                     wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -481,8 +481,8 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr_565__bgr(wuffs_base__slice_u8 dst,
-                                         wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__bgr_565__bgr(const uint8_t* scratch1024,
+                                         wuffs_base__slice_u8 dst,
                                          wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t src_len3 = src.len / 3;
@@ -510,8 +510,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t src_len4 = src.len / 4;
@@ -539,8 +539,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t src_len4 = src.len / 4;
@@ -592,8 +592,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr_565__y(wuffs_base__slice_u8 dst,
-                                       wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__bgr_565__y(const uint8_t* scratch1024,
+                                       wuffs_base__slice_u8 dst,
                                        wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t len = dst_len2 < src.len ? dst_len2 : src.len;
@@ -618,13 +618,9 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr_565__index__src(
-    wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
-    wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
+wuffs_base__pixel_swizzler__bgr_565__index__src(const uint8_t* scratch1024,
+                                                wuffs_base__slice_u8 dst,
+                                                wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t len = dst_len2 < src.len ? dst_len2 : src.len;
   uint8_t* d = dst.ptr;
@@ -636,16 +632,16 @@
   while (n >= loop_unroll_count) {
     wuffs_base__store_u16le__no_bounds_check(
         d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
     wuffs_base__store_u16le__no_bounds_check(
         d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[1] * 4)));
+                         scratch1024 + ((size_t)s[1] * 4)));
     wuffs_base__store_u16le__no_bounds_check(
         d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[2] * 4)));
+                         scratch1024 + ((size_t)s[2] * 4)));
     wuffs_base__store_u16le__no_bounds_check(
         d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[3] * 4)));
+                         scratch1024 + ((size_t)s[3] * 4)));
 
     s += loop_unroll_count * 1;
     d += loop_unroll_count * 2;
@@ -655,7 +651,7 @@
   while (n >= 1) {
     wuffs_base__store_u16le__no_bounds_check(
         d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
 
     s += 1 * 1;
     d += 1 * 2;
@@ -667,12 +663,9 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len2 = dst.len / 2;
   size_t len = dst_len2 < src.len ? dst_len2 : src.len;
   uint8_t* d = dst.ptr;
@@ -682,7 +675,7 @@
   // TODO: unroll.
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);
@@ -699,10 +692,9 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(
-    wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
-    wuffs_base__slice_u8 src) {
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(const uint8_t* scratch1024,
+                                                     wuffs_base__slice_u8 dst,
+                                                     wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t src_len4 = src.len / 4;
   size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;
@@ -728,8 +720,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t src_len4 = src.len / 4;
@@ -775,8 +767,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -806,8 +798,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -834,8 +826,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -863,12 +855,9 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxx__index__src(wuffs_base__slice_u8 dst,
-                                            wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxx__index__src(const uint8_t* scratch1024,
+                                            wuffs_base__slice_u8 dst,
                                             wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len3 = dst.len / 3;
   size_t len = dst_len3 < src.len ? dst_len3 : src.len;
   uint8_t* d = dst.ptr;
@@ -887,16 +876,16 @@
   while (n > loop_unroll_count) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[1] * 4)));
+                         scratch1024 + ((size_t)s[1] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[2] * 4)));
+                         scratch1024 + ((size_t)s[2] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[3] * 4)));
+                         scratch1024 + ((size_t)s[3] * 4)));
 
     s += loop_unroll_count * 1;
     d += loop_unroll_count * 3;
@@ -904,7 +893,7 @@
   }
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
 
@@ -918,12 +907,9 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len3 = dst.len / 3;
   size_t len = dst_len3 < src.len ? dst_len3 : src.len;
   uint8_t* d = dst.ptr;
@@ -933,22 +919,22 @@
   const size_t loop_unroll_count = 4;
 
   while (n >= loop_unroll_count) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
     }
-    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[1] * 4));
     if (s1) {
       wuffs_base__store_u24le__no_bounds_check(d + (1 * 3), s1);
     }
-    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[2] * 4));
     if (s2) {
       wuffs_base__store_u24le__no_bounds_check(d + (2 * 3), s2);
     }
-    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[3] * 4));
     if (s3) {
       wuffs_base__store_u24le__no_bounds_check(d + (3 * 3), s3);
@@ -960,7 +946,7 @@
   }
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
@@ -975,8 +961,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxx__y(wuffs_base__slice_u8 dst,
-                                   wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxx__y(const uint8_t* scratch1024,
+                                   wuffs_base__slice_u8 dst,
                                    wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t len = dst_len3 < src.len ? dst_len3 : src.len;
@@ -1003,12 +989,9 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxxx__index__src(wuffs_base__slice_u8 dst,
-                                             wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxxx__index__src(const uint8_t* scratch1024,
+                                             wuffs_base__slice_u8 dst,
                                              wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len4 = dst.len / 4;
   size_t len = dst_len4 < src.len ? dst_len4 : src.len;
   uint8_t* d = dst.ptr;
@@ -1020,16 +1003,16 @@
   while (n >= loop_unroll_count) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (1 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[1] * 4)));
+                         scratch1024 + ((size_t)s[1] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[2] * 4)));
+                         scratch1024 + ((size_t)s[2] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[3] * 4)));
+                         scratch1024 + ((size_t)s[3] * 4)));
 
     s += loop_unroll_count * 1;
     d += loop_unroll_count * 4;
@@ -1039,7 +1022,7 @@
   while (n >= 1) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
 
     s += 1 * 1;
     d += 1 * 4;
@@ -1051,12 +1034,9 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len4 = dst.len / 4;
   size_t len = dst_len4 < src.len ? dst_len4 : src.len;
   uint8_t* d = dst.ptr;
@@ -1066,22 +1046,22 @@
   const size_t loop_unroll_count = 4;
 
   while (n >= loop_unroll_count) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);
     }
-    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[1] * 4));
     if (s1) {
       wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);
     }
-    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[2] * 4));
     if (s2) {
       wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);
     }
-    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[3] * 4));
     if (s3) {
       wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);
@@ -1093,7 +1073,7 @@
   }
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);
@@ -1108,8 +1088,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxxx__xxx(wuffs_base__slice_u8 dst,
-                                      wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxxx__xxx(const uint8_t* scratch1024,
+                                      wuffs_base__slice_u8 dst,
                                       wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len3 = src.len / 3;
@@ -1134,8 +1114,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxxx__y(wuffs_base__slice_u8 dst,
-                                    wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxxx__y(const uint8_t* scratch1024,
+                                    wuffs_base__slice_u8 dst,
                                     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t len = dst_len4 < src.len ? dst_len4 : src.len;
@@ -1162,7 +1142,6 @@
 static wuffs_base__pixel_swizzler__func  //
 wuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,
                                        wuffs_base__pixel_format dst_format,
-                                       wuffs_base__slice_u8 dst_palette,
                                        wuffs_base__slice_u8 src_palette,
                                        wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
@@ -1190,15 +1169,15 @@
 wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(
     wuffs_base__pixel_swizzler* p,
     wuffs_base__pixel_format dst_format,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src_palette,
     wuffs_base__pixel_blend blend) {
+  wuffs_base__slice_u8 scratch =
+      wuffs_base__make_slice_u8(&(p->private_impl.scratch1024[0]), 1024);
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:
-      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=
-          1024) {
+      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {
         return NULL;
       }
       switch (blend) {
@@ -1208,7 +1187,7 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,
+      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(scratch,
                                                          src_palette) != 1024) {
         return NULL;
       }
@@ -1221,8 +1200,7 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=
-          1024) {
+      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {
         return NULL;
       }
       switch (blend) {
@@ -1236,8 +1214,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
-      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=
-          1024) {
+      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {
         return NULL;
       }
       switch (blend) {
@@ -1249,8 +1226,8 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
-      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(scratch, src_palette) !=
+          1024) {
         return NULL;
       }
       switch (blend) {
@@ -1264,8 +1241,8 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
-      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(scratch, src_palette) !=
+          1024) {
         return NULL;
       }
       switch (blend) {
@@ -1282,7 +1259,6 @@
 static wuffs_base__pixel_swizzler__func  //
 wuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,
                                          wuffs_base__pixel_format dst_format,
-                                         wuffs_base__slice_u8 dst_palette,
                                          wuffs_base__slice_u8 src_palette,
                                          wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
@@ -1313,7 +1289,6 @@
 wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
     wuffs_base__pixel_swizzler* p,
     wuffs_base__pixel_format dst_format,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src_palette,
     wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
@@ -1388,39 +1363,46 @@
 
   switch (src_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__Y:
-      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, dst_palette,
-                                                    src_palette, blend);
+      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, src_palette,
+                                                    blend);
       break;
 
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:
       func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(
-          p, dst_format, dst_palette, src_palette, blend);
+          p, dst_format, src_palette, blend);
       break;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      func = wuffs_base__pixel_swizzler__prepare__bgr(
-          p, dst_format, dst_palette, src_palette, blend);
+      func = wuffs_base__pixel_swizzler__prepare__bgr(p, dst_format,
+                                                      src_palette, blend);
       break;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
-          p, dst_format, dst_palette, src_palette, blend);
+          p, dst_format, src_palette, blend);
       break;
   }
 
   p->private_impl.func = func;
-  return wuffs_base__make_status(
-      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);
+  if (!func) {
+    return wuffs_base__make_status(
+        wuffs_base__error__unsupported_pixel_swizzler_option);
+  }
+  if (dst_palette.len == 1024) {
+    const uint8_t* scratch1024 = &(p->private_impl.scratch1024[0]);
+    memcpy(dst_palette.ptr, scratch1024, 1024);
+  }
+  return wuffs_base__make_status(NULL);
 }
 
 WUFFS_BASE__MAYBE_STATIC uint64_t  //
 wuffs_base__pixel_swizzler__swizzle_interleaved(
     const wuffs_base__pixel_swizzler* p,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   if (p && p->private_impl.func) {
-    return (*p->private_impl.func)(dst, dst_palette, src);
+    const uint8_t* scratch1024 = &(p->private_impl.scratch1024[0]);
+    return (*p->private_impl.func)(scratch1024, dst, src);
   }
   return 0;
 }
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 6be82bf..7e96e44 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -135,48 +135,50 @@
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,\n                                               wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__store_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n   " +
 	" uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_3_3(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len3 < src_len3 ? dst_len3 : src_len3;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 3);\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 <" +
-	" src_len4 ? dst_len4 : src_len4;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 4);\n  }\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(const uint8_t* scratch1024,\n                                     wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_3_3(const uint8_t* scratch1024,\n                                     wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len3 < src_len3 ? dst_len3 : src_len3;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 3);\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(const uint8_t* scratch1024,\n                                     wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_le" +
+	"n4 : src_len4;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 4);\n  }\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgr(wuffs_base__slice_u8 dst,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len2 < src_len3 ? dst_len2 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t b5 = s[0] >> 3;\n    uint32_t g6 = s[1] >> 2;\n    uint32_t r5 = s[2] >> 3;\n    uint32_t rgb_565 = (r5 << 11) | (g6 << 5) | (b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 3;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len4 = src.len / 4;\n  size_t len = " +
-	"dst_len2 < src_len4 ? dst_len2 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2),\n        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n            wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0" +
-	"x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n\n    // Convert from 565 color to 16-bit color.\n    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));\n    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);\n    uint32_t dr = (0x8421 * old_r5) >> 4;\n    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);\n    uint32_t dg = (0x1041 * old_g6) >> 2;\n    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);\n    uint32_t db = (0x8421 * old_b5) >> 4;\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 565 color and combine the components.\n    uint32_t new_r5 = 0x1F & (dr >> 11);\n    uint32_t new_g6 = 0x3F & (dg >> 10);\n    uint32_t new_b5 = 0x1F & (db >> 11);\n    uint32_t new_rgb" +
-	"_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),\n                                             (uint16_t)new_rgb_565);\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__y(wuffs_base__slice_u8 dst,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t y5 = s[0] >> 3;\n    uint32_t y6 = s[0] >> 2;\n    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index__src(\n    wuffs_base__slice_u8" +
-	" dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr" +
-	" + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u16le__no_bounds" +
-	"_check(d + (0 * 2), (uint16_t)s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgr(const uint8_t* scratch1024,\n                                         wuffs_base__slice_u8 dst,\n                                         wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len2 < src_len3 ? dst_len2 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t b5 = s[0] >> 3;\n    uint32_t g6 = s[1] >> 2;\n    uint32_t r5 = s[2] >> 3;\n    uint32_t rgb_565 = (r5 << 11) | (g6 << 5) | (b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 3;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len2 < s" +
+	"rc_len4 ? dst_len2 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2),\n        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n            wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)" +
+	"s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n\n    // Convert from 565 color to 16-bit color.\n    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));\n    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);\n    uint32_t dr = (0x8421 * old_r5) >> 4;\n    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);\n    uint32_t dg = (0x1041 * old_g6) >> 2;\n    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);\n    uint32_t db = (0x8421 * old_b5) >> 4;\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 565 color and combine the components.\n    uint32_t new_r5 = 0x1F & (dr >> 11);\n    uint32_t new_g6 = 0x3F & (dg >> 10);\n    uint32_t new_b5 = 0x1F & (db >> 11);\n    uint32_t new_rgb_565 = (new_r5 << " +
+	"11) | (new_g6 << 5) | (new_b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),\n                                             (uint16_t)new_rgb_565);\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__y(const uint8_t* scratch1024,\n                                       wuffs_base__slice_u8 dst,\n                                       wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t y5 = s[0] >> 3;\n    uint32_t y6 = s[0] >> 2;\n    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index__src(const uint8_t* scratch1024,\n                     " +
+	"                           wuffs_base__slice_u8 dst,\n                                                wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         scratch1024 + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         scratch1024 + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         scratch1024 + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         scratch1024 + ((size_t)s[" +
+	"3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         scratch1024 + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n" +
+	" -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 =\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n            wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;\n  uint8_t* d = d" +
-	"st.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n    uint32_t dr = 0x101 * ((uint32_t)d[2]);\n    uint32_t dg = 0x101 * ((uint32_t)d[1]);\n    uint32_t db = 0x101 * ((uint32_t)d[0]);\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 8-bit color.\n    d[0] = (uint8_t)(db >> 8);\n    d[1] = (uint8_t)(dg >> 8);\n    d[2] = (uint8_t)(dr >> 8);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(const uint8_t* scratch1024,\n                                                     wuffs_base__slice_u8 dst,\n                                                     wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 =\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n            wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len4 = src.le" +
+	"n / 4;\n  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n    uint32_t dr = 0x101 * ((uint32_t)d[2]);\n    uint32_t dg = 0x101 * ((uint32_t)d[1]);\n    uint32_t db = 0x101 * ((uint32_t)d[0]);\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 8-bit color.\n    d[0] = (uint8_t)(db >> 8);\n    d[1] = (uint8_t)(dg >> 8);\n    d[2] = (uint8_t)(dr >> 8);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;" +
+	"\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__composite_nonpremul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__composite_nonpremul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  ui" +
-	"nt8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = d" +
+	"st.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(wuffs_base__slice_u8 dst,\n                                            wuffs_base__slice_u8 dst_palette,\n                                            wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__load_u32le__no_bounds_c" +
-	"heck(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__" +
-	"pixel_swizzler__xxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u24le__no_bounds_check(d + (1 * 3), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                    " +
-	"                      ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u24le__no_bounds_check(d + (2 * 3), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u24le__no_bounds_check(d + (3 * 3), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__y(wuffs_base__slice_u8 dst,\n                                   wuffs_base__slice_u8 dst_palette,\n                                   wuffs_base__slice_u8 src) {\n  size_t dst_len3 =" +
-	" dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint8_t s0 = s[0];\n    d[0] = s0;\n    d[1] = s0;\n    d[2] = s0;\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(const uint8_t* scratch1024,\n                                            wuffs_base__slice_u8 dst,\n                                            wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[0" +
+	"] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[0] * 4));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over(\n    const uint8_t* scratch" +
+	"1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u24le__no_bounds_check(d + (1 * 3), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u24le__no_bounds_check(d + (2 * 3), s2);\n    }\n    uint32_t s3 = " +
+	"wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u24le__no_bounds_check(d + (3 * 3), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__y(const uint8_t* scratch1024,\n                                   wuffs_base__slice_u8 dst,\n                                   wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  w" +
+	"hile (n >= 1) {\n    uint8_t s0 = s[0];\n    d[0] = s0;\n    d[1] = s0;\n    d[2] = s0;\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(wuffs_base__slice_u8 dst,\n                                             wuffs_base__slice_u8 dst_palette,\n                                             wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_pale" +
-	"tte.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count)" +
-	" {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count *" +
-	" 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__xxx(wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 dst_palette,\n                                      wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len4 < src_len3 ? dst_len4 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u24le__no_bounds_check(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n" +
-	"\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(wuffs_base__slice_u8 dst,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(const uint8_t* scratch1024,\n                                             wuffs_base__slice_u8 dst,\n                                             wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bound" +
+	"s_check(\n        d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         scratch1024 + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    const uint8_t* scratch1024,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          (" +
+	"(size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +\n                      " +
+	"                                    ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__xxx(const uint8_t* scratch1024,\n                                      wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len4 < src_len3 ? dst_len4 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u24le__no_bounds_check(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(const uint8_t* scratch1024,\n                                    wuffs_base__slice_u8 dst,\n             " +
+	"                       wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_format,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__pixel_swizzler__xxx__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WU" +
-	"FFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,\n                                        " +
-	"                 src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_pa" +
-	"lette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__" +
-	"swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_format,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__B" +
-	"GR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_b" +
-	"ase__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swiz" +
-	"zler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_format,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__pixel_swizzler__xxx__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__pixel_swizzler__xx" +
+	"xx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  wuffs_base__slice_u8 scratch =\n      wuffs_base__make_slice_u8(&(p->private_impl.scratch1024[0]), 1024);\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(scratch,\n                                                         src_p" +
+	"alette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {\n        return NULL;\n " +
+	"     }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(scratch, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(scratch, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      swit" +
+	"ch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_format,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_B" +
+	"ASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      swit" +
+	"ch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case " +
+	"WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
-	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n\n  // TODO: support many more formats.\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n\n  switch (src_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, dst_palette,\n                                                    src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    " +
-	"      p, dst_format, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n  }\n\n  p->private_impl.func = func;\n  return wuffs_base__make_status(\n      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);\n}\n\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    return (*p->private_impl.func)(dst, dst_palette, src);\n  }\n  return 0;\n}\n" +
+	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n\n  // TODO: support many more formats.\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n\n  switch (src_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, src_palette,\n                                                    blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n          p, dst_" +
+	"format, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(p, dst_format,\n                                                      src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n          p, dst_format, src_palette, blend);\n      break;\n  }\n\n  p->private_impl.func = func;\n  if (!func) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n  if (dst_palette.len == 1024) {\n    const uint8_t* scratch1024 = &(p->private_impl.scratch1024[0]);\n    memcpy(dst_palette.ptr, scratch1024, 1024);\n  }\n  return wuffs_base__make_status(NULL);\n}\n\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    const uint8_t* scratch1024 =" +
+	" &(p->private_impl.scratch1024[0]);\n    return (*p->private_impl.func)(scratch1024, dst, src);\n  }\n  return 0;\n}\n" +
 	""
 
 const baseFundamentalPrivateH = "" +
@@ -318,10 +320,10 @@
 	"" +
 	"// --------\n\n// wuffs_base__pixel_palette__closest_element returns the index of the palette\n// element that minimizes the sum of squared differences of the four ARGB\n// channels, working in premultiplied alpha. Ties favor the smaller index.\n//\n// The palette_slice.len may equal (N*4), for N less than 256, which means that\n// only the first N palette elements are considered. It returns 0 when N is 0.\n//\n// Applying this function on a per-pixel basis will not produce whole-of-image\n// dithering.\nWUFFS_BASE__MAYBE_STATIC uint8_t  //\nwuffs_base__pixel_palette__closest_element(\n    wuffs_base__slice_u8 palette_slice,\n    wuffs_base__pixel_format palette_format,\n    wuffs_base__color_u32_argb_premul c);\n\n" +
 	"" +
-	"// --------\n\n// TODO: should the func type take restrict pointers?\ntypedef uint64_t (*wuffs_base__pixel_swizzler__func)(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src);\n\ntypedef struct {\n  // Do not access the private_impl's fields directly. There is no API/ABI\n  // compatibility or safety guarantee if you do so.\n  struct {\n    wuffs_base__pixel_swizzler__func func;\n  } private_impl;\n\n#ifdef __cplusplus\n  inline wuffs_base__status prepare(wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend);\n  inline uint64_t swizzle_interleaved(wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 dst_palette,\n                                      wuffs_base__slice_u8 src) const;\n#endi" +
-	"f  // __cplusplus\n\n} wuffs_base__pixel_swizzler;\n\n// wuffs_base__pixel_swizzler__prepare readies the pixel swizzler so that its\n// other methods may be called.\n//\n// For modular builds that divide the base module into sub-modules, using this\n// function requires the WUFFS_CONFIG__MODULE__BASE__PIXCONV sub-module, not\n// just WUFFS_CONFIG__MODULE__BASE__CORE.\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend);\n\n// wuffs_base__pixel_swizzler__swizzle_interleaved converts pixels from a\n// source format to a destination format.\n//\n// For modular builds that divide the base module into sub-modules, using " +
-	"this\n// function requires the WUFFS_CONFIG__MODULE__BASE__PIXCONV sub-module, not\n// just WUFFS_CONFIG__MODULE__BASE__CORE.\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src);\n\n#ifdef __cplusplus\n\ninline wuffs_base__status  //\nwuffs_base__pixel_swizzler::prepare(wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  return wuffs_base__pixel_swizzler__prepare(this, dst_format, dst_palette,\n                                             src_format, src_palette, blend);\n}\n\nuint64_t  //\nwuffs_base__pixel_swizzler::swizzle_interleaved(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__sl" +
-	"ice_u8 dst_palette,\n    wuffs_base__slice_u8 src) const {\n  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, dst_palette,\n                                                         src);\n}\n\n#endif  // __cplusplus\n" +
+	"// --------\n\n// TODO: should the func type take restrict pointers?\ntypedef uint64_t (*wuffs_base__pixel_swizzler__func)(const uint8_t* scratch1024,\n                                                     wuffs_base__slice_u8 dst,\n                                                     wuffs_base__slice_u8 src);\n\ntypedef struct {\n  // Do not access the private_impl's fields directly. There is no API/ABI\n  // compatibility or safety guarantee if you do so.\n  struct {\n    wuffs_base__pixel_swizzler__func func;\n    uint8_t scratch1024[1024];\n  } private_impl;\n\n#ifdef __cplusplus\n  inline wuffs_base__status prepare(wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend);\n  inline uint64_t swizzle_interleaved(wuffs_base__slice_u8 dst,\n                              " +
+	"        wuffs_base__slice_u8 src) const;\n#endif  // __cplusplus\n\n} wuffs_base__pixel_swizzler;\n\n// wuffs_base__pixel_swizzler__prepare readies the pixel swizzler so that its\n// other methods may be called.\n//\n// For modular builds that divide the base module into sub-modules, using this\n// function requires the WUFFS_CONFIG__MODULE__BASE__PIXCONV sub-module, not\n// just WUFFS_CONFIG__MODULE__BASE__CORE.\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend);\n\n// wuffs_base__pixel_swizzler__swizzle_interleaved converts pixels from a\n// source format to a destination format.\n//\n// For modular builds that d" +
+	"ivide the base module into sub-modules, using this\n// function requires the WUFFS_CONFIG__MODULE__BASE__PIXCONV sub-module, not\n// just WUFFS_CONFIG__MODULE__BASE__CORE.\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src);\n\n#ifdef __cplusplus\n\ninline wuffs_base__status  //\nwuffs_base__pixel_swizzler::prepare(wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  return wuffs_base__pixel_swizzler__prepare(this, dst_format, dst_palette,\n                                             src_format, src_palette, blend);\n}\n\nuint64_t  //\nwuffs_base__pixel_swizzler::swizzle_interleaved(\n    wuffs_base__slice_u8 dst,\n    wuffs_" +
+	"base__slice_u8 src) const {\n  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, src);\n}\n\n#endif  // __cplusplus\n" +
 	""
 
 const baseIOPrivateH = "" +
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index cab8fd8..d2e2d35 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -381,7 +381,7 @@
 		"dst_pixfmt: pixel_format, dst_palette: slice u8," +
 		"src_pixfmt: pixel_format, src_palette: slice u8, blend: pixel_blend) status",
 	"pixel_swizzler.swizzle_interleaved!(" +
-		"dst: slice u8, dst_palette: slice u8, src: slice u8) u64",
+		"dst: slice u8, src: slice u8) u64",
 }
 
 var Interfaces = []string{
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index fecdc96..4562596 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -3773,16 +3773,16 @@
 // --------
 
 // TODO: should the func type take restrict pointers?
-typedef uint64_t (*wuffs_base__pixel_swizzler__func)(
-    wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
-    wuffs_base__slice_u8 src);
+typedef uint64_t (*wuffs_base__pixel_swizzler__func)(const uint8_t* scratch1024,
+                                                     wuffs_base__slice_u8 dst,
+                                                     wuffs_base__slice_u8 src);
 
 typedef struct {
   // Do not access the private_impl's fields directly. There is no API/ABI
   // compatibility or safety guarantee if you do so.
   struct {
     wuffs_base__pixel_swizzler__func func;
+    uint8_t scratch1024[1024];
   } private_impl;
 
 #ifdef __cplusplus
@@ -3792,7 +3792,6 @@
                                     wuffs_base__slice_u8 src_palette,
                                     wuffs_base__pixel_blend blend);
   inline uint64_t swizzle_interleaved(wuffs_base__slice_u8 dst,
-                                      wuffs_base__slice_u8 dst_palette,
                                       wuffs_base__slice_u8 src) const;
 #endif  // __cplusplus
 
@@ -3822,7 +3821,6 @@
 wuffs_base__pixel_swizzler__swizzle_interleaved(
     const wuffs_base__pixel_swizzler* p,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src);
 
 #ifdef __cplusplus
@@ -3840,10 +3838,8 @@
 uint64_t  //
 wuffs_base__pixel_swizzler::swizzle_interleaved(
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) const {
-  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, dst_palette,
-                                                         src);
+  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, src);
 }
 
 #endif  // __cplusplus
@@ -6193,7 +6189,6 @@
   struct {
     uint8_t f_compressed[4096];
     uint8_t f_palettes[2][1024];
-    uint8_t f_dst_palette[1024];
     wuffs_lzw__decoder f_lzw;
 
     struct {
@@ -10759,15 +10754,15 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,
-                                     wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__copy_1_1(const uint8_t* scratch1024,
+                                     wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 src) {
   return wuffs_base__slice_u8__copy_from_slice(dst, src);
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__copy_3_3(wuffs_base__slice_u8 dst,
-                                     wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__copy_3_3(const uint8_t* scratch1024,
+                                     wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t src_len3 = src.len / 3;
@@ -10779,8 +10774,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,
-                                     wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__copy_4_4(const uint8_t* scratch1024,
+                                     wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -10794,8 +10789,8 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr_565__bgr(wuffs_base__slice_u8 dst,
-                                         wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__bgr_565__bgr(const uint8_t* scratch1024,
+                                         wuffs_base__slice_u8 dst,
                                          wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t src_len3 = src.len / 3;
@@ -10823,8 +10818,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t src_len4 = src.len / 4;
@@ -10852,8 +10847,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t src_len4 = src.len / 4;
@@ -10905,8 +10900,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr_565__y(wuffs_base__slice_u8 dst,
-                                       wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__bgr_565__y(const uint8_t* scratch1024,
+                                       wuffs_base__slice_u8 dst,
                                        wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t len = dst_len2 < src.len ? dst_len2 : src.len;
@@ -10931,13 +10926,9 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr_565__index__src(
-    wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
-    wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
+wuffs_base__pixel_swizzler__bgr_565__index__src(const uint8_t* scratch1024,
+                                                wuffs_base__slice_u8 dst,
+                                                wuffs_base__slice_u8 src) {
   size_t dst_len2 = dst.len / 2;
   size_t len = dst_len2 < src.len ? dst_len2 : src.len;
   uint8_t* d = dst.ptr;
@@ -10949,16 +10940,16 @@
   while (n >= loop_unroll_count) {
     wuffs_base__store_u16le__no_bounds_check(
         d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
     wuffs_base__store_u16le__no_bounds_check(
         d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[1] * 4)));
+                         scratch1024 + ((size_t)s[1] * 4)));
     wuffs_base__store_u16le__no_bounds_check(
         d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[2] * 4)));
+                         scratch1024 + ((size_t)s[2] * 4)));
     wuffs_base__store_u16le__no_bounds_check(
         d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[3] * 4)));
+                         scratch1024 + ((size_t)s[3] * 4)));
 
     s += loop_unroll_count * 1;
     d += loop_unroll_count * 2;
@@ -10968,7 +10959,7 @@
   while (n >= 1) {
     wuffs_base__store_u16le__no_bounds_check(
         d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
 
     s += 1 * 1;
     d += 1 * 2;
@@ -10980,12 +10971,9 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len2 = dst.len / 2;
   size_t len = dst_len2 < src.len ? dst_len2 : src.len;
   uint8_t* d = dst.ptr;
@@ -10995,7 +10983,7 @@
   // TODO: unroll.
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);
@@ -11012,10 +11000,9 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(
-    wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
-    wuffs_base__slice_u8 src) {
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(const uint8_t* scratch1024,
+                                                     wuffs_base__slice_u8 dst,
+                                                     wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t src_len4 = src.len / 4;
   size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;
@@ -11041,8 +11028,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t src_len4 = src.len / 4;
@@ -11088,8 +11075,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -11119,8 +11106,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -11147,8 +11134,8 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len4 = src.len / 4;
@@ -11176,12 +11163,9 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxx__index__src(wuffs_base__slice_u8 dst,
-                                            wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxx__index__src(const uint8_t* scratch1024,
+                                            wuffs_base__slice_u8 dst,
                                             wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len3 = dst.len / 3;
   size_t len = dst_len3 < src.len ? dst_len3 : src.len;
   uint8_t* d = dst.ptr;
@@ -11200,16 +11184,16 @@
   while (n > loop_unroll_count) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[1] * 4)));
+                         scratch1024 + ((size_t)s[1] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[2] * 4)));
+                         scratch1024 + ((size_t)s[2] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[3] * 4)));
+                         scratch1024 + ((size_t)s[3] * 4)));
 
     s += loop_unroll_count * 1;
     d += loop_unroll_count * 3;
@@ -11217,7 +11201,7 @@
   }
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
 
@@ -11231,12 +11215,9 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len3 = dst.len / 3;
   size_t len = dst_len3 < src.len ? dst_len3 : src.len;
   uint8_t* d = dst.ptr;
@@ -11246,22 +11227,22 @@
   const size_t loop_unroll_count = 4;
 
   while (n >= loop_unroll_count) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
     }
-    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[1] * 4));
     if (s1) {
       wuffs_base__store_u24le__no_bounds_check(d + (1 * 3), s1);
     }
-    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[2] * 4));
     if (s2) {
       wuffs_base__store_u24le__no_bounds_check(d + (2 * 3), s2);
     }
-    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[3] * 4));
     if (s3) {
       wuffs_base__store_u24le__no_bounds_check(d + (3 * 3), s3);
@@ -11273,7 +11254,7 @@
   }
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
@@ -11288,8 +11269,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxx__y(wuffs_base__slice_u8 dst,
-                                   wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxx__y(const uint8_t* scratch1024,
+                                   wuffs_base__slice_u8 dst,
                                    wuffs_base__slice_u8 src) {
   size_t dst_len3 = dst.len / 3;
   size_t len = dst_len3 < src.len ? dst_len3 : src.len;
@@ -11316,12 +11297,9 @@
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxxx__index__src(wuffs_base__slice_u8 dst,
-                                             wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxxx__index__src(const uint8_t* scratch1024,
+                                             wuffs_base__slice_u8 dst,
                                              wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len4 = dst.len / 4;
   size_t len = dst_len4 < src.len ? dst_len4 : src.len;
   uint8_t* d = dst.ptr;
@@ -11333,16 +11311,16 @@
   while (n >= loop_unroll_count) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (1 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[1] * 4)));
+                         scratch1024 + ((size_t)s[1] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[2] * 4)));
+                         scratch1024 + ((size_t)s[2] * 4)));
     wuffs_base__store_u32le__no_bounds_check(
         d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[3] * 4)));
+                         scratch1024 + ((size_t)s[3] * 4)));
 
     s += loop_unroll_count * 1;
     d += loop_unroll_count * 4;
@@ -11352,7 +11330,7 @@
   while (n >= 1) {
     wuffs_base__store_u32le__no_bounds_check(
         d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(
-                         dst_palette.ptr + ((size_t)s[0] * 4)));
+                         scratch1024 + ((size_t)s[0] * 4)));
 
     s += 1 * 1;
     d += 1 * 4;
@@ -11364,12 +11342,9 @@
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(
+    const uint8_t* scratch1024,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
-  if (dst_palette.len != 1024) {
-    return 0;
-  }
   size_t dst_len4 = dst.len / 4;
   size_t len = dst_len4 < src.len ? dst_len4 : src.len;
   uint8_t* d = dst.ptr;
@@ -11379,22 +11354,22 @@
   const size_t loop_unroll_count = 4;
 
   while (n >= loop_unroll_count) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);
     }
-    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[1] * 4));
     if (s1) {
       wuffs_base__store_u32le__no_bounds_check(d + (1 * 4), s1);
     }
-    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[2] * 4));
     if (s2) {
       wuffs_base__store_u32le__no_bounds_check(d + (2 * 4), s2);
     }
-    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[3] * 4));
     if (s3) {
       wuffs_base__store_u32le__no_bounds_check(d + (3 * 4), s3);
@@ -11406,7 +11381,7 @@
   }
 
   while (n >= 1) {
-    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(scratch1024 +
                                                           ((size_t)s[0] * 4));
     if (s0) {
       wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);
@@ -11421,8 +11396,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxxx__xxx(wuffs_base__slice_u8 dst,
-                                      wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxxx__xxx(const uint8_t* scratch1024,
+                                      wuffs_base__slice_u8 dst,
                                       wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t src_len3 = src.len / 3;
@@ -11447,8 +11422,8 @@
 }
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__xxxx__y(wuffs_base__slice_u8 dst,
-                                    wuffs_base__slice_u8 dst_palette,
+wuffs_base__pixel_swizzler__xxxx__y(const uint8_t* scratch1024,
+                                    wuffs_base__slice_u8 dst,
                                     wuffs_base__slice_u8 src) {
   size_t dst_len4 = dst.len / 4;
   size_t len = dst_len4 < src.len ? dst_len4 : src.len;
@@ -11475,7 +11450,6 @@
 static wuffs_base__pixel_swizzler__func  //
 wuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,
                                        wuffs_base__pixel_format dst_format,
-                                       wuffs_base__slice_u8 dst_palette,
                                        wuffs_base__slice_u8 src_palette,
                                        wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
@@ -11503,15 +11477,15 @@
 wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(
     wuffs_base__pixel_swizzler* p,
     wuffs_base__pixel_format dst_format,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src_palette,
     wuffs_base__pixel_blend blend) {
+  wuffs_base__slice_u8 scratch =
+      wuffs_base__make_slice_u8(&(p->private_impl.scratch1024[0]), 1024);
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:
-      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=
-          1024) {
+      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {
         return NULL;
       }
       switch (blend) {
@@ -11521,7 +11495,7 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,
+      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(scratch,
                                                          src_palette) != 1024) {
         return NULL;
       }
@@ -11534,8 +11508,7 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=
-          1024) {
+      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {
         return NULL;
       }
       switch (blend) {
@@ -11549,8 +11522,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
-      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=
-          1024) {
+      if (wuffs_base__slice_u8__copy_from_slice(scratch, src_palette) != 1024) {
         return NULL;
       }
       switch (blend) {
@@ -11562,8 +11534,8 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
-      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(scratch, src_palette) !=
+          1024) {
         return NULL;
       }
       switch (blend) {
@@ -11577,8 +11549,8 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
-      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(scratch, src_palette) !=
+          1024) {
         return NULL;
       }
       switch (blend) {
@@ -11595,7 +11567,6 @@
 static wuffs_base__pixel_swizzler__func  //
 wuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,
                                          wuffs_base__pixel_format dst_format,
-                                         wuffs_base__slice_u8 dst_palette,
                                          wuffs_base__slice_u8 src_palette,
                                          wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
@@ -11626,7 +11597,6 @@
 wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
     wuffs_base__pixel_swizzler* p,
     wuffs_base__pixel_format dst_format,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src_palette,
     wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
@@ -11701,39 +11671,46 @@
 
   switch (src_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__Y:
-      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, dst_palette,
-                                                    src_palette, blend);
+      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, src_palette,
+                                                    blend);
       break;
 
     case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:
       func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(
-          p, dst_format, dst_palette, src_palette, blend);
+          p, dst_format, src_palette, blend);
       break;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      func = wuffs_base__pixel_swizzler__prepare__bgr(
-          p, dst_format, dst_palette, src_palette, blend);
+      func = wuffs_base__pixel_swizzler__prepare__bgr(p, dst_format,
+                                                      src_palette, blend);
       break;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
-          p, dst_format, dst_palette, src_palette, blend);
+          p, dst_format, src_palette, blend);
       break;
   }
 
   p->private_impl.func = func;
-  return wuffs_base__make_status(
-      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);
+  if (!func) {
+    return wuffs_base__make_status(
+        wuffs_base__error__unsupported_pixel_swizzler_option);
+  }
+  if (dst_palette.len == 1024) {
+    const uint8_t* scratch1024 = &(p->private_impl.scratch1024[0]);
+    memcpy(dst_palette.ptr, scratch1024, 1024);
+  }
+  return wuffs_base__make_status(NULL);
 }
 
 WUFFS_BASE__MAYBE_STATIC uint64_t  //
 wuffs_base__pixel_swizzler__swizzle_interleaved(
     const wuffs_base__pixel_swizzler* p,
     wuffs_base__slice_u8 dst,
-    wuffs_base__slice_u8 dst_palette,
     wuffs_base__slice_u8 src) {
   if (p && p->private_impl.func) {
-    return (*p->private_impl.func)(dst, dst_palette, src);
+    const uint8_t* scratch1024 = &(p->private_impl.scratch1024[0]);
+    return (*p->private_impl.func)(scratch1024, dst, src);
   }
   return 0;
 }
@@ -13001,7 +12978,6 @@
       wuffs_base__pixel_swizzler__swizzle_interleaved(
           &self->private_impl.f_swizzler,
           wuffs_base__slice_u8__subslice_i(v_dst, v_i),
-          wuffs_base__utility__empty_slice_u8(),
           wuffs_base__slice_u8__subslice_j(
               wuffs_base__make_slice_u8(self->private_impl.f_stash, 4),
               self->private_impl.f_num_stashed));
@@ -13035,8 +13011,7 @@
     if (v_i < ((uint64_t)(v_dst.len))) {
       v_n = wuffs_base__pixel_swizzler__swizzle_interleaved(
           &self->private_impl.f_swizzler,
-          wuffs_base__slice_u8__subslice_i(v_dst, v_i),
-          wuffs_base__utility__empty_slice_u8(), a_src);
+          wuffs_base__slice_u8__subslice_i(v_dst, v_i), a_src);
       wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x,
                                         ((uint32_t)((v_n & 4294967295))));
       v_n = ((v_n & 4294967295) * ((uint64_t)(v_src_bytes_per_pixel)));
@@ -21019,7 +20994,6 @@
   uint32_t v_num_palette_entries = 0;
   uint32_t v_i = 0;
   uint32_t v_argb = 0;
-  wuffs_base__slice_u8 v_dst_palette = {0};
   wuffs_base__status v_status = wuffs_base__make_status(NULL);
   uint8_t v_lw = 0;
 
@@ -21138,14 +21112,10 @@
           (4 * ((uint32_t)(self->private_impl.f_gc_transparent_index))) + 3)] =
           0;
     }
-    v_dst_palette = wuffs_base__pixel_buffer__palette(a_dst);
-    if (((uint64_t)(v_dst_palette.len)) == 0) {
-      v_dst_palette =
-          wuffs_base__make_slice_u8(self->private_data.f_dst_palette, 1024);
-    }
     v_status = wuffs_base__pixel_swizzler__prepare(
         &self->private_impl.f_swizzler,
-        wuffs_base__pixel_buffer__pixel_format(a_dst), v_dst_palette,
+        wuffs_base__pixel_buffer__pixel_format(a_dst),
+        wuffs_base__pixel_buffer__palette(a_dst),
         wuffs_base__utility__make_pixel_format(2198077448),
         wuffs_base__make_slice_u8(
             self->private_data.f_palettes[v_which_palette], 1024),
@@ -21502,9 +21472,7 @@
         v_dst = wuffs_base__slice_u8__subslice_i(v_dst, v_i);
       }
       v_n = wuffs_base__pixel_swizzler__swizzle_interleaved(
-          &self->private_impl.f_swizzler, v_dst,
-          wuffs_base__make_slice_u8(self->private_data.f_dst_palette, 1024),
-          v_src);
+          &self->private_impl.f_swizzler, v_dst, v_src);
       wuffs_base__u64__sat_add_indirect(&v_src_ri, v_n);
       wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x,
                                         ((uint32_t)((v_n & 4294967295))));
@@ -24977,7 +24945,6 @@
           v_c = ((uint8_t)(((((uint32_t)(v_c)) << 1) & 255)));
           wuffs_base__pixel_swizzler__swizzle_interleaved(
               &self->private_impl.f_swizzler, v_dst,
-              wuffs_base__utility__empty_slice_u8(),
               wuffs_base__make_slice_u8(v_src, 1));
           if (v_dst_bytes_per_pixel <= ((uint64_t)(v_dst.len))) {
             v_dst =
diff --git a/std/bmp/decode_bmp.wuffs b/std/bmp/decode_bmp.wuffs
index 5af01c6..106340e 100644
--- a/std/bmp/decode_bmp.wuffs
+++ b/std/bmp/decode_bmp.wuffs
@@ -341,7 +341,6 @@
 		if i < dst.length() {
 			this.swizzler.swizzle_interleaved!(
 				dst: dst[i ..],
-				dst_palette: this.util.empty_slice_u8(),
 				src: this.stash[.. this.num_stashed])
 			this.dst_x ~sat+= 1
 		}
@@ -377,7 +376,6 @@
 		if i < dst.length() {
 			n = this.swizzler.swizzle_interleaved!(
 				dst: dst[i ..],
-				dst_palette: this.util.empty_slice_u8(),
 				src: args.src)
 			this.dst_x ~sat+= (n & 0xFFFF_FFFF) as base.u32
 			n = (n & 0xFFFF_FFFF) * (src_bytes_per_pixel as base.u64)
diff --git a/std/gif/decode_gif.wuffs b/std/gif/decode_gif.wuffs
index 235a167..717012f 100644
--- a/std/gif/decode_gif.wuffs
+++ b/std/gif/decode_gif.wuffs
@@ -137,8 +137,6 @@
 
 	// palettes[0] and palettes[1] are the Global and Local Color Table.
 	palettes : array[2] array[4 * 256] base.u8,
-	// dst_palette is the swizzled color table.
-	dst_palette : array[4 * 256] base.u8,
 
 	lzw : lzw.decoder,
 	//#WHEN PREPROC101 decode_config.wuffs
@@ -821,7 +819,6 @@
 	var num_palette_entries : base.u32[..= 256]
 	var i                   : base.u32
 	var argb                : base.u32
-	var dst_palette         : slice base.u8
 	var status              : base.status
 	var lw                  : base.u8
 
@@ -873,16 +870,11 @@
 		this.palettes[1][(4 * (this.gc_transparent_index as base.u32)) + 3] = 0x00
 	}
 
-	dst_palette = args.dst.palette()
-	if dst_palette.length() == 0 {
-		dst_palette = this.dst_palette[..]
-	}
-
 	// TODO: a Wuffs (not just C) name for the
 	// WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY magic pixfmt constant.
 	status = this.swizzler.prepare!(
 		dst_pixfmt: args.dst.pixel_format(),
-		dst_palette: dst_palette,
+		dst_palette: args.dst.palette(),
 		src_pixfmt: this.util.make_pixel_format(repr: 0x8304_0008),
 		src_palette: this.palettes[which_palette][..],
 		blend: args.blend)
@@ -1074,8 +1066,7 @@
 			} else {
 				dst = dst[i ..]
 			}
-			n = this.swizzler.swizzle_interleaved!(
-				dst: dst, dst_palette: this.dst_palette[..], src: src)
+			n = this.swizzler.swizzle_interleaved!(dst: dst, src: src)
 
 			src_ri ~sat+= n
 			this.dst_x ~sat+= (n & 0xFFFF_FFFF) as base.u32
diff --git a/std/wbmp/decode_wbmp.wuffs b/std/wbmp/decode_wbmp.wuffs
index b0be49c..d0a74c6 100644
--- a/std/wbmp/decode_wbmp.wuffs
+++ b/std/wbmp/decode_wbmp.wuffs
@@ -212,8 +212,7 @@
 				//     v_c <<= 1;
 				c = (((c as base.u32) << 1) & 0xFF) as base.u8
 
-				this.swizzler.swizzle_interleaved!(
-					dst: dst, dst_palette: this.util.empty_slice_u8(), src: src[..])
+				this.swizzler.swizzle_interleaved!(dst: dst, src: src[..])
 
 				if dst_bytes_per_pixel <= dst.length() {
 					dst = dst[dst_bytes_per_pixel ..]
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index 0cb7ad4..bf65c11 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -107,7 +107,6 @@
 
   const uint32_t width = 5;
   const uint32_t height = 5;
-  uint8_t dummy_palette_array[1024];
   wuffs_base__pixel_swizzler swizzler;
 
   const struct {
@@ -201,14 +200,6 @@
       wuffs_base__pixel_alpha_transparency dst_transparency =
           wuffs_base__pixel_format__transparency(&dst_pixfmt);
 
-      wuffs_base__slice_u8 dst_palette =
-          wuffs_base__pixel_buffer__palette(&dst_pixbuf);
-      if (dst_palette.len == 0) {
-        dst_palette = wuffs_base__make_slice_u8(
-            &dummy_palette_array[0],
-            WUFFS_TESTLIB_ARRAY_SIZE(dummy_palette_array));
-      }
-
       int b;
       for (b = 0; b < WUFFS_TESTLIB_ARRAY_SIZE(blends); b++) {
         // Set the middle dst pixel.
@@ -220,14 +211,14 @@
         CHECK_STATUS(
             "prepare",
             wuffs_base__pixel_swizzler__prepare(
-                &swizzler, dst_pixfmt, dst_palette,
+                &swizzler, dst_pixfmt,
+                wuffs_base__pixel_buffer__palette(&dst_pixbuf),
                 wuffs_base__make_pixel_format(srcs[s].pixfmt_repr),
                 wuffs_base__pixel_buffer__palette(&src_pixbuf), blends[b]));
         wuffs_base__pixel_swizzler__swizzle_interleaved(
             &swizzler,
             wuffs_base__table_u8__row(
                 wuffs_base__pixel_buffer__plane(&dst_pixbuf, 0), height / 2),
-            dst_palette,
             wuffs_base__table_u8__row(
                 wuffs_base__pixel_buffer__plane(&src_pixbuf, 0), height / 2));