Add PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE

Binary size, before:
14272 gen/lib/c/clang-9-dynamic/wuffs-base-pixconv.lo
14760 gen/lib/c/clang-9-static/wuffs-base-pixconv.o
22336 gen/lib/c/gcc-dynamic/wuffs-base-pixconv.lo
22280 gen/lib/c/gcc-static/wuffs-base-pixconv.o

After:
18712 gen/lib/c/clang-9-dynamic/wuffs-base-pixconv.lo
19392 gen/lib/c/clang-9-static/wuffs-base-pixconv.o
36880 gen/lib/c/gcc-dynamic/wuffs-base-pixconv.lo
36824 gen/lib/c/gcc-static/wuffs-base-pixconv.o
diff --git a/internal/cgen/base/image-public.h b/internal/cgen/base/image-public.h
index 4ca47de..002bf2b 100644
--- a/internal/cgen/base/image-public.h
+++ b/internal/cgen/base/image-public.h
@@ -109,6 +109,68 @@
   return (a << 24) | (r << 16) | (g << 8) | (b << 0);
 }
 
+// wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul converts
+// from 4x16LE non-premultiplied alpha to 4x8 premultiplied alpha.
+static inline wuffs_base__color_u32_argb_premul  //
+wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+    uint64_t argb_nonpremul) {
+  uint32_t a16 = 0xFFFF & (argb_nonpremul >> 48);
+
+  uint32_t r16 = 0xFFFF & (argb_nonpremul >> 32);
+  r16 = (r16 * a16) / 0xFFFF;
+  uint32_t g16 = 0xFFFF & (argb_nonpremul >> 16);
+  g16 = (g16 * a16) / 0xFFFF;
+  uint32_t b16 = 0xFFFF & (argb_nonpremul >> 0);
+  b16 = (b16 * a16) / 0xFFFF;
+
+  return ((a16 >> 8) << 24) | ((r16 >> 8) << 16) | ((g16 >> 8) << 8) |
+         ((b16 >> 8) << 0);
+}
+
+// wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul converts
+// from 4x8 premultiplied alpha to 4x16LE non-premultiplied alpha.
+static inline uint64_t  //
+wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul(
+    wuffs_base__color_u32_argb_premul c) {
+  uint32_t a = 0xFF & (c >> 24);
+  if (a == 0xFF) {
+    uint64_t r16 = 0x101 * (0xFF & (c >> 16));
+    uint64_t g16 = 0x101 * (0xFF & (c >> 8));
+    uint64_t b16 = 0x101 * (0xFF & (c >> 0));
+    return 0xFFFF000000000000u | (r16 << 32) | (g16 << 16) | (b16 << 0);
+  } else if (a == 0) {
+    return 0;
+  }
+  uint64_t a16 = a * 0x101;
+
+  uint64_t r = 0xFF & (c >> 16);
+  uint64_t r16 = (r * (0x101 * 0xFFFF)) / a16;
+  uint64_t g = 0xFF & (c >> 8);
+  uint64_t g16 = (g * (0x101 * 0xFFFF)) / a16;
+  uint64_t b = 0xFF & (c >> 0);
+  uint64_t b16 = (b * (0x101 * 0xFFFF)) / a16;
+
+  return (a16 << 48) | (r16 << 32) | (g16 << 16) | (b16 << 0);
+}
+
+static inline uint64_t  //
+wuffs_base__color_u32__as__color_u64(uint32_t c) {
+  uint64_t a16 = 0x101 * (0xFF & (c >> 24));
+  uint64_t r16 = 0x101 * (0xFF & (c >> 16));
+  uint64_t g16 = 0x101 * (0xFF & (c >> 8));
+  uint64_t b16 = 0x101 * (0xFF & (c >> 0));
+  return (a16 << 48) | (r16 << 32) | (g16 << 16) | (b16 << 0);
+}
+
+static inline uint32_t  //
+wuffs_base__color_u64__as__color_u32(uint64_t c) {
+  uint32_t a = ((uint32_t)(0xFF & (c >> 56)));
+  uint32_t r = ((uint32_t)(0xFF & (c >> 40)));
+  uint32_t g = ((uint32_t)(0xFF & (c >> 24)));
+  uint32_t b = ((uint32_t)(0xFF & (c >> 8)));
+  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
 // --------
 
 typedef uint8_t wuffs_base__pixel_blend;
@@ -196,13 +258,17 @@
 #define WUFFS_BASE__PIXEL_FORMAT__BGR_565 0x80000565
 #define WUFFS_BASE__PIXEL_FORMAT__BGR 0x80000888
 #define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL 0x81008888
+#define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE 0x8100BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL 0x82008888
+#define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL_4X16LE 0x8200BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY 0x83008888
 #define WUFFS_BASE__PIXEL_FORMAT__BGRX 0x90008888
 
 #define WUFFS_BASE__PIXEL_FORMAT__RGB 0xA0000888
 #define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL 0xA1008888
+#define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL_4X16LE 0xA100BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL 0xA2008888
+#define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL_4X16LE 0xA200BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY 0xA3008888
 #define WUFFS_BASE__PIXEL_FORMAT__RGBX 0xB0008888
 
diff --git a/internal/cgen/base/pixconv-submodule.c b/internal/cgen/base/pixconv-submodule.c
index bde2517..edcd535 100644
--- a/internal/cgen/base/pixconv-submodule.c
+++ b/internal/cgen/base/pixconv-submodule.c
@@ -76,6 +76,9 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       return wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(
           wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
+      return wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+          wuffs_base__load_u64le__no_bounds_check(row + (8 * ((size_t)x))));
     case WUFFS_BASE__PIXEL_FORMAT__BGRX:
       return 0xFF000000 |
              wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));
@@ -163,6 +166,12 @@
           wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(
               color));
       break;
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
+      wuffs_base__store_u64le__no_bounds_check(
+          row + (8 * ((size_t)x)),
+          wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul(
+              color));
+      break;
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
       wuffs_base__store_u24le__no_bounds_check(
@@ -255,7 +264,7 @@
 static inline uint32_t  //
 wuffs_base__composite_nonpremul_nonpremul_u32_axxx(uint32_t dst_nonpremul,
                                                    uint32_t src_nonpremul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));
@@ -286,18 +295,58 @@
     db = (db * 0xFFFF) / da;
   }
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
+static inline uint64_t  //
+wuffs_base__composite_nonpremul_nonpremul_u64_axxx(uint64_t dst_nonpremul,
+                                                   uint64_t src_nonpremul) {
+  // Extract components.
+  uint64_t sa = 0xFFFF & (src_nonpremul >> 48);
+  uint64_t sr = 0xFFFF & (src_nonpremul >> 32);
+  uint64_t sg = 0xFFFF & (src_nonpremul >> 16);
+  uint64_t sb = 0xFFFF & (src_nonpremul >> 0);
+  uint64_t da = 0xFFFF & (dst_nonpremul >> 48);
+  uint64_t dr = 0xFFFF & (dst_nonpremul >> 32);
+  uint64_t dg = 0xFFFF & (dst_nonpremul >> 16);
+  uint64_t db = 0xFFFF & (dst_nonpremul >> 0);
+
+  // Convert dst from nonpremul to premul.
+  dr = (dr * da) / 0xFFFF;
+  dg = (dg * da) / 0xFFFF;
+  db = (db * da) / 0xFFFF;
+
+  // Calculate the inverse of the src-alpha: how much of the dst to keep.
+  uint64_t ia = 0xFFFF - sa;
+
+  // Composite src (nonpremul) over dst (premul).
+  da = sa + ((da * ia) / 0xFFFF);
+  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+  db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+  // Convert dst from premul to nonpremul.
+  if (da != 0) {
+    dr = (dr * 0xFFFF) / da;
+    dg = (dg * 0xFFFF) / da;
+    db = (db * 0xFFFF) / da;
+  }
+
+  // Combine components.
+  return (db << 0) | (dg << 16) | (dr << 32) | (da << 48);
+}
+
 static inline uint32_t  //
 wuffs_base__composite_nonpremul_premul_u32_axxx(uint32_t dst_nonpremul,
                                                 uint32_t src_premul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));
@@ -328,18 +377,20 @@
     db = (db * 0xFFFF) / da;
   }
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
 static inline uint32_t  //
 wuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,
                                                 uint32_t src_nonpremul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));
@@ -358,18 +409,46 @@
   dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
   db = ((sb * sa) + (db * ia)) / 0xFFFF;
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
+static inline uint64_t  //
+wuffs_base__composite_premul_nonpremul_u64_axxx(uint64_t dst_premul,
+                                                uint64_t src_nonpremul) {
+  // Extract components.
+  uint64_t sa = 0xFFFF & (src_nonpremul >> 48);
+  uint64_t sr = 0xFFFF & (src_nonpremul >> 32);
+  uint64_t sg = 0xFFFF & (src_nonpremul >> 16);
+  uint64_t sb = 0xFFFF & (src_nonpremul >> 0);
+  uint64_t da = 0xFFFF & (dst_premul >> 48);
+  uint64_t dr = 0xFFFF & (dst_premul >> 32);
+  uint64_t dg = 0xFFFF & (dst_premul >> 16);
+  uint64_t db = 0xFFFF & (dst_premul >> 0);
+
+  // Calculate the inverse of the src-alpha: how much of the dst to keep.
+  uint64_t ia = 0xFFFF - sa;
+
+  // Composite src (nonpremul) over dst (premul).
+  da = sa + ((da * ia) / 0xFFFF);
+  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+  db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+  // Combine components.
+  return (db << 0) | (dg << 16) | (dr << 32) | (da << 48);
+}
+
 static inline uint32_t  //
 wuffs_base__composite_premul_premul_u32_axxx(uint32_t dst_premul,
                                              uint32_t src_premul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));
@@ -388,24 +467,27 @@
   dg = sg + ((dg * ia) / 0xFFFF);
   db = sb + ((db * ia) / 0xFFFF);
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,
-                                               wuffs_base__slice_u8 src) {
-  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;
+wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 src) {
+  size_t len = (dst.len < src.len ? dst.len : src.len) / 4;
   uint8_t* d = dst.ptr;
   const uint8_t* s = src.ptr;
 
-  size_t n = len4;
+  size_t n = len;
   while (n--) {
     uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);
     uint32_t b5 = 0x1F & (argb >> (8 - 5));
@@ -417,17 +499,17 @@
     s += 4;
     d += 4;
   }
-  return len4 * 4;
+  return len;
 }
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,
                                            wuffs_base__slice_u8 src) {
-  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;
+  size_t len = (dst.len < src.len ? dst.len : src.len) / 4;
   uint8_t* d = dst.ptr;
   const uint8_t* s = src.ptr;
 
-  size_t n = len4;
+  size_t n = len;
   while (n--) {
     uint8_t b0 = s[0];
     uint8_t b1 = s[1];
@@ -440,7 +522,56 @@
     s += 4;
     d += 4;
   }
-  return len4 * 4;
+  return len;
+}
+
+// --------
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,
+                                                    size_t dst_len,
+                                                    uint8_t* dst_palette_ptr,
+                                                    size_t dst_palette_len,
+                                                    const uint8_t* src_ptr,
+                                                    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+
+  const size_t loop_unroll_count = 4;
+
+  size_t n = len;
+  while (n >= loop_unroll_count) {
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (1 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (1 * 8))));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (2 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (2 * 8))));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (3 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (3 * 8))));
+
+    s += loop_unroll_count * 8;
+    d += loop_unroll_count * 4;
+    n -= loop_unroll_count;
+  }
+
+  while (n >= 1) {
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+  return len;
 }
 
 // --------
@@ -557,6 +688,38 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len2 = dst_len / 2;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len2 < src_len8) ? dst_len2 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    wuffs_base__store_u16le__no_bounds_check(
+        d + (0 * 2),
+        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(
+            wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+                wuffs_base__load_u64le__no_bounds_check(s + (0 * 8)))));
+
+    s += 1 * 8;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(
     uint8_t* dst_ptr,
     size_t dst_len,
@@ -574,7 +737,7 @@
   // TODO: unroll.
 
   while (n >= 1) {
-    // Convert from 8-bit color to 16-bit color.
+    // Extract 16-bit color components.
     uint32_t sa = 0x101 * ((uint32_t)s[3]);
     uint32_t sr = 0x101 * ((uint32_t)s[2]);
     uint32_t sg = 0x101 * ((uint32_t)s[1]);
@@ -614,6 +777,63 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len2 = dst_len / 2;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len2 < src_len8) ? dst_len2 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Extract 16-bit color components.
+    uint32_t sa = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 6));
+    uint32_t sr = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 4));
+    uint32_t sg = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 2));
+    uint32_t sb = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 0));
+
+    // Convert from 565 color to 16-bit color.
+    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));
+    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);
+    uint32_t dr = (0x8421 * old_r5) >> 4;
+    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);
+    uint32_t dg = (0x1041 * old_g6) >> 2;
+    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);
+    uint32_t db = (0x8421 * old_b5) >> 4;
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 565 color and combine the components.
+    uint32_t new_r5 = 0x1F & (dr >> 11);
+    uint32_t new_g6 = 0x3F & (dg >> 10);
+    uint32_t new_b5 = 0x1F & (db >> 11);
+    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),
+                                             (uint16_t)new_rgb_565);
+
+    s += 1 * 8;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__y(uint8_t* dst_ptr,
                                        size_t dst_len,
                                        uint8_t* dst_palette_ptr,
@@ -759,6 +979,37 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len3 = dst_len / 3;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len3 < src_len8) ? dst_len3 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 =
+        wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+            wuffs_base__load_u64le__no_bounds_check(s + (0 * 8)));
+    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
+
+    s += 1 * 8;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(
     uint8_t* dst_ptr,
     size_t dst_len,
@@ -776,7 +1027,7 @@
   // TODO: unroll.
 
   while (n >= 1) {
-    // Convert from 8-bit color to 16-bit color.
+    // Extract 16-bit color components.
     uint32_t sa = 0x101 * ((uint32_t)s[3]);
     uint32_t sr = 0x101 * ((uint32_t)s[2]);
     uint32_t sg = 0x101 * ((uint32_t)s[1]);
@@ -806,6 +1057,54 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len3 = dst_len / 3;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len3 < src_len8) ? dst_len3 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Extract 16-bit color components.
+    uint32_t sa = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 6));
+    uint32_t sr = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 4));
+    uint32_t sg = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 2));
+    uint32_t sb = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 0));
+    uint32_t dr = 0x101 * ((uint32_t)d[2]);
+    uint32_t dg = 0x101 * ((uint32_t)d[1]);
+    uint32_t db = 0x101 * ((uint32_t)d[0]);
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 8-bit color.
+    d[0] = (uint8_t)(db >> 8);
+    d[1] = (uint8_t)(dg >> 8);
+    d[2] = (uint8_t)(dr >> 8);
+
+    s += 1 * 8;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -823,8 +1122,6 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  // TODO: unroll.
-
   while (n >= 1) {
     uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));
     uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));
@@ -840,6 +1137,38 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  while (n >= 1) {
+    uint64_t d0 = wuffs_base__color_u32__as__color_u64(
+        wuffs_base__load_u32le__no_bounds_check(d + (0 * 4)));
+    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4),
+        wuffs_base__color_u64__as__color_u32(
+            wuffs_base__composite_nonpremul_nonpremul_u64_axxx(d0, s0)));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -874,6 +1203,37 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4),
+        wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(s0));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(
     uint8_t* dst_ptr,
     size_t dst_len,
@@ -904,6 +1264,40 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint64_t d0 = wuffs_base__color_u32__as__color_u64(
+        wuffs_base__load_u32le__no_bounds_check(d + (0 * 4)));
+    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4),
+        wuffs_base__color_u64__as__color_u32(
+            wuffs_base__composite_premul_nonpremul_u64_axxx(d0, s0)));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -1273,8 +1667,8 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,
-                                                         src_palette) != 1024) {
+      if (wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(
+              dst_palette, src_palette) != 256) {
         return NULL;
       }
       switch (blend) {
@@ -1315,7 +1709,7 @@
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
       if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+                                                     src_palette) != 256) {
         return NULL;
       }
       switch (blend) {
@@ -1330,7 +1724,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
       if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+                                                     src_palette) != 256) {
         return NULL;
       }
       switch (blend) {
@@ -1434,6 +1828,66 @@
   return NULL;
 }
 
+static wuffs_base__pixel_swizzler__func  //
+wuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(
+    wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_format dst_pixfmt,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src_palette,
+    wuffs_base__pixel_blend blend) {
+  switch (dst_pixfmt.repr) {
+    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGR:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
+    case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__RGB:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+      // TODO.
+      break;
+  }
+  return NULL;
+}
+
 // --------
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status  //
@@ -1480,6 +1934,11 @@
       func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
           p, dst_pixfmt, dst_palette, src_palette, blend);
       break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
+      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(
+          p, dst_pixfmt, dst_palette, src_palette, blend);
+      break;
   }
 
   p->private_impl.func = func;
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index d4a5cbb..a113893 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -118,7 +118,9 @@
 	"// ---------------- Images\n\n// wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied\n// Alpha, Red, Green, Blue color, as a uint32_t value. Its value is always\n// 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.\ntypedef uint32_t wuffs_base__color_u32_argb_premul;\n\nstatic inline uint16_t  //\nwuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n    wuffs_base__color_u32_argb_premul c) {\n  uint32_t r5 = 0xF800 & (c >> 8);\n  uint32_t g6 = 0x07E0 & (c >> 5);\n  uint32_t b5 = 0x001F & (c >> 3);\n  return (uint16_t)(r5 | g6 | b5);\n}\n\nstatic inline wuffs_base__color_u32_argb_premul  //\nwuffs_base__color_u16_rgb_565__as__color_u32_argb_premul(uint16_t rgb_565) {\n  uint32_t b5 = 0x1F & (rgb_565 >> 0);\n  uint32_t b = (b5 << 3) | (b5 >> 2);\n  uint32_t g6 = 0x3F & (rgb_565 >> 5);\n  uint32_t g = (g6 << 2) | (g6 >> 4);\n  uint32_t r5 = 0x1F & (rgb_565 >> 11);\n  uint32_t r = (r5 << 3) | (r5 >> 2);\n  return 0xFF000000 | (r << 16) | (g << 8) | (b << 0);\n}\n\nstatic inline uint8_t  //" +
 	"\nwuffs_base__color_u32_argb_premul__as__color_u8_gray(\n    wuffs_base__color_u32_argb_premul c) {\n  // Work in 16-bit color.\n  uint32_t cr = 0x101 * (0xFF & (c >> 16));\n  uint32_t cg = 0x101 * (0xFF & (c >> 8));\n  uint32_t cb = 0x101 * (0xFF & (c >> 0));\n\n  // These coefficients (the fractions 0.299, 0.587 and 0.114) are the same\n  // as those given by the JFIF specification.\n  //\n  // Note that 19595 + 38470 + 7471 equals 65536, also known as (1 << 16). We\n  // shift by 24, not just by 16, because the return value is 8-bit color, not\n  // 16-bit color.\n  uint32_t weighted_average = (19595 * cr) + (38470 * cg) + (7471 * cb) + 32768;\n  return (uint8_t)(weighted_average >> 24);\n}\n\n// wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul converts\n// from non-premultiplied alpha to premultiplied alpha.\nstatic inline wuffs_base__color_u32_argb_premul  //\nwuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n    uint32_t argb_nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and onc" +
 	"e for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (argb_nonpremul >> 24);\n  uint32_t a16 = a * (0x101 * 0x101);\n\n  uint32_t r = 0xFF & (argb_nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (argb_nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (argb_nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n// wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul converts\n// from premultiplied alpha to non-premultiplied alpha.\nstatic inline uint32_t  //\nwuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(\n    wuffs_base_" +
-	"_color_u32_argb_premul c) {\n  uint32_t a = 0xFF & (c >> 24);\n  if (a == 0xFF) {\n    return c;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (c >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (c >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (c >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n" +
+	"_color_u32_argb_premul c) {\n  uint32_t a = 0xFF & (c >> 24);\n  if (a == 0xFF) {\n    return c;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (c >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (c >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (c >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n// wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul converts\n// from 4x16LE non-premultiplied alpha to 4x8 premultiplied alpha.\nstatic inline wuffs_base__color_u32_argb_premul  //\nwuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(\n    uint64_t argb_nonpremul) {\n  uint32_t a16 = 0xFFFF & (argb_nonpremul >> 48);\n\n  uint32_t r16 = 0xFFFF & (argb_nonpremul >> 32);\n  r16 = (r16 * a16) / 0xFFFF;\n  uint32_t g16 = 0xFFFF & (argb_nonpremul >> 16);\n  g16 = (g16 * a16) / 0xFFFF;\n  uint32_t b16 = 0xFFFF & (argb_nonpremul >> 0);\n  b16 = (b16 * a16) / 0xF" +
+	"FFF;\n\n  return ((a16 >> 8) << 24) | ((r16 >> 8) << 16) | ((g16 >> 8) << 8) |\n         ((b16 >> 8) << 0);\n}\n\n// wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul converts\n// from 4x8 premultiplied alpha to 4x16LE non-premultiplied alpha.\nstatic inline uint64_t  //\nwuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul(\n    wuffs_base__color_u32_argb_premul c) {\n  uint32_t a = 0xFF & (c >> 24);\n  if (a == 0xFF) {\n    uint64_t r16 = 0x101 * (0xFF & (c >> 16));\n    uint64_t g16 = 0x101 * (0xFF & (c >> 8));\n    uint64_t b16 = 0x101 * (0xFF & (c >> 0));\n    return 0xFFFF000000000000u | (r16 << 32) | (g16 << 16) | (b16 << 0);\n  } else if (a == 0) {\n    return 0;\n  }\n  uint64_t a16 = a * 0x101;\n\n  uint64_t r = 0xFF & (c >> 16);\n  uint64_t r16 = (r * (0x101 * 0xFFFF)) / a16;\n  uint64_t g = 0xFF & (c >> 8);\n  uint64_t g16 = (g * (0x101 * 0xFFFF)) / a16;\n  uint64_t b = 0xFF & (c >> 0);\n  uint64_t b16 = (b * (0x101 * 0xFFFF)) / a16;\n\n  return (a16 << 48) | (r16 << 32) | (g16 << 16) | (b16 << 0);\n}\n" +
+	"\nstatic inline uint64_t  //\nwuffs_base__color_u32__as__color_u64(uint32_t c) {\n  uint64_t a16 = 0x101 * (0xFF & (c >> 24));\n  uint64_t r16 = 0x101 * (0xFF & (c >> 16));\n  uint64_t g16 = 0x101 * (0xFF & (c >> 8));\n  uint64_t b16 = 0x101 * (0xFF & (c >> 0));\n  return (a16 << 48) | (r16 << 32) | (g16 << 16) | (b16 << 0);\n}\n\nstatic inline uint32_t  //\nwuffs_base__color_u64__as__color_u32(uint64_t c) {\n  uint32_t a = ((uint32_t)(0xFF & (c >> 56)));\n  uint32_t r = ((uint32_t)(0xFF & (c >> 40)));\n  uint32_t g = ((uint32_t)(0xFF & (c >> 24)));\n  uint32_t b = ((uint32_t)(0xFF & (c >> 8)));\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n" +
 	"" +
 	"// --------\n\ntypedef uint8_t wuffs_base__pixel_blend;\n\n// wuffs_base__pixel_blend encodes how to blend source and destination pixels,\n// accounting for transparency. It encompasses the Porter-Duff compositing\n// operators as well as the other blending modes defined by PDF.\n//\n// TODO: implement the other modes.\n#define WUFFS_BASE__PIXEL_BLEND__SRC ((wuffs_base__pixel_blend)0)\n#define WUFFS_BASE__PIXEL_BLEND__SRC_OVER ((wuffs_base__pixel_blend)1)\n\n" +
 	"" +
@@ -126,10 +128,10 @@
 	"" +
 	"// --------\n\n#define WUFFS_BASE__PIXEL_FORMAT__NUM_PLANES_MAX 4\n\n#define WUFFS_BASE__PIXEL_FORMAT__INDEXED__INDEX_PLANE 0\n#define WUFFS_BASE__PIXEL_FORMAT__INDEXED__COLOR_PLANE 3\n\n// wuffs_base__pixel_format encodes the format of the bytes that constitute an\n// image frame's pixel data.\n//\n// See https://github.com/google/wuffs/blob/master/doc/note/pixel-formats.md\n//\n// Do not manipulate its bits directly; they are private implementation\n// details. Use methods such as wuffs_base__pixel_format__num_planes instead.\ntypedef struct wuffs_base__pixel_format__struct {\n  uint32_t repr;\n\n#ifdef __cplusplus\n  inline bool is_valid() const;\n  inline uint32_t bits_per_pixel() const;\n  inline bool is_direct() const;\n  inline bool is_indexed() const;\n  inline bool is_interleaved() const;\n  inline bool is_planar() const;\n  inline uint32_t num_planes() const;\n  inline wuffs_base__pixel_alpha_transparency transparency() const;\n#endif  // __cplusplus\n\n} wuffs_base__pixel_format;\n\nstatic inline wuffs_base__pixel_format  //\nwu" +
 	"ffs_base__make_pixel_format(uint32_t repr) {\n  wuffs_base__pixel_format f;\n  f.repr = repr;\n  return f;\n}\n\n// Common 8-bit-depth pixel formats. This list is not exhaustive; not all valid\n// wuffs_base__pixel_format values are present.\n\n#define WUFFS_BASE__PIXEL_FORMAT__INVALID 0x00000000\n\n#define WUFFS_BASE__PIXEL_FORMAT__A 0x02000008\n\n#define WUFFS_BASE__PIXEL_FORMAT__Y 0x20000008\n#define WUFFS_BASE__PIXEL_FORMAT__YA_NONPREMUL 0x21000008\n#define WUFFS_BASE__PIXEL_FORMAT__YA_PREMUL 0x22000008\n\n#define WUFFS_BASE__PIXEL_FORMAT__YCBCR 0x40020888\n#define WUFFS_BASE__PIXEL_FORMAT__YCBCRA_NONPREMUL 0x41038888\n#define WUFFS_BASE__PIXEL_FORMAT__YCBCRK 0x50038888\n\n#define WUFFS_BASE__PIXEL_FORMAT__YCOCG 0x60020888\n#define WUFFS_BASE__PIXEL_FORMAT__YCOCGA_NONPREMUL 0x61038888\n#define WUFFS_BASE__PIXEL_FORMAT__YCOCGK 0x70038888\n\n#define WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL 0x81040008\n#define WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL 0x82040008\n#define WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY 0x8" +
-	"3040008\n\n#define WUFFS_BASE__PIXEL_FORMAT__BGR_565 0x80000565\n#define WUFFS_BASE__PIXEL_FORMAT__BGR 0x80000888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL 0x81008888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL 0x82008888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY 0x83008888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRX 0x90008888\n\n#define WUFFS_BASE__PIXEL_FORMAT__RGB 0xA0000888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL 0xA1008888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL 0xA2008888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY 0xA3008888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBX 0xB0008888\n\n#define WUFFS_BASE__PIXEL_FORMAT__CMY 0xC0020888\n#define WUFFS_BASE__PIXEL_FORMAT__CMYK 0xD0038888\n\nextern const uint32_t wuffs_base__pixel_format__bits_per_channel[16];\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_valid(const wuffs_base__pixel_format* f) {\n  return f->repr != 0;\n}\n\n// wuffs_base__pixel_format__bits_per_pixel returns the number of bits per\n// pixel for interleaved pixel formats, and ret" +
-	"urns 0 for planar pixel formats.\nstatic inline uint32_t  //\nwuffs_base__pixel_format__bits_per_pixel(const wuffs_base__pixel_format* f) {\n  if (((f->repr >> 16) & 0x03) != 0) {\n    return 0;\n  }\n  return wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 0)] +\n         wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 4)] +\n         wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 8)] +\n         wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 12)];\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_direct(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 18) & 0x01) == 0;\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_indexed(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 18) & 0x01) != 0;\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_interleaved(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 16) & 0x03) == 0;\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_planar(const wuffs_base__pixel_format* f) {" +
-	"\n  return ((f->repr >> 16) & 0x03) != 0;\n}\n\nstatic inline uint32_t  //\nwuffs_base__pixel_format__num_planes(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 16) & 0x03) + 1;\n}\n\nstatic inline wuffs_base__pixel_alpha_transparency  //\nwuffs_base__pixel_format__transparency(const wuffs_base__pixel_format* f) {\n  return (wuffs_base__pixel_alpha_transparency)((f->repr >> 24) & 0x03);\n}\n\n#ifdef __cplusplus\n\ninline bool  //\nwuffs_base__pixel_format::is_valid() const {\n  return wuffs_base__pixel_format__is_valid(this);\n}\n\ninline uint32_t  //\nwuffs_base__pixel_format::bits_per_pixel() const {\n  return wuffs_base__pixel_format__bits_per_pixel(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_direct() const {\n  return wuffs_base__pixel_format__is_direct(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_indexed() const {\n  return wuffs_base__pixel_format__is_indexed(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_interleaved() const {\n  return wuffs_base__pixel_format__is_interleaved(this);\n}\n\ni" +
-	"nline bool  //\nwuffs_base__pixel_format::is_planar() const {\n  return wuffs_base__pixel_format__is_planar(this);\n}\n\ninline uint32_t  //\nwuffs_base__pixel_format::num_planes() const {\n  return wuffs_base__pixel_format__num_planes(this);\n}\n\ninline wuffs_base__pixel_alpha_transparency  //\nwuffs_base__pixel_format::transparency() const {\n  return wuffs_base__pixel_format__transparency(this);\n}\n\n#endif  // __cplusplus\n\n" +
+	"3040008\n\n#define WUFFS_BASE__PIXEL_FORMAT__BGR_565 0x80000565\n#define WUFFS_BASE__PIXEL_FORMAT__BGR 0x80000888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL 0x81008888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE 0x8100BBBB\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL 0x82008888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL_4X16LE 0x8200BBBB\n#define WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY 0x83008888\n#define WUFFS_BASE__PIXEL_FORMAT__BGRX 0x90008888\n\n#define WUFFS_BASE__PIXEL_FORMAT__RGB 0xA0000888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL 0xA1008888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL_4X16LE 0xA100BBBB\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL 0xA2008888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL_4X16LE 0xA200BBBB\n#define WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY 0xA3008888\n#define WUFFS_BASE__PIXEL_FORMAT__RGBX 0xB0008888\n\n#define WUFFS_BASE__PIXEL_FORMAT__CMY 0xC0020888\n#define WUFFS_BASE__PIXEL_FORMAT__CMYK 0xD0038888\n\nextern const uint32_t wuffs_base__pixel_format__bits" +
+	"_per_channel[16];\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_valid(const wuffs_base__pixel_format* f) {\n  return f->repr != 0;\n}\n\n// wuffs_base__pixel_format__bits_per_pixel returns the number of bits per\n// pixel for interleaved pixel formats, and returns 0 for planar pixel formats.\nstatic inline uint32_t  //\nwuffs_base__pixel_format__bits_per_pixel(const wuffs_base__pixel_format* f) {\n  if (((f->repr >> 16) & 0x03) != 0) {\n    return 0;\n  }\n  return wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 0)] +\n         wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 4)] +\n         wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 8)] +\n         wuffs_base__pixel_format__bits_per_channel[0x0F & (f->repr >> 12)];\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_direct(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 18) & 0x01) == 0;\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_indexed(const wuffs_base__pixel_format* f) {\n  return ((f->repr >" +
+	"> 18) & 0x01) != 0;\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_interleaved(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 16) & 0x03) == 0;\n}\n\nstatic inline bool  //\nwuffs_base__pixel_format__is_planar(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 16) & 0x03) != 0;\n}\n\nstatic inline uint32_t  //\nwuffs_base__pixel_format__num_planes(const wuffs_base__pixel_format* f) {\n  return ((f->repr >> 16) & 0x03) + 1;\n}\n\nstatic inline wuffs_base__pixel_alpha_transparency  //\nwuffs_base__pixel_format__transparency(const wuffs_base__pixel_format* f) {\n  return (wuffs_base__pixel_alpha_transparency)((f->repr >> 24) & 0x03);\n}\n\n#ifdef __cplusplus\n\ninline bool  //\nwuffs_base__pixel_format::is_valid() const {\n  return wuffs_base__pixel_format__is_valid(this);\n}\n\ninline uint32_t  //\nwuffs_base__pixel_format::bits_per_pixel() const {\n  return wuffs_base__pixel_format__bits_per_pixel(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_direct() const {\n  return wuffs_base__pixel_format__is_di" +
+	"rect(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_indexed() const {\n  return wuffs_base__pixel_format__is_indexed(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_interleaved() const {\n  return wuffs_base__pixel_format__is_interleaved(this);\n}\n\ninline bool  //\nwuffs_base__pixel_format::is_planar() const {\n  return wuffs_base__pixel_format__is_planar(this);\n}\n\ninline uint32_t  //\nwuffs_base__pixel_format::num_planes() const {\n  return wuffs_base__pixel_format__num_planes(this);\n}\n\ninline wuffs_base__pixel_alpha_transparency  //\nwuffs_base__pixel_format::transparency() const {\n  return wuffs_base__pixel_format__transparency(this);\n}\n\n#endif  // __cplusplus\n\n" +
 	"" +
 	"// --------\n\n// wuffs_base__pixel_subsampling encodes whether sample values cover one pixel\n// or cover multiple pixels.\n//\n// See https://github.com/google/wuffs/blob/master/doc/note/pixel-subsampling.md\n//\n// Do not manipulate its bits directly; they are private implementation\n// details. Use methods such as wuffs_base__pixel_subsampling__bias_x instead.\ntypedef struct wuffs_base__pixel_subsampling__struct {\n  uint32_t repr;\n\n#ifdef __cplusplus\n  inline uint32_t bias_x(uint32_t plane) const;\n  inline uint32_t denominator_x(uint32_t plane) const;\n  inline uint32_t bias_y(uint32_t plane) const;\n  inline uint32_t denominator_y(uint32_t plane) const;\n#endif  // __cplusplus\n\n} wuffs_base__pixel_subsampling;\n\nstatic inline wuffs_base__pixel_subsampling  //\nwuffs_base__make_pixel_subsampling(uint32_t repr) {\n  wuffs_base__pixel_subsampling s;\n  s.repr = repr;\n  return s;\n}\n\n#define WUFFS_BASE__PIXEL_SUBSAMPLING__NONE 0x00000000\n\n#define WUFFS_BASE__PIXEL_SUBSAMPLING__444 0x000000\n#define WUFFS_BASE__PIXEL_SUBSAMPL" +
 	"ING__440 0x010100\n#define WUFFS_BASE__PIXEL_SUBSAMPLING__422 0x101000\n#define WUFFS_BASE__PIXEL_SUBSAMPLING__420 0x111100\n#define WUFFS_BASE__PIXEL_SUBSAMPLING__411 0x303000\n#define WUFFS_BASE__PIXEL_SUBSAMPLING__410 0x313100\n\nstatic inline uint32_t  //\nwuffs_base__pixel_subsampling__bias_x(const wuffs_base__pixel_subsampling* s,\n                                      uint32_t plane) {\n  uint32_t shift = ((plane & 0x03) * 8) + 6;\n  return (s->repr >> shift) & 0x03;\n}\n\nstatic inline uint32_t  //\nwuffs_base__pixel_subsampling__denominator_x(\n    const wuffs_base__pixel_subsampling* s,\n    uint32_t plane) {\n  uint32_t shift = ((plane & 0x03) * 8) + 4;\n  return ((s->repr >> shift) & 0x03) + 1;\n}\n\nstatic inline uint32_t  //\nwuffs_base__pixel_subsampling__bias_y(const wuffs_base__pixel_subsampling* s,\n                                      uint32_t plane) {\n  uint32_t shift = ((plane & 0x03) * 8) + 2;\n  return (s->repr >> shift) & 0x03;\n}\n\nstatic inline uint32_t  //\nwuffs_base__pixel_subsampling__denominator_y(\n    c" +
@@ -517,43 +519,57 @@
 	"" +
 	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__color_u32_argb_premul  //\nwuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_buffer* pb,\n                                       uint32_t x,\n                                       uint32_t y) {\n  if (!pb || (x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return 0;\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return 0;\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  const uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      uint8_t* palette = pb->private_impl" +
 	".planes[3].ptr;\n      return wuffs_base__load_u32le__no_bounds_check(palette +\n                                                     (4 * ((size_t)row[x])));\n    }\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return 0xFF000000 | (0x00010101 * ((uint32_t)(row[x])));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n          wuffs_base__load_u32le__no_bounds_check(palette +\n                                                  (4 * ((size_t)row[x]))));\n    }\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__color_u16_rgb_565__as__color_u32_argb_premul(\n          wuffs_base__load_u16le__no_bounds_check(row + (2 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return 0xFF000000 |\n             wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_F" +
-	"ORMAT__BGRA_NONPREMUL:\n      return wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 |\n             wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n              wuffs_base__load_u32le__no_bounds_check(row +\n                                                      (4 * ((size_t)x)))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((siz" +
-	"e_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_buffer__set_color_u32_at(\n    wuffs_base__pixel_buffer* pb,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_premul color) {\n  if (!pb) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.pla" +
-	"nes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le__no_bounds_check(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      wuffs_base__store_u8__no_bounds_check(\n          row + ((size_t)x),\n          wuffs_base__color_u32_argb_premul__as__color_u8_gray(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      wuffs_base__store_u8__no_bounds_check(\n          row + ((size_t)x), wuffs_base__pixel_palette__closest_element(\n                                 wuffs_base__pixel_buffer__palette(pb),\n                                 pb->pixcfg.private_impl.pixfmt, color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      wuffs_base__store_u16le__no_bounds_check(\n          row + (2 * ((size_t)x)),\n          wuffs_base__color_u32_argb_prem" +
-	"ul__as__color_u16_rgb_565(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le__no_bounds_check(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)),\n          wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(\n              color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le__no_bounds_check(\n          row + (3 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)),\n          wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(\n              wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le__no_bounds_check(\n       " +
-	"   row + (4 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
+	"ORMAT__BGRA_NONPREMUL:\n      return wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:\n      return wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(\n          wuffs_base__load_u64le__no_bounds_check(row + (8 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 |\n             wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n              wuffs_base__load_u32le__no_bounds_check(row +\n                                                      (4 * ((size_t)x)" +
+	"))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_buffer__set_color_u32_at(\n    wuffs_base__pixel_buffer* pb,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_premul color) {\n  if (!pb) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfm" +
+	"t)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le__no_bounds_check(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      wuffs_base__store_u8__no_bounds_check(\n          row + ((size_t)x),\n          wuffs_base__color_u32_argb_premul__as__color_u8_gray(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      wuffs_base__store_u8__no_bounds_check(\n          row + ((size_t)x), wuffs_base__pixel_palette__closest_element(\n                                 wuffs_base__pixel_buffer__palette(pb),\n                                 pb->pixcfg" +
+	".private_impl.pixfmt, color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      wuffs_base__store_u16le__no_bounds_check(\n          row + (2 * ((size_t)x)),\n          wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le__no_bounds_check(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)),\n          wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(\n              color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:\n      wuffs_base__store_u64le__no_bounds_check(\n          row + (8 * ((size_t)x)),\n          wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul(\n              color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le__no_bounds_check(\n          row + (3 * ((size_t)x)), wuffs_bas" +
+	"e__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)),\n          wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(\n              wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
 	"" +
 	"// --------\n\nWUFFS_BASE__MAYBE_STATIC uint8_t  //\nwuffs_base__pixel_palette__closest_element(\n    wuffs_base__slice_u8 palette_slice,\n    wuffs_base__pixel_format palette_format,\n    wuffs_base__color_u32_argb_premul c) {\n  size_t n = palette_slice.len / 4;\n  if (n > 256) {\n    n = 256;\n  }\n  size_t best_index = 0;\n  uint64_t best_score = 0xFFFFFFFFFFFFFFFF;\n\n  // Work in 16-bit color.\n  uint32_t ca = 0x101 * (0xFF & (c >> 24));\n  uint32_t cr = 0x101 * (0xFF & (c >> 16));\n  uint32_t cg = 0x101 * (0xFF & (c >> 8));\n  uint32_t cb = 0x101 * (0xFF & (c >> 0));\n\n  switch (palette_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      bool nonpremul = palette_format.repr ==\n                       WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL;\n\n      size_t i;\n      for (i = 0; i < n; i++) {\n        // Work in 16-bit color.\n        uint32_t pb = 0x101 * ((uint32_t)(palette" +
 	"_slice.ptr[(4 * i) + 0]));\n        uint32_t pg = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 1]));\n        uint32_t pr = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 2]));\n        uint32_t pa = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 3]));\n\n        // Convert to premultiplied alpha.\n        if (nonpremul && (pa != 0xFFFF)) {\n          pb = (pb * pa) / 0xFFFF;\n          pg = (pg * pa) / 0xFFFF;\n          pr = (pr * pa) / 0xFFFF;\n        }\n\n        // These deltas are conceptually int32_t (signed) but after squaring,\n        // it's equivalent to work in uint32_t (unsigned).\n        pb -= cb;\n        pg -= cg;\n        pr -= cr;\n        pa -= ca;\n        uint64_t score = ((uint64_t)(pb * pb)) + ((uint64_t)(pg * pg)) +\n                         ((uint64_t)(pr * pr)) + ((uint64_t)(pa * pa));\n        if (best_score > score) {\n          best_score = score;\n          best_index = i;\n        }\n      }\n      break;\n    }\n  }\n\n  return (uint8_t)best_index;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic inline uint32_t  //\nwuffs_base__composite_nonpremul_nonpremul_u32_axxx(uint32_t dst_nonpremul,\n                                                   uint32_t src_nonpremul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_nonpremul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_nonpremul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_nonpremul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_nonpremul >> 0));\n\n  // Convert dst from nonpremul to premul.\n  dr = (dr * da) / 0xFFFF;\n  dg = (dg * da) / 0xFFFF;\n  db = (db * da) / 0xFFFF;\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * i" +
-	"a)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert dst from premul to nonpremul.\n  if (da != 0) {\n    dr = (dr * 0xFFFF) / da;\n    dg = (dg * 0xFFFF) / da;\n    db = (db * 0xFFFF) / da;\n  }\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_nonpremul_premul_u32_axxx(uint32_t dst_nonpremul,\n                                                uint32_t src_premul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_premul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_nonpremul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_nonpremul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_nonpremul >> 8))" +
-	";\n  uint32_t db = 0x101 * (0xFF & (dst_nonpremul >> 0));\n\n  // Convert dst from nonpremul to premul.\n  dr = (dr * da) / 0xFFFF;\n  dg = (dg * da) / 0xFFFF;\n  db = (db * da) / 0xFFFF;\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (premul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = sr + ((dr * ia) / 0xFFFF);\n  dg = sg + ((dg * ia) / 0xFFFF);\n  db = sb + ((db * ia) / 0xFFFF);\n\n  // Convert dst from premul to nonpremul.\n  if (da != 0) {\n    dr = (dr * 0xFFFF) / da;\n    dg = (dg * 0xFFFF) / da;\n    db = (db * 0xFFFF) / da;\n  }\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,\n                                                uint32_t src_nonpremul) {\n  // Convert from 8-bit color to 16-bit color.\n " +
-	" uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_premul_premul_u32_axxx(uint32_t dst_premul,\n    " +
-	"                                         uint32_t src_premul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_premul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (premul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = sr + ((dr * ia) / 0xFFFF);\n  dg = sg + ((dg * ia) / 0xFFFF);\n  db = sb + ((db * ia) / 0xFFFF);\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\n" +
+	"// --------\n\nstatic inline uint32_t  //\nwuffs_base__composite_nonpremul_nonpremul_u32_axxx(uint32_t dst_nonpremul,\n                                                   uint32_t src_nonpremul) {\n  // Extract 16-bit color components.\n  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_nonpremul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_nonpremul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_nonpremul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_nonpremul >> 0));\n\n  // Convert dst from nonpremul to premul.\n  dr = (dr * da) / 0xFFFF;\n  dg = (dg * da) / 0xFFFF;\n  db = (db * da) / 0xFFFF;\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xF" +
+	"FFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert dst from premul to nonpremul.\n  if (da != 0) {\n    dr = (dr * 0xFFFF) / da;\n    dg = (dg * 0xFFFF) / da;\n    db = (db * 0xFFFF) / da;\n  }\n\n  // Convert from 16-bit color to 8-bit color.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n\n  // Combine components.\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint64_t  //\nwuffs_base__composite_nonpremul_nonpremul_u64_axxx(uint64_t dst_nonpremul,\n                                                   uint64_t src_nonpremul) {\n  // Extract components.\n  uint64_t sa = 0xFFFF & (src_nonpremul >> 48);\n  uint64_t sr = 0xFFFF & (src_nonpremul >> 32);\n  uint64_t sg = 0xFFFF & (src_nonpremul >> 16);\n  uint64_t sb = 0xFFFF & (src_nonpremul >> 0);\n  uint64_t da = 0xFFFF & (dst_nonpremul >> 48);\n  uint64_t dr = 0xFFFF & (dst_nonpremul >> 32);\n  uint64_t dg = 0xFFFF & (dst_nonpremul >> 16);\n  uint64_t db = 0xFFFF & (dst_nonpremul >> 0);\n\n  // Convert ds" +
+	"t from nonpremul to premul.\n  dr = (dr * da) / 0xFFFF;\n  dg = (dg * da) / 0xFFFF;\n  db = (db * da) / 0xFFFF;\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint64_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert dst from premul to nonpremul.\n  if (da != 0) {\n    dr = (dr * 0xFFFF) / da;\n    dg = (dg * 0xFFFF) / da;\n    db = (db * 0xFFFF) / da;\n  }\n\n  // Combine components.\n  return (db << 0) | (dg << 16) | (dr << 32) | (da << 48);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_nonpremul_premul_u32_axxx(uint32_t dst_nonpremul,\n                                                uint32_t src_premul) {\n  // Extract 16-bit color components.\n  uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_premul >> 8)" +
+	");\n  uint32_t sb = 0x101 * (0xFF & (src_premul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_nonpremul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_nonpremul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_nonpremul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_nonpremul >> 0));\n\n  // Convert dst from nonpremul to premul.\n  dr = (dr * da) / 0xFFFF;\n  dg = (dg * da) / 0xFFFF;\n  db = (db * da) / 0xFFFF;\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (premul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = sr + ((dr * ia) / 0xFFFF);\n  dg = sg + ((dg * ia) / 0xFFFF);\n  db = sb + ((db * ia) / 0xFFFF);\n\n  // Convert dst from premul to nonpremul.\n  if (da != 0) {\n    dr = (dr * 0xFFFF) / da;\n    dg = (dg * 0xFFFF) / da;\n    db = (db * 0xFFFF) / da;\n  }\n\n  // Convert from 16-bit color to 8-bit color.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n\n  // Combine components.\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n" +
+	"\nstatic inline uint32_t  //\nwuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,\n                                                uint32_t src_nonpremul) {\n  // Extract 16-bit color components.\n  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert from 16-bit color to 8-bit color.\n  da >>= 8;\n  dr >>= 8;" +
+	"\n  dg >>= 8;\n  db >>= 8;\n\n  // Combine components.\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint64_t  //\nwuffs_base__composite_premul_nonpremul_u64_axxx(uint64_t dst_premul,\n                                                uint64_t src_nonpremul) {\n  // Extract components.\n  uint64_t sa = 0xFFFF & (src_nonpremul >> 48);\n  uint64_t sr = 0xFFFF & (src_nonpremul >> 32);\n  uint64_t sg = 0xFFFF & (src_nonpremul >> 16);\n  uint64_t sb = 0xFFFF & (src_nonpremul >> 0);\n  uint64_t da = 0xFFFF & (dst_premul >> 48);\n  uint64_t dr = 0xFFFF & (dst_premul >> 32);\n  uint64_t dg = 0xFFFF & (dst_premul >> 16);\n  uint64_t db = 0xFFFF & (dst_premul >> 0);\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint64_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Combine components.\n  return " +
+	"(db << 0) | (dg << 16) | (dr << 32) | (da << 48);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_premul_premul_u32_axxx(uint32_t dst_premul,\n                                             uint32_t src_premul) {\n  // Extract 16-bit color components.\n  uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_premul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (premul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = sr + ((dr * ia) / 0xFFFF);\n  dg = sg + ((dg * ia) / 0xFFFF);\n  db = sb + ((db * ia) / 0xFFFF);\n\n  // Convert from 16-bit color to 8-bit color.\n  da >>= 8;\n  dr" +
+	" >>= 8;\n  dg >>= 8;\n  db >>= 8;\n\n  // Combine components.\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,\n                                               wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  const uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__store_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  const uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while" +
-	" (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_align4_bgr_565_888(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 src) {\n  size_t len = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  const uint8_t* s = src.ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__store_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 src) {\n  size_t len = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  const uint8_t* s = src.ptr;\n\n  size_t n = len;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint" +
+	"8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len;\n}\n\n" +
+	"" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,\n                                                    size_t dst_len,\n                                                    uint8_t* dst_palette_ptr,\n                                                    size_t dst_palette_len,\n                                                    const uint8_t* src_ptr,\n                                                    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n\n  const size_t loop_unroll_count = 4;\n\n  size_t n = len;\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 4), wuffs_base__color_u64__as__" +
+	"color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (1 * 8))));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (2 * 8))));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (3 * 8))));\n\n    s += loop_unroll_count * 8;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__color_u64__as__color_u32(\n                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(uint8_t* dst_ptr,\n                                     size_t dst_len,\n                                     uint8_t* dst_palette_ptr,\n                                     size_t dst_palette_len,\n                                     const uint8_t* src_ptr,\n                                     size_t src_len) {\n  size_t len = (dst_len < src_len) ? dst_len : src_len;\n  if (len > 0) {\n    memmove(dst_ptr, src_ptr, len);\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_3_3(uint8_t* dst_ptr,\n                                     size_t dst_len,\n                                     uint8_t* dst_palette_ptr,\n                                     size_t dst_palette_len,\n                                     const uint8_t* src_ptr,\n                                     size_t src_len) {\n  size_t dst_len3 = dst_len / 3;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len3 < src_len3) ? dst_len3 : src_len3;\n  if (len > 0) {\n  " +
 	"  memmove(dst_ptr, src_ptr, len * 3);\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(uint8_t* dst_ptr,\n                                     size_t dst_len,\n                                     uint8_t* dst_palette_ptr,\n                                     size_t dst_palette_len,\n                                     const uint8_t* src_ptr,\n                                     size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  if (len > 0) {\n    memmove(dst_ptr, src_ptr, len * 4);\n  }\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgr(uint8_t* dst_ptr,\n                                         size_t dst_len,\n                                         uint8_t* dst_palette_ptr,\n                                         size_t dst_palette_len,\n                                         const uint8_t* src_ptr,\n                                         size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len3 = src_len / 3;\n  size_t len = (dst_len2 < src_len3) ? dst_len2 : src_len3;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t b5 = s[0] >> 3;\n    uint32_t g6 = s[1] >> 2;\n    uint32_t r5 = s[2] >> 3;\n    uint32_t rgb_565 = (r5 << 11) | (g6 << 5) | (b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 3;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(\n    uint8_t*" +
-	" dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len2 < src_len4) ? dst_len2 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2),\n        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n            wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len4 = src_len / 4;" +
-	"\n  size_t len = (dst_len2 < src_len4) ? dst_len2 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n\n    // Convert from 565 color to 16-bit color.\n    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));\n    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);\n    uint32_t dr = (0x8421 * old_r5) >> 4;\n    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);\n    uint32_t dg = (0x1041 * old_g6) >> 2;\n    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);\n    uint32_t db = (0x8421 * old_b5) >> 4;\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa)" +
-	" + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 565 color and combine the components.\n    uint32_t new_r5 = 0x1F & (dr >> 11);\n    uint32_t new_g6 = 0x3F & (dg >> 10);\n    uint32_t new_b5 = 0x1F & (db >> 11);\n    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),\n                                             (uint16_t)new_rgb_565);\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__y(uint8_t* dst_ptr,\n                                       size_t dst_len,\n                                       uint8_t* dst_palette_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t len = (dst_len2 < src_len) ? dst_len2 : src_len;\n  uint8_t* d = dst_" +
-	"ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t y5 = s[0] >> 3;\n    uint32_t y6 = s[0] >> 2;\n    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index__src(uint8_t* dst_ptr,\n                                                size_t dst_len,\n                                                uint8_t* dst_palette_ptr,\n                                                size_t dst_palette_len,\n                                                const uint8_t* src_ptr,\n                                                size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst_len / 2;\n  size_t len = (dst_len2 < src_len) ? dst_len2 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_" +
-	"unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s" +
-	"[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst_len / 2;\n  size_t len = (dst_len2 < src_len) ? dst_len2 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	" dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len2 < src_len4) ? dst_len2 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2),\n        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n            wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len8 = src_len / " +
+	"8;\n  size_t len = (dst_len2 < src_len8) ? dst_len2 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2),\n        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n            wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(\n                wuffs_base__load_u64le__no_bounds_check(s + (0 * 8)))));\n\n    s += 1 * 8;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len2 < src_len4) ? dst_len2 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Extract 16-bit color " +
+	"components.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n\n    // Convert from 565 color to 16-bit color.\n    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));\n    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);\n    uint32_t dr = (0x8421 * old_r5) >> 4;\n    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);\n    uint32_t dg = (0x1041 * old_g6) >> 2;\n    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);\n    uint32_t db = (0x8421 * old_b5) >> 4;\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 565 color and combine the components.\n    uint32_t new_r5 = 0x1F & (dr >> 11);\n    uint32_t new_g6 = 0x3F " +
+	"& (dg >> 10);\n    uint32_t new_b5 = 0x1F & (db >> 11);\n    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),\n                                             (uint16_t)new_rgb_565);\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len2 < src_len8) ? dst_len2 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Extract 16-bit color components.\n    uint32_t sa = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 6));\n    uint32_t sr = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 4));\n    uint32_t sg = ((uint32" +
+	"_t)wuffs_base__load_u16le__no_bounds_check(s + 2));\n    uint32_t sb = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 0));\n\n    // Convert from 565 color to 16-bit color.\n    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));\n    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);\n    uint32_t dr = (0x8421 * old_r5) >> 4;\n    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);\n    uint32_t dg = (0x1041 * old_g6) >> 2;\n    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);\n    uint32_t db = (0x8421 * old_b5) >> 4;\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 565 color and combine the components.\n    uint32_t new_r5 = 0x1F & (dr >> 11);\n    uint32_t new_g6 = 0x3F & (dg >> 10);\n    uint32_t new_b5 = 0x1F & (db >> 11);\n   " +
+	" uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),\n                                             (uint16_t)new_rgb_565);\n\n    s += 1 * 8;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__y(uint8_t* dst_ptr,\n                                       size_t dst_len,\n                                       uint8_t* dst_palette_ptr,\n                                       size_t dst_palette_len,\n                                       const uint8_t* src_ptr,\n                                       size_t src_len) {\n  size_t dst_len2 = dst_len / 2;\n  size_t len = (dst_len2 < src_len) ? dst_len2 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t y5 = s[0] >> 3;\n    uint32_t y6 = s[0] >> 2;\n    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), " +
+	"(uint16_t)rgb_565);\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index__src(uint8_t* dst_ptr,\n                                                size_t dst_len,\n                                                uint8_t* dst_palette_ptr,\n                                                size_t dst_palette_len,\n                                                const uint8_t* src_ptr,\n                                                size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst_len / 2;\n  size_t len = (dst_len2 < src_len) ? dst_len2 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le__no_bounds_ch" +
+	"eck(\n        d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n  " +
+	"  const uint8_t* src_ptr,\n    size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst_len / 2;\n  size_t len = (dst_len2 < src_len) ? dst_len2 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(uint8_t* dst_ptr,\n                                                     size_t dst_len,\n                                                     uint8_t* dst_palette_ptr,\n                                                     size_t dst_palette_len,\n                                                     const uint8_t* src_ptr,\n                                                     size_t src_len) {\n  size_t dst_len3 = dst_len / 3;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len3 < src_len4) ? dst_len3 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 =\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(\n            wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nw" +
-	"uffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len3 = dst_len / 3;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len3 < src_len4) ? dst_len3 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n    uint32_t dr = 0x101 * ((uint32_t)d[2]);\n    uint32_t dg = 0x101 * ((uint32_t)d[1]);\n    uint32_t db = 0x101 * ((uint32_t)d[0]);\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * s" +
-	"a) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 8-bit color.\n    d[0] = (uint8_t)(db >> 8);\n    d[1] = (uint8_t)(dg >> 8);\n    d[2] = (uint8_t)(dr >> 8);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"uffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len3 = dst_len / 3;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len3 < src_len8) ? dst_len3 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 =\n        wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(\n            wuffs_base__load_u64le__no_bounds_check(s + (0 * 8)));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 8;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len3 = dst_len / 3;\n  size_t src_len4 = src_le" +
+	"n / 4;\n  size_t len = (dst_len3 < src_len4) ? dst_len3 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Extract 16-bit color components.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n    uint32_t dr = 0x101 * ((uint32_t)d[2]);\n    uint32_t dg = 0x101 * ((uint32_t)d[1]);\n    uint32_t db = 0x101 * ((uint32_t)d[0]);\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 8-bit color.\n    d[0] = (uint8_t)(db >> 8);\n    d[1] = (uint8_t)(dg >> 8);\n    d[2] = (uint8_t)(dr >> 8);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n" +
+	"}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len3 = dst_len / 3;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len3 < src_len8) ? dst_len3 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Extract 16-bit color components.\n    uint32_t sa = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 6));\n    uint32_t sr = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 4));\n    uint32_t sg = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 2));\n    uint32_t sb = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 0));\n    uint32_t dr = 0x101 * ((uint32_t)d[2]);\n    uint32_t dg = 0x101 * ((uint32_t)d[1]);\n    uint32_t db = 0x101 * ((uint32_t)d[0]);\n\n    // Calculate the inverse of the src-alpha: how much of the" +
+	" dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 8-bit color.\n    d[0] = (uint8_t)(db >> 8);\n    d[1] = (uint8_t)(dg >> 8);\n    d[2] = (uint8_t)(dr >> 8);\n\n    s += 1 * 8;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__composite_nonpremul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__composite_nonpremul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const" +
+	" uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  while (n >= 1) {\n    uint64_t d0 = wuffs_base__color_u32__as__color_u64(\n        wuffs_base__load_u32le__no_bounds_check(d + (0 * 4)));\n    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__color_u64__as__color_u32(\n            wuffs_base__composite_nonpremul_nonpremul_u64_axxx(d0, s0)));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = " +
-	"dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 " +
+	"= dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(s0));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len4 = src_len / 4;\n  size_t len = (dst_len4 < src_len4) ? dst_len4 : src_len4;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_" +
+	"bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over(\n    uint8_t* dst_ptr,\n    size_t dst_len,\n    uint8_t* dst_palette_ptr,\n    size_t dst_palette_len,\n    const uint8_t* src_ptr,\n    size_t src_len) {\n  size_t dst_len4 = dst_len / 4;\n  size_t src_len8 = src_len / 8;\n  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint64_t d0 = wuffs_base__color_u32__as__color_u64(\n        wuffs_base__load_u32le__no_bounds_check(d + (0 * 4)));\n    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0" +
+	" * 4),\n        wuffs_base__color_u64__as__color_u32(\n            wuffs_base__composite_premul_nonpremul_u64_axxx(d0, s0)));\n\n    s += 1 * 8;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(uint8_t* dst_ptr,\n                                            size_t dst_len,\n                                            uint8_t* dst_palette_ptr,\n                                            size_t dst_palette_len,\n                                            const uint8_t* src_ptr,\n                                            size_t src_len) {\n  if (dst_palette_len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst_len / 3;\n  size_t len = (dst_len3 < src_len) ? dst_len3 : src_len;\n  uint8_t* d = dst_ptr;\n  const uint8_t* s = src_ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop" +
 	" or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette_ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette_ptr +\n                                                          ((siz" +
@@ -569,18 +585,20 @@
 	"0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_pixfmt,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__pixel_swizzler__xxx__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WU" +
-	"FFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,\n                                        " +
-	"                 src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_pa" +
-	"lette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__" +
-	"swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__B" +
-	"GR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_b" +
-	"ase__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swiz" +
-	"zler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"FFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(\n              dst_palette, src_palette) != 25" +
+	"6) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        " +
+	"return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n         " +
+	"                                            src_palette) != 256) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_pixfmt,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return wuffs_base__pixel_swi" +
+	"zzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_non" +
+	"premul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src" +
+	";\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_pixfmt,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel" +
+	"_swizzler__bgr_565__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:" +
+	"\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
 	"// --------\n\nWUFFS_BASE__MAYBE_STATIC wuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_pixfmt,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_pixfmt,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  p->private_impl.func = NULL;\n  p->private_impl.src_pixfmt_bytes_per_pixel = 0;\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n  uint32_t src_pixfmt_bits_per_pixel =\n      wuffs_base__pixel_format__bits_per_pixel(&src_pixfmt);\n  if ((src_pixfmt_bits_per_pixel == 0) ||\n      ((src_pixfmt_bits_per_pixel & 7) != 0)) {\n    return wuffs_base__make_status(\n        wuffs_base__error__unsupported_pixel_swizzler_option);\n  }\n\n  // TODO: support many more for" +
-	"mats.\n\n  switch (src_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_pixfmt, dst_palette,\n                                                    src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n  }\n\n  p->private_impl.func = func;\n  p->private_impl.src_pixfmt_bytes_per_pixel = src_pixfmt_bits_per_pixel / 8;\n  return wuffs_base__make_status(\n      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);\n}" +
-	"\n\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved_from_reader(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    const uint8_t** ptr_iop_r,\n    const uint8_t* io2_r) {\n  if (p && p->private_impl.func) {\n    const uint8_t* iop_r = *ptr_iop_r;\n    uint64_t n = (*p->private_impl.func)(dst.ptr, dst.len, dst_palette.ptr,\n                                         dst_palette.len, iop_r,\n                                         (size_t)(io2_r - iop_r));\n    *ptr_iop_r += n * p->private_impl.src_pixfmt_bytes_per_pixel;\n    return n;\n  }\n  return 0;\n}\n\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    return (*p->private_impl.func)(dst.ptr, dst.len, dst_palette.ptr,\n                              " +
-	"     dst_palette.len, src.ptr, src.len);\n  }\n  return 0;\n}\n" +
+	"mats.\n\n  switch (src_pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_pixfmt, dst_palette,\n                                                    src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(\n          p, dst_pixfmt, dst_palette, src_palette, blend);\n      break;\n  }\n\n  p->private_im" +
+	"pl.func = func;\n  p->private_impl.src_pixfmt_bytes_per_pixel = src_pixfmt_bits_per_pixel / 8;\n  return wuffs_base__make_status(\n      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);\n}\n\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved_from_reader(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    const uint8_t** ptr_iop_r,\n    const uint8_t* io2_r) {\n  if (p && p->private_impl.func) {\n    const uint8_t* iop_r = *ptr_iop_r;\n    uint64_t n = (*p->private_impl.func)(dst.ptr, dst.len, dst_palette.ptr,\n                                         dst_palette.len, iop_r,\n                                         (size_t)(io2_r - iop_r));\n    *ptr_iop_r += n * p->private_impl.src_pixfmt_bytes_per_pixel;\n    return n;\n  }\n  return 0;\n}\n\nWUFFS_BASE__MAYBE_STATIC uint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n " +
+	"   wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    return (*p->private_impl.func)(dst.ptr, dst.len, dst_palette.ptr,\n                                   dst_palette.len, src.ptr, src.len);\n  }\n  return 0;\n}\n" +
 	""
 
 const BaseUTF8SubmoduleC = "" +
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 6ff5a12..b414577 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -60,13 +60,17 @@
 	{t.IDU32, "0x80000565", "PIXEL_FORMAT__BGR_565"},
 	{t.IDU32, "0x80000888", "PIXEL_FORMAT__BGR"},
 	{t.IDU32, "0x81008888", "PIXEL_FORMAT__BGRA_NONPREMUL"},
+	{t.IDU32, "0x8100BBBB", "PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE"},
 	{t.IDU32, "0x82008888", "PIXEL_FORMAT__BGRA_PREMUL"},
+	{t.IDU32, "0x8200BBBB", "PIXEL_FORMAT__BGRA_PREMUL_4X16LE"},
 	{t.IDU32, "0x83008888", "PIXEL_FORMAT__BGRA_BINARY"},
 	{t.IDU32, "0x90008888", "PIXEL_FORMAT__BGRX"},
 
 	{t.IDU32, "0xA0000888", "PIXEL_FORMAT__RGB"},
 	{t.IDU32, "0xA1008888", "PIXEL_FORMAT__RGBA_NONPREMUL"},
+	{t.IDU32, "0xA100BBBB", "PIXEL_FORMAT__RGBA_NONPREMUL_4X16LE"},
 	{t.IDU32, "0xA2008888", "PIXEL_FORMAT__RGBA_PREMUL"},
+	{t.IDU32, "0xA200BBBB", "PIXEL_FORMAT__RGBA_PREMUL_4X16LE"},
 	{t.IDU32, "0xA3008888", "PIXEL_FORMAT__RGBA_BINARY"},
 	{t.IDU32, "0xB0008888", "PIXEL_FORMAT__RGBX"},
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 8c6dfa4..3987662 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -2866,6 +2866,68 @@
   return (a << 24) | (r << 16) | (g << 8) | (b << 0);
 }
 
+// wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul converts
+// from 4x16LE non-premultiplied alpha to 4x8 premultiplied alpha.
+static inline wuffs_base__color_u32_argb_premul  //
+wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+    uint64_t argb_nonpremul) {
+  uint32_t a16 = 0xFFFF & (argb_nonpremul >> 48);
+
+  uint32_t r16 = 0xFFFF & (argb_nonpremul >> 32);
+  r16 = (r16 * a16) / 0xFFFF;
+  uint32_t g16 = 0xFFFF & (argb_nonpremul >> 16);
+  g16 = (g16 * a16) / 0xFFFF;
+  uint32_t b16 = 0xFFFF & (argb_nonpremul >> 0);
+  b16 = (b16 * a16) / 0xFFFF;
+
+  return ((a16 >> 8) << 24) | ((r16 >> 8) << 16) | ((g16 >> 8) << 8) |
+         ((b16 >> 8) << 0);
+}
+
+// wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul converts
+// from 4x8 premultiplied alpha to 4x16LE non-premultiplied alpha.
+static inline uint64_t  //
+wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul(
+    wuffs_base__color_u32_argb_premul c) {
+  uint32_t a = 0xFF & (c >> 24);
+  if (a == 0xFF) {
+    uint64_t r16 = 0x101 * (0xFF & (c >> 16));
+    uint64_t g16 = 0x101 * (0xFF & (c >> 8));
+    uint64_t b16 = 0x101 * (0xFF & (c >> 0));
+    return 0xFFFF000000000000u | (r16 << 32) | (g16 << 16) | (b16 << 0);
+  } else if (a == 0) {
+    return 0;
+  }
+  uint64_t a16 = a * 0x101;
+
+  uint64_t r = 0xFF & (c >> 16);
+  uint64_t r16 = (r * (0x101 * 0xFFFF)) / a16;
+  uint64_t g = 0xFF & (c >> 8);
+  uint64_t g16 = (g * (0x101 * 0xFFFF)) / a16;
+  uint64_t b = 0xFF & (c >> 0);
+  uint64_t b16 = (b * (0x101 * 0xFFFF)) / a16;
+
+  return (a16 << 48) | (r16 << 32) | (g16 << 16) | (b16 << 0);
+}
+
+static inline uint64_t  //
+wuffs_base__color_u32__as__color_u64(uint32_t c) {
+  uint64_t a16 = 0x101 * (0xFF & (c >> 24));
+  uint64_t r16 = 0x101 * (0xFF & (c >> 16));
+  uint64_t g16 = 0x101 * (0xFF & (c >> 8));
+  uint64_t b16 = 0x101 * (0xFF & (c >> 0));
+  return (a16 << 48) | (r16 << 32) | (g16 << 16) | (b16 << 0);
+}
+
+static inline uint32_t  //
+wuffs_base__color_u64__as__color_u32(uint64_t c) {
+  uint32_t a = ((uint32_t)(0xFF & (c >> 56)));
+  uint32_t r = ((uint32_t)(0xFF & (c >> 40)));
+  uint32_t g = ((uint32_t)(0xFF & (c >> 24)));
+  uint32_t b = ((uint32_t)(0xFF & (c >> 8)));
+  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
 // --------
 
 typedef uint8_t wuffs_base__pixel_blend;
@@ -2953,13 +3015,17 @@
 #define WUFFS_BASE__PIXEL_FORMAT__BGR_565 0x80000565
 #define WUFFS_BASE__PIXEL_FORMAT__BGR 0x80000888
 #define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL 0x81008888
+#define WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE 0x8100BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL 0x82008888
+#define WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL_4X16LE 0x8200BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY 0x83008888
 #define WUFFS_BASE__PIXEL_FORMAT__BGRX 0x90008888
 
 #define WUFFS_BASE__PIXEL_FORMAT__RGB 0xA0000888
 #define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL 0xA1008888
+#define WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL_4X16LE 0xA100BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL 0xA2008888
+#define WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL_4X16LE 0xA200BBBB
 #define WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY 0xA3008888
 #define WUFFS_BASE__PIXEL_FORMAT__RGBX 0xB0008888
 
@@ -13466,6 +13532,9 @@
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       return wuffs_base__color_u32_argb_nonpremul__as__color_u32_argb_premul(
           wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
+      return wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+          wuffs_base__load_u64le__no_bounds_check(row + (8 * ((size_t)x))));
     case WUFFS_BASE__PIXEL_FORMAT__BGRX:
       return 0xFF000000 |
              wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));
@@ -13553,6 +13622,12 @@
           wuffs_base__color_u32_argb_premul__as__color_u32_argb_nonpremul(
               color));
       break;
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
+      wuffs_base__store_u64le__no_bounds_check(
+          row + (8 * ((size_t)x)),
+          wuffs_base__color_u32_argb_premul__as__color_u64_argb_nonpremul(
+              color));
+      break;
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
       wuffs_base__store_u24le__no_bounds_check(
@@ -13645,7 +13720,7 @@
 static inline uint32_t  //
 wuffs_base__composite_nonpremul_nonpremul_u32_axxx(uint32_t dst_nonpremul,
                                                    uint32_t src_nonpremul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));
@@ -13676,18 +13751,58 @@
     db = (db * 0xFFFF) / da;
   }
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
+static inline uint64_t  //
+wuffs_base__composite_nonpremul_nonpremul_u64_axxx(uint64_t dst_nonpremul,
+                                                   uint64_t src_nonpremul) {
+  // Extract components.
+  uint64_t sa = 0xFFFF & (src_nonpremul >> 48);
+  uint64_t sr = 0xFFFF & (src_nonpremul >> 32);
+  uint64_t sg = 0xFFFF & (src_nonpremul >> 16);
+  uint64_t sb = 0xFFFF & (src_nonpremul >> 0);
+  uint64_t da = 0xFFFF & (dst_nonpremul >> 48);
+  uint64_t dr = 0xFFFF & (dst_nonpremul >> 32);
+  uint64_t dg = 0xFFFF & (dst_nonpremul >> 16);
+  uint64_t db = 0xFFFF & (dst_nonpremul >> 0);
+
+  // Convert dst from nonpremul to premul.
+  dr = (dr * da) / 0xFFFF;
+  dg = (dg * da) / 0xFFFF;
+  db = (db * da) / 0xFFFF;
+
+  // Calculate the inverse of the src-alpha: how much of the dst to keep.
+  uint64_t ia = 0xFFFF - sa;
+
+  // Composite src (nonpremul) over dst (premul).
+  da = sa + ((da * ia) / 0xFFFF);
+  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+  db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+  // Convert dst from premul to nonpremul.
+  if (da != 0) {
+    dr = (dr * 0xFFFF) / da;
+    dg = (dg * 0xFFFF) / da;
+    db = (db * 0xFFFF) / da;
+  }
+
+  // Combine components.
+  return (db << 0) | (dg << 16) | (dr << 32) | (da << 48);
+}
+
 static inline uint32_t  //
 wuffs_base__composite_nonpremul_premul_u32_axxx(uint32_t dst_nonpremul,
                                                 uint32_t src_premul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));
@@ -13718,18 +13833,20 @@
     db = (db * 0xFFFF) / da;
   }
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
 static inline uint32_t  //
 wuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,
                                                 uint32_t src_nonpremul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));
@@ -13748,18 +13865,46 @@
   dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
   db = ((sb * sa) + (db * ia)) / 0xFFFF;
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
+static inline uint64_t  //
+wuffs_base__composite_premul_nonpremul_u64_axxx(uint64_t dst_premul,
+                                                uint64_t src_nonpremul) {
+  // Extract components.
+  uint64_t sa = 0xFFFF & (src_nonpremul >> 48);
+  uint64_t sr = 0xFFFF & (src_nonpremul >> 32);
+  uint64_t sg = 0xFFFF & (src_nonpremul >> 16);
+  uint64_t sb = 0xFFFF & (src_nonpremul >> 0);
+  uint64_t da = 0xFFFF & (dst_premul >> 48);
+  uint64_t dr = 0xFFFF & (dst_premul >> 32);
+  uint64_t dg = 0xFFFF & (dst_premul >> 16);
+  uint64_t db = 0xFFFF & (dst_premul >> 0);
+
+  // Calculate the inverse of the src-alpha: how much of the dst to keep.
+  uint64_t ia = 0xFFFF - sa;
+
+  // Composite src (nonpremul) over dst (premul).
+  da = sa + ((da * ia) / 0xFFFF);
+  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+  db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+  // Combine components.
+  return (db << 0) | (dg << 16) | (dr << 32) | (da << 48);
+}
+
 static inline uint32_t  //
 wuffs_base__composite_premul_premul_u32_axxx(uint32_t dst_premul,
                                              uint32_t src_premul) {
-  // Convert from 8-bit color to 16-bit color.
+  // Extract 16-bit color components.
   uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));
   uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));
   uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));
@@ -13778,24 +13923,27 @@
   dg = sg + ((dg * ia) / 0xFFFF);
   db = sb + ((db * ia) / 0xFFFF);
 
-  // Convert from 16-bit color to 8-bit color and combine the components.
+  // Convert from 16-bit color to 8-bit color.
   da >>= 8;
   dr >>= 8;
   dg >>= 8;
   db >>= 8;
+
+  // Combine components.
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
 // --------
 
 static uint64_t  //
-wuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,
-                                               wuffs_base__slice_u8 src) {
-  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;
+wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 src) {
+  size_t len = (dst.len < src.len ? dst.len : src.len) / 4;
   uint8_t* d = dst.ptr;
   const uint8_t* s = src.ptr;
 
-  size_t n = len4;
+  size_t n = len;
   while (n--) {
     uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);
     uint32_t b5 = 0x1F & (argb >> (8 - 5));
@@ -13807,17 +13955,17 @@
     s += 4;
     d += 4;
   }
-  return len4 * 4;
+  return len;
 }
 
 static uint64_t  //
 wuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,
                                            wuffs_base__slice_u8 src) {
-  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;
+  size_t len = (dst.len < src.len ? dst.len : src.len) / 4;
   uint8_t* d = dst.ptr;
   const uint8_t* s = src.ptr;
 
-  size_t n = len4;
+  size_t n = len;
   while (n--) {
     uint8_t b0 = s[0];
     uint8_t b1 = s[1];
@@ -13830,7 +13978,56 @@
     s += 4;
     d += 4;
   }
-  return len4 * 4;
+  return len;
+}
+
+// --------
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le(uint8_t* dst_ptr,
+                                                    size_t dst_len,
+                                                    uint8_t* dst_palette_ptr,
+                                                    size_t dst_palette_len,
+                                                    const uint8_t* src_ptr,
+                                                    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+
+  const size_t loop_unroll_count = 4;
+
+  size_t n = len;
+  while (n >= loop_unroll_count) {
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (1 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (1 * 8))));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (2 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (2 * 8))));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (3 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (3 * 8))));
+
+    s += loop_unroll_count * 8;
+    d += loop_unroll_count * 4;
+    n -= loop_unroll_count;
+  }
+
+  while (n >= 1) {
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4), wuffs_base__color_u64__as__color_u32(
+                         wuffs_base__load_u64le__no_bounds_check(s + (0 * 8))));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+  return len;
 }
 
 // --------
@@ -13947,6 +14144,38 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len2 = dst_len / 2;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len2 < src_len8) ? dst_len2 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    wuffs_base__store_u16le__no_bounds_check(
+        d + (0 * 2),
+        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(
+            wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+                wuffs_base__load_u64le__no_bounds_check(s + (0 * 8)))));
+
+    s += 1 * 8;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(
     uint8_t* dst_ptr,
     size_t dst_len,
@@ -13964,7 +14193,7 @@
   // TODO: unroll.
 
   while (n >= 1) {
-    // Convert from 8-bit color to 16-bit color.
+    // Extract 16-bit color components.
     uint32_t sa = 0x101 * ((uint32_t)s[3]);
     uint32_t sr = 0x101 * ((uint32_t)s[2]);
     uint32_t sg = 0x101 * ((uint32_t)s[1]);
@@ -14004,6 +14233,63 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len2 = dst_len / 2;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len2 < src_len8) ? dst_len2 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Extract 16-bit color components.
+    uint32_t sa = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 6));
+    uint32_t sr = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 4));
+    uint32_t sg = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 2));
+    uint32_t sb = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 0));
+
+    // Convert from 565 color to 16-bit color.
+    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));
+    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);
+    uint32_t dr = (0x8421 * old_r5) >> 4;
+    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);
+    uint32_t dg = (0x1041 * old_g6) >> 2;
+    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);
+    uint32_t db = (0x8421 * old_b5) >> 4;
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 565 color and combine the components.
+    uint32_t new_r5 = 0x1F & (dr >> 11);
+    uint32_t new_g6 = 0x3F & (dg >> 10);
+    uint32_t new_b5 = 0x1F & (db >> 11);
+    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),
+                                             (uint16_t)new_rgb_565);
+
+    s += 1 * 8;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__y(uint8_t* dst_ptr,
                                        size_t dst_len,
                                        uint8_t* dst_palette_ptr,
@@ -14149,6 +14435,37 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len3 = dst_len / 3;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len3 < src_len8) ? dst_len3 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 =
+        wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(
+            wuffs_base__load_u64le__no_bounds_check(s + (0 * 8)));
+    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
+
+    s += 1 * 8;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(
     uint8_t* dst_ptr,
     size_t dst_len,
@@ -14166,7 +14483,7 @@
   // TODO: unroll.
 
   while (n >= 1) {
-    // Convert from 8-bit color to 16-bit color.
+    // Extract 16-bit color components.
     uint32_t sa = 0x101 * ((uint32_t)s[3]);
     uint32_t sr = 0x101 * ((uint32_t)s[2]);
     uint32_t sg = 0x101 * ((uint32_t)s[1]);
@@ -14196,6 +14513,54 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len3 = dst_len / 3;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len3 < src_len8) ? dst_len3 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Extract 16-bit color components.
+    uint32_t sa = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 6));
+    uint32_t sr = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 4));
+    uint32_t sg = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 2));
+    uint32_t sb = ((uint32_t)wuffs_base__load_u16le__no_bounds_check(s + 0));
+    uint32_t dr = 0x101 * ((uint32_t)d[2]);
+    uint32_t dg = 0x101 * ((uint32_t)d[1]);
+    uint32_t db = 0x101 * ((uint32_t)d[0]);
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 8-bit color.
+    d[0] = (uint8_t)(db >> 8);
+    d[1] = (uint8_t)(dg >> 8);
+    d[2] = (uint8_t)(dr >> 8);
+
+    s += 1 * 8;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -14213,8 +14578,6 @@
   const uint8_t* s = src_ptr;
   size_t n = len;
 
-  // TODO: unroll.
-
   while (n >= 1) {
     uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));
     uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));
@@ -14230,6 +14593,38 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  while (n >= 1) {
+    uint64_t d0 = wuffs_base__color_u32__as__color_u64(
+        wuffs_base__load_u32le__no_bounds_check(d + (0 * 4)));
+    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4),
+        wuffs_base__color_u64__as__color_u32(
+            wuffs_base__composite_nonpremul_nonpremul_u64_axxx(d0, s0)));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -14264,6 +14659,37 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4),
+        wuffs_base__color_u64_argb_nonpremul__as__color_u32_argb_premul(s0));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(
     uint8_t* dst_ptr,
     size_t dst_len,
@@ -14294,6 +14720,40 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over(
+    uint8_t* dst_ptr,
+    size_t dst_len,
+    uint8_t* dst_palette_ptr,
+    size_t dst_palette_len,
+    const uint8_t* src_ptr,
+    size_t src_len) {
+  size_t dst_len4 = dst_len / 4;
+  size_t src_len8 = src_len / 8;
+  size_t len = (dst_len4 < src_len8) ? dst_len4 : src_len8;
+  uint8_t* d = dst_ptr;
+  const uint8_t* s = src_ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint64_t d0 = wuffs_base__color_u32__as__color_u64(
+        wuffs_base__load_u32le__no_bounds_check(d + (0 * 4)));
+    uint64_t s0 = wuffs_base__load_u64le__no_bounds_check(s + (0 * 8));
+    wuffs_base__store_u32le__no_bounds_check(
+        d + (0 * 4),
+        wuffs_base__color_u64__as__color_u32(
+            wuffs_base__composite_premul_nonpremul_u64_axxx(d0, s0)));
+
+    s += 1 * 8;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -14663,8 +15123,8 @@
       return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,
-                                                         src_palette) != 1024) {
+      if (wuffs_base__pixel_swizzler__squash_align4_bgr_565_888(
+              dst_palette, src_palette) != 256) {
         return NULL;
       }
       switch (blend) {
@@ -14705,7 +15165,7 @@
 
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
       if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+                                                     src_palette) != 256) {
         return NULL;
       }
       switch (blend) {
@@ -14720,7 +15180,7 @@
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
       if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,
-                                                     src_palette) != 1024) {
+                                                     src_palette) != 256) {
         return NULL;
       }
       switch (blend) {
@@ -14824,6 +15284,66 @@
   return NULL;
 }
 
+static wuffs_base__pixel_swizzler__func  //
+wuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(
+    wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_format dst_pixfmt,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src_palette,
+    wuffs_base__pixel_blend blend) {
+  switch (dst_pixfmt.repr) {
+    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGR:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__squash_tight_4x8_4x16le;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul_4x16le__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
+    case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__RGB:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+      // TODO.
+      break;
+  }
+  return NULL;
+}
+
 // --------
 
 WUFFS_BASE__MAYBE_STATIC wuffs_base__status  //
@@ -14870,6 +15390,11 @@
       func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
           p, dst_pixfmt, dst_palette, src_palette, blend);
       break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE:
+      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul_4x16le(
+          p, dst_pixfmt, dst_palette, src_palette, blend);
+      break;
   }
 
   p->private_impl.func = func;
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index eb9d362..3308add 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -130,6 +130,10 @@
           .pixel = 0x55443300,
           .pixfmt_repr = WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL,
       },
+      {
+          .pixel = 0x55443300,
+          .pixfmt_repr = WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE,
+      },
   };
 
   const struct {