Expand supported pixel_swizzler dst+src pixfmt's
diff --git a/internal/cgen/base/image-impl.c b/internal/cgen/base/image-impl.c
index 2e688f8..3c69d3b 100644
--- a/internal/cgen/base/image-impl.c
+++ b/internal/cgen/base/image-impl.c
@@ -36,8 +36,9 @@
     uint32_t b5 = 0x1F & (argb >> (8 - 5));
     uint32_t g6 = 0x3F & (argb >> (16 - 6));
     uint32_t r5 = 0x1F & (argb >> (24 - 5));
+    uint32_t alpha = argb & 0xFF000000;
     wuffs_base__store_u32le__no_bounds_check(
-        d, (r5 << 11) | (g6 << 5) | (b5 << 0));
+        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));
     s += 4;
     d += 4;
   }
@@ -451,6 +452,19 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__copy_3_3(wuffs_base__slice_u8 dst,
+                                     wuffs_base__slice_u8 dst_palette,
+                                     wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t src_len3 = src.len / 3;
+  size_t len = dst_len3 < src_len3 ? dst_len3 : src_len3;
+  if (len > 0) {
+    memmove(dst.ptr, src.ptr, len * 3);
+  }
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 dst_palette,
                                      wuffs_base__slice_u8 src) {
@@ -466,6 +480,143 @@
 // --------
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgr(wuffs_base__slice_u8 dst,
+                                         wuffs_base__slice_u8 dst_palette,
+                                         wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t src_len3 = src.len / 3;
+  size_t len = dst_len2 < src_len3 ? dst_len2 : src_len3;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t b5 = s[0] >> 3;
+    uint32_t g6 = s[1] >> 2;
+    uint32_t r5 = s[2] >> 3;
+    uint32_t rgb_565 = (r5 << 11) | (g6 << 5) | (b5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);
+
+    s += 1 * 3;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    wuffs_base__store_u16le__no_bounds_check(
+        d + (0 * 2),
+        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(
+            wuffs_base__premul_u32_axxx(
+                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));
+
+    s += 1 * 4;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Convert from 8-bit color to 16-bit color.
+    uint32_t sa = 0x101 * ((uint32_t)s[3]);
+    uint32_t sr = 0x101 * ((uint32_t)s[2]);
+    uint32_t sg = 0x101 * ((uint32_t)s[1]);
+    uint32_t sb = 0x101 * ((uint32_t)s[0]);
+
+    // Convert from 565 color to 16-bit color.
+    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));
+    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);
+    uint32_t dr = (0x8421 * old_r5) >> 4;
+    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);
+    uint32_t dg = (0x1041 * old_g6) >> 2;
+    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);
+    uint32_t db = (0x8421 * old_b5) >> 4;
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 565 color and combine the components.
+    uint32_t new_r5 = 0x1F & (dr >> 11);
+    uint32_t new_g6 = 0x3F & (dg >> 10);
+    uint32_t new_b5 = 0x1F & (db >> 11);
+    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),
+                                             (uint16_t)new_rgb_565);
+
+    s += 1 * 4;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__y(wuffs_base__slice_u8 dst,
+                                       wuffs_base__slice_u8 dst_palette,
+                                       wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t len = dst_len2 < src.len ? dst_len2 : src.len;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t y5 = s[0] >> 3;
+    uint32_t y6 = s[0] >> 2;
+    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);
+
+    s += 1 * 1;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__index__src(
     wuffs_base__slice_u8 dst,
     wuffs_base__slice_u8 dst_palette,
@@ -513,6 +664,111 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  if (dst_palette.len != 1024) {
+    return 0;
+  }
+  size_t dst_len2 = dst.len / 2;
+  size_t len = dst_len2 < src.len ? dst_len2 : src.len;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+                                                          ((size_t)s[0] * 4));
+    if (s0) {
+      wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);
+    }
+
+    s += 1 * 1;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+// --------
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 = wuffs_base__premul_u32_axxx(
+        wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));
+    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
+
+    s += 1 * 4;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Convert from 8-bit color to 16-bit color.
+    uint32_t sa = 0x101 * ((uint32_t)s[3]);
+    uint32_t sr = 0x101 * ((uint32_t)s[2]);
+    uint32_t sg = 0x101 * ((uint32_t)s[1]);
+    uint32_t sb = 0x101 * ((uint32_t)s[0]);
+    uint32_t dr = 0x101 * ((uint32_t)d[2]);
+    uint32_t dg = 0x101 * ((uint32_t)d[1]);
+    uint32_t db = 0x101 * ((uint32_t)d[0]);
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 8-bit color.
+    d[0] = (uint8_t)(db >> 8);
+    d[1] = (uint8_t)(dg >> 8);
+    d[2] = (uint8_t)(dr >> 8);
+
+    s += 1 * 4;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -715,6 +971,32 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__xxx__y(wuffs_base__slice_u8 dst,
+                                   wuffs_base__slice_u8 dst_palette,
+                                   wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t len = dst_len3 < src.len ? dst_len3 : src.len;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint8_t s0 = s[0];
+    d[0] = s0;
+    d[1] = s0;
+    d[2] = s0;
+
+    s += 1 * 1;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -882,13 +1164,11 @@
                                        wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__bgr_565__y;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__xxx__y;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
@@ -932,6 +1212,8 @@
       switch (blend) {
         case WUFFS_BASE__PIXEL_BLEND__SRC:
           return wuffs_base__pixel_swizzler__bgr_565__index__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;
       }
       return NULL;
 
@@ -1002,12 +1284,10 @@
                                          wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__bgr_565__bgr;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__copy_3_3;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
@@ -1035,12 +1315,22 @@
     wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      // TODO.
-      break;
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;
+      }
+      return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      // TODO.
-      break;
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;
+      }
+      return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       switch (blend) {
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 3a3a10d..fb9ca00 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -29,8 +29,8 @@
 const baseImageImplC = "" +
 	"// ---------------- Images\n\nconst uint32_t wuffs_base__pixel_format__bits_per_channel[16] = {\n    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n    0x08, 0x0A, 0x0C, 0x10, 0x18, 0x20, 0x30, 0x40,\n};\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,\n                                               wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    wuffs_base__store_u32le__no_bounds_check(\n        d, (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    u" +
-	"int8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic inline uint32_t  //\nwuffs_base__swap_u32_argb_abgr(uint32_t u) {\n  uint32_t o = u & 0xFF00FF00;\n  uint32_t r = u & 0x00FF0000;\n  uint32_t b = u & 0x000000FF;\n  return o | (r >> 16) | (b << 16);\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,\n                                               wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le__no_bounds_check(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    uint32_t alpha = argb & 0xFF000000;\n    wuffs_base__store_u32le__no_bounds_check(\n        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n   " +
+	" uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n    d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic inline uint32_t  //\nwuffs_base__swap_u32_argb_abgr(uint32_t u) {\n  uint32_t o = u & 0xFF00FF00;\n  uint32_t r = u & 0x00FF0000;\n  uint32_t b = u & 0x000000FF;\n  return o | (r >> 16) | (b << 16);\n}\n\n" +
 	"" +
 	"// --------\n\nstatic inline uint32_t  //\nwuffs_base__composite_nonpremul_nonpremul_u32_axxx(uint32_t dst_nonpremul,\n                                                   uint32_t src_nonpremul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_nonpremul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_nonpremul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_nonpremul >> 8));\n  uint32_t db = 0x101 * (0xFF & (dst_nonpremul >> 0));\n\n  // Convert dst from nonpremul to premul.\n  dr = (dr * da) / 0xFFFF;\n  dg = (dg * da) / 0xFFFF;\n  db = (db * da) / 0xFFFF;\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * i" +
 	"a)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert dst from premul to nonpremul.\n  if (da != 0) {\n    dr = (dr * 0xFFFF) / da;\n    dg = (dg * 0xFFFF) / da;\n    db = (db * 0xFFFF) / da;\n  }\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_nonpremul_premul_u32_axxx(uint32_t dst_nonpremul,\n                                                uint32_t src_premul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_premul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_premul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_premul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_premul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_nonpremul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_nonpremul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_nonpremul >> 8))" +
@@ -48,10 +48,19 @@
 	"// --------\n\nuint8_t  //\nwuffs_base__pixel_palette__closest_element(\n    wuffs_base__slice_u8 palette_slice,\n    wuffs_base__pixel_format palette_format,\n    wuffs_base__color_u32_argb_premul c) {\n  size_t n = palette_slice.len / 4;\n  if (n > 256) {\n    n = 256;\n  }\n  size_t best_index = 0;\n  uint64_t best_score = 0xFFFFFFFFFFFFFFFF;\n\n  // Work in 16-bit color.\n  uint32_t ca = 0x101 * (0xFF & (c >> 24));\n  uint32_t cr = 0x101 * (0xFF & (c >> 16));\n  uint32_t cg = 0x101 * (0xFF & (c >> 8));\n  uint32_t cb = 0x101 * (0xFF & (c >> 0));\n\n  switch (palette_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      bool nonpremul = palette_format.repr ==\n                       WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL;\n\n      size_t i;\n      for (i = 0; i < n; i++) {\n        // Work in 16-bit color.\n        uint32_t pb = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 0]))" +
 	";\n        uint32_t pg = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 1]));\n        uint32_t pr = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 2]));\n        uint32_t pa = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 3]));\n\n        // Convert to premultiplied alpha.\n        if (nonpremul && (pa != 0xFFFF)) {\n          pb = (pb * pa) / 0xFFFF;\n          pg = (pg * pa) / 0xFFFF;\n          pr = (pr * pa) / 0xFFFF;\n        }\n\n        // These deltas are conceptually int32_t (signed) but after squaring,\n        // it's equivalent to work in uint32_t (unsigned).\n        pb -= cb;\n        pg -= cg;\n        pr -= cr;\n        pa -= ca;\n        uint64_t score = ((uint64_t)(pb * pb)) + ((uint64_t)(pg * pg)) +\n                         ((uint64_t)(pr * pr)) + ((uint64_t)(pa * pa));\n        if (best_score > score) {\n          best_score = score;\n          best_index = i;\n        }\n      }\n      break;\n    }\n  }\n\n  return (uint8_t)best_index;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 4);\n  }\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_3_3(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len3 < src_len3 ? dst_len3 : src_len3;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 3);\n  }\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 <" +
+	" src_len4 ? dst_len4 : src_len4;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 4);\n  }\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le__no_bounds_che" +
-	"ck(\n        d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgr(wuffs_base__slice_u8 dst,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len2 < src_len3 ? dst_len2 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t b5 = s[0] >> 3;\n    uint32_t g6 = s[1] >> 2;\n    uint32_t r5 = s[2] >> 3;\n    uint32_t rgb_565 = (r5 << 11) | (g6 << 5) | (b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 3;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len4 = src.len / 4;\n  size_t len = " +
+	"dst_len2 < src_len4 ? dst_len2 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2),\n        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(\n            wuffs_base__premul_u32_axxx(\n                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_" +
+	"t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n\n    // Convert from 565 color to 16-bit color.\n    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));\n    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);\n    uint32_t dr = (0x8421 * old_r5) >> 4;\n    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);\n    uint32_t dg = (0x1041 * old_g6) >> 2;\n    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);\n    uint32_t db = (0x8421 * old_b5) >> 4;\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 565 color and combine the components.\n    uint32_t new_r5 = 0x1F & (dr >> 11);\n    uint32_t new_g6 = 0x3F & (dg >> 10);\n    uint32_t new_b5 = 0x1F & (db >> 11);\n    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5" +
+	") | (new_b5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),\n                                             (uint16_t)new_rgb_565);\n\n    s += 1 * 4;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__y(wuffs_base__slice_u8 dst,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src) {\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t y5 = s[0] >> 3;\n    uint32_t y6 = s[0] >> 2;\n    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);\n    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_p" +
+	"alette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (1 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (2 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (3 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += l" +
+	"oop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le__no_bounds_check(\n        d + (0 * 2), wuffs_base__load_u16le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);\n " +
+	"   }\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__premul_u32_axxx(\n        wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len3 = dst.len / 3;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len" +
+	";\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    // Convert from 8-bit color to 16-bit color.\n    uint32_t sa = 0x101 * ((uint32_t)s[3]);\n    uint32_t sr = 0x101 * ((uint32_t)s[2]);\n    uint32_t sg = 0x101 * ((uint32_t)s[1]);\n    uint32_t sb = 0x101 * ((uint32_t)s[0]);\n    uint32_t dr = 0x101 * ((uint32_t)d[2]);\n    uint32_t dg = 0x101 * ((uint32_t)d[1]);\n    uint32_t db = 0x101 * ((uint32_t)d[0]);\n\n    // Calculate the inverse of the src-alpha: how much of the dst to keep.\n    uint32_t ia = 0xFFFF - sa;\n\n    // Composite src (nonpremul) over dst (premul).\n    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n    db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n    // Convert from 16-bit color to 8-bit color.\n    d[0] = (uint8_t)(db >> 8);\n    d[1] = (uint8_t)(dg >> 8);\n    d[2] = (uint8_t)(dr >> 8);\n\n    s += 1 * 4;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        wuffs_base__composite_nonpremul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
@@ -61,7 +70,8 @@
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(wuffs_base__slice_u8 dst,\n                                            wuffs_base__slice_u8 dst_palette,\n                                            wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with\n  // \">=\", the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store\n  // will overwrite that with the correct byte. There is always another\n  // store, whether a 4-byte store in this loop or a 1-byte store in the\n  // next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 3), wuffs_base__load_u32le__no_bounds_c" +
 	"heck(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 3), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__" +
 	"pixel_swizzler__xxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u24le__no_bounds_check(d + (1 * 3), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                    " +
-	"                      ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u24le__no_bounds_check(d + (2 * 3), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u24le__no_bounds_check(d + (3 * 3), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"                      ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u24le__no_bounds_check(d + (2 * 3), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u24le__no_bounds_check(d + (3 * 3), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__y(wuffs_base__slice_u8 dst,\n                                   wuffs_base__slice_u8 dst_palette,\n                                   wuffs_base__slice_u8 src) {\n  size_t dst_len3 =" +
+	" dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint8_t s0 = s[0];\n    d[0] = s0;\n    d[1] = s0;\n    d[2] = s0;\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(wuffs_base__slice_u8 dst,\n                                             wuffs_base__slice_u8 dst_palette,\n                                             wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (1 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (2 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_pale" +
 	"tte.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (3 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__load_u32le__no_bounds_check(\n                         dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count)" +
@@ -69,13 +79,14 @@
 	" 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +\n                                                          ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__xxx(wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 dst_palette,\n                                      wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len4 < src_len3 ? dst_len4 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4),\n        0xFF000000 | wuffs_base__load_u24le__no_bounds_check(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n" +
 	"\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(wuffs_base__slice_u8 dst,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), 0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
-	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_format,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuf" +
-	"fs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,\n                                                         src_palette) != 1024) {\n     " +
-	"   return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__inde" +
-	"x__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BA" +
-	"SE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_format,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXE" +
-	"L_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n" +
-	"          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"// --------\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__y(wuffs_base__pixel_swizzler* p,\n                                       wuffs_base__pixel_format dst_format,\n                                       wuffs_base__slice_u8 dst_palette,\n                                       wuffs_base__slice_u8 src_palette,\n                                       wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__pixel_swizzler__xxx__y;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WU" +
+	"FFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__pixel_swizzler__xxxx__y;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_1_1;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      if (wuffs_base__pixel_swizzler__squash_bgr_565_888(dst_palette,\n                                        " +
+	"                 src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr_565__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_pa" +
+	"lette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__" +
+	"swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_format,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      return wuffs_base__pixel_swizzler__bgr_565__bgr;\n\n    case WUFFS_BASE__PIXEL_FORMAT__B" +
+	"GR:\n      return wuffs_base__pixel_swizzler__copy_3_3;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_b" +
+	"ase__pixel_swizzler__bgr_565__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__copy_4_4;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_nonpremul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swiz" +
+	"zler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
 	"// --------\n\nwuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n\n  // TODO: support many more formats.\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n\n  switch (src_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, dst_palette,\n                                                    src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n          p, dst_format, dst_" +
 	"palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n  }\n\n  p->private_impl.func = func;\n  return wuffs_base__make_status(\n      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);\n}\n\nuint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    return (*p->private_impl.func)(dst, dst_palette, src);\n  }\n  return 0;\n}\n" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 73d07eb..b522e68 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -8542,8 +8542,9 @@
     uint32_t b5 = 0x1F & (argb >> (8 - 5));
     uint32_t g6 = 0x3F & (argb >> (16 - 6));
     uint32_t r5 = 0x1F & (argb >> (24 - 5));
+    uint32_t alpha = argb & 0xFF000000;
     wuffs_base__store_u32le__no_bounds_check(
-        d, (r5 << 11) | (g6 << 5) | (b5 << 0));
+        d, alpha | (r5 << 11) | (g6 << 5) | (b5 << 0));
     s += 4;
     d += 4;
   }
@@ -8957,6 +8958,19 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__copy_3_3(wuffs_base__slice_u8 dst,
+                                     wuffs_base__slice_u8 dst_palette,
+                                     wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t src_len3 = src.len / 3;
+  size_t len = dst_len3 < src_len3 ? dst_len3 : src_len3;
+  if (len > 0) {
+    memmove(dst.ptr, src.ptr, len * 3);
+  }
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 dst_palette,
                                      wuffs_base__slice_u8 src) {
@@ -8972,6 +8986,143 @@
 // --------
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgr(wuffs_base__slice_u8 dst,
+                                         wuffs_base__slice_u8 dst_palette,
+                                         wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t src_len3 = src.len / 3;
+  size_t len = dst_len2 < src_len3 ? dst_len2 : src_len3;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t b5 = s[0] >> 3;
+    uint32_t g6 = s[1] >> 2;
+    uint32_t r5 = s[2] >> 3;
+    uint32_t rgb_565 = (r5 << 11) | (g6 << 5) | (b5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);
+
+    s += 1 * 3;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    wuffs_base__store_u16le__no_bounds_check(
+        d + (0 * 2),
+        wuffs_base__color_u32_argb_premul__as__color_u16_rgb_565(
+            wuffs_base__premul_u32_axxx(
+                wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)))));
+
+    s += 1 * 4;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len2 < src_len4 ? dst_len2 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Convert from 8-bit color to 16-bit color.
+    uint32_t sa = 0x101 * ((uint32_t)s[3]);
+    uint32_t sr = 0x101 * ((uint32_t)s[2]);
+    uint32_t sg = 0x101 * ((uint32_t)s[1]);
+    uint32_t sb = 0x101 * ((uint32_t)s[0]);
+
+    // Convert from 565 color to 16-bit color.
+    uint32_t old_rgb_565 = wuffs_base__load_u16le__no_bounds_check(d + (0 * 2));
+    uint32_t old_r5 = 0x1F & (old_rgb_565 >> 11);
+    uint32_t dr = (0x8421 * old_r5) >> 4;
+    uint32_t old_g6 = 0x3F & (old_rgb_565 >> 5);
+    uint32_t dg = (0x1041 * old_g6) >> 2;
+    uint32_t old_b5 = 0x1F & (old_rgb_565 >> 0);
+    uint32_t db = (0x8421 * old_b5) >> 4;
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 565 color and combine the components.
+    uint32_t new_r5 = 0x1F & (dr >> 11);
+    uint32_t new_g6 = 0x3F & (dg >> 10);
+    uint32_t new_b5 = 0x1F & (db >> 11);
+    uint32_t new_rgb_565 = (new_r5 << 11) | (new_g6 << 5) | (new_b5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2),
+                                             (uint16_t)new_rgb_565);
+
+    s += 1 * 4;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__y(wuffs_base__slice_u8 dst,
+                                       wuffs_base__slice_u8 dst_palette,
+                                       wuffs_base__slice_u8 src) {
+  size_t dst_len2 = dst.len / 2;
+  size_t len = dst_len2 < src.len ? dst_len2 : src.len;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t y5 = s[0] >> 3;
+    uint32_t y6 = s[0] >> 2;
+    uint32_t rgb_565 = (y5 << 11) | (y6 << 5) | (y5 << 0);
+    wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)rgb_565);
+
+    s += 1 * 1;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__bgr_565__index__src(
     wuffs_base__slice_u8 dst,
     wuffs_base__slice_u8 dst_palette,
@@ -9019,6 +9170,111 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  if (dst_palette.len != 1024) {
+    return 0;
+  }
+  size_t dst_len2 = dst.len / 2;
+  size_t len = dst_len2 < src.len ? dst_len2 : src.len;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(dst_palette.ptr +
+                                                          ((size_t)s[0] * 4));
+    if (s0) {
+      wuffs_base__store_u16le__no_bounds_check(d + (0 * 2), (uint16_t)s0);
+    }
+
+    s += 1 * 1;
+    d += 1 * 2;
+    n -= 1;
+  }
+
+  return len;
+}
+
+// --------
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 = wuffs_base__premul_u32_axxx(
+        wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));
+    wuffs_base__store_u24le__no_bounds_check(d + (0 * 3), s0);
+
+    s += 1 * 4;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len3 < src_len4 ? dst_len3 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    // Convert from 8-bit color to 16-bit color.
+    uint32_t sa = 0x101 * ((uint32_t)s[3]);
+    uint32_t sr = 0x101 * ((uint32_t)s[2]);
+    uint32_t sg = 0x101 * ((uint32_t)s[1]);
+    uint32_t sb = 0x101 * ((uint32_t)s[0]);
+    uint32_t dr = 0x101 * ((uint32_t)d[2]);
+    uint32_t dg = 0x101 * ((uint32_t)d[1]);
+    uint32_t db = 0x101 * ((uint32_t)d[0]);
+
+    // Calculate the inverse of the src-alpha: how much of the dst to keep.
+    uint32_t ia = 0xFFFF - sa;
+
+    // Composite src (nonpremul) over dst (premul).
+    dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+    dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+    db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+    // Convert from 16-bit color to 8-bit color.
+    d[0] = (uint8_t)(db >> 8);
+    d[1] = (uint8_t)(dg >> 8);
+    d[2] = (uint8_t)(dr >> 8);
+
+    s += 1 * 4;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -9221,6 +9477,32 @@
   return len;
 }
 
+static uint64_t  //
+wuffs_base__pixel_swizzler__xxx__y(wuffs_base__slice_u8 dst,
+                                   wuffs_base__slice_u8 dst_palette,
+                                   wuffs_base__slice_u8 src) {
+  size_t dst_len3 = dst.len / 3;
+  size_t len = dst_len3 < src.len ? dst_len3 : src.len;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint8_t s0 = s[0];
+    d[0] = s0;
+    d[1] = s0;
+    d[2] = s0;
+
+    s += 1 * 1;
+    d += 1 * 3;
+    n -= 1;
+  }
+
+  return len;
+}
+
 // --------
 
 static uint64_t  //
@@ -9388,13 +9670,11 @@
                                        wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__bgr_565__y;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
     case WUFFS_BASE__PIXEL_FORMAT__RGB:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__xxx__y;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
@@ -9438,6 +9718,8 @@
       switch (blend) {
         case WUFFS_BASE__PIXEL_BLEND__SRC:
           return wuffs_base__pixel_swizzler__bgr_565__index__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr_565__index_binary_alpha__src_over;
       }
       return NULL;
 
@@ -9508,12 +9790,10 @@
                                          wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__bgr_565__bgr;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      // TODO.
-      break;
+      return wuffs_base__pixel_swizzler__copy_3_3;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
@@ -9541,12 +9821,22 @@
     wuffs_base__pixel_blend blend) {
   switch (dst_format.repr) {
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
-      // TODO.
-      break;
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr_565__bgra_nonpremul__src_over;
+      }
+      return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGR:
-      // TODO.
-      break;
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgr__bgra_nonpremul__src_over;
+      }
+      return NULL;
 
     case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
       switch (blend) {
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index 3470524..0cb7ad4 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -137,9 +137,21 @@
     uint32_t pixfmt_repr;
   } dsts[] = {
       {
+          .pixel = 0xFF000010,
+          .pixfmt_repr = WUFFS_BASE__PIXEL_FORMAT__BGR_565,
+      },
+      {
+          .pixel = 0xFF000040,
+          .pixfmt_repr = WUFFS_BASE__PIXEL_FORMAT__BGR,
+      },
+      {
           .pixel = 0x80000040,
           .pixfmt_repr = WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL,
       },
+      {
+          .pixel = 0x80000040,
+          .pixfmt_repr = WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL,
+      },
   };
 
   const wuffs_base__pixel_blend blends[] = {
@@ -184,6 +196,10 @@
                    wuffs_base__pixel_buffer__set_from_slice(
                        &dst_pixbuf, &dst_pixcfg, g_have_slice_u8));
       fill_palette_with_grays(&dst_pixbuf);
+      wuffs_base__pixel_format dst_pixfmt =
+          wuffs_base__make_pixel_format(dsts[d].pixfmt_repr);
+      wuffs_base__pixel_alpha_transparency dst_transparency =
+          wuffs_base__pixel_format__transparency(&dst_pixfmt);
 
       wuffs_base__slice_u8 dst_palette =
           wuffs_base__pixel_buffer__palette(&dst_pixbuf);
@@ -204,8 +220,8 @@
         CHECK_STATUS(
             "prepare",
             wuffs_base__pixel_swizzler__prepare(
-                &swizzler, wuffs_base__make_pixel_format(dsts[d].pixfmt_repr),
-                dst_palette, wuffs_base__make_pixel_format(srcs[s].pixfmt_repr),
+                &swizzler, dst_pixfmt, dst_palette,
+                wuffs_base__make_pixel_format(srcs[s].pixfmt_repr),
                 wuffs_base__pixel_buffer__palette(&src_pixbuf), blends[b]));
         wuffs_base__pixel_swizzler__swizzle_interleaved(
             &swizzler,
@@ -216,17 +232,21 @@
                 wuffs_base__pixel_buffer__plane(&src_pixbuf, 0), height / 2));
 
         // Check the middle dst pixel.
-        uint32_t tolerance = 0;
+        uint32_t tolerance =
+            (dsts[d].pixfmt_repr == WUFFS_BASE__PIXEL_FORMAT__BGR_565) ? 4 : 0;
         wuffs_base__color_u32_argb_premul want_dst_pixel = 0;
         if (blends[b] == WUFFS_BASE__PIXEL_BLEND__SRC) {
           want_dst_pixel = srcs[s].pixel;
         } else if (blends[b] == WUFFS_BASE__PIXEL_BLEND__SRC_OVER) {
-          tolerance = 1;
+          tolerance += 1;
           want_dst_pixel = wuffs_base__composite_premul_premul_u32_axxx(
               dsts[d].pixel, srcs[s].pixel);
         } else {
           return "unsupported blend";
         }
+        if (dst_transparency == WUFFS_BASE__PIXEL_ALPHA_TRANSPARENCY__OPAQUE) {
+          want_dst_pixel |= 0xFF000000;
+        }
         wuffs_base__color_u32_argb_premul have_dst_pixel =
             wuffs_base__pixel_buffer__color_u32_at(&dst_pixbuf, width / 2,
                                                    height / 2);