Have std/bmp support 32 bits per pixel color
diff --git a/internal/cgen/base/image-impl.c b/internal/cgen/base/image-impl.c
index 5ca466b..2dd0339 100644
--- a/internal/cgen/base/image-impl.c
+++ b/internal/cgen/base/image-impl.c
@@ -30,6 +30,36 @@
 }
 
 static inline uint32_t  //
+wuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,
+                                                uint32_t src_nonpremul) {
+  // Convert from 8-bit color to 16-bit color.
+  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));
+  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));
+  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));
+  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));
+  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));
+  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));
+  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));
+  uint32_t db = 0x101 * (0xFF & (dst_premul >> 0));
+
+  // Calculate the inverse of the src-alpha: how much of the dst to keep.
+  uint32_t ia = 0xFFFF - sa;
+
+  // Composite src (nonpremul) over dst (premul).
+  da = sa + ((da * ia) / 0xFFFF);
+  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+  db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+  // Convert from 16-bit color to 8-bit color and combine the components.
+  da >>= 8;
+  dr >>= 8;
+  dg >>= 8;
+  db >>= 8;
+  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
+}
+
+static inline uint32_t  //
 wuffs_base__premul_u32_axxx(uint32_t nonpremul) {
   // Multiplying by 0x101 (twice, once for alpha and once for color) converts
   // from 8-bit to 16-bit color. Shifting right by 8 undoes that.
@@ -229,6 +259,61 @@
 // --------
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len4 = dst.len / 4;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 =
+        wuffs_base__premul_u32_axxx(wuffs_base__load_u32le(s + (0 * 4)));
+    wuffs_base__store_u32le(d + (0 * 4), s0);
+
+    s += 1 * 4;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len4 = dst.len / 4;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t d0 = wuffs_base__load_u32le(d + (0 * 4));
+    uint32_t s0 = wuffs_base__load_u32le(s + (0 * 4));
+    wuffs_base__store_u32le(
+        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));
+
+    s += 1 * 4;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 dst_palette,
                                      wuffs_base__slice_u8 src) {
@@ -236,6 +321,19 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,
+                                     wuffs_base__slice_u8 dst_palette,
+                                     wuffs_base__slice_u8 src) {
+  size_t dst_len4 = dst.len / 4;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;
+  if (len > 0) {
+    memmove(dst.ptr, src.ptr, len * 4);
+  }
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__xx__index__src(wuffs_base__slice_u8 dst,
                                            wuffs_base__slice_u8 dst_palette,
                                            wuffs_base__slice_u8 src) {
@@ -737,6 +835,54 @@
   return NULL;
 }
 
+static wuffs_base__pixel_swizzler__func  //
+wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
+    wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_format dst_format,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src_palette,
+    wuffs_base__pixel_blend blend) {
+  switch (dst_format.repr) {
+    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGR:
+    case WUFFS_BASE__PIXEL_FORMAT__RGB:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__copy_4_4;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
+      // TODO.
+      break;
+  }
+  return NULL;
+}
+
 // --------
 
 wuffs_base__status  //
@@ -769,6 +915,11 @@
       func = wuffs_base__pixel_swizzler__prepare__bgr(
           p, dst_format, dst_palette, src_palette, blend);
       break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
+      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
+          p, dst_format, dst_palette, src_palette, blend);
+      break;
   }
 
   p->private_impl.func = func;
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 8eec91a..5f87923 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -26,24 +26,27 @@
 	""
 
 const baseImageImplC = "" +
-	"// ---------------- Images\n\nconst uint32_t wuffs_base__pixel_format__bits_per_channel[16] = {\n    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n    0x08, 0x0A, 0x0C, 0x10, 0x18, 0x20, 0x30, 0x40,\n};\n\nstatic inline uint32_t  //\nwuffs_base__swap_u32_argb_abgr(uint32_t u) {\n  uint32_t o = u & 0xFF00FF00;\n  uint32_t r = u & 0x00FF0000;\n  uint32_t b = u & 0x000000FF;\n  return o | (r >> 16) | (b << 16);\n}\n\nstatic inline uint32_t  //\nwuffs_base__premul_u32_axxx(uint32_t nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and once for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (nonpremul >> 24);\n  uint32_t a16 = a * (0x101 * 0x101);" +
-	"\n\n  uint32_t r = 0xFF & (nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\nstatic inline uint32_t  //\nwuffs_base__nonpremul_u32_axxx(uint32_t premul) {\n  uint32_t a = 0xFF & (premul >> 24);\n  if (a == 0xFF) {\n    return premul;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (premul >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (premul >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (premul >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\nwuffs_base__color_u32_argb_premul  //\nwuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_buffer* b,\n                                       uint32_t x,\n                                       uint32_t y) {\n  if (" +
-	"!b || (x >= b->pixcfg.private_impl.width) ||\n      (y >= b->pixcfg.private_impl.height)) {\n    return 0;\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&b->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return 0;\n  }\n\n  size_t stride = b->private_impl.planes[0].stride;\n  uint8_t* row = b->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (b->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__load_u32le(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      uint8_t* palette = b->private_impl.planes[3].ptr;\n      return wuffs_base__load_u32le(palette + (4 * ((size_t)row[x])));\n    }\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return 0xFF000000 | (0x00010101 * ((uin" +
-	"t32_t)(row[x])));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL: {\n      uint8_t* palette = b->private_impl.planes[3].ptr;\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le(palette + (4 * ((size_t)row[x]))));\n    }\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint16_t bgr = wuffs_base__load_u16le(row + (2 * ((size_t)x)));\n      uint32_t b5 = 0x1F & (bgr >> 0);\n      uint32_t b = (b5 << 3) | (b5 >> 2);\n      uint32_t g6 = 0x3F & (bgr >> 5);\n      uint32_t g = (g6 << 2) | (g6 >> 4);\n      uint32_t r5 = 0x1F & (bgr >> 11);\n      uint32_t r = (r5 << 3) | (r5 >> 2);\n      return 0xFF000000 | (r << 16) | (g << 8) | (b << 0);\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return 0xFF000000 | wuffs_base__load_u24le(row + (3 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 | wuffs_base__load_u32le(row + (4 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base" +
-	"__load_u32le(row + (4 * ((size_t)x))));\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 | wuffs_base__load_u24le(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 | wuffs_base__load_u32le(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le(row + (4 * ((size_t)x)))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nwuffs_base__status  //\nwuffs_base__pixel_buffer__set_color_u32_at(\n    wuffs_base__pixel_buffer* b,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_pr" +
-	"emul color) {\n  if (!b) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= b->pixcfg.private_impl.width) ||\n      (y >= b->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&b->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = b->private_impl.planes[0].stride;\n  uint8_t* row = b->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (b->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint32_t b5 = 0x1F & (color >> (8 - 5));\n      uint32_t g6 = 0x3F & (color >> (16 - 6));\n " +
-	"     uint32_t r5 = 0x1F & (color >> (24 - 5));\n      uint32_t bgr565 = (b5 << 0) | (g6 << 5) | (r5 << 11);\n      wuffs_base__store_u16le(row + (2 * ((size_t)x)), (uint16_t)bgr565);\n      break;\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)),\n                              wuffs_base__nonpremul_u32_axxx(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le(row + (3 * ((size_t)x)),\n                              wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)),\n                              wuffs_base__nonpremul_u32_axxx(\n                                  wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    c" +
-	"ase WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)),\n                              wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
+	"// ---------------- Images\n\nconst uint32_t wuffs_base__pixel_format__bits_per_channel[16] = {\n    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n    0x08, 0x0A, 0x0C, 0x10, 0x18, 0x20, 0x30, 0x40,\n};\n\nstatic inline uint32_t  //\nwuffs_base__swap_u32_argb_abgr(uint32_t u) {\n  uint32_t o = u & 0xFF00FF00;\n  uint32_t r = u & 0x00FF0000;\n  uint32_t b = u & 0x000000FF;\n  return o | (r >> 16) | (b << 16);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,\n                                                uint32_t src_nonpremul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));\n  uint32_t db = 0x101" +
+	" * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint32_t  //\nwuffs_base__premul_u32_axxx(uint32_t nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and once for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0" +
+	"x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (nonpremul >> 24);\n  uint32_t a16 = a * (0x101 * 0x101);\n\n  uint32_t r = 0xFF & (nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\nstatic inline uint32_t  //\nwuffs_base__nonpremul_u32_axxx(uint32_t premul) {\n  uint32_t a = 0xFF & (premul >> 24);\n  if (a == 0xFF) {\n    return premul;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (premul >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (premul >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (premul >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\nwuffs_base__color_u32_argb_premul  //\nwuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_" +
+	"buffer* b,\n                                       uint32_t x,\n                                       uint32_t y) {\n  if (!b || (x >= b->pixcfg.private_impl.width) ||\n      (y >= b->pixcfg.private_impl.height)) {\n    return 0;\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&b->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return 0;\n  }\n\n  size_t stride = b->private_impl.planes[0].stride;\n  uint8_t* row = b->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (b->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__load_u32le(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      uint8_t* palette = b->private_impl.planes[3].ptr;\n      return wuffs_base__load_u32le(palette + (4 * ((size_t)row[x])));\n    }\n\n      // Common" +
+	" formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return 0xFF000000 | (0x00010101 * ((uint32_t)(row[x])));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL: {\n      uint8_t* palette = b->private_impl.planes[3].ptr;\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le(palette + (4 * ((size_t)row[x]))));\n    }\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint16_t bgr = wuffs_base__load_u16le(row + (2 * ((size_t)x)));\n      uint32_t b5 = 0x1F & (bgr >> 0);\n      uint32_t b = (b5 << 3) | (b5 >> 2);\n      uint32_t g6 = 0x3F & (bgr >> 5);\n      uint32_t g = (g6 << 2) | (g6 >> 4);\n      uint32_t r5 = 0x1F & (bgr >> 11);\n      uint32_t r = (r5 << 3) | (r5 >> 2);\n      return 0xFF000000 | (r << 16) | (g << 8) | (b << 0);\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return 0xFF000000 | wuffs_base__load_u24le(row + (3 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 | wuffs_base__load_u32le(row + (4 * ((size_" +
+	"t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le(row + (4 * ((size_t)x))));\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 | wuffs_base__load_u24le(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 | wuffs_base__load_u32le(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le(row + (4 * ((size_t)x)))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nwuffs_base__status  //\nwuffs_base__pixel_buf" +
+	"fer__set_color_u32_at(\n    wuffs_base__pixel_buffer* b,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_premul color) {\n  if (!b) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= b->pixcfg.private_impl.width) ||\n      (y >= b->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&b->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = b->private_impl.planes[0].stride;\n  uint8_t* row = b->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (b->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__" +
+	"PIXEL_FORMAT__BGR_565: {\n      uint32_t b5 = 0x1F & (color >> (8 - 5));\n      uint32_t g6 = 0x3F & (color >> (16 - 6));\n      uint32_t r5 = 0x1F & (color >> (24 - 5));\n      uint32_t bgr565 = (b5 << 0) | (g6 << 5) | (r5 << 11);\n      wuffs_base__store_u16le(row + (2 * ((size_t)x)), (uint16_t)bgr565);\n      break;\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)),\n                              wuffs_base__nonpremul_u32_axxx(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le(row + (3 * ((size_t)x)),\n                              wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)),\n                              wuffs_base__nonpremul_u32_axxx(\n                                  wuffs_base__swap_" +
+	"u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le(row + (4 * ((size_t)x)),\n                              wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
 	"" +
-	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xx__index__src(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 dst_palette,\n                                           wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le(\n        d + (0 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le(\n        d + (1 * 2),\n        wuffs_base__load_u16le(dst_palette" +
-	".ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le(\n        d + (2 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le(\n        d + (3 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le(\n        d + (0 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(wuffs_base__slice_u8 dst,\n                                            wuffs_base__slice_u8 dst_palette,\n                                            wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n " +
-	" const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with \">=\",\n  // the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store will\n  // overwrite that with the correct byte. There is always another store,\n  // whether a 4-byte store in this loop or a 1-byte store in the next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le(\n        d + (0 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le(\n        d + (1 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le(\n        d + (2 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le(\n        d + (3 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unrol" +
-	"l_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    d[0] = (uint8_t)(s0 >> 0);\n    d[1] = (uint8_t)(s0 >> 8);\n    d[2] = (uint8_t)(s0 >> 16);\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u24le(d + (1 * 4), s" +
-	"1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u24le(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u24le(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(wuffs_base__slice_u8 dst,\n                                             wuffs_base__slice_u8 dst_palette,\n                                             wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  u" +
-	"int8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le(\n        d + (0 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le(\n        d + (1 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le(\n        d + (2 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le(\n        d + (3 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le(\n        d + (0 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    wuffs_bas" +
-	"e__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u32le(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u32le(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;" +
-	"\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__xxx(wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 dst_palette,\n                                      wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len4 < src_len3 ? dst_len4 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le(d + (0 * 4),\n                            0xFF000000 | wuffs_base__load_u24le(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(wuffs_base__slice_u8 dst,\n                                   " +
-	" wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le(d + (0 * 4),\n                            0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
+	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 =\n        wuffs_base__premul_u32_axxx(wuffs_base__load_u32le(s + (0 * 4)));\n    wuffs_base__store_u32le(d + (0 * 4), s0);\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unrol" +
+	"l.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le(s + (0 * 4));\n    wuffs_base__store_u32le(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  if (len > 0) {\n    memmove(dst.ptr, src.ptr, len * 4);\n  }\n  return len;\n}\n\nstatic uint64_t" +
+	"  //\nwuffs_base__pixel_swizzler__xx__index__src(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 dst_palette,\n                                           wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len2 = dst.len / 2;\n  size_t len = dst_len2 < src.len ? dst_len2 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u16le(\n        d + (0 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u16le(\n        d + (1 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u16le(\n        d + (2 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u16le(\n        d + (3 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    " +
+	"d += loop_unroll_count * 2;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u16le(\n        d + (0 * 2),\n        wuffs_base__load_u16le(dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 2;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxx__index__src(wuffs_base__slice_u8 dst,\n                                            wuffs_base__slice_u8 dst_palette,\n                                            wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  // The comparison in the while condition is \">\", not \">=\", because with \">=\",\n  // the last 4-byte store could write past the end of the dst slice.\n  //\n  // Each 4-byte store writes one too many bytes, but a subsequent store will\n  // overwrite that with the correct " +
+	"byte. There is always another store,\n  // whether a 4-byte store in this loop or a 1-byte store in the next loop.\n  while (n > loop_unroll_count) {\n    wuffs_base__store_u32le(\n        d + (0 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le(\n        d + (1 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le(\n        d + (2 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le(\n        d + (3 * 3),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    d[0] = (uint8_t)(s0 >> 0);\n    d[1] = (uint8_t)(s0 >> 8);\n    d[2] = (uint8_t)(s0 >> 16);\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pi" +
+	"xel_swizzler__xxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len3 = dst.len / 3;\n  size_t len = dst_len3 < src.len ? dst_len3 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u24le(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u24le(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u24le(d + (3 * 4), s3);\n    }\n\n " +
+	"   s += loop_unroll_count * 1;\n    d += loop_unroll_count * 3;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u24le(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 3;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index__src(wuffs_base__slice_u8 dst,\n                                             wuffs_base__slice_u8 dst_palette,\n                                             wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    wuffs_base__store_u32le(\n        d + (0 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4)));\n    wuffs_base__store_u32le(\n        d + (1 * 4)," +
+	"\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4)));\n    wuffs_base__store_u32le(\n        d + (2 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4)));\n    wuffs_base__store_u32le(\n        d + (3 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4)));\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    wuffs_base__store_u32le(\n        d + (0 * 4),\n        wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4)));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (dst_palette.len != 1024) {\n    return 0;\n  }\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  const size_t" +
+	" loop_unroll_count = 4;\n\n  while (n >= loop_unroll_count) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le(d + (0 * 4), s0);\n    }\n    uint32_t s1 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[1] * 4));\n    if (s1) {\n      wuffs_base__store_u32le(d + (1 * 4), s1);\n    }\n    uint32_t s2 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[2] * 4));\n    if (s2) {\n      wuffs_base__store_u32le(d + (2 * 4), s2);\n    }\n    uint32_t s3 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[3] * 4));\n    if (s3) {\n      wuffs_base__store_u32le(d + (3 * 4), s3);\n    }\n\n    s += loop_unroll_count * 1;\n    d += loop_unroll_count * 4;\n    n -= loop_unroll_count;\n  }\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__load_u32le(dst_palette.ptr + ((size_t)s[0] * 4));\n    if (s0) {\n      wuffs_base__store_u32le(d + (0 * 4), s0);\n    }\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_sw" +
+	"izzler__xxxx__xxx(wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 dst_palette,\n                                      wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len3 = src.len / 3;\n  size_t len = dst_len4 < src_len3 ? dst_len4 : src_len3;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base__store_u32le(d + (0 * 4),\n                            0xFF000000 | wuffs_base__load_u24le(s + (0 * 3)));\n\n    s += 1 * 3;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__xxxx__y(wuffs_base__slice_u8 dst,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t len = dst_len4 < src.len ? dst_len4 : src.len;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    wuffs_base" +
+	"__store_u32le(d + (0 * 4),\n                            0xFF000000 | (0x010101 * (uint32_t)s[0]));\n\n    s += 1 * 1;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__squash_bgr_565_888(wuffs_base__slice_u8 dst,\n                                               wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint32_t argb = wuffs_base__load_u32le(s);\n    uint32_t b5 = 0x1F & (argb >> (8 - 5));\n    uint32_t g6 = 0x3F & (argb >> (16 - 6));\n    uint32_t r5 = 0x1F & (argb >> (24 - 5));\n    wuffs_base__store_u32le(d, (b5 << 0) | (g6 << 5) | (r5 << 11));\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__swap_rgbx_bgrx(wuffs_base__slice_u8 dst,\n                                           wuffs_base__slice_u8 src) {\n  size_t len4 = (dst.len < src.len ? dst.len : src.len) / 4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n\n  size_t n = len4;\n  while (n--) {\n    uint8_t b0 = s[0];\n    uint8_t b1 = s[1];\n    uint8_t b2 = s[2];\n    uint8_t b3 = s[3];\n  " +
 	"  d[0] = b2;\n    d[1] = b1;\n    d[2] = b0;\n    d[3] = b3;\n    s += 4;\n    d += 4;\n  }\n  return len4 * 4;\n}\n\n" +
@@ -53,10 +56,11 @@
 	"   return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xx__index__src;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      if (wuffs_base__slice_u8__copy_from_slice(dst_palette, src_palette) !=\n          1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__sr" +
 	"c;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      if (wuffs_base__pixel_swizzler__swap_rgbx_bgrx(dst_palette,\n                                                     src_palette) != 1024) {\n        return NULL;\n      }\n      switch (blend) {\n        case WUFFS_BASE__P" +
 	"IXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__xxxx__index__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__xxxx__index_binary_alpha__src_over;\n      }\n      return NULL;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgr(wuffs_base__pixel_swizzler* p,\n                                         wuffs_base__pixel_format dst_format,\n                                         wuffs_base__slice_u8 dst_palette,\n                                         wuffs_base__slice_u8 src_palette,\n                                         wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMU" +
-	"L:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
+	"L:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__pixel_swizzler__xxxx__xxx;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\nstatic wuffs_base__pixel_swizzler__func  //\nwuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n    wuffs_base__pixel_swizzler* p,\n    wuffs_base__pixel_format dst_format,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src_palette,\n    wuffs_base__pixel_blend blend) {\n  switch (dst_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base" +
+	"__pixel_swizzler__copy_4_4;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      switch (blend) {\n        case WUFFS_BASE__PIXEL_BLEND__SRC:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;\n        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:\n          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;\n      }\n      return NULL;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      // TODO.\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      // TODO.\n      break;\n  }\n  return NULL;\n}\n\n" +
 	"" +
 	"// --------\n\nwuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend) {\n  if (!p) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n\n  // TODO: support many more formats.\n\n  wuffs_base__pixel_swizzler__func func = NULL;\n\n  switch (src_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      func = wuffs_base__pixel_swizzler__prepare__y(p, dst_format, dst_palette,\n                                                    src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      func = wuffs_base__pixel_swizzler__prepare__indexed__bgra_binary(\n          p, dst_format, dst_" +
-	"palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n  }\n\n  p->private_impl.func = func;\n  return wuffs_base__make_status(\n      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);\n}\n\nuint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    return (*p->private_impl.func)(dst, dst_palette, src);\n  }\n  return 0;\n}\n" +
+	"palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      func = wuffs_base__pixel_swizzler__prepare__bgr(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(\n          p, dst_format, dst_palette, src_palette, blend);\n      break;\n  }\n\n  p->private_impl.func = func;\n  return wuffs_base__make_status(\n      func ? NULL : wuffs_base__error__unsupported_pixel_swizzler_option);\n}\n\nuint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  if (p && p->private_impl.func) {\n    return (*p->private_impl.func)(dst, dst_palette, src);\n  }\n  return 0;\n}\n" +
 	""
 
 const baseCorePrivateH = "" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 86495e7..2413996 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -3255,6 +3255,10 @@
     uint64_t f_bytes_total;
     wuffs_base__pixel_format f_pixfmt;
     uint32_t f_padding;
+    uint32_t f_mask_r;
+    uint32_t f_mask_g;
+    uint32_t f_mask_b;
+    uint32_t f_mask_a;
     uint64_t f_frame_config_io_position;
     uint32_t f_dst_x;
     uint32_t f_dst_y;
@@ -3273,7 +3277,8 @@
 
   struct {
     struct {
-      uint32_t v_bitmapInfoLen;
+      uint32_t v_bitmap_info_len;
+      uint32_t v_compression;
       uint64_t scratch;
     } s_decode_image_config[1];
     struct {
@@ -6516,6 +6521,36 @@
 }
 
 static inline uint32_t  //
+wuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,
+                                                uint32_t src_nonpremul) {
+  // Convert from 8-bit color to 16-bit color.
+  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));
+  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));
+  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));
+  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));
+  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));
+  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));
+  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));
+  uint32_t db = 0x101 * (0xFF & (dst_premul >> 0));
+
+  // Calculate the inverse of the src-alpha: how much of the dst to keep.
+  uint32_t ia = 0xFFFF - sa;
+
+  // Composite src (nonpremul) over dst (premul).
+  da = sa + ((da * ia) / 0xFFFF);
+  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;
+  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;
+  db = ((sb * sa) + (db * ia)) / 0xFFFF;
+
+  // Convert from 16-bit color to 8-bit color and combine the components.
+  da >>= 8;
+  dr >>= 8;
+  dg >>= 8;
+  db >>= 8;
+  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
+}
+
+static inline uint32_t  //
 wuffs_base__premul_u32_axxx(uint32_t nonpremul) {
   // Multiplying by 0x101 (twice, once for alpha and once for color) converts
   // from 8-bit to 16-bit color. Shifting right by 8 undoes that.
@@ -6715,6 +6750,61 @@
 // --------
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len4 = dst.len / 4;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t s0 =
+        wuffs_base__premul_u32_axxx(wuffs_base__load_u32le(s + (0 * 4)));
+    wuffs_base__store_u32le(d + (0 * 4), s0);
+
+    s += 1 * 4;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
+wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(
+    wuffs_base__slice_u8 dst,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src) {
+  size_t dst_len4 = dst.len / 4;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;
+  uint8_t* d = dst.ptr;
+  uint8_t* s = src.ptr;
+  size_t n = len;
+
+  // TODO: unroll.
+
+  while (n >= 1) {
+    uint32_t d0 = wuffs_base__load_u32le(d + (0 * 4));
+    uint32_t s0 = wuffs_base__load_u32le(s + (0 * 4));
+    wuffs_base__store_u32le(
+        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));
+
+    s += 1 * 4;
+    d += 1 * 4;
+    n -= 1;
+  }
+
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,
                                      wuffs_base__slice_u8 dst_palette,
                                      wuffs_base__slice_u8 src) {
@@ -6722,6 +6812,19 @@
 }
 
 static uint64_t  //
+wuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,
+                                     wuffs_base__slice_u8 dst_palette,
+                                     wuffs_base__slice_u8 src) {
+  size_t dst_len4 = dst.len / 4;
+  size_t src_len4 = src.len / 4;
+  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;
+  if (len > 0) {
+    memmove(dst.ptr, src.ptr, len * 4);
+  }
+  return len;
+}
+
+static uint64_t  //
 wuffs_base__pixel_swizzler__xx__index__src(wuffs_base__slice_u8 dst,
                                            wuffs_base__slice_u8 dst_palette,
                                            wuffs_base__slice_u8 src) {
@@ -7223,6 +7326,54 @@
   return NULL;
 }
 
+static wuffs_base__pixel_swizzler__func  //
+wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
+    wuffs_base__pixel_swizzler* p,
+    wuffs_base__pixel_format dst_format,
+    wuffs_base__slice_u8 dst_palette,
+    wuffs_base__slice_u8 src_palette,
+    wuffs_base__pixel_blend blend) {
+  switch (dst_format.repr) {
+    case WUFFS_BASE__PIXEL_FORMAT__BGR_565:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGR:
+    case WUFFS_BASE__PIXEL_FORMAT__RGB:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRX:
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__copy_4_4;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:
+      switch (blend) {
+        case WUFFS_BASE__PIXEL_BLEND__SRC:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src;
+        case WUFFS_BASE__PIXEL_BLEND__SRC_OVER:
+          return wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over;
+      }
+      return NULL;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:
+      // TODO.
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__RGBX:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:
+      // TODO.
+      break;
+  }
+  return NULL;
+}
+
 // --------
 
 wuffs_base__status  //
@@ -7255,6 +7406,11 @@
       func = wuffs_base__pixel_swizzler__prepare__bgr(
           p, dst_format, dst_palette, src_palette, blend);
       break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:
+      func = wuffs_base__pixel_swizzler__prepare__bgra_nonpremul(
+          p, dst_format, dst_palette, src_palette, blend);
+      break;
   }
 
   p->private_impl.func = func;
@@ -7587,7 +7743,7 @@
   wuffs_base__status status = wuffs_base__make_status(NULL);
 
   uint32_t v_magic = 0;
-  uint32_t v_bitmapInfoLen = 0;
+  uint32_t v_bitmap_info_len = 0;
   uint32_t v_width = 0;
   uint32_t v_height = 0;
   uint32_t v_planes = 0;
@@ -7607,8 +7763,9 @@
 
   uint32_t coro_susp_point = self->private_impl.p_decode_image_config[0];
   if (coro_susp_point) {
-    v_bitmapInfoLen =
-        self->private_data.s_decode_image_config[0].v_bitmapInfoLen;
+    v_bitmap_info_len =
+        self->private_data.s_decode_image_config[0].v_bitmap_info_len;
+    v_compression = self->private_data.s_decode_image_config[0].v_compression;
   }
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
@@ -7728,17 +7885,18 @@
           *scratch |= ((uint64_t)(num_bits_2)) << 56;
         }
       }
-      v_bitmapInfoLen = t_2;
+      v_bitmap_info_len = t_2;
     }
-    if (v_bitmapInfoLen != 124) {
+    if ((v_bitmap_info_len != 40) && (v_bitmap_info_len != 108) &&
+        (v_bitmap_info_len != 124)) {
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
-    if (self->private_impl.f_padding < v_bitmapInfoLen) {
+    if (self->private_impl.f_padding < v_bitmap_info_len) {
       status = wuffs_base__make_status(wuffs_bmp__error__bad_header);
       goto exit;
     }
-    self->private_impl.f_padding -= v_bitmapInfoLen;
+    self->private_impl.f_padding -= v_bitmap_info_len;
     {
       WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8);
       uint32_t t_3;
@@ -7810,8 +7968,7 @@
       status = wuffs_base__make_status(wuffs_bmp__error__bad_header);
       goto exit;
     } else if (v_height >= 2147483648) {
-      self->private_impl.f_height =
-          ((0 - self->private_impl.f_height) & 2147483647);
+      self->private_impl.f_height = ((0 - v_height) & 2147483647);
       self->private_impl.f_top_down = true;
     } else {
       self->private_impl.f_height = v_height;
@@ -7889,6 +8046,13 @@
       self->private_impl.f_pad_per_row = (self->private_impl.f_width & 3);
       self->private_impl.f_pixfmt =
           wuffs_base__utility__make_pixel_format(1073744008);
+    } else if (v_bits_per_pixel == 32) {
+      self->private_impl.f_bits_per_pixel = 32;
+      self->private_impl.f_bytes_per_row =
+          (((uint64_t)(self->private_impl.f_width)) * 4);
+      self->private_impl.f_pad_per_row = 0;
+      self->private_impl.f_pixfmt =
+          wuffs_base__utility__make_pixel_format(1157662856);
     } else {
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
@@ -7927,12 +8091,7 @@
       }
       v_compression = t_7;
     }
-    if (v_compression != 0) {
-      status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
-      goto exit;
-    }
-    self->private_data.s_decode_image_config[0].scratch =
-        (v_bitmapInfoLen - 20);
+    self->private_data.s_decode_image_config[0].scratch = 20;
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT(18);
     if (self->private_data.s_decode_image_config[0].scratch >
         ((uint64_t)(io2_a_src - iop_a_src))) {
@@ -7943,6 +8102,154 @@
       goto suspend;
     }
     iop_a_src += self->private_data.s_decode_image_config[0].scratch;
+    if (v_bitmap_info_len >= 108) {
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(19);
+        uint32_t t_8;
+        if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) {
+          t_8 = wuffs_base__load_u32le(iop_a_src);
+          iop_a_src += 4;
+        } else {
+          self->private_data.s_decode_image_config[0].scratch = 0;
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(20);
+          while (true) {
+            if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+              status =
+                  wuffs_base__make_status(wuffs_base__suspension__short_read);
+              goto suspend;
+            }
+            uint64_t* scratch =
+                &self->private_data.s_decode_image_config[0].scratch;
+            uint32_t num_bits_8 = ((uint32_t)(*scratch >> 56));
+            *scratch <<= 8;
+            *scratch >>= 8;
+            *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_8;
+            if (num_bits_8 == 24) {
+              t_8 = ((uint32_t)(*scratch));
+              break;
+            }
+            num_bits_8 += 8;
+            *scratch |= ((uint64_t)(num_bits_8)) << 56;
+          }
+        }
+        self->private_impl.f_mask_r = t_8;
+      }
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(21);
+        uint32_t t_9;
+        if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) {
+          t_9 = wuffs_base__load_u32le(iop_a_src);
+          iop_a_src += 4;
+        } else {
+          self->private_data.s_decode_image_config[0].scratch = 0;
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(22);
+          while (true) {
+            if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+              status =
+                  wuffs_base__make_status(wuffs_base__suspension__short_read);
+              goto suspend;
+            }
+            uint64_t* scratch =
+                &self->private_data.s_decode_image_config[0].scratch;
+            uint32_t num_bits_9 = ((uint32_t)(*scratch >> 56));
+            *scratch <<= 8;
+            *scratch >>= 8;
+            *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_9;
+            if (num_bits_9 == 24) {
+              t_9 = ((uint32_t)(*scratch));
+              break;
+            }
+            num_bits_9 += 8;
+            *scratch |= ((uint64_t)(num_bits_9)) << 56;
+          }
+        }
+        self->private_impl.f_mask_g = t_9;
+      }
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(23);
+        uint32_t t_10;
+        if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) {
+          t_10 = wuffs_base__load_u32le(iop_a_src);
+          iop_a_src += 4;
+        } else {
+          self->private_data.s_decode_image_config[0].scratch = 0;
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(24);
+          while (true) {
+            if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+              status =
+                  wuffs_base__make_status(wuffs_base__suspension__short_read);
+              goto suspend;
+            }
+            uint64_t* scratch =
+                &self->private_data.s_decode_image_config[0].scratch;
+            uint32_t num_bits_10 = ((uint32_t)(*scratch >> 56));
+            *scratch <<= 8;
+            *scratch >>= 8;
+            *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_10;
+            if (num_bits_10 == 24) {
+              t_10 = ((uint32_t)(*scratch));
+              break;
+            }
+            num_bits_10 += 8;
+            *scratch |= ((uint64_t)(num_bits_10)) << 56;
+          }
+        }
+        self->private_impl.f_mask_b = t_10;
+      }
+      {
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(25);
+        uint32_t t_11;
+        if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) {
+          t_11 = wuffs_base__load_u32le(iop_a_src);
+          iop_a_src += 4;
+        } else {
+          self->private_data.s_decode_image_config[0].scratch = 0;
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(26);
+          while (true) {
+            if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+              status =
+                  wuffs_base__make_status(wuffs_base__suspension__short_read);
+              goto suspend;
+            }
+            uint64_t* scratch =
+                &self->private_data.s_decode_image_config[0].scratch;
+            uint32_t num_bits_11 = ((uint32_t)(*scratch >> 56));
+            *scratch <<= 8;
+            *scratch >>= 8;
+            *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_11;
+            if (num_bits_11 == 24) {
+              t_11 = ((uint32_t)(*scratch));
+              break;
+            }
+            num_bits_11 += 8;
+            *scratch |= ((uint64_t)(num_bits_11)) << 56;
+          }
+        }
+        self->private_impl.f_mask_a = t_11;
+      }
+      if ((v_compression == 3) && (self->private_impl.f_mask_r == 16711680) &&
+          (self->private_impl.f_mask_g == 65280) &&
+          (self->private_impl.f_mask_b == 255) &&
+          (self->private_impl.f_mask_a == 4278190080)) {
+        v_compression = 0;
+      }
+      self->private_data.s_decode_image_config[0].scratch =
+          (v_bitmap_info_len - 56);
+      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(27);
+      if (self->private_data.s_decode_image_config[0].scratch >
+          ((uint64_t)(io2_a_src - iop_a_src))) {
+        self->private_data.s_decode_image_config[0].scratch -=
+            ((uint64_t)(io2_a_src - iop_a_src));
+        iop_a_src = io2_a_src;
+        status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+        goto suspend;
+      }
+      iop_a_src += self->private_data.s_decode_image_config[0].scratch;
+    }
+    if (v_compression != 0) {
+      status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
+      goto exit;
+    }
     self->private_impl.f_frame_config_io_position = wuffs_base__u64__sat_add(
         a_src->meta.pos, ((uint64_t)(iop_a_src - io0_a_src)));
     if (a_dst != NULL) {
@@ -7965,7 +8272,9 @@
       wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
   self->private_impl.active_coroutine =
       wuffs_base__status__is_suspension(&status) ? 1 : 0;
-  self->private_data.s_decode_image_config[0].v_bitmapInfoLen = v_bitmapInfoLen;
+  self->private_data.s_decode_image_config[0].v_bitmap_info_len =
+      v_bitmap_info_len;
+  self->private_data.s_decode_image_config[0].v_compression = v_compression;
 
   goto exit;
 exit:
diff --git a/std/bmp/decode_bmp.wuffs b/std/bmp/decode_bmp.wuffs
index 96f0ad5..032f798 100644
--- a/std/bmp/decode_bmp.wuffs
+++ b/std/bmp/decode_bmp.wuffs
@@ -34,6 +34,11 @@
 
 	padding : base.u32,
 
+	mask_r : base.u32,
+	mask_g : base.u32,
+	mask_b : base.u32,
+	mask_a : base.u32,
+
 	frame_config_io_position : base.u64,
 
 	dst_x     : base.u32,
@@ -50,13 +55,13 @@
 )
 
 pub func decoder.decode_image_config?(dst: nptr base.image_config, src: base.io_reader) {
-	var magic          : base.u32
-	var bitmapInfoLen  : base.u32
-	var width          : base.u32
-	var height         : base.u32
-	var planes         : base.u32
-	var bits_per_pixel : base.u32
-	var compression    : base.u32
+	var magic           : base.u32
+	var bitmap_info_len : base.u32
+	var width           : base.u32
+	var height          : base.u32
+	var planes          : base.u32
+	var bits_per_pixel  : base.u32
+	var compression     : base.u32
 
 	if this.call_sequence <> 0 {
 		return base."#bad call sequence"
@@ -77,18 +82,16 @@
 	}
 	this.padding -= 14
 
-	// Read the BITMAPINFOHEADER (version 5 is 124 bytes).
-	//
-	// TODO: support other versions.
+	// Read the BITMAPINFOHEADER (version 3 / 4 / 5 is 40 / 108 / 124 bytes).
 
-	bitmapInfoLen = args.src.read_u32le?()
-	if bitmapInfoLen <> 124 {
+	bitmap_info_len = args.src.read_u32le?()
+	if (bitmap_info_len <> 40) and (bitmap_info_len <> 108) and (bitmap_info_len <> 124) {
 		return "#unsupported BMP file"
 	}
-	if this.padding < bitmapInfoLen {
+	if this.padding < bitmap_info_len {
 		return "#bad header"
 	}
-	this.padding -= bitmapInfoLen
+	this.padding -= bitmap_info_len
 
 	width = args.src.read_u32le?()
 	if width >= 0x8000_0000 {
@@ -102,7 +105,7 @@
 	} else if height >= 0x8000_0000 {
 		// The &0x7FFF_FFFF is redundant, but proves to the compiler that the
 		// result is within this.height's refined bounds.
-		this.height = (0 ~mod- this.height) & 0x7FFF_FFFF
+		this.height = (0 ~mod- height) & 0x7FFF_FFFF
 		this.top_down = true
 	} else {
 		this.height = height
@@ -123,22 +126,52 @@
 		// TODO: a Wuffs (not just C) name for the
 		// WUFFS_BASE__PIXEL_FORMAT__BGR magic pixfmt constant.
 		this.pixfmt = this.util.make_pixel_format(repr: 0x4000_0888)
+	} else if bits_per_pixel == 32 {
+		this.bits_per_pixel = 32
+		this.bytes_per_row = (this.width as base.u64) * 4
+		this.pad_per_row = 0
+		// TODO: a Wuffs (not just C) name for the
+		// WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL magic pixfmt constant.
+		this.pixfmt = this.util.make_pixel_format(repr: 0x4500_8888)
 	} else {
-		// TODO: support 8bpp, 32bpp, possibly more.
+		// TODO: support other bits_per_pixel's.
 		return "#unsupported BMP file"
 	}
 	this.bytes_total = this.bytes_per_row * (this.height as base.u64)
 
 	compression = args.src.read_u32le?()
+
+	// We've already read 20 bytes from the BITMAPINFOHEADER: size (4), width
+	// (4), height (4), planes (2), bpp (2), compression (4). Skip the rest of
+	// the version 3 BITMAPINFOHEADER (whose total size is 40).
+	args.src.skip32?(n: 40 - 20)
+
+	if bitmap_info_len >= 108 {
+		this.mask_r = args.src.read_u32le?()
+		this.mask_g = args.src.read_u32le?()
+		this.mask_b = args.src.read_u32le?()
+		this.mask_a = args.src.read_u32le?()
+
+		// If compression is 3 (BITFIELDS) but the explicit masks are what the
+		// implicit masks are for no compression, treat it as no compression.
+		if (compression == 3) and
+			(this.mask_r == 0x00FF_0000) and
+			(this.mask_g == 0x0000_FF00) and
+			(this.mask_b == 0x0000_00FF) and
+			(this.mask_a == 0xFF00_0000) {
+			compression = 0
+		}
+
+		// Skip the rest of the BITMAPINFOHEADER. We've already read (40 + (4 *
+		// 4)) bytes.
+		args.src.skip32?(n: bitmap_info_len - 56)
+	}
+
 	if compression <> 0 {
 		// TODO: support compression.
 		return "#unsupported BMP file"
 	}
 
-	// We've already read 20 bytes from the BITMAPINFOHEADER: size (4), width
-	// (4), height (4), planes (2), bpp (2), compression (4). Skip the rest.
-	args.src.skip32?(n: bitmapInfoLen - 20)
-
 	this.frame_config_io_position = args.src.position()
 
 	if args.dst <> nullptr {