Move premul/non-premul conversions to image-public
diff --git a/internal/cgen/base/image-impl.c b/internal/cgen/base/image-impl.c
index 6f552eb..a4dfb38 100644
--- a/internal/cgen/base/image-impl.c
+++ b/internal/cgen/base/image-impl.c
@@ -59,50 +59,6 @@
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
-static inline uint32_t  //
-wuffs_base__premul_u32_axxx(uint32_t nonpremul) {
-  // Multiplying by 0x101 (twice, once for alpha and once for color) converts
-  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.
-  //
-  // Working in the higher bit depth can produce slightly different (and
-  // arguably slightly more accurate) results. For example, given 8-bit blue
-  // and alpha of 0x80 and 0x81:
-  //
-  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40
-  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41
-  uint32_t a = 0xFF & (nonpremul >> 24);
-  uint32_t a16 = a * (0x101 * 0x101);
-
-  uint32_t r = 0xFF & (nonpremul >> 16);
-  r = ((r * a16) / 0xFFFF) >> 8;
-  uint32_t g = 0xFF & (nonpremul >> 8);
-  g = ((g * a16) / 0xFFFF) >> 8;
-  uint32_t b = 0xFF & (nonpremul >> 0);
-  b = ((b * a16) / 0xFFFF) >> 8;
-
-  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
-}
-
-static inline uint32_t  //
-wuffs_base__nonpremul_u32_axxx(uint32_t premul) {
-  uint32_t a = 0xFF & (premul >> 24);
-  if (a == 0xFF) {
-    return premul;
-  } else if (a == 0) {
-    return 0;
-  }
-  uint32_t a16 = a * 0x101;
-
-  uint32_t r = 0xFF & (premul >> 16);
-  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;
-  uint32_t g = 0xFF & (premul >> 8);
-  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;
-  uint32_t b = 0xFF & (premul >> 0);
-  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;
-
-  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
-}
-
 wuffs_base__color_u32_argb_premul  //
 wuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_buffer* pb,
                                        uint32_t x,
diff --git a/internal/cgen/base/image-public.h b/internal/cgen/base/image-public.h
index dc4008d..754320b 100644
--- a/internal/cgen/base/image-public.h
+++ b/internal/cgen/base/image-public.h
@@ -17,10 +17,58 @@
 // ---------------- Images
 
 // wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied
-// Alpha, Red, Green, Blue color, as a uint32_t value. It is in word order, not
-// byte order: its value is always 0xAARRGGBB, regardless of endianness.
+// Alpha, Red, Green, Blue color, as a uint32_t value. Its value is always
+// 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.
 typedef uint32_t wuffs_base__color_u32_argb_premul;
 
+// wuffs_base__premul_u32_axxx converts from non-premultiplied alpha to
+// premultiplied alpha. The "axxx" means either "argb" or "abgr".
+static inline uint32_t  //
+wuffs_base__premul_u32_axxx(uint32_t nonpremul) {
+  // Multiplying by 0x101 (twice, once for alpha and once for color) converts
+  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.
+  //
+  // Working in the higher bit depth can produce slightly different (and
+  // arguably slightly more accurate) results. For example, given 8-bit blue
+  // and alpha of 0x80 and 0x81:
+  //
+  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40
+  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41
+  uint32_t a = 0xFF & (nonpremul >> 24);
+  uint32_t a16 = a * (0x101 * 0x101);
+
+  uint32_t r = 0xFF & (nonpremul >> 16);
+  r = ((r * a16) / 0xFFFF) >> 8;
+  uint32_t g = 0xFF & (nonpremul >> 8);
+  g = ((g * a16) / 0xFFFF) >> 8;
+  uint32_t b = 0xFF & (nonpremul >> 0);
+  b = ((b * a16) / 0xFFFF) >> 8;
+
+  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
+// wuffs_base__nonpremul_u32_axxx converts from premultiplied alpha to
+// non-premultiplied alpha. The "axxx" means either "argb" or "abgr".
+static inline uint32_t  //
+wuffs_base__nonpremul_u32_axxx(uint32_t premul) {
+  uint32_t a = 0xFF & (premul >> 24);
+  if (a == 0xFF) {
+    return premul;
+  } else if (a == 0) {
+    return 0;
+  }
+  uint32_t a16 = a * 0x101;
+
+  uint32_t r = 0xFF & (premul >> 16);
+  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;
+  uint32_t g = 0xFF & (premul >> 8);
+  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;
+  uint32_t b = 0xFF & (premul >> 0);
+  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;
+
+  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
 // --------
 
 typedef uint8_t wuffs_base__pixel_blend;
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 6795243..f5d45d7 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -28,14 +28,13 @@
 
 const baseImageImplC = "" +
 	"// ---------------- Images\n\nconst uint32_t wuffs_base__pixel_format__bits_per_channel[16] = {\n    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n    0x08, 0x0A, 0x0C, 0x10, 0x18, 0x20, 0x30, 0x40,\n};\n\nstatic inline uint32_t  //\nwuffs_base__swap_u32_argb_abgr(uint32_t u) {\n  uint32_t o = u & 0xFF00FF00;\n  uint32_t r = u & 0x00FF0000;\n  uint32_t b = u & 0x000000FF;\n  return o | (r >> 16) | (b << 16);\n}\n\nstatic inline uint32_t  //\nwuffs_base__composite_premul_nonpremul_u32_axxx(uint32_t dst_premul,\n                                                uint32_t src_nonpremul) {\n  // Convert from 8-bit color to 16-bit color.\n  uint32_t sa = 0x101 * (0xFF & (src_nonpremul >> 24));\n  uint32_t sr = 0x101 * (0xFF & (src_nonpremul >> 16));\n  uint32_t sg = 0x101 * (0xFF & (src_nonpremul >> 8));\n  uint32_t sb = 0x101 * (0xFF & (src_nonpremul >> 0));\n  uint32_t da = 0x101 * (0xFF & (dst_premul >> 24));\n  uint32_t dr = 0x101 * (0xFF & (dst_premul >> 16));\n  uint32_t dg = 0x101 * (0xFF & (dst_premul >> 8));\n  uint32_t db = 0x101" +
-	" * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nstatic inline uint32_t  //\nwuffs_base__premul_u32_axxx(uint32_t nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and once for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0" +
-	"x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (nonpremul >> 24);\n  uint32_t a16 = a * (0x101 * 0x101);\n\n  uint32_t r = 0xFF & (nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\nstatic inline uint32_t  //\nwuffs_base__nonpremul_u32_axxx(uint32_t premul) {\n  uint32_t a = 0xFF & (premul >> 24);\n  if (a == 0xFF) {\n    return premul;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (premul >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (premul >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (premul >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\nwuffs_base__color_u32_argb_premul  //\nwuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_" +
-	"buffer* pb,\n                                       uint32_t x,\n                                       uint32_t y) {\n  if (!pb || (x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return 0;\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return 0;\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__load_u32le__no_bounds_check(palette +\n    " +
-	"                                                 (4 * ((size_t)row[x])));\n    }\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return 0xFF000000 | (0x00010101 * ((uint32_t)(row[x])));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(palette +\n                                                  (4 * ((size_t)row[x]))));\n    }\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint16_t bgr =\n          wuffs_base__load_u16le__no_bounds_check(row + (2 * ((size_t)x)));\n      uint32_t b5 = 0x1F & (bgr >> 0);\n      uint32_t b = (b5 << 3) | (b5 >> 2);\n      uint32_t g6 = 0x3F & (bgr >> 5);\n      uint32_t g = (g6 << 2) | (g6 >> 4);\n      uint32_t r5 = 0x1F & (bgr >> 11);\n      uint32_t r = (r5 << 3) | (r5 >> 2);\n      return 0xFF000000 | (r << 16) | (g << 8) | (b << 0);\n    }\n    case WUFFS_BASE__PIXEL_FORMA" +
-	"T__BGR:\n      return 0xFF000000 |\n             wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 |\n             wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)))));\n    case WUFFS_BASE__" +
-	"PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nwuffs_base__status  //\nwuffs_base__pixel_buffer__set_color_u32_at(\n    wuffs_base__pixel_buffer* pb,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_premul color) {\n  if (!pb) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_im" +
-	"pl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le__no_bounds_check(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint32_t b5 = 0x1F & (color >> (8 - 5));\n      uint32_t g6 = 0x3F & (color >> (16 - 6));\n      uint32_t r5 = 0x1F & (color >> (24 - 5));\n      uint32_t bgr565 = (b5 << 0) | (g6 << 5) | (r5 << 11);\n      wuffs_base__store_u16le__no_bounds_check(row + (2 * ((size_t)x)),\n                                               (uint16_t)bgr565);\n      break;\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le__no_bounds_check(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((siz" +
-	"e_t)x)), wuffs_base__nonpremul_u32_axxx(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le__no_bounds_check(\n          row + (3 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__nonpremul_u32_axxx(\n                                       wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
+	" * (0xFF & (dst_premul >> 0));\n\n  // Calculate the inverse of the src-alpha: how much of the dst to keep.\n  uint32_t ia = 0xFFFF - sa;\n\n  // Composite src (nonpremul) over dst (premul).\n  da = sa + ((da * ia) / 0xFFFF);\n  dr = ((sr * sa) + (dr * ia)) / 0xFFFF;\n  dg = ((sg * sa) + (dg * ia)) / 0xFFFF;\n  db = ((sb * sa) + (db * ia)) / 0xFFFF;\n\n  // Convert from 16-bit color to 8-bit color and combine the components.\n  da >>= 8;\n  dr >>= 8;\n  dg >>= 8;\n  db >>= 8;\n  return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);\n}\n\nwuffs_base__color_u32_argb_premul  //\nwuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_buffer* pb,\n                                       uint32_t x,\n                                       uint32_t y) {\n  if (!pb || (x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return 0;\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return 0;\n  }\n\n  size_t stride = pb->priva" +
+	"te_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__load_u32le__no_bounds_check(palette +\n                                                     (4 * ((size_t)row[x])));\n    }\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return 0xFF000000 | (0x00010101 * ((uint32_t)(row[x])));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__premul_u32_axxx(" +
+	"\n          wuffs_base__load_u32le__no_bounds_check(palette +\n                                                  (4 * ((size_t)row[x]))));\n    }\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint16_t bgr =\n          wuffs_base__load_u16le__no_bounds_check(row + (2 * ((size_t)x)));\n      uint32_t b5 = 0x1F & (bgr >> 0);\n      uint32_t b = (b5 << 3) | (b5 >> 2);\n      uint32_t g6 = 0x3F & (bgr >> 5);\n      uint32_t g = (g6 << 2) | (g6 >> 4);\n      uint32_t r5 = 0x1F & (bgr >> 11);\n      uint32_t r = (r5 << 3) | (r5 >> 2);\n      return 0xFF000000 | (r << 16) | (g << 8) | (b << 0);\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return 0xFF000000 |\n             wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 |\n             wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_" +
+	"check(row + (4 * ((size_t)x))));\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nwuffs_base__status  //\nwuffs_base__pixel_buffer__set_color_u32_at(\n    wuffs_base__p" +
+	"ixel_buffer* pb,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_premul color) {\n  if (!pb) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le__no_bounds_check(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__B" +
+	"GR_565: {\n      uint32_t b5 = 0x1F & (color >> (8 - 5));\n      uint32_t g6 = 0x3F & (color >> (16 - 6));\n      uint32_t r5 = 0x1F & (color >> (24 - 5));\n      uint32_t bgr565 = (b5 << 0) | (g6 << 5) | (r5 << 11);\n      wuffs_base__store_u16le__no_bounds_check(row + (2 * ((size_t)x)),\n                                               (uint16_t)bgr565);\n      break;\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le__no_bounds_check(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__nonpremul_u32_axxx(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le__no_bounds_check(\n          row + (3 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__nonpremul_u" +
+	"32_axxx(\n                                       wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__premul_u32_axxx(\n        wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n" +
 	"  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  if (l" +
@@ -228,7 +227,8 @@
 	""
 
 const baseImagePublicH = "" +
-	"// ---------------- Images\n\n// wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied\n// Alpha, Red, Green, Blue color, as a uint32_t value. It is in word order, not\n// byte order: its value is always 0xAARRGGBB, regardless of endianness.\ntypedef uint32_t wuffs_base__color_u32_argb_premul;\n\n" +
+	"// ---------------- Images\n\n// wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied\n// Alpha, Red, Green, Blue color, as a uint32_t value. Its value is always\n// 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.\ntypedef uint32_t wuffs_base__color_u32_argb_premul;\n\n// wuffs_base__premul_u32_axxx converts from non-premultiplied alpha to\n// premultiplied alpha. The \"axxx\" means either \"argb\" or \"abgr\".\nstatic inline uint32_t  //\nwuffs_base__premul_u32_axxx(uint32_t nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and once for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (nonpremul >> 2" +
+	"4);\n  uint32_t a16 = a * (0x101 * 0x101);\n\n  uint32_t r = 0xFF & (nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n// wuffs_base__nonpremul_u32_axxx converts from premultiplied alpha to\n// non-premultiplied alpha. The \"axxx\" means either \"argb\" or \"abgr\".\nstatic inline uint32_t  //\nwuffs_base__nonpremul_u32_axxx(uint32_t premul) {\n  uint32_t a = 0xFF & (premul >> 24);\n  if (a == 0xFF) {\n    return premul;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (premul >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (premul >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (premul >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n" +
 	"" +
 	"// --------\n\ntypedef uint8_t wuffs_base__pixel_blend;\n\n// wuffs_base__pixel_blend encodes how to blend source and destination pixels,\n// accounting for transparency. It encompasses the Porter-Duff compositing\n// operators as well as the other blending modes defined by PDF.\n//\n// TODO: implement the other modes.\n#define WUFFS_BASE__PIXEL_BLEND__SRC ((wuffs_base__pixel_blend)0)\n#define WUFFS_BASE__PIXEL_BLEND__SRC_OVER ((wuffs_base__pixel_blend)1)\n\n" +
 	"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index cceb627..61fc598 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -2503,10 +2503,58 @@
 // ---------------- Images
 
 // wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied
-// Alpha, Red, Green, Blue color, as a uint32_t value. It is in word order, not
-// byte order: its value is always 0xAARRGGBB, regardless of endianness.
+// Alpha, Red, Green, Blue color, as a uint32_t value. Its value is always
+// 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.
 typedef uint32_t wuffs_base__color_u32_argb_premul;
 
+// wuffs_base__premul_u32_axxx converts from non-premultiplied alpha to
+// premultiplied alpha. The "axxx" means either "argb" or "abgr".
+static inline uint32_t  //
+wuffs_base__premul_u32_axxx(uint32_t nonpremul) {
+  // Multiplying by 0x101 (twice, once for alpha and once for color) converts
+  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.
+  //
+  // Working in the higher bit depth can produce slightly different (and
+  // arguably slightly more accurate) results. For example, given 8-bit blue
+  // and alpha of 0x80 and 0x81:
+  //
+  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40
+  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41
+  uint32_t a = 0xFF & (nonpremul >> 24);
+  uint32_t a16 = a * (0x101 * 0x101);
+
+  uint32_t r = 0xFF & (nonpremul >> 16);
+  r = ((r * a16) / 0xFFFF) >> 8;
+  uint32_t g = 0xFF & (nonpremul >> 8);
+  g = ((g * a16) / 0xFFFF) >> 8;
+  uint32_t b = 0xFF & (nonpremul >> 0);
+  b = ((b * a16) / 0xFFFF) >> 8;
+
+  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
+// wuffs_base__nonpremul_u32_axxx converts from premultiplied alpha to
+// non-premultiplied alpha. The "axxx" means either "argb" or "abgr".
+static inline uint32_t  //
+wuffs_base__nonpremul_u32_axxx(uint32_t premul) {
+  uint32_t a = 0xFF & (premul >> 24);
+  if (a == 0xFF) {
+    return premul;
+  } else if (a == 0) {
+    return 0;
+  }
+  uint32_t a16 = a * 0x101;
+
+  uint32_t r = 0xFF & (premul >> 16);
+  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;
+  uint32_t g = 0xFF & (premul >> 8);
+  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;
+  uint32_t b = 0xFF & (premul >> 0);
+  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;
+
+  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
 // --------
 
 typedef uint8_t wuffs_base__pixel_blend;
@@ -8428,50 +8476,6 @@
   return (db << 0) | (dg << 8) | (dr << 16) | (da << 24);
 }
 
-static inline uint32_t  //
-wuffs_base__premul_u32_axxx(uint32_t nonpremul) {
-  // Multiplying by 0x101 (twice, once for alpha and once for color) converts
-  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.
-  //
-  // Working in the higher bit depth can produce slightly different (and
-  // arguably slightly more accurate) results. For example, given 8-bit blue
-  // and alpha of 0x80 and 0x81:
-  //
-  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40
-  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41
-  uint32_t a = 0xFF & (nonpremul >> 24);
-  uint32_t a16 = a * (0x101 * 0x101);
-
-  uint32_t r = 0xFF & (nonpremul >> 16);
-  r = ((r * a16) / 0xFFFF) >> 8;
-  uint32_t g = 0xFF & (nonpremul >> 8);
-  g = ((g * a16) / 0xFFFF) >> 8;
-  uint32_t b = 0xFF & (nonpremul >> 0);
-  b = ((b * a16) / 0xFFFF) >> 8;
-
-  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
-}
-
-static inline uint32_t  //
-wuffs_base__nonpremul_u32_axxx(uint32_t premul) {
-  uint32_t a = 0xFF & (premul >> 24);
-  if (a == 0xFF) {
-    return premul;
-  } else if (a == 0) {
-    return 0;
-  }
-  uint32_t a16 = a * 0x101;
-
-  uint32_t r = 0xFF & (premul >> 16);
-  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;
-  uint32_t g = 0xFF & (premul >> 8);
-  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;
-  uint32_t b = 0xFF & (premul >> 0);
-  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;
-
-  return (a << 24) | (r << 16) | (g << 8) | (b << 0);
-}
-
 wuffs_base__color_u32_argb_premul  //
 wuffs_base__pixel_buffer__color_u32_at(const wuffs_base__pixel_buffer* pb,
                                        uint32_t x,