Add more pixel formats to set_color_u32_at
diff --git a/internal/cgen/base/image-impl.c b/internal/cgen/base/image-impl.c
index 00ef363..9fdf7ca 100644
--- a/internal/cgen/base/image-impl.c
+++ b/internal/cgen/base/image-impl.c
@@ -175,6 +175,18 @@
 
       // Common formats above. Rarer formats below.
 
+    case WUFFS_BASE__PIXEL_FORMAT__Y:
+      wuffs_base__store_u8__no_bounds_check(
+          row + ((size_t)x), wuffs_base__color_u32_argb_premul__as_gray(color));
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:
+      wuffs_base__store_u8__no_bounds_check(
+          row + ((size_t)x), wuffs_base__pixel_palette__closest_element(
+                                 wuffs_base__pixel_buffer__palette(pb),
+                                 pb->pixcfg.private_impl.pixfmt, color));
+      break;
+
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {
       uint32_t b5 = 0x1F & (color >> (8 - 5));
       uint32_t g6 = 0x3F & (color >> (16 - 6));
@@ -218,6 +230,68 @@
 
 // --------
 
+uint8_t  //
+wuffs_base__pixel_palette__closest_element(
+    wuffs_base__slice_u8 palette_slice,
+    wuffs_base__pixel_format palette_format,
+    wuffs_base__color_u32_argb_premul c) {
+  size_t n = palette_slice.len / 4;
+  if (n > 256) {
+    n = 256;
+  }
+  size_t best_index = 0;
+  uint64_t best_score = 0xFFFFFFFFFFFFFFFF;
+
+  // Work in 16-bit color.
+  uint32_t ca = 0x101 * (0xFF & (c >> 24));
+  uint32_t cr = 0x101 * (0xFF & (c >> 16));
+  uint32_t cg = 0x101 * (0xFF & (c >> 8));
+  uint32_t cb = 0x101 * (0xFF & (c >> 0));
+
+  switch (palette_format.repr) {
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {
+      bool nonpremul = palette_format.repr ==
+                       WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL;
+
+      size_t i;
+      for (i = 0; i < n; i++) {
+        // Work in 16-bit color.
+        uint32_t pb = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 0]));
+        uint32_t pg = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 1]));
+        uint32_t pr = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 2]));
+        uint32_t pa = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 3]));
+
+        // Convert to premultiplied alpha.
+        if (nonpremul && (pa != 0xFFFF)) {
+          pb = (pb * pa) / 0xFFFF;
+          pg = (pg * pa) / 0xFFFF;
+          pr = (pr * pa) / 0xFFFF;
+        }
+
+        // These deltas are conceptually int32_t (signed) but after squaring,
+        // it's equivalent to work in uint32_t (unsigned).
+        pb -= cb;
+        pg -= cg;
+        pr -= cr;
+        pa -= ca;
+        uint64_t score = ((uint64_t)(pb * pb)) + ((uint64_t)(pg * pg)) +
+                         ((uint64_t)(pr * pr)) + ((uint64_t)(pa * pa));
+        if (best_score > score) {
+          best_score = score;
+          best_index = i;
+        }
+      }
+      break;
+    }
+  }
+
+  return best_index;
+}
+
+// --------
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(
     wuffs_base__slice_u8 dst,
diff --git a/internal/cgen/base/image-public.h b/internal/cgen/base/image-public.h
index 1fd3f28..b235f2a 100644
--- a/internal/cgen/base/image-public.h
+++ b/internal/cgen/base/image-public.h
@@ -21,6 +21,23 @@
 // 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.
 typedef uint32_t wuffs_base__color_u32_argb_premul;
 
+static inline uint8_t  //
+wuffs_base__color_u32_argb_premul__as_gray(
+    wuffs_base__color_u32_argb_premul c) {
+  // Work in 16-bit color.
+  uint32_t cr = 0x101 * (0xFF & (c >> 16));
+  uint32_t cg = 0x101 * (0xFF & (c >> 8));
+  uint32_t cb = 0x101 * (0xFF & (c >> 0));
+
+  // These coefficients (the fractions 0.299, 0.587 and 0.114) are the same
+  // as those given by the JFIF specification.
+  //
+  // Note that 19595 + 38470 + 7471 equals 65536, also known as (1 << 16). We
+  // shift by 24, not just by 16, because the return value is 8-bit color, not
+  // 16-bit color.
+  return ((19595 * cr) + (38470 * cg) + (7471 * cb) + 32768) >> 24;
+}
+
 // wuffs_base__premul_u32_axxx converts from non-premultiplied alpha to
 // premultiplied alpha. The "axxx" means either "argb" or "abgr".
 static inline uint32_t  //
@@ -1229,6 +1246,23 @@
 
 // --------
 
+// wuffs_base__pixel_palette__closest_element returns the index of the palette
+// element that minimizes the sum of squared differences of the four ARGB
+// channels, working in premultiplied alpha. Ties favor the smaller index.
+//
+// The palette_slice.len may equal (N*4), for N less than 256, which means that
+// only the first N palette elements are considered. It returns 0 when N is 0.
+//
+// Applying this function on a per-pixel basis will not produce whole-of-image
+// dithering.
+uint8_t  //
+wuffs_base__pixel_palette__closest_element(
+    wuffs_base__slice_u8 palette_slice,
+    wuffs_base__pixel_format palette_format,
+    wuffs_base__color_u32_argb_premul c);
+
+// --------
+
 // TODO: should the func type take restrict pointers?
 typedef uint64_t (*wuffs_base__pixel_swizzler__func)(
     wuffs_base__slice_u8 dst,
diff --git a/internal/cgen/data.go b/internal/cgen/data.go
index 2285cdf..de410ef 100644
--- a/internal/cgen/data.go
+++ b/internal/cgen/data.go
@@ -32,9 +32,12 @@
 	"te_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_BINARY:\n      return wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__load_u32le__no_bounds_check(palette +\n                                                     (4 * ((size_t)row[x])));\n    }\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      return 0xFF000000 | (0x00010101 * ((uint32_t)(row[x])));\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL: {\n      uint8_t* palette = pb->private_impl.planes[3].ptr;\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(palette +\n " +
 	"                                                 (4 * ((size_t)row[x]))));\n    }\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint16_t bgr =\n          wuffs_base__load_u16le__no_bounds_check(row + (2 * ((size_t)x)));\n      uint32_t b5 = 0x1F & (bgr >> 0);\n      uint32_t b = (b5 << 3) | (b5 >> 2);\n      uint32_t g6 = 0x3F & (bgr >> 5);\n      uint32_t g = (g6 << 2) | (g6 >> 4);\n      uint32_t r5 = 0x1F & (bgr >> 11);\n      uint32_t r = (r5 << 3) | (r5 >> 2);\n      return 0xFF000000 | (r << 16) | (g << 8) | (b << 0);\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      return 0xFF000000 |\n             wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x)));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      return wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      return 0xFF000000 |\n             wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)));\n\n    case WUFFS_BASE__PIXEL_F" +
 	"ORMAT__RGB:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u24le__no_bounds_check(row + (3 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      return wuffs_base__swap_u32_argb_abgr(wuffs_base__premul_u32_axxx(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x)))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_BINARY:\n      return wuffs_base__swap_u32_argb_abgr(\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      return wuffs_base__swap_u32_argb_abgr(\n          0xFF000000 |\n          wuffs_base__load_u32le__no_bounds_check(row + (4 * ((size_t)x))));\n\n    default:\n      // TODO: support more formats.\n      break;\n  }\n\n  return 0;\n}\n\nwuffs_base__status  //\nwuffs_base__pixel_buffer__set_color_u32_at(\n    wuffs_base__pixel_buffer* pb,\n    uint32_t x,\n    uint32_t y,\n    wuffs_base__color_u32_argb_premul color)" +
-	" {\n  if (!pb) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le__no_bounds_check(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint32_t b5 = 0x1F & (color >> (8 - 5));\n      uint32_t g6 = 0x3F & (color >> (16 - 6));\n      uint32_t r5 =" +
-	" 0x1F & (color >> (24 - 5));\n      uint32_t bgr565 = (b5 << 0) | (g6 << 5) | (r5 << 11);\n      wuffs_base__store_u16le__no_bounds_check(row + (2 * ((size_t)x)),\n                                               (uint16_t)bgr565);\n      break;\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le__no_bounds_check(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__nonpremul_u32_axxx(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le__no_bounds_check(\n          row + (3 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__nonpremul_u32_axxx(\n                                       wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__P" +
-	"IXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
+	" {\n  if (!pb) {\n    return wuffs_base__make_status(wuffs_base__error__bad_receiver);\n  }\n  if ((x >= pb->pixcfg.private_impl.width) ||\n      (y >= pb->pixcfg.private_impl.height)) {\n    return wuffs_base__make_status(wuffs_base__error__bad_argument);\n  }\n\n  if (wuffs_base__pixel_format__is_planar(&pb->pixcfg.private_impl.pixfmt)) {\n    // TODO: support planar formats.\n    return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  size_t stride = pb->private_impl.planes[0].stride;\n  uint8_t* row = pb->private_impl.planes[0].ptr + (stride * ((size_t)y));\n\n  switch (pb->pixcfg.private_impl.pixfmt.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__BGRX:\n      wuffs_base__store_u32le__no_bounds_check(row + (4 * ((size_t)x)), color);\n      break;\n\n      // Common formats above. Rarer formats below.\n\n    case WUFFS_BASE__PIXEL_FORMAT__Y:\n      wuffs_base__store_u8__no_bounds_check(\n          row + ((size_t)x), wuffs_base__color_u32_argb_premul__as_gray(color" +
+	"));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:\n      wuffs_base__store_u8__no_bounds_check(\n          row + ((size_t)x), wuffs_base__pixel_palette__closest_element(\n                                 wuffs_base__pixel_buffer__palette(pb),\n                                 pb->pixcfg.private_impl.pixfmt, color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {\n      uint32_t b5 = 0x1F & (color >> (8 - 5));\n      uint32_t g6 = 0x3F & (color >> (16 - 6));\n      uint32_t r5 = 0x1F & (color >> (24 - 5));\n      uint32_t bgr565 = (b5 << 0) | (g6 << 5) | (r5 << 11);\n      wuffs_base__store_u16le__no_bounds_check(row + (2 * ((size_t)x)),\n                                               (uint16_t)bgr565);\n      break;\n    }\n    case WUFFS_BASE__PIXEL_FORMAT__BGR:\n      wuffs_base__store_u24le__no_bounds_check(row + (3 * ((size_t)x)), color);\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x))" +
+	", wuffs_base__nonpremul_u32_axxx(color));\n      break;\n\n    case WUFFS_BASE__PIXEL_FORMAT__RGB:\n      wuffs_base__store_u24le__no_bounds_check(\n          row + (3 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__nonpremul_u32_axxx(\n                                       wuffs_base__swap_u32_argb_abgr(color)));\n      break;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL:\n      WUFFS_BASE__FALLTHROUGH;\n    case WUFFS_BASE__PIXEL_FORMAT__RGBX:\n      wuffs_base__store_u32le__no_bounds_check(\n          row + (4 * ((size_t)x)), wuffs_base__swap_u32_argb_abgr(color));\n      break;\n\n    default:\n      // TODO: support more formats.\n      return wuffs_base__make_status(wuffs_base__error__unsupported_option);\n  }\n\n  return wuffs_base__make_status(NULL);\n}\n\n" +
+	"" +
+	"// --------\n\nuint8_t  //\nwuffs_base__pixel_palette__closest_element(\n    wuffs_base__slice_u8 palette_slice,\n    wuffs_base__pixel_format palette_format,\n    wuffs_base__color_u32_argb_premul c) {\n  size_t n = palette_slice.len / 4;\n  if (n > 256) {\n    n = 256;\n  }\n  size_t best_index = 0;\n  uint64_t best_score = 0xFFFFFFFFFFFFFFFF;\n\n  // Work in 16-bit color.\n  uint32_t ca = 0x101 * (0xFF & (c >> 24));\n  uint32_t cr = 0x101 * (0xFF & (c >> 16));\n  uint32_t cg = 0x101 * (0xFF & (c >> 8));\n  uint32_t cb = 0x101 * (0xFF & (c >> 0));\n\n  switch (palette_format.repr) {\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:\n    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {\n      bool nonpremul = palette_format.repr ==\n                       WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL;\n\n      size_t i;\n      for (i = 0; i < n; i++) {\n        // Work in 16-bit color.\n        uint32_t pb = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 0]))" +
+	";\n        uint32_t pg = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 1]));\n        uint32_t pr = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 2]));\n        uint32_t pa = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 3]));\n\n        // Convert to premultiplied alpha.\n        if (nonpremul && (pa != 0xFFFF)) {\n          pb = (pb * pa) / 0xFFFF;\n          pg = (pg * pa) / 0xFFFF;\n          pr = (pr * pa) / 0xFFFF;\n        }\n\n        // These deltas are conceptually int32_t (signed) but after squaring,\n        // it's equivalent to work in uint32_t (unsigned).\n        pb -= cb;\n        pg -= cg;\n        pr -= cr;\n        pa -= ca;\n        uint64_t score = ((uint64_t)(pb * pb)) + ((uint64_t)(pg * pg)) +\n                         ((uint64_t)(pr * pr)) + ((uint64_t)(pa * pa));\n        if (best_score > score) {\n          best_score = score;\n          best_index = i;\n        }\n      }\n      break;\n    }\n  }\n\n  return best_index;\n}\n\n" +
 	"" +
 	"// --------\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t s0 = wuffs_base__premul_u32_axxx(\n        wuffs_base__load_u32le__no_bounds_check(s + (0 * 4)));\n    wuffs_base__store_u32le__no_bounds_check(d + (0 * 4), s0);\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src_over(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  uint8_t* d = dst.ptr;\n  uint8_t* s = src.ptr;\n" +
 	"  size_t n = len;\n\n  // TODO: unroll.\n\n  while (n >= 1) {\n    uint32_t d0 = wuffs_base__load_u32le__no_bounds_check(d + (0 * 4));\n    uint32_t s0 = wuffs_base__load_u32le__no_bounds_check(s + (0 * 4));\n    wuffs_base__store_u32le__no_bounds_check(\n        d + (0 * 4), wuffs_base__composite_premul_nonpremul_u32_axxx(d0, s0));\n\n    s += 1 * 4;\n    d += 1 * 4;\n    n -= 1;\n  }\n\n  return len;\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_1_1(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  return wuffs_base__slice_u8__copy_from_slice(dst, src);\n}\n\nstatic uint64_t  //\nwuffs_base__pixel_swizzler__copy_4_4(wuffs_base__slice_u8 dst,\n                                     wuffs_base__slice_u8 dst_palette,\n                                     wuffs_base__slice_u8 src) {\n  size_t dst_len4 = dst.len / 4;\n  size_t src_len4 = src.len / 4;\n  size_t len = dst_len4 < src_len4 ? dst_len4 : src_len4;\n  if (l" +
@@ -227,8 +230,9 @@
 	""
 
 const baseImagePublicH = "" +
-	"// ---------------- Images\n\n// wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied\n// Alpha, Red, Green, Blue color, as a uint32_t value. Its value is always\n// 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.\ntypedef uint32_t wuffs_base__color_u32_argb_premul;\n\n// wuffs_base__premul_u32_axxx converts from non-premultiplied alpha to\n// premultiplied alpha. The \"axxx\" means either \"argb\" or \"abgr\".\nstatic inline uint32_t  //\nwuffs_base__premul_u32_axxx(uint32_t nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and once for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (nonpremul >> 2" +
-	"4);\n  uint32_t a16 = a * (0x101 * 0x101);\n\n  uint32_t r = 0xFF & (nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n// wuffs_base__nonpremul_u32_axxx converts from premultiplied alpha to\n// non-premultiplied alpha. The \"axxx\" means either \"argb\" or \"abgr\".\nstatic inline uint32_t  //\nwuffs_base__nonpremul_u32_axxx(uint32_t premul) {\n  uint32_t a = 0xFF & (premul >> 24);\n  if (a == 0xFF) {\n    return premul;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (premul >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (premul >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (premul >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n" +
+	"// ---------------- Images\n\n// wuffs_base__color_u32_argb_premul is an 8 bit per channel premultiplied\n// Alpha, Red, Green, Blue color, as a uint32_t value. Its value is always\n// 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.\ntypedef uint32_t wuffs_base__color_u32_argb_premul;\n\nstatic inline uint8_t  //\nwuffs_base__color_u32_argb_premul__as_gray(\n    wuffs_base__color_u32_argb_premul c) {\n  // Work in 16-bit color.\n  uint32_t cr = 0x101 * (0xFF & (c >> 16));\n  uint32_t cg = 0x101 * (0xFF & (c >> 8));\n  uint32_t cb = 0x101 * (0xFF & (c >> 0));\n\n  // These coefficients (the fractions 0.299, 0.587 and 0.114) are the same\n  // as those given by the JFIF specification.\n  //\n  // Note that 19595 + 38470 + 7471 equals 65536, also known as (1 << 16). We\n  // shift by 24, not just by 16, because the return value is 8-bit color, not\n  // 16-bit color.\n  return ((19595 * cr) + (38470 * cg) + (7471 * cb) + 32768) >> 24;\n}\n\n// wuffs_base__premul_u32_axxx converts from non-premultiplied alpha " +
+	"to\n// premultiplied alpha. The \"axxx\" means either \"argb\" or \"abgr\".\nstatic inline uint32_t  //\nwuffs_base__premul_u32_axxx(uint32_t nonpremul) {\n  // Multiplying by 0x101 (twice, once for alpha and once for color) converts\n  // from 8-bit to 16-bit color. Shifting right by 8 undoes that.\n  //\n  // Working in the higher bit depth can produce slightly different (and\n  // arguably slightly more accurate) results. For example, given 8-bit blue\n  // and alpha of 0x80 and 0x81:\n  //\n  //  - ((0x80   * 0x81  ) / 0xFF  )      = 0x40        = 0x40\n  //  - ((0x8080 * 0x8181) / 0xFFFF) >> 8 = 0x4101 >> 8 = 0x41\n  uint32_t a = 0xFF & (nonpremul >> 24);\n  uint32_t a16 = a * (0x101 * 0x101);\n\n  uint32_t r = 0xFF & (nonpremul >> 16);\n  r = ((r * a16) / 0xFFFF) >> 8;\n  uint32_t g = 0xFF & (nonpremul >> 8);\n  g = ((g * a16) / 0xFFFF) >> 8;\n  uint32_t b = 0xFF & (nonpremul >> 0);\n  b = ((b * a16) / 0xFFFF) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n// wuffs_base__nonpremul_u32_axxx converts from premultip" +
+	"lied alpha to\n// non-premultiplied alpha. The \"axxx\" means either \"argb\" or \"abgr\".\nstatic inline uint32_t  //\nwuffs_base__nonpremul_u32_axxx(uint32_t premul) {\n  uint32_t a = 0xFF & (premul >> 24);\n  if (a == 0xFF) {\n    return premul;\n  } else if (a == 0) {\n    return 0;\n  }\n  uint32_t a16 = a * 0x101;\n\n  uint32_t r = 0xFF & (premul >> 16);\n  r = ((r * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t g = 0xFF & (premul >> 8);\n  g = ((g * (0x101 * 0xFFFF)) / a16) >> 8;\n  uint32_t b = 0xFF & (premul >> 0);\n  b = ((b * (0x101 * 0xFFFF)) / a16) >> 8;\n\n  return (a << 24) | (r << 16) | (g << 8) | (b << 0);\n}\n\n" +
 	"" +
 	"// --------\n\ntypedef uint8_t wuffs_base__pixel_blend;\n\n// wuffs_base__pixel_blend encodes how to blend source and destination pixels,\n// accounting for transparency. It encompasses the Porter-Duff compositing\n// operators as well as the other blending modes defined by PDF.\n//\n// TODO: implement the other modes.\n#define WUFFS_BASE__PIXEL_BLEND__SRC ((wuffs_base__pixel_blend)0)\n#define WUFFS_BASE__PIXEL_BLEND__SRC_OVER ((wuffs_base__pixel_blend)1)\n\n" +
 	"" +
@@ -282,6 +286,8 @@
 	"" +
 	"// --------\n\ntypedef struct {\n  // Do not access the private_impl's fields directly. There is no API/ABI\n  // compatibility or safety guarantee if you do so.\n  struct {\n    uint8_t TODO;\n  } private_impl;\n\n#ifdef __cplusplus\n#endif  // __cplusplus\n\n} wuffs_base__decode_frame_options;\n\n#ifdef __cplusplus\n\n#endif  // __cplusplus\n\n" +
 	"" +
+	"// --------\n\n// wuffs_base__pixel_palette__closest_element returns the index of the palette\n// element that minimizes the sum of squared differences of the four ARGB\n// channels, working in premultiplied alpha. Ties favor the smaller index.\n//\n// The palette_slice.len may equal (N*4), for N less than 256, which means that\n// only the first N palette elements are considered. It returns 0 when N is 0.\n//\n// Applying this function on a per-pixel basis will not produce whole-of-image\n// dithering.\nuint8_t  //\nwuffs_base__pixel_palette__closest_element(\n    wuffs_base__slice_u8 palette_slice,\n    wuffs_base__pixel_format palette_format,\n    wuffs_base__color_u32_argb_premul c);\n\n" +
+	"" +
 	"// --------\n\n// TODO: should the func type take restrict pointers?\ntypedef uint64_t (*wuffs_base__pixel_swizzler__func)(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src);\n\ntypedef struct {\n  // Do not access the private_impl's fields directly. There is no API/ABI\n  // compatibility or safety guarantee if you do so.\n  struct {\n    wuffs_base__pixel_swizzler__func func;\n  } private_impl;\n\n#ifdef __cplusplus\n  inline wuffs_base__status prepare(wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend);\n  inline uint64_t swizzle_interleaved(wuffs_base__slice_u8 dst,\n                                      wuffs_base__slice_u8 dst_palette,\n                                      wuffs_base__slice_u8 src) const;\n#endi" +
 	"f  // __cplusplus\n\n} wuffs_base__pixel_swizzler;\n\nwuffs_base__status  //\nwuffs_base__pixel_swizzler__prepare(wuffs_base__pixel_swizzler* p,\n                                    wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src_palette,\n                                    wuffs_base__pixel_blend blend);\n\nuint64_t  //\nwuffs_base__pixel_swizzler__swizzle_interleaved(\n    const wuffs_base__pixel_swizzler* p,\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src);\n\n#ifdef __cplusplus\n\ninline wuffs_base__status  //\nwuffs_base__pixel_swizzler::prepare(wuffs_base__pixel_format dst_format,\n                                    wuffs_base__slice_u8 dst_palette,\n                                    wuffs_base__pixel_format src_format,\n                                    wuffs_base__slice_u8 src" +
 	"_palette,\n                                    wuffs_base__pixel_blend blend) {\n  return wuffs_base__pixel_swizzler__prepare(this, dst_format, dst_palette,\n                                             src_format, src_palette, blend);\n}\n\nuint64_t  //\nwuffs_base__pixel_swizzler::swizzle_interleaved(\n    wuffs_base__slice_u8 dst,\n    wuffs_base__slice_u8 dst_palette,\n    wuffs_base__slice_u8 src) const {\n  return wuffs_base__pixel_swizzler__swizzle_interleaved(this, dst, dst_palette,\n                                                         src);\n}\n\n#endif  // __cplusplus\n" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index d86140e..ff2e2fe 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -2507,6 +2507,23 @@
 // 0xAARRGGBB (Alpha most significant, Blue least), regardless of endianness.
 typedef uint32_t wuffs_base__color_u32_argb_premul;
 
+static inline uint8_t  //
+wuffs_base__color_u32_argb_premul__as_gray(
+    wuffs_base__color_u32_argb_premul c) {
+  // Work in 16-bit color.
+  uint32_t cr = 0x101 * (0xFF & (c >> 16));
+  uint32_t cg = 0x101 * (0xFF & (c >> 8));
+  uint32_t cb = 0x101 * (0xFF & (c >> 0));
+
+  // These coefficients (the fractions 0.299, 0.587 and 0.114) are the same
+  // as those given by the JFIF specification.
+  //
+  // Note that 19595 + 38470 + 7471 equals 65536, also known as (1 << 16). We
+  // shift by 24, not just by 16, because the return value is 8-bit color, not
+  // 16-bit color.
+  return ((19595 * cr) + (38470 * cg) + (7471 * cb) + 32768) >> 24;
+}
+
 // wuffs_base__premul_u32_axxx converts from non-premultiplied alpha to
 // premultiplied alpha. The "axxx" means either "argb" or "abgr".
 static inline uint32_t  //
@@ -3715,6 +3732,23 @@
 
 // --------
 
+// wuffs_base__pixel_palette__closest_element returns the index of the palette
+// element that minimizes the sum of squared differences of the four ARGB
+// channels, working in premultiplied alpha. Ties favor the smaller index.
+//
+// The palette_slice.len may equal (N*4), for N less than 256, which means that
+// only the first N palette elements are considered. It returns 0 when N is 0.
+//
+// Applying this function on a per-pixel basis will not produce whole-of-image
+// dithering.
+uint8_t  //
+wuffs_base__pixel_palette__closest_element(
+    wuffs_base__slice_u8 palette_slice,
+    wuffs_base__pixel_format palette_format,
+    wuffs_base__color_u32_argb_premul c);
+
+// --------
+
 // TODO: should the func type take restrict pointers?
 typedef uint64_t (*wuffs_base__pixel_swizzler__func)(
     wuffs_base__slice_u8 dst,
@@ -8626,6 +8660,18 @@
 
       // Common formats above. Rarer formats below.
 
+    case WUFFS_BASE__PIXEL_FORMAT__Y:
+      wuffs_base__store_u8__no_bounds_check(
+          row + ((size_t)x), wuffs_base__color_u32_argb_premul__as_gray(color));
+      break;
+
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY:
+      wuffs_base__store_u8__no_bounds_check(
+          row + ((size_t)x), wuffs_base__pixel_palette__closest_element(
+                                 wuffs_base__pixel_buffer__palette(pb),
+                                 pb->pixcfg.private_impl.pixfmt, color));
+      break;
+
     case WUFFS_BASE__PIXEL_FORMAT__BGR_565: {
       uint32_t b5 = 0x1F & (color >> (8 - 5));
       uint32_t g6 = 0x3F & (color >> (16 - 6));
@@ -8669,6 +8715,68 @@
 
 // --------
 
+uint8_t  //
+wuffs_base__pixel_palette__closest_element(
+    wuffs_base__slice_u8 palette_slice,
+    wuffs_base__pixel_format palette_format,
+    wuffs_base__color_u32_argb_premul c) {
+  size_t n = palette_slice.len / 4;
+  if (n > 256) {
+    n = 256;
+  }
+  size_t best_index = 0;
+  uint64_t best_score = 0xFFFFFFFFFFFFFFFF;
+
+  // Work in 16-bit color.
+  uint32_t ca = 0x101 * (0xFF & (c >> 24));
+  uint32_t cr = 0x101 * (0xFF & (c >> 16));
+  uint32_t cg = 0x101 * (0xFF & (c >> 8));
+  uint32_t cb = 0x101 * (0xFF & (c >> 0));
+
+  switch (palette_format.repr) {
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_PREMUL:
+    case WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY: {
+      bool nonpremul = palette_format.repr ==
+                       WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_NONPREMUL;
+
+      size_t i;
+      for (i = 0; i < n; i++) {
+        // Work in 16-bit color.
+        uint32_t pb = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 0]));
+        uint32_t pg = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 1]));
+        uint32_t pr = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 2]));
+        uint32_t pa = 0x101 * ((uint32_t)(palette_slice.ptr[(4 * i) + 3]));
+
+        // Convert to premultiplied alpha.
+        if (nonpremul && (pa != 0xFFFF)) {
+          pb = (pb * pa) / 0xFFFF;
+          pg = (pg * pa) / 0xFFFF;
+          pr = (pr * pa) / 0xFFFF;
+        }
+
+        // These deltas are conceptually int32_t (signed) but after squaring,
+        // it's equivalent to work in uint32_t (unsigned).
+        pb -= cb;
+        pg -= cg;
+        pr -= cr;
+        pa -= ca;
+        uint64_t score = ((uint64_t)(pb * pb)) + ((uint64_t)(pg * pg)) +
+                         ((uint64_t)(pr * pr)) + ((uint64_t)(pa * pa));
+        if (best_score > score) {
+          best_score = score;
+          best_index = i;
+        }
+      }
+      break;
+    }
+  }
+
+  return best_index;
+}
+
+// --------
+
 static uint64_t  //
 wuffs_base__pixel_swizzler__bgra_premul__bgra_nonpremul__src(
     wuffs_base__slice_u8 dst,
diff --git a/test/c/std/wbmp.c b/test/c/std/wbmp.c
index f5d9f7c..11c5162 100644
--- a/test/c/std/wbmp.c
+++ b/test/c/std/wbmp.c
@@ -84,14 +84,6 @@
   }
 }
 
-void  //
-fill_table(wuffs_base__table_u8 t, uint8_t c) {
-  size_t y;
-  for (y = 0; y < t.height; y++) {
-    memset(t.ptr + (y * t.stride), c, t.width);
-  }
-}
-
 const char*  //
 test_wuffs_pixel_swizzler_swizzle() {
   CHECK_FOCUS(__func__);
@@ -150,19 +142,10 @@
                      &src_pixbuf, &src_pixcfg, g_src_slice_u8));
     fill_palette_with_grays(&src_pixbuf);
 
-    // Set the middle src pixel.
-    if (srcs[s].pixfmt_repr == WUFFS_BASE__PIXEL_FORMAT__Y) {
-      fill_table(wuffs_base__pixel_buffer__plane(&src_pixbuf, 0), 0x44);
-    } else if (srcs[s].pixfmt_repr ==
-               WUFFS_BASE__PIXEL_FORMAT__INDEXED__BGRA_BINARY) {
-      fill_table(wuffs_base__pixel_buffer__plane(&src_pixbuf, 0), 0x44);
-    } else {
-      CHECK_STATUS("set_color_u32_at",
-                   wuffs_base__pixel_buffer__set_color_u32_at(
-                       &src_pixbuf, width / 2, height / 2, srcs[s].pixel));
-    }
-
-    // Check the middle src pixel.
+    // Set and check the middle src pixel.
+    CHECK_STATUS("set_color_u32_at",
+                 wuffs_base__pixel_buffer__set_color_u32_at(
+                     &src_pixbuf, width / 2, height / 2, srcs[s].pixel));
     wuffs_base__color_u32_argb_premul have_src_pixel =
         wuffs_base__pixel_buffer__color_u32_at(&src_pixbuf, width / 2,
                                                height / 2);