Have std/bmp decode 8-bit paletted images
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 5434e83..de88496 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -5567,6 +5567,7 @@
     uint32_t f_io_redirect_fourcc;
     uint64_t f_io_redirect_pos;
     uint64_t f_frame_config_io_position;
+    uint32_t f_bitmap_info_len;
     uint32_t f_padding;
     uint32_t f_bits_per_pixel;
     uint32_t f_compression;
@@ -5584,13 +5585,14 @@
     uint32_t p_decode_frame_config[1];
     uint32_t p_decode_frame[1];
     uint32_t p_skip_frame[1];
+    uint32_t p_read_palette[1];
   } private_impl;
 
   struct {
     uint8_t f_scratch[2048];
+    uint8_t f_src_palette[1024];
 
     struct {
-      uint32_t v_bitmap_info_len;
       uint64_t scratch;
     } s_decode_image_config[1];
     struct {
@@ -5599,6 +5601,10 @@
     struct {
       uint64_t scratch;
     } s_skip_frame[1];
+    struct {
+      uint32_t v_i;
+      uint64_t scratch;
+    } s_read_palette[1];
   } private_data;
 
 #ifdef __cplusplus
@@ -15983,6 +15989,11 @@
     wuffs_base__io_buffer* a_src);
 
 static wuffs_base__status
+wuffs_bmp__decoder__read_palette(
+    wuffs_bmp__decoder* self,
+    wuffs_base__io_buffer* a_src);
+
+static wuffs_base__status
 wuffs_bmp__decoder__process_masks(
     wuffs_bmp__decoder* self);
 
@@ -16132,7 +16143,6 @@
   wuffs_base__status status = wuffs_base__make_status(NULL);
 
   uint32_t v_magic = 0;
-  uint32_t v_bitmap_info_len = 0;
   uint32_t v_width = 0;
   uint32_t v_height = 0;
   uint32_t v_planes = 0;
@@ -16150,9 +16160,6 @@
   }
 
   uint32_t coro_susp_point = self->private_impl.p_decode_image_config[0];
-  if (coro_susp_point) {
-    v_bitmap_info_len = self->private_data.s_decode_image_config[0].v_bitmap_info_len;
-  }
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
 
@@ -16267,25 +16274,27 @@
           *scratch |= ((uint64_t)(num_bits_2)) << 56;
         }
       }
-      v_bitmap_info_len = t_2;
+      self->private_impl.f_bitmap_info_len = t_2;
     }
-    if (self->private_impl.f_padding < v_bitmap_info_len) {
+    if (self->private_impl.f_padding < self->private_impl.f_bitmap_info_len) {
       status = wuffs_base__make_status(wuffs_bmp__error__bad_header);
       goto exit;
     }
-    self->private_impl.f_padding -= v_bitmap_info_len;
-    if (v_bitmap_info_len == 40) {
-      if (self->private_impl.f_padding >= 16) {
-        v_bitmap_info_len = 56;
-        self->private_impl.f_padding -= 16;
-      } else if (self->private_impl.f_padding >= 12) {
-        v_bitmap_info_len = 52;
-        self->private_impl.f_padding -= 12;
+    self->private_impl.f_padding -= self->private_impl.f_bitmap_info_len;
+    if (self->private_impl.f_bitmap_info_len == 40) {
+      if (self->private_impl.f_bits_per_pixel >= 16) {
+        if (self->private_impl.f_padding >= 16) {
+          self->private_impl.f_bitmap_info_len = 56;
+          self->private_impl.f_padding -= 16;
+        } else if (self->private_impl.f_padding >= 12) {
+          self->private_impl.f_bitmap_info_len = 52;
+          self->private_impl.f_padding -= 12;
+        }
       }
-    } else if ((v_bitmap_info_len != 52) &&
-        (v_bitmap_info_len != 56) &&
-        (v_bitmap_info_len != 108) &&
-        (v_bitmap_info_len != 124)) {
+    } else if ((self->private_impl.f_bitmap_info_len != 52) &&
+        (self->private_impl.f_bitmap_info_len != 56) &&
+        (self->private_impl.f_bitmap_info_len != 108) &&
+        (self->private_impl.f_bitmap_info_len != 124)) {
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
@@ -16478,7 +16487,7 @@
       self->private_impl.f_compression = 3;
     }
     if (self->private_impl.f_compression == 3) {
-      if (v_bitmap_info_len >= 52) {
+      if (self->private_impl.f_bitmap_info_len >= 52) {
         {
           WUFFS_BASE__COROUTINE_SUSPENSION_POINT(19);
           uint32_t t_8;
@@ -16566,7 +16575,7 @@
           }
           self->private_impl.f_channel_masks[0] = t_10;
         }
-        if (v_bitmap_info_len >= 56) {
+        if (self->private_impl.f_bitmap_info_len >= 56) {
           {
             WUFFS_BASE__COROUTINE_SUSPENSION_POINT(25);
             uint32_t t_11;
@@ -16596,7 +16605,7 @@
             }
             self->private_impl.f_channel_masks[3] = t_11;
           }
-          self->private_data.s_decode_image_config[0].scratch = (v_bitmap_info_len - 56);
+          self->private_data.s_decode_image_config[0].scratch = (self->private_impl.f_bitmap_info_len - 56);
           WUFFS_BASE__COROUTINE_SUSPENSION_POINT(27);
           if (self->private_data.s_decode_image_config[0].scratch > ((uint64_t)(io2_a_src - iop_a_src))) {
             self->private_data.s_decode_image_config[0].scratch -= ((uint64_t)(io2_a_src - iop_a_src));
@@ -16621,8 +16630,8 @@
           goto suspend;
         }
       }
-    } else if (v_bitmap_info_len >= 40) {
-      self->private_data.s_decode_image_config[0].scratch = (v_bitmap_info_len - 40);
+    } else if (self->private_impl.f_bitmap_info_len >= 40) {
+      self->private_data.s_decode_image_config[0].scratch = (self->private_impl.f_bitmap_info_len - 40);
       WUFFS_BASE__COROUTINE_SUSPENSION_POINT(29);
       if (self->private_data.s_decode_image_config[0].scratch > ((uint64_t)(io2_a_src - iop_a_src))) {
         self->private_data.s_decode_image_config[0].scratch -= ((uint64_t)(io2_a_src - iop_a_src));
@@ -16631,18 +16640,33 @@
         goto suspend;
       }
       iop_a_src += self->private_data.s_decode_image_config[0].scratch;
+      if (self->private_impl.f_bits_per_pixel < 16) {
+        if (a_src) {
+          a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+        }
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(30);
+        status = wuffs_bmp__decoder__read_palette(self, a_src);
+        if (a_src) {
+          iop_a_src = a_src->data.ptr + a_src->meta.ri;
+        }
+        if (status.repr) {
+          goto suspend;
+        }
+      }
     } else {
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
     if (self->private_impl.f_compression == 0) {
-      if (self->private_impl.f_bits_per_pixel == 16) {
+      if (self->private_impl.f_bits_per_pixel == 8) {
+        self->private_impl.f_src_pixfmt = 2198077448;
+      } else if (self->private_impl.f_bits_per_pixel == 16) {
         self->private_impl.f_compression = 3;
         self->private_impl.f_channel_masks[0] = 31;
         self->private_impl.f_channel_masks[1] = 992;
         self->private_impl.f_channel_masks[2] = 31744;
         self->private_impl.f_channel_masks[3] = 0;
-        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(30);
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(31);
         status = wuffs_bmp__decoder__process_masks(self);
         if (status.repr) {
           goto suspend;
@@ -16671,7 +16695,10 @@
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
-    if (self->private_impl.f_bits_per_pixel == 16) {
+    if (self->private_impl.f_bits_per_pixel == 8) {
+      self->private_impl.f_bytes_per_row = ((((((uint64_t)(self->private_impl.f_width)) * 1) + 3) >> 2) << 2);
+      self->private_impl.f_pad_per_row = ((4 - (self->private_impl.f_width & 3)) & 3);
+    } else if (self->private_impl.f_bits_per_pixel == 16) {
       self->private_impl.f_bytes_per_row = ((((((uint64_t)(self->private_impl.f_width)) * 2) + 3) >> 2) << 2);
       self->private_impl.f_pad_per_row = ((self->private_impl.f_width & 1) * 2);
     } else if (self->private_impl.f_bits_per_pixel == 24) {
@@ -16711,7 +16738,6 @@
   suspend:
   self->private_impl.p_decode_image_config[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
   self->private_impl.active_coroutine = wuffs_base__status__is_suspension(&status) ? 1 : 0;
-  self->private_data.s_decode_image_config[0].v_bitmap_info_len = v_bitmap_info_len;
 
   goto exit;
   exit:
@@ -16875,6 +16901,7 @@
   self->private_impl.active_coroutine = 0;
   wuffs_base__status status = wuffs_base__make_status(NULL);
 
+  wuffs_base__slice_u8 v_dst_palette = {0};
   wuffs_base__status v_status = wuffs_base__make_status(NULL);
 
   const uint8_t* iop_a_src = NULL;
@@ -16929,11 +16956,15 @@
         self->private_impl.f_dst_y_end = 4294967295;
         self->private_impl.f_dst_y_inc = 4294967295;
       }
+      v_dst_palette = wuffs_base__pixel_buffer__palette(a_dst);
+      if (((uint64_t)(v_dst_palette.len)) == 0) {
+        v_dst_palette = wuffs_base__slice_u8__subslice_i(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), 1024);
+      }
       v_status = wuffs_base__pixel_swizzler__prepare(&self->private_impl.f_swizzler,
           wuffs_base__pixel_buffer__pixel_format(a_dst),
-          wuffs_base__pixel_buffer__palette(a_dst),
+          v_dst_palette,
           wuffs_base__utility__make_pixel_format(self->private_impl.f_src_pixfmt),
-          wuffs_base__utility__empty_slice_u8(),
+          wuffs_base__make_slice_u8(self->private_data.f_src_palette, 1024),
           a_blend);
       if ( ! wuffs_base__status__is_ok(&v_status)) {
         status = v_status;
@@ -17018,6 +17049,7 @@
   uint32_t v_dst_bits_per_pixel = 0;
   uint64_t v_dst_bytes_per_pixel = 0;
   uint64_t v_dst_bytes_per_row = 0;
+  wuffs_base__slice_u8 v_dst_palette = {0};
   wuffs_base__table_u8 v_tab = {0};
   wuffs_base__slice_u8 v_dst = {0};
   uint64_t v_i = 0;
@@ -17042,6 +17074,10 @@
   }
   v_dst_bytes_per_pixel = ((uint64_t)((v_dst_bits_per_pixel / 8)));
   v_dst_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * v_dst_bytes_per_pixel);
+  v_dst_palette = wuffs_base__pixel_buffer__palette(a_dst);
+  if (((uint64_t)(v_dst_palette.len)) == 0) {
+    v_dst_palette = wuffs_base__slice_u8__subslice_i(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), 1024);
+  }
   v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0);
   label__outer__continue:;
   while (true) {
@@ -17076,7 +17112,7 @@
       v_n = wuffs_base__pixel_swizzler__swizzle_interleaved_from_reader(
           &self->private_impl.f_swizzler,
           wuffs_base__slice_u8__subslice_i(v_dst, v_i),
-          wuffs_base__utility__empty_slice_u8(),
+          v_dst_palette,
           &iop_a_src,
           io2_a_src);
       if (v_n == 0) {
@@ -17114,6 +17150,7 @@
   uint32_t v_dst_bits_per_pixel = 0;
   uint64_t v_dst_bytes_per_pixel = 0;
   uint64_t v_dst_bytes_per_row = 0;
+  wuffs_base__slice_u8 v_dst_palette = {0};
   wuffs_base__table_u8 v_tab = {0};
   wuffs_base__slice_u8 v_dst = {0};
   uint64_t v_i = 0;
@@ -17145,6 +17182,10 @@
   }
   v_dst_bytes_per_pixel = ((uint64_t)((v_dst_bits_per_pixel / 8)));
   v_dst_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * v_dst_bytes_per_pixel);
+  v_dst_palette = wuffs_base__pixel_buffer__palette(a_dst);
+  if (((uint64_t)(v_dst_palette.len)) == 0) {
+    v_dst_palette = wuffs_base__slice_u8__subslice_i(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), 1024);
+  }
   v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0);
   label__outer__continue:;
   while (true) {
@@ -17214,7 +17255,7 @@
       if (v_i >= ((uint64_t)(v_dst.len))) {
         goto label__inner__continue;
       }
-      v_n = wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, wuffs_base__slice_u8__subslice_i(v_dst, v_i), wuffs_base__utility__empty_slice_u8(), wuffs_base__slice_u8__subslice_j(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), (8 * v_p0)));
+      v_n = wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, wuffs_base__slice_u8__subslice_i(v_dst, v_i), v_dst_palette, wuffs_base__slice_u8__subslice_j(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), (8 * v_p0)));
       if (v_n == 0) {
         status = wuffs_base__make_status(wuffs_bmp__note__internal_note_short_read);
         goto ok;
@@ -17480,6 +17521,142 @@
   return wuffs_base__utility__make_range_ii_u64(0, 0);
 }
 
+// -------- func bmp.decoder.read_palette
+
+static wuffs_base__status
+wuffs_bmp__decoder__read_palette(
+    wuffs_bmp__decoder* self,
+    wuffs_base__io_buffer* a_src) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  uint32_t v_i = 0;
+  uint32_t v_argb = 0;
+
+  const uint8_t* iop_a_src = NULL;
+  const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_src) {
+    io0_a_src = a_src->data.ptr;
+    io1_a_src = io0_a_src + a_src->meta.ri;
+    iop_a_src = io1_a_src;
+    io2_a_src = io0_a_src + a_src->meta.wi;
+  }
+
+  uint32_t coro_susp_point = self->private_impl.p_read_palette[0];
+  if (coro_susp_point) {
+    v_i = self->private_data.s_read_palette[0].v_i;
+  }
+  switch (coro_susp_point) {
+    WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
+
+    if (self->private_impl.f_bitmap_info_len < 40) {
+      while ((v_i < 256) && (self->private_impl.f_padding >= 3)) {
+        self->private_impl.f_padding -= 3;
+        {
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1);
+          uint32_t t_0;
+          if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 3)) {
+            t_0 = ((uint32_t)(wuffs_base__load_u24le__no_bounds_check(iop_a_src)));
+            iop_a_src += 3;
+          } else {
+            self->private_data.s_read_palette[0].scratch = 0;
+            WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2);
+            while (true) {
+              if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+                status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+                goto suspend;
+              }
+              uint64_t* scratch = &self->private_data.s_read_palette[0].scratch;
+              uint32_t num_bits_0 = ((uint32_t)(*scratch >> 56));
+              *scratch <<= 8;
+              *scratch >>= 8;
+              *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_0;
+              if (num_bits_0 == 16) {
+                t_0 = ((uint32_t)(*scratch));
+                break;
+              }
+              num_bits_0 += 8;
+              *scratch |= ((uint64_t)(num_bits_0)) << 56;
+            }
+          }
+          v_argb = t_0;
+        }
+        v_argb |= 4278190080;
+        self->private_data.f_src_palette[((4 * v_i) + 0)] = ((uint8_t)(((v_argb >> 0) & 255)));
+        self->private_data.f_src_palette[((4 * v_i) + 1)] = ((uint8_t)(((v_argb >> 8) & 255)));
+        self->private_data.f_src_palette[((4 * v_i) + 2)] = ((uint8_t)(((v_argb >> 16) & 255)));
+        self->private_data.f_src_palette[((4 * v_i) + 3)] = ((uint8_t)(((v_argb >> 24) & 255)));
+        v_i += 1;
+      }
+    } else {
+      while ((v_i < 256) && (self->private_impl.f_padding >= 4)) {
+        self->private_impl.f_padding -= 4;
+        {
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3);
+          uint32_t t_1;
+          if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) {
+            t_1 = wuffs_base__load_u32le__no_bounds_check(iop_a_src);
+            iop_a_src += 4;
+          } else {
+            self->private_data.s_read_palette[0].scratch = 0;
+            WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4);
+            while (true) {
+              if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) {
+                status = wuffs_base__make_status(wuffs_base__suspension__short_read);
+                goto suspend;
+              }
+              uint64_t* scratch = &self->private_data.s_read_palette[0].scratch;
+              uint32_t num_bits_1 = ((uint32_t)(*scratch >> 56));
+              *scratch <<= 8;
+              *scratch >>= 8;
+              *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_1;
+              if (num_bits_1 == 24) {
+                t_1 = ((uint32_t)(*scratch));
+                break;
+              }
+              num_bits_1 += 8;
+              *scratch |= ((uint64_t)(num_bits_1)) << 56;
+            }
+          }
+          v_argb = t_1;
+        }
+        v_argb |= 4278190080;
+        self->private_data.f_src_palette[((4 * v_i) + 0)] = ((uint8_t)(((v_argb >> 0) & 255)));
+        self->private_data.f_src_palette[((4 * v_i) + 1)] = ((uint8_t)(((v_argb >> 8) & 255)));
+        self->private_data.f_src_palette[((4 * v_i) + 2)] = ((uint8_t)(((v_argb >> 16) & 255)));
+        self->private_data.f_src_palette[((4 * v_i) + 3)] = ((uint8_t)(((v_argb >> 24) & 255)));
+        v_i += 1;
+      }
+    }
+    while (v_i < 256) {
+      self->private_data.f_src_palette[((4 * v_i) + 0)] = 0;
+      self->private_data.f_src_palette[((4 * v_i) + 1)] = 0;
+      self->private_data.f_src_palette[((4 * v_i) + 2)] = 0;
+      self->private_data.f_src_palette[((4 * v_i) + 3)] = 255;
+      v_i += 1;
+    }
+
+    goto ok;
+    ok:
+    self->private_impl.p_read_palette[0] = 0;
+    goto exit;
+  }
+
+  goto suspend;
+  suspend:
+  self->private_impl.p_read_palette[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
+  self->private_data.s_read_palette[0].v_i = v_i;
+
+  goto exit;
+  exit:
+  if (a_src) {
+    a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+  }
+
+  return status;
+}
+
 // -------- func bmp.decoder.process_masks
 
 static wuffs_base__status
diff --git a/std/bmp/decode_bmp.wuffs b/std/bmp/decode_bmp.wuffs
index ae7683e..8a671fb 100644
--- a/std/bmp/decode_bmp.wuffs
+++ b/std/bmp/decode_bmp.wuffs
@@ -35,7 +35,8 @@
 
 	frame_config_io_position : base.u64,
 
-	padding : base.u32,
+	bitmap_info_len : base.u32,
+	padding         : base.u32,
 
 	bits_per_pixel : base.u32,
 	compression    : base.u32,
@@ -56,19 +57,24 @@
 	swizzler : base.pixel_swizzler,
 	util     : base.utility,
 )(
-	scratch : array[2048] base.u8,  // 2048 = 256 * (8 bytes per BGRA_NONPREMUL_4X16LE).
+	// scratch is one of:
+	//  - 2048 bytes = 256  * (8 bytes per BGRA_NONPREMUL_4X16LE).
+	//  - 1024 bytes = 1024 * (1 byte  per INDEXED__BGRA_BINARY), plus
+	//    1024 bytes = 256  * (4 bytes per dst_palette entry).
+	scratch : array[2048] base.u8,
+
+	src_palette : array[4 * 256] base.u8,
 )
 
 pub func decoder.set_quirk_enabled!(quirk: base.u32, enabled: base.bool) {
 }
 
 pub func decoder.decode_image_config?(dst: nptr base.image_config, src: base.io_reader) {
-	var magic           : base.u32
-	var bitmap_info_len : base.u32
-	var width           : base.u32
-	var height          : base.u32
-	var planes          : base.u32
-	var dst_pixfmt      : base.u32
+	var magic      : base.u32
+	var width      : base.u32
+	var height     : base.u32
+	var planes     : base.u32
+	var dst_pixfmt : base.u32
 
 	if (this.call_sequence <> 0) or (this.io_redirect_fourcc == 1) {
 		return base."#bad call sequence"
@@ -94,24 +100,26 @@
 
 	// Read the BITMAPINFOHEADER (version 3 / 4 / 5 is 40 / 108 / 124 bytes).
 
-	bitmap_info_len = args.src.read_u32le?()
-	if this.padding < bitmap_info_len {
+	this.bitmap_info_len = args.src.read_u32le?()
+	if this.padding < this.bitmap_info_len {
 		return "#bad header"
 	}
-	this.padding -= bitmap_info_len
+	this.padding -= this.bitmap_info_len
 
-	if bitmap_info_len == 40 {
-		// It's poorly documented, but some "length 40" data can be silently
-		// augmented with RGB or RGBA channel_masks (12 or 16 bytes).
-		if this.padding >= 16 {
-			bitmap_info_len = 56
-			this.padding -= 16
-		} else if this.padding >= 12 {
-			bitmap_info_len = 52
-			this.padding -= 12
+	if this.bitmap_info_len == 40 {
+		if this.bits_per_pixel >= 16 {
+			// It's poorly documented, but "length 40" data can be silently
+			// augmented with RGB or RGBA channel_masks (12 or 16 bytes).
+			if this.padding >= 16 {
+				this.bitmap_info_len = 56
+				this.padding -= 16
+			} else if this.padding >= 12 {
+				this.bitmap_info_len = 52
+				this.padding -= 12
+			}
 		}
-	} else if (bitmap_info_len <> 52) and (bitmap_info_len <> 56) and
-		(bitmap_info_len <> 108) and (bitmap_info_len <> 124) {
+	} else if (this.bitmap_info_len <> 52) and (this.bitmap_info_len <> 56) and
+		(this.bitmap_info_len <> 108) and (this.bitmap_info_len <> 124) {
 		return "#unsupported BMP file"
 	}
 
@@ -163,14 +171,14 @@
 
 	// Read the channel_masks when this.compression is 3 (BITFIELDS).
 	if this.compression == 3 {
-		if bitmap_info_len >= 52 {
+		if this.bitmap_info_len >= 52 {
 			this.channel_masks[2] = args.src.read_u32le?()
 			this.channel_masks[1] = args.src.read_u32le?()
 			this.channel_masks[0] = args.src.read_u32le?()
-			if bitmap_info_len >= 56 {
+			if this.bitmap_info_len >= 56 {
 				this.channel_masks[3] = args.src.read_u32le?()
 				// Skip the rest of the BITMAPINFOHEADER.
-				args.src.skip_u32?(n: bitmap_info_len - 56)
+				args.src.skip_u32?(n: this.bitmap_info_len ~mod- 56)
 			}
 			// If the explicit channel_masks are what the implicit ones would
 			// be for no compression, treat it as no compression.
@@ -188,15 +196,21 @@
 			}
 			this.process_masks?()
 		}
-	} else if bitmap_info_len >= 40 {
+	} else if this.bitmap_info_len >= 40 {
 		// Skip the rest of the BITMAPINFOHEADER.
-		args.src.skip_u32?(n: bitmap_info_len - 40)
+		args.src.skip_u32?(n: this.bitmap_info_len - 40)
+		// The palette follows the BITMAPINFOHEADER.
+		if this.bits_per_pixel < 16 {
+			this.read_palette?(src: args.src)
+		}
 	} else {
 		return "#unsupported BMP file"
 	}
 
 	if this.compression == 0 {  // 0 means no compression.
-		if this.bits_per_pixel == 16 {
+		if this.bits_per_pixel == 8 {
+			this.src_pixfmt = base.PIXEL_FORMAT__INDEXED__BGRA_BINARY
+		} else if this.bits_per_pixel == 16 {
 			// Implement BMP's 16-bit default (BGRX_5551) as BITFIELDS.
 			this.compression = 3
 			this.channel_masks[0] = 0x001F
@@ -229,7 +243,10 @@
 	}
 
 	// The "((x + 3) >> 2) << 2" dance rounds x up to a multiple of 4.
-	if this.bits_per_pixel == 16 {
+	if this.bits_per_pixel == 8 {
+		this.bytes_per_row = ((((this.width as base.u64) * 1) + 3) >> 2) << 2
+		this.pad_per_row = (4 - (this.width & 3)) & 3
+	} else if this.bits_per_pixel == 16 {
 		this.bytes_per_row = ((((this.width as base.u64) * 2) + 3) >> 2) << 2
 		this.pad_per_row = (this.width & 1) * 2
 	} else if this.bits_per_pixel == 24 {
@@ -296,7 +313,8 @@
 }
 
 pub func decoder.decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reader, blend: base.pixel_blend, workbuf: slice base.u8, opts: nptr base.decode_frame_options) {
-	var status : base.status
+	var dst_palette : slice base.u8
+	var status      : base.status
 
 	if this.call_sequence < 2 {
 		this.decode_frame_config?(dst: nullptr, src: args.src)
@@ -320,11 +338,16 @@
 			this.dst_y_inc = 0xFFFF_FFFF  // -1 as a base.u32.
 		}
 
+		dst_palette = args.dst.palette()
+		if dst_palette.length() == 0 {
+			dst_palette = this.scratch[1024 ..]
+		}
+
 		status = this.swizzler.prepare!(
 			dst_pixfmt: args.dst.pixel_format(),
-			dst_palette: args.dst.palette(),
+			dst_palette: dst_palette,
 			src_pixfmt: this.util.make_pixel_format(repr: this.src_pixfmt),
-			src_palette: this.util.empty_slice_u8(),
+			src_palette: this.src_palette[..],
 			blend: args.blend)
 		if not status.is_ok() {
 			return status
@@ -354,6 +377,7 @@
 	var dst_bits_per_pixel  : base.u32[..= 256]
 	var dst_bytes_per_pixel : base.u64[..= 32]
 	var dst_bytes_per_row   : base.u64
+	var dst_palette         : slice base.u8
 	var tab                 : table base.u8
 	var dst                 : slice base.u8
 	var i                   : base.u64
@@ -368,6 +392,10 @@
 	}
 	dst_bytes_per_pixel = (dst_bits_per_pixel / 8) as base.u64
 	dst_bytes_per_row = (this.width as base.u64) * dst_bytes_per_pixel
+	dst_palette = args.dst.palette()
+	if dst_palette.length() == 0 {
+		dst_palette = this.scratch[1024 ..]
+	}
 	tab = args.dst.plane(p: 0)
 
 	while.outer true {
@@ -403,7 +431,7 @@
 			}
 			n = this.swizzler.swizzle_interleaved_from_reader!(
 				dst: dst[i ..],
-				dst_palette: this.util.empty_slice_u8(),
+				dst_palette: dst_palette,
 				src: args.src)
 			if n == 0 {
 				return "@internal note: short read"
@@ -420,6 +448,7 @@
 	var dst_bits_per_pixel  : base.u32[..= 256]
 	var dst_bytes_per_pixel : base.u64[..= 32]
 	var dst_bytes_per_row   : base.u64
+	var dst_palette         : slice base.u8
 	var tab                 : table base.u8
 	var dst                 : slice base.u8
 	var i                   : base.u64
@@ -443,6 +472,10 @@
 	}
 	dst_bytes_per_pixel = (dst_bits_per_pixel / 8) as base.u64
 	dst_bytes_per_row = (this.width as base.u64) * dst_bytes_per_pixel
+	dst_palette = args.dst.palette()
+	if dst_palette.length() == 0 {
+		dst_palette = this.scratch[1024 ..]
+	}
 	tab = args.dst.plane(p: 0)
 
 	while.outer true {
@@ -529,7 +562,7 @@
 
 			n = this.swizzler.swizzle_interleaved_from_slice!(
 				dst: dst[i ..],
-				dst_palette: this.util.empty_slice_u8(),
+				dst_palette: dst_palette,
 				src: this.scratch[.. 8 * p0])
 			if n == 0 {
 				return "@internal note: short read"
@@ -611,6 +644,43 @@
 	return this.util.make_range_ii_u64(min_incl: 0, max_incl: 0)
 }
 
+pri func decoder.read_palette?(src: base.io_reader) {
+	var i    : base.u32
+	var argb : base.u32
+
+	if this.bitmap_info_len < 40 {
+		while (i < 256) and (this.padding >= 3) {
+			this.padding -= 3
+			argb = args.src.read_u24le_as_u32?()
+			argb |= 0xFF00_0000
+			this.src_palette[(4 * i) + 0] = ((argb >> 0) & 0xFF) as base.u8
+			this.src_palette[(4 * i) + 1] = ((argb >> 8) & 0xFF) as base.u8
+			this.src_palette[(4 * i) + 2] = ((argb >> 16) & 0xFF) as base.u8
+			this.src_palette[(4 * i) + 3] = ((argb >> 24) & 0xFF) as base.u8
+			i += 1
+		} endwhile
+	} else {
+		while (i < 256) and (this.padding >= 4) {
+			this.padding -= 4
+			argb = args.src.read_u32le?()
+			argb |= 0xFF00_0000
+			this.src_palette[(4 * i) + 0] = ((argb >> 0) & 0xFF) as base.u8
+			this.src_palette[(4 * i) + 1] = ((argb >> 8) & 0xFF) as base.u8
+			this.src_palette[(4 * i) + 2] = ((argb >> 16) & 0xFF) as base.u8
+			this.src_palette[(4 * i) + 3] = ((argb >> 24) & 0xFF) as base.u8
+			i += 1
+		} endwhile
+	}
+
+	while i < 256 {
+		this.src_palette[(4 * i) + 0] = 0x00
+		this.src_palette[(4 * i) + 1] = 0x00
+		this.src_palette[(4 * i) + 2] = 0x00
+		this.src_palette[(4 * i) + 3] = 0xFF
+		i += 1
+	} endwhile
+}
+
 pri func decoder.process_masks?() {
 	var i    : base.u32
 	var mask : base.u32