Have std/bmp decode BITFIELDS compression
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 8006981..8533d46 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -5563,12 +5563,16 @@
     bool f_top_down;
     uint32_t f_pad_per_row;
     uint64_t f_bytes_per_row;
-    wuffs_base__pixel_format f_pixfmt;
+    uint32_t f_src_pixfmt;
     uint32_t f_io_redirect_fourcc;
     uint64_t f_io_redirect_pos;
     uint64_t f_frame_config_io_position;
     uint32_t f_padding;
+    uint32_t f_bits_per_pixel;
+    uint32_t f_compression;
     uint32_t f_channel_masks[4];
+    uint8_t f_channel_shifts[4];
+    uint8_t f_channel_num_bits[4];
     uint32_t f_dst_x;
     uint32_t f_dst_y;
     uint32_t f_dst_y_end;
@@ -5583,10 +5587,10 @@
   } private_impl;
 
   struct {
+    uint8_t f_scratch[2048];
+
     struct {
       uint32_t v_bitmap_info_len;
-      uint32_t v_bits_per_pixel;
-      uint32_t v_compression;
       uint64_t scratch;
     } s_decode_image_config[1];
     struct {
@@ -15962,7 +15966,13 @@
 // ---------------- Private Function Prototypes
 
 static wuffs_base__status
-wuffs_bmp__decoder__swizzle(
+wuffs_bmp__decoder__swizzle_compress0(
+    wuffs_bmp__decoder* self,
+    wuffs_base__pixel_buffer* a_dst,
+    wuffs_base__io_buffer* a_src);
+
+static wuffs_base__status
+wuffs_bmp__decoder__swizzle_compress3(
     wuffs_bmp__decoder* self,
     wuffs_base__pixel_buffer* a_dst,
     wuffs_base__io_buffer* a_src);
@@ -15972,6 +15982,10 @@
     wuffs_bmp__decoder* self,
     wuffs_base__io_buffer* a_src);
 
+static wuffs_base__status
+wuffs_bmp__decoder__process_masks(
+    wuffs_bmp__decoder* self);
+
 // ---------------- VTables
 
 const wuffs_base__image_decoder__func_ptrs
@@ -16122,8 +16136,7 @@
   uint32_t v_width = 0;
   uint32_t v_height = 0;
   uint32_t v_planes = 0;
-  uint32_t v_bits_per_pixel = 0;
-  uint32_t v_compression = 0;
+  uint32_t v_dst_pixfmt = 0;
 
   const uint8_t* iop_a_src = NULL;
   const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
@@ -16139,8 +16152,6 @@
   uint32_t coro_susp_point = self->private_impl.p_decode_image_config[0];
   if (coro_susp_point) {
     v_bitmap_info_len = self->private_data.s_decode_image_config[0].v_bitmap_info_len;
-    v_bits_per_pixel = self->private_data.s_decode_image_config[0].v_bits_per_pixel;
-    v_compression = self->private_data.s_decode_image_config[0].v_compression;
   }
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
@@ -16410,7 +16421,7 @@
           *scratch |= ((uint64_t)(num_bits_6)) << 56;
         }
       }
-      v_bits_per_pixel = t_6;
+      self->private_impl.f_bits_per_pixel = t_6;
     }
     {
       WUFFS_BASE__COROUTINE_SUSPENSION_POINT(16);
@@ -16439,14 +16450,14 @@
           *scratch |= ((uint64_t)(num_bits_7)) << 56;
         }
       }
-      v_compression = t_7;
+      self->private_impl.f_compression = t_7;
     }
-    if (v_bits_per_pixel == 0) {
-      if (v_compression == 4) {
+    if (self->private_impl.f_bits_per_pixel == 0) {
+      if (self->private_impl.f_compression == 4) {
         self->private_impl.f_io_redirect_fourcc = 1246774599;
         status = wuffs_base__make_status(wuffs_base__note__i_o_redirect);
         goto ok;
-      } else if (v_compression == 5) {
+      } else if (self->private_impl.f_compression == 5) {
         self->private_impl.f_io_redirect_fourcc = 1347307296;
         status = wuffs_base__make_status(wuffs_base__note__i_o_redirect);
         goto ok;
@@ -16463,7 +16474,10 @@
       goto suspend;
     }
     iop_a_src += self->private_data.s_decode_image_config[0].scratch;
-    if ((v_compression == 3) || (v_compression == 6)) {
+    if (self->private_impl.f_compression == 6) {
+      self->private_impl.f_compression = 3;
+    }
+    if (self->private_impl.f_compression == 3) {
       if (v_bitmap_info_len >= 52) {
         {
           WUFFS_BASE__COROUTINE_SUSPENSION_POINT(19);
@@ -16593,18 +16607,23 @@
           iop_a_src += self->private_data.s_decode_image_config[0].scratch;
         }
         if ((self->private_impl.f_channel_masks[0] == 255) && (self->private_impl.f_channel_masks[1] == 65280) && (self->private_impl.f_channel_masks[2] == 16711680)) {
-          if (v_bits_per_pixel == 24) {
-            v_compression = 0;
-          } else if (v_bits_per_pixel == 32) {
+          if (self->private_impl.f_bits_per_pixel == 24) {
+            self->private_impl.f_compression = 0;
+          } else if (self->private_impl.f_bits_per_pixel == 32) {
             if ((self->private_impl.f_channel_masks[3] == 0) || (self->private_impl.f_channel_masks[3] == 4278190080)) {
-              v_compression = 0;
+              self->private_impl.f_compression = 0;
             }
           }
         }
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(28);
+        status = wuffs_bmp__decoder__process_masks(self);
+        if (status.repr) {
+          goto suspend;
+        }
       }
     } else if (v_bitmap_info_len >= 40) {
       self->private_data.s_decode_image_config[0].scratch = (v_bitmap_info_len - 40);
-      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(28);
+      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(29);
       if (self->private_data.s_decode_image_config[0].scratch > ((uint64_t)(io2_a_src - iop_a_src))) {
         self->private_data.s_decode_image_config[0].scratch -= ((uint64_t)(io2_a_src - iop_a_src));
         iop_a_src = io2_a_src;
@@ -16616,36 +16635,69 @@
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
-    if (v_bits_per_pixel == 24) {
-      self->private_impl.f_bytes_per_row = ((((((uint64_t)(self->private_impl.f_width)) * 3) + 3) >> 2) << 2);
-      self->private_impl.f_pad_per_row = (self->private_impl.f_width & 3);
-      self->private_impl.f_pixfmt = wuffs_base__utility__make_pixel_format(2147485832);
-    } else if (v_bits_per_pixel == 32) {
-      self->private_impl.f_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * 4);
-      self->private_impl.f_pad_per_row = 0;
-      if (self->private_impl.f_channel_masks[3] == 0) {
-        self->private_impl.f_pixfmt = wuffs_base__utility__make_pixel_format(2415954056);
+    if (self->private_impl.f_compression == 0) {
+      if (self->private_impl.f_bits_per_pixel == 16) {
+        self->private_impl.f_compression = 3;
+        self->private_impl.f_channel_masks[0] = 31;
+        self->private_impl.f_channel_masks[1] = 992;
+        self->private_impl.f_channel_masks[2] = 31744;
+        self->private_impl.f_channel_masks[3] = 0;
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(30);
+        status = wuffs_bmp__decoder__process_masks(self);
+        if (status.repr) {
+          goto suspend;
+        }
+        self->private_impl.f_src_pixfmt = 2164308923;
+      } else if (self->private_impl.f_bits_per_pixel == 24) {
+        self->private_impl.f_src_pixfmt = 2147485832;
+      } else if (self->private_impl.f_bits_per_pixel == 32) {
+        if (self->private_impl.f_channel_masks[3] == 0) {
+          self->private_impl.f_src_pixfmt = 2415954056;
+        } else {
+          self->private_impl.f_src_pixfmt = 2164295816;
+        }
       } else {
-        self->private_impl.f_pixfmt = wuffs_base__utility__make_pixel_format(2164295816);
+        status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
+        goto exit;
+      }
+    } else if (self->private_impl.f_compression == 3) {
+      if ((self->private_impl.f_bits_per_pixel == 16) || (self->private_impl.f_bits_per_pixel == 32)) {
+        self->private_impl.f_src_pixfmt = 2164308923;
+      } else {
+        status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
+        goto exit;
       }
     } else {
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
-    if (v_compression != 0) {
-      status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
-      goto exit;
+    if (self->private_impl.f_bits_per_pixel == 16) {
+      self->private_impl.f_bytes_per_row = ((((((uint64_t)(self->private_impl.f_width)) * 2) + 3) >> 2) << 2);
+      self->private_impl.f_pad_per_row = ((self->private_impl.f_width & 1) * 2);
+    } else if (self->private_impl.f_bits_per_pixel == 24) {
+      self->private_impl.f_bytes_per_row = ((((((uint64_t)(self->private_impl.f_width)) * 3) + 3) >> 2) << 2);
+      self->private_impl.f_pad_per_row = (self->private_impl.f_width & 3);
+    } else if (self->private_impl.f_bits_per_pixel == 32) {
+      self->private_impl.f_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * 4);
+      self->private_impl.f_pad_per_row = 0;
     }
     self->private_impl.f_frame_config_io_position = wuffs_base__u64__sat_add(a_src->meta.pos, ((uint64_t)(iop_a_src - io0_a_src)));
     if (a_dst != NULL) {
+      v_dst_pixfmt = 2164295816;
+      if ((self->private_impl.f_channel_num_bits[0] > 8) ||
+          (self->private_impl.f_channel_num_bits[1] > 8) ||
+          (self->private_impl.f_channel_num_bits[2] > 8) ||
+          (self->private_impl.f_channel_num_bits[3] > 8)) {
+        v_dst_pixfmt = 2164308923;
+      }
       wuffs_base__image_config__set(
           a_dst,
-          2164295816,
+          v_dst_pixfmt,
           0,
           self->private_impl.f_width,
           self->private_impl.f_height,
           self->private_impl.f_frame_config_io_position,
-          true);
+          (self->private_impl.f_channel_masks[3] == 0));
     }
     self->private_impl.f_call_sequence = 1;
 
@@ -16660,8 +16712,6 @@
   self->private_impl.p_decode_image_config[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
   self->private_impl.active_coroutine = wuffs_base__status__is_suspension(&status) ? 1 : 0;
   self->private_data.s_decode_image_config[0].v_bitmap_info_len = v_bitmap_info_len;
-  self->private_data.s_decode_image_config[0].v_bits_per_pixel = v_bits_per_pixel;
-  self->private_data.s_decode_image_config[0].v_compression = v_compression;
 
   goto exit;
   exit:
@@ -16882,7 +16932,7 @@
       v_status = wuffs_base__pixel_swizzler__prepare(&self->private_impl.f_swizzler,
           wuffs_base__pixel_buffer__pixel_format(a_dst),
           wuffs_base__pixel_buffer__palette(a_dst),
-          self->private_impl.f_pixfmt,
+          wuffs_base__utility__make_pixel_format(self->private_impl.f_src_pixfmt),
           wuffs_base__utility__empty_slice_u8(),
           a_blend);
       if ( ! wuffs_base__status__is_ok(&v_status)) {
@@ -16896,12 +16946,22 @@
         goto ok;
       }
       while (true) {
-        if (a_src) {
-          a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
-        }
-        v_status = wuffs_bmp__decoder__swizzle(self, a_dst, a_src);
-        if (a_src) {
-          iop_a_src = a_src->data.ptr + a_src->meta.ri;
+        if (self->private_impl.f_compression == 0) {
+          if (a_src) {
+            a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+          }
+          v_status = wuffs_bmp__decoder__swizzle_compress0(self, a_dst, a_src);
+          if (a_src) {
+            iop_a_src = a_src->data.ptr + a_src->meta.ri;
+          }
+        } else {
+          if (a_src) {
+            a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+          }
+          v_status = wuffs_bmp__decoder__swizzle_compress3(self, a_dst, a_src);
+          if (a_src) {
+            iop_a_src = a_src->data.ptr + a_src->meta.ri;
+          }
         }
         if (wuffs_base__status__is_ok(&v_status)) {
           goto label__0__break;
@@ -16945,10 +17005,10 @@
   return status;
 }
 
-// -------- func bmp.decoder.swizzle
+// -------- func bmp.decoder.swizzle_compress0
 
 static wuffs_base__status
-wuffs_bmp__decoder__swizzle(
+wuffs_bmp__decoder__swizzle_compress0(
     wuffs_bmp__decoder* self,
     wuffs_base__pixel_buffer* a_dst,
     wuffs_base__io_buffer* a_src) {
@@ -17041,6 +17101,142 @@
   return status;
 }
 
+// -------- func bmp.decoder.swizzle_compress3
+
+static wuffs_base__status
+wuffs_bmp__decoder__swizzle_compress3(
+    wuffs_bmp__decoder* self,
+    wuffs_base__pixel_buffer* a_dst,
+    wuffs_base__io_buffer* a_src) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  wuffs_base__pixel_format v_dst_pixfmt = {0};
+  uint32_t v_dst_bits_per_pixel = 0;
+  uint64_t v_dst_bytes_per_pixel = 0;
+  uint64_t v_dst_bytes_per_row = 0;
+  wuffs_base__table_u8 v_tab = {0};
+  wuffs_base__slice_u8 v_dst = {0};
+  uint64_t v_i = 0;
+  uint64_t v_n = 0;
+  uint32_t v_p0 = 0;
+  uint32_t v_p1 = 0;
+  uint32_t v_p1_temp = 0;
+  uint32_t v_num_bits = 0;
+  uint32_t v_c = 0;
+  uint32_t v_c32 = 0;
+  uint32_t v_channel = 0;
+
+  const uint8_t* iop_a_src = NULL;
+  const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_src) {
+    io0_a_src = a_src->data.ptr;
+    io1_a_src = io0_a_src + a_src->meta.ri;
+    iop_a_src = io1_a_src;
+    io2_a_src = io0_a_src + a_src->meta.wi;
+  }
+
+  v_dst_pixfmt = wuffs_base__pixel_buffer__pixel_format(a_dst);
+  v_dst_bits_per_pixel = wuffs_base__pixel_format__bits_per_pixel(&v_dst_pixfmt);
+  if ((v_dst_bits_per_pixel & 7) != 0) {
+    status = wuffs_base__make_status(wuffs_base__error__unsupported_option);
+    goto exit;
+  }
+  v_dst_bytes_per_pixel = ((uint64_t)((v_dst_bits_per_pixel / 8)));
+  v_dst_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * v_dst_bytes_per_pixel);
+  v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0);
+  label__outer__continue:;
+  while (true) {
+    while (self->private_impl.f_pending_pad > 0) {
+      if (((uint64_t)(io2_a_src - iop_a_src)) <= 0) {
+        status = wuffs_base__make_status(wuffs_bmp__note__internal_note_short_read);
+        goto ok;
+      }
+      self->private_impl.f_pending_pad -= 1;
+      (iop_a_src += 1, wuffs_base__make_empty_struct());
+    }
+    label__inner__continue:;
+    while (true) {
+      if (self->private_impl.f_dst_x == self->private_impl.f_width) {
+        self->private_impl.f_dst_x = 0;
+        self->private_impl.f_dst_y += self->private_impl.f_dst_y_inc;
+        if (self->private_impl.f_dst_y == self->private_impl.f_dst_y_end) {
+          goto label__outer__break;
+        } else if (self->private_impl.f_pad_per_row != 0) {
+          self->private_impl.f_pending_pad = self->private_impl.f_pad_per_row;
+          goto label__outer__continue;
+        }
+      }
+      v_p1_temp = (self->private_impl.f_width - self->private_impl.f_dst_x);
+      v_p1 = wuffs_base__u32__min(v_p1_temp, 256);
+      v_p0 = 0;
+      while ((v_p0 < v_p1) && (((uint64_t)(io2_a_src - iop_a_src)) >= 2)) {
+        if (self->private_impl.f_bits_per_pixel == 16) {
+          if (((uint64_t)(io2_a_src - iop_a_src)) < 2) {
+            goto label__0__break;
+          }
+          v_c32 = ((uint32_t)(wuffs_base__load_u16le__no_bounds_check(iop_a_src)));
+          (iop_a_src += 2, wuffs_base__make_empty_struct());
+        } else {
+          if (((uint64_t)(io2_a_src - iop_a_src)) < 4) {
+            goto label__0__break;
+          }
+          v_c32 = wuffs_base__load_u32le__no_bounds_check(iop_a_src);
+          (iop_a_src += 4, wuffs_base__make_empty_struct());
+        }
+        v_channel = 0;
+        while (v_channel < 4) {
+          if (self->private_impl.f_channel_num_bits[v_channel] == 0) {
+            self->private_data.f_scratch[((8 * v_p0) + (2 * v_channel) + 0)] = 255;
+            self->private_data.f_scratch[((8 * v_p0) + (2 * v_channel) + 1)] = 255;
+          } else {
+            v_c = ((v_c32 & self->private_impl.f_channel_masks[v_channel]) >> self->private_impl.f_channel_shifts[v_channel]);
+            v_num_bits = ((uint32_t)(self->private_impl.f_channel_num_bits[v_channel]));
+            while (v_num_bits < 16) {
+              v_c |= (v_c << v_num_bits);
+              v_num_bits *= 2;
+            }
+            v_c >>= (v_num_bits - 16);
+            self->private_data.f_scratch[((8 * v_p0) + (2 * v_channel) + 0)] = ((uint8_t)((255 & (v_c >> 0))));
+            self->private_data.f_scratch[((8 * v_p0) + (2 * v_channel) + 1)] = ((uint8_t)((255 & (v_c >> 8))));
+          }
+          v_channel += 1;
+        }
+        v_p0 += 1;
+      }
+      label__0__break:;
+      v_dst = wuffs_base__table_u8__row(v_tab, self->private_impl.f_dst_y);
+      if (v_dst_bytes_per_row < ((uint64_t)(v_dst.len))) {
+        v_dst = wuffs_base__slice_u8__subslice_j(v_dst, v_dst_bytes_per_row);
+      }
+      v_i = (((uint64_t)(self->private_impl.f_dst_x)) * v_dst_bytes_per_pixel);
+      if (v_i >= ((uint64_t)(v_dst.len))) {
+        goto label__inner__continue;
+      }
+      v_n = wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, wuffs_base__slice_u8__subslice_i(v_dst, v_i), wuffs_base__utility__empty_slice_u8(), wuffs_base__slice_u8__subslice_j(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), (8 * v_p0)));
+      if (v_n == 0) {
+        status = wuffs_base__make_status(wuffs_bmp__note__internal_note_short_read);
+        goto ok;
+      }
+      wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x, ((uint32_t)((v_n & 4294967295))));
+    }
+  }
+  label__outer__break:;
+  status = wuffs_base__make_status(NULL);
+  goto ok;
+
+  goto ok;
+  ok:
+  goto exit;
+  exit:
+  if (a_src) {
+    a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+  }
+
+  return status;
+}
+
 // -------- func bmp.decoder.skip_frame
 
 static wuffs_base__status
@@ -17284,6 +17480,50 @@
   return wuffs_base__utility__make_range_ii_u64(0, 0);
 }
 
+// -------- func bmp.decoder.process_masks
+
+static wuffs_base__status
+wuffs_bmp__decoder__process_masks(
+    wuffs_bmp__decoder* self) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  uint32_t v_i = 0;
+  uint32_t v_mask = 0;
+  uint32_t v_n = 0;
+
+  while (v_i < 4) {
+    v_mask = self->private_impl.f_channel_masks[v_i];
+    if (v_mask != 0) {
+      v_n = 0;
+      while ((v_mask & 1) == 0) {
+        v_n += 1;
+        v_mask >>= 1;
+      }
+      self->private_impl.f_channel_shifts[v_i] = ((uint8_t)((v_n & 31)));
+      v_n = 0;
+      while ((v_mask & 1) == 1) {
+        v_n += 1;
+        v_mask >>= 1;
+      }
+      if ((v_mask != 0) || (v_n > 32)) {
+        status = wuffs_base__make_status(wuffs_bmp__error__bad_header);
+        goto exit;
+      }
+      self->private_impl.f_channel_num_bits[v_i] = ((uint8_t)(v_n));
+    } else if (v_i != 3) {
+      status = wuffs_base__make_status(wuffs_bmp__error__bad_header);
+      goto exit;
+    }
+    v_i += 1;
+  }
+
+  goto ok;
+  ok:
+  goto exit;
+  exit:
+  return status;
+}
+
 #endif  // !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__BMP)
 
 #if !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__CBOR)
diff --git a/std/bmp/decode_bmp.wuffs b/std/bmp/decode_bmp.wuffs
index 5851cf8..ae7683e 100644
--- a/std/bmp/decode_bmp.wuffs
+++ b/std/bmp/decode_bmp.wuffs
@@ -28,7 +28,7 @@
 	top_down      : base.bool,
 	pad_per_row   : base.u32[..= 3],
 	bytes_per_row : base.u64[..= 0x0000_0001_FFFF_FFFC],  // 4 * 0x7FFF_FFFF
-	pixfmt        : base.pixel_format,
+	src_pixfmt    : base.u32,
 
 	io_redirect_fourcc : base.u32,
 	io_redirect_pos    : base.u64,
@@ -37,9 +37,14 @@
 
 	padding : base.u32,
 
+	bits_per_pixel : base.u32,
+	compression    : base.u32,
+
 	// channel_etc's indexes are: B, G, R, A. This is in Wuffs' default order,
 	// which isn't the RGBA order they're listed in the wire format.
-	channel_masks : array[4] base.u32,
+	channel_masks    : array[4] base.u32,
+	channel_shifts   : array[4] base.u8[..= 31],
+	channel_num_bits : array[4] base.u8[..= 32],
 
 	dst_x     : base.u32,
 	dst_y     : base.u32,
@@ -50,6 +55,8 @@
 
 	swizzler : base.pixel_swizzler,
 	util     : base.utility,
+)(
+	scratch : array[2048] base.u8,  // 2048 = 256 * (8 bytes per BGRA_NONPREMUL_4X16LE).
 )
 
 pub func decoder.set_quirk_enabled!(quirk: base.u32, enabled: base.bool) {
@@ -61,8 +68,7 @@
 	var width           : base.u32
 	var height          : base.u32
 	var planes          : base.u32
-	var bits_per_pixel  : base.u32
-	var compression     : base.u32
+	var dst_pixfmt      : base.u32
 
 	if (this.call_sequence <> 0) or (this.io_redirect_fourcc == 1) {
 		return base."#bad call sequence"
@@ -132,13 +138,13 @@
 		return "#unsupported BMP file"
 	}
 
-	bits_per_pixel = args.src.read_u16le_as_u32?()
-	compression = args.src.read_u32le?()
-	if bits_per_pixel == 0 {
-		if compression == 4 {
+	this.bits_per_pixel = args.src.read_u16le_as_u32?()
+	this.compression = args.src.read_u32le?()
+	if this.bits_per_pixel == 0 {
+		if this.compression == 4 {
 			this.io_redirect_fourcc = 'JPEG'be
 			return base."@I/O redirect"
-		} else if compression == 5 {
+		} else if this.compression == 5 {
 			this.io_redirect_fourcc = 'PNG 'be
 			return base."@I/O redirect"
 		}
@@ -150,9 +156,13 @@
 	// the version 3 BITMAPINFOHEADER (whose total size is 40).
 	args.src.skip_u32?(n: 40 - 20)
 
-	// Read the channel_masks when compression is 3 (BITFIELDS) or 6
-	// (ALPHABITFIELDS).
-	if (compression == 3) or (compression == 6) {
+	// Treat 6 (ALPHABITFIELDS) the same as 3 (BITFIELDS).
+	if this.compression == 6 {
+		this.compression = 3
+	}
+
+	// Read the channel_masks when this.compression is 3 (BITFIELDS).
+	if this.compression == 3 {
 		if bitmap_info_len >= 52 {
 			this.channel_masks[2] = args.src.read_u32le?()
 			this.channel_masks[1] = args.src.read_u32le?()
@@ -167,15 +177,16 @@
 			if (this.channel_masks[0] == 0x0000_00FF) and
 				(this.channel_masks[1] == 0x0000_FF00) and
 				(this.channel_masks[2] == 0x00FF_0000) {
-				if bits_per_pixel == 24 {
-					compression = 0
-				} else if bits_per_pixel == 32 {
+				if this.bits_per_pixel == 24 {
+					this.compression = 0
+				} else if this.bits_per_pixel == 32 {
 					if (this.channel_masks[3] == 0) or
 						(this.channel_masks[3] == 0xFF00_0000) {
-						compression = 0
+						this.compression = 0
 					}
 				}
 			}
+			this.process_masks?()
 		}
 	} else if bitmap_info_len >= 40 {
 		// Skip the rest of the BITMAPINFOHEADER.
@@ -184,40 +195,69 @@
 		return "#unsupported BMP file"
 	}
 
-	if bits_per_pixel == 24 {
-		// 3 bytes per pixel, but row lengths are rounded up to multiples of 4.
-		// The "((x + 3) >> 2) << 2" dance rounds x up.
-		this.bytes_per_row = ((((this.width as base.u64) * 3) + 3) >> 2) << 2
-		this.pad_per_row = this.width & 3
-		this.pixfmt = this.util.make_pixel_format(repr: base.PIXEL_FORMAT__BGR)
-	} else if bits_per_pixel == 32 {
-		this.bytes_per_row = (this.width as base.u64) * 4
-		this.pad_per_row = 0
-		if this.channel_masks[3] == 0 {
-			this.pixfmt = this.util.make_pixel_format(repr: base.PIXEL_FORMAT__BGRX)
+	if this.compression == 0 {  // 0 means no compression.
+		if this.bits_per_pixel == 16 {
+			// Implement BMP's 16-bit default (BGRX_5551) as BITFIELDS.
+			this.compression = 3
+			this.channel_masks[0] = 0x001F
+			this.channel_masks[1] = 0x03E0
+			this.channel_masks[2] = 0x7C00
+			this.channel_masks[3] = 0x0000
+			this.process_masks?()
+			this.src_pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE
+		} else if this.bits_per_pixel == 24 {
+			this.src_pixfmt = base.PIXEL_FORMAT__BGR
+		} else if this.bits_per_pixel == 32 {
+			if this.channel_masks[3] == 0 {
+				this.src_pixfmt = base.PIXEL_FORMAT__BGRX
+			} else {
+				this.src_pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL
+			}
 		} else {
-			this.pixfmt = this.util.make_pixel_format(repr: base.PIXEL_FORMAT__BGRA_NONPREMUL)
+			return "#unsupported BMP file"
 		}
+
+	} else if this.compression == 3 {  // 3 means BITFIELDS.
+		if (this.bits_per_pixel == 16) or (this.bits_per_pixel == 32) {
+			this.src_pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE
+		} else {
+			return "#unsupported BMP file"
+		}
+
 	} else {
-		// TODO: support other bits_per_pixel's.
 		return "#unsupported BMP file"
 	}
 
-	if compression <> 0 {
-		// TODO: support compression.
-		return "#unsupported BMP file"
+	// The "((x + 3) >> 2) << 2" dance rounds x up to a multiple of 4.
+	if this.bits_per_pixel == 16 {
+		this.bytes_per_row = ((((this.width as base.u64) * 2) + 3) >> 2) << 2
+		this.pad_per_row = (this.width & 1) * 2
+	} else if this.bits_per_pixel == 24 {
+		this.bytes_per_row = ((((this.width as base.u64) * 3) + 3) >> 2) << 2
+		this.pad_per_row = this.width & 3
+	} else if this.bits_per_pixel == 32 {
+		this.bytes_per_row = (this.width as base.u64) * 4
+		this.pad_per_row = 0
 	}
 
 	this.frame_config_io_position = args.src.position()
 
 	if args.dst <> nullptr {
+		dst_pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL
+		if (this.channel_num_bits[0] > 8) or
+			(this.channel_num_bits[1] > 8) or
+			(this.channel_num_bits[2] > 8) or
+			(this.channel_num_bits[3] > 8) {
+			dst_pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE
+		}
+
 		args.dst.set!(
-			pixfmt: base.PIXEL_FORMAT__BGRA_NONPREMUL,  // TODO: this.pixfmt instead?
+			pixfmt: dst_pixfmt,
 			pixsub: 0,
 			width: this.width,
 			height: this.height,
 			first_frame_io_position: this.frame_config_io_position,
-			first_frame_is_opaque: true)
+			first_frame_is_opaque: this.channel_masks[3] == 0)
 	}
 
 	this.call_sequence = 1
@@ -283,7 +323,7 @@
 		status = this.swizzler.prepare!(
 			dst_pixfmt: args.dst.pixel_format(),
 			dst_palette: args.dst.palette(),
-			src_pixfmt: this.pixfmt,
+			src_pixfmt: this.util.make_pixel_format(repr: this.src_pixfmt),
 			src_palette: this.util.empty_slice_u8(),
 			blend: args.blend)
 		if not status.is_ok() {
@@ -291,7 +331,12 @@
 		}
 
 		while true {
-			status = this.swizzle!(dst: args.dst, src: args.src)
+			if this.compression == 0 {
+				status = this.swizzle_compress0!(dst: args.dst, src: args.src)
+			} else {
+				status = this.swizzle_compress3!(dst: args.dst, src: args.src)
+			}
+
 			if status.is_ok() {
 				break
 			} else if status <> "@internal note: short read" {
@@ -304,7 +349,7 @@
 	this.call_sequence = 3
 }
 
-pri func decoder.swizzle!(dst: ptr base.pixel_buffer, src: base.io_reader) base.status {
+pri func decoder.swizzle_compress0!(dst: ptr base.pixel_buffer, src: base.io_reader) base.status {
 	var dst_pixfmt          : base.pixel_format
 	var dst_bits_per_pixel  : base.u32[..= 256]
 	var dst_bytes_per_pixel : base.u64[..= 32]
@@ -370,6 +415,132 @@
 	return ok
 }
 
+pri func decoder.swizzle_compress3!(dst: ptr base.pixel_buffer, src: base.io_reader) base.status {
+	var dst_pixfmt          : base.pixel_format
+	var dst_bits_per_pixel  : base.u32[..= 256]
+	var dst_bytes_per_pixel : base.u64[..= 32]
+	var dst_bytes_per_row   : base.u64
+	var tab                 : table base.u8
+	var dst                 : slice base.u8
+	var i                   : base.u64
+	var n                   : base.u64
+
+	var p0      : base.u32[..= 256]
+	var p1      : base.u32[..= 256]
+	var p1_temp : base.u32
+
+	var num_bits : base.u32[..= 32]
+	var c        : base.u32
+	var c32      : base.u32
+	var channel  : base.u32[..= 4]
+
+	// TODO: the dst_pixfmt variable shouldn't be necessary. We should be able
+	// to chain the two calls: "args.dst.pixel_format().bits_per_pixel()".
+	dst_pixfmt = args.dst.pixel_format()
+	dst_bits_per_pixel = dst_pixfmt.bits_per_pixel()
+	if (dst_bits_per_pixel & 7) <> 0 {
+		return base."#unsupported option"
+	}
+	dst_bytes_per_pixel = (dst_bits_per_pixel / 8) as base.u64
+	dst_bytes_per_row = (this.width as base.u64) * dst_bytes_per_pixel
+	tab = args.dst.plane(p: 0)
+
+	while.outer true {
+		while this.pending_pad > 0 {
+			if args.src.length() <= 0 {
+				return "@internal note: short read"
+			}
+			this.pending_pad -= 1
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+		} endwhile
+
+		while.inner true {
+			if this.dst_x == this.width {
+				this.dst_x = 0
+				this.dst_y ~mod+= this.dst_y_inc
+				if this.dst_y == this.dst_y_end {
+					break.outer
+				} else if this.pad_per_row <> 0 {
+					this.pending_pad = this.pad_per_row
+					continue.outer
+				}
+			}
+
+			// -------- BEGIN convert to PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE.
+			p1_temp = this.width ~mod- this.dst_x
+			p1 = p1_temp.min(a: 256)
+			p0 = 0
+			while (p0 < p1) and (args.src.length() >= 2) {
+				assert p0 < 256 via "a < b: a < c; c <= b"(c: p1)
+				if this.bits_per_pixel == 16 {
+					if args.src.length() < 2 {
+						break
+					}
+					c32 = args.src.peek_u16le_as_u32()
+					args.src.skip_u32_fast!(actual: 2, worst_case: 2)
+				} else {
+					if args.src.length() < 4 {
+						break
+					}
+					c32 = args.src.peek_u32le()
+					args.src.skip_u32_fast!(actual: 4, worst_case: 4)
+				}
+
+				channel = 0
+				while channel < 4,
+					inv p0 < 256,
+				{
+					if this.channel_num_bits[channel] == 0 {
+						this.scratch[(8 * p0) + (2 * channel) + 0] = 0xFF
+						this.scratch[(8 * p0) + (2 * channel) + 1] = 0xFF
+					} else {
+						c = (c32 & this.channel_masks[channel]) >> this.channel_shifts[channel]
+						num_bits = this.channel_num_bits[channel] as base.u32
+						while num_bits < 16,
+							inv p0 < 256,
+							inv channel < 4,
+							post num_bits >= 16,
+						{
+							c |= c ~mod<< num_bits
+							num_bits *= 2
+						} endwhile
+						c >>= num_bits - 16
+						this.scratch[(8 * p0) + (2 * channel) + 0] = (0xFF & (c >> 0)) as base.u8
+						this.scratch[(8 * p0) + (2 * channel) + 1] = (0xFF & (c >> 8)) as base.u8
+					}
+
+					channel += 1
+				} endwhile
+
+				p0 += 1
+			} endwhile
+			// -------- END   convert to PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE.
+
+			dst = tab.row(y: this.dst_y)
+			if dst_bytes_per_row < dst.length() {
+				dst = dst[.. dst_bytes_per_row]
+			}
+			i = (this.dst_x as base.u64) * dst_bytes_per_pixel
+			if i >= dst.length() {
+				// TODO: advance args.src if the dst pixel_buffer bounds is
+				// smaller than this BMP's image bounds?
+				continue.inner
+			}
+
+			n = this.swizzler.swizzle_interleaved_from_slice!(
+				dst: dst[i ..],
+				dst_palette: this.util.empty_slice_u8(),
+				src: this.scratch[.. 8 * p0])
+			if n == 0 {
+				return "@internal note: short read"
+			}
+			this.dst_x ~sat+= (n & 0xFFFF_FFFF) as base.u32
+		} endwhile.inner
+	} endwhile.outer
+
+	return ok
+}
+
 pri func decoder.skip_frame?(src: base.io_reader) {
 	args.src.skip_u32?(n: this.padding)
 	args.src.skip?(n: this.bytes_per_row * (this.height as base.u64))
@@ -439,3 +610,39 @@
 pub func decoder.workbuf_len() base.range_ii_u64 {
 	return this.util.make_range_ii_u64(min_incl: 0, max_incl: 0)
 }
+
+pri func decoder.process_masks?() {
+	var i    : base.u32
+	var mask : base.u32
+	var n    : base.u32
+
+	while i < 4 {
+		mask = this.channel_masks[i]
+		if mask <> 0 {
+			n = 0
+			while (mask & 1) == 0,
+				inv i < 4,
+			{
+				n ~mod+= 1
+				mask >>= 1
+			} endwhile
+			this.channel_shifts[i] = (n & 31) as base.u8
+
+			n = 0
+			while (mask & 1) == 1,
+				inv i < 4,
+			{
+				n ~mod+= 1
+				mask >>= 1
+			} endwhile
+			if (mask <> 0) or (n > 32) {
+				return "#bad header"
+			}
+			this.channel_num_bits[i] = n as base.u8
+		} else if i <> 3 {
+			return "#bad header"
+		}
+
+		i += 1
+	} endwhile
+}