Have std/bmp decode 1, 2 or 4 bits per pixel
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index e87e854..ec528c8 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -16129,6 +16129,8 @@
 
 #define WUFFS_BMP__COMPRESSION_ALPHABITFIELDS 6
 
+#define WUFFS_BMP__COMPRESSION_LOW_BIT_DEPTH 256
+
 #define WUFFS_BMP__RLE_STATE_NEUTRAL 0
 
 #define WUFFS_BMP__RLE_STATE_RUN 1
@@ -16164,6 +16166,12 @@
     wuffs_base__io_buffer* a_src);
 
 static wuffs_base__status
+wuffs_bmp__decoder__swizzle_low_bit_depth(
+    wuffs_bmp__decoder* self,
+    wuffs_base__pixel_buffer* a_dst,
+    wuffs_base__io_buffer* a_src);
+
+static wuffs_base__status
 wuffs_bmp__decoder__skip_frame(
     wuffs_bmp__decoder* self,
     wuffs_base__io_buffer* a_src);
@@ -16327,6 +16335,7 @@
   uint32_t v_height = 0;
   uint32_t v_planes = 0;
   uint32_t v_dst_pixfmt = 0;
+  uint32_t v_byte_width = 0;
 
   const uint8_t* iop_a_src = NULL;
   const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
@@ -16838,7 +16847,10 @@
       goto exit;
     }
     if (self->private_impl.f_compression == 0) {
-      if (self->private_impl.f_bits_per_pixel == 8) {
+      if ((self->private_impl.f_bits_per_pixel == 1) || (self->private_impl.f_bits_per_pixel == 2) || (self->private_impl.f_bits_per_pixel == 4)) {
+        self->private_impl.f_src_pixfmt = 2198077448;
+        self->private_impl.f_compression = 256;
+      } else if (self->private_impl.f_bits_per_pixel == 8) {
         self->private_impl.f_src_pixfmt = 2198077448;
       } else if (self->private_impl.f_bits_per_pixel == 16) {
         self->private_impl.f_compression = 3;
@@ -16871,6 +16883,13 @@
         status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
         goto exit;
       }
+    } else if (self->private_impl.f_compression == 2) {
+      if (self->private_impl.f_bits_per_pixel == 4) {
+        self->private_impl.f_src_pixfmt = 2198077448;
+      } else {
+        status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
+        goto exit;
+      }
     } else if (self->private_impl.f_compression == 3) {
       if ((self->private_impl.f_bits_per_pixel == 16) || (self->private_impl.f_bits_per_pixel == 32)) {
         self->private_impl.f_src_pixfmt = 2164308923;
@@ -16882,7 +16901,19 @@
       status = wuffs_base__make_status(wuffs_bmp__error__unsupported_bmp_file);
       goto exit;
     }
-    if (self->private_impl.f_bits_per_pixel == 8) {
+    if (self->private_impl.f_bits_per_pixel == 1) {
+      v_byte_width = ((self->private_impl.f_width >> 3) + (((self->private_impl.f_width & 7) + 7) >> 3));
+      self->private_impl.f_bytes_per_row = ((((((uint64_t)(v_byte_width)) * 1) + 3) >> 2) << 2);
+      self->private_impl.f_pad_per_row = ((4 - (v_byte_width & 3)) & 3);
+    } else if (self->private_impl.f_bits_per_pixel == 2) {
+      v_byte_width = ((self->private_impl.f_width >> 2) + (((self->private_impl.f_width & 3) + 3) >> 2));
+      self->private_impl.f_bytes_per_row = ((((((uint64_t)(v_byte_width)) * 1) + 3) >> 2) << 2);
+      self->private_impl.f_pad_per_row = ((4 - (v_byte_width & 3)) & 3);
+    } else if (self->private_impl.f_bits_per_pixel == 4) {
+      v_byte_width = ((self->private_impl.f_width >> 1) + (self->private_impl.f_width & 1));
+      self->private_impl.f_bytes_per_row = ((((((uint64_t)(v_byte_width)) * 1) + 3) >> 2) << 2);
+      self->private_impl.f_pad_per_row = ((4 - (v_byte_width & 3)) & 3);
+    } else if (self->private_impl.f_bits_per_pixel == 8) {
       self->private_impl.f_bytes_per_row = ((((((uint64_t)(self->private_impl.f_width)) * 1) + 3) >> 2) << 2);
       self->private_impl.f_pad_per_row = ((4 - (self->private_impl.f_width & 3)) & 3);
     } else if (self->private_impl.f_bits_per_pixel == 16) {
@@ -17170,7 +17201,7 @@
           if (a_src) {
             iop_a_src = a_src->data.ptr + a_src->meta.ri;
           }
-        } else if (self->private_impl.f_compression == 1) {
+        } else if (self->private_impl.f_compression < 3) {
           if (a_src) {
             a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
           }
@@ -17178,7 +17209,7 @@
           if (a_src) {
             iop_a_src = a_src->data.ptr + a_src->meta.ri;
           }
-        } else {
+        } else if (self->private_impl.f_compression == 3) {
           if (a_src) {
             a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
           }
@@ -17186,6 +17217,14 @@
           if (a_src) {
             iop_a_src = a_src->data.ptr + a_src->meta.ri;
           }
+        } else {
+          if (a_src) {
+            a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+          }
+          v_status = wuffs_bmp__decoder__swizzle_low_bit_depth(self, a_dst, a_src);
+          if (a_src) {
+            iop_a_src = a_src->data.ptr + a_src->meta.ri;
+          }
         }
         if (wuffs_base__status__is_ok(&v_status)) {
           goto label__0__break;
@@ -17349,7 +17388,10 @@
   uint64_t v_n = 0;
   uint32_t v_p0 = 0;
   uint8_t v_code = 0;
+  uint8_t v_indexes[2] = {0};
   uint32_t v_rle_state = 0;
+  uint32_t v_chunk_bits = 0;
+  uint32_t v_chunk_count = 0;
 
   const uint8_t* iop_a_src = NULL;
   const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
@@ -17409,10 +17451,21 @@
             }
             v_code = wuffs_base__load_u8be__no_bounds_check(iop_a_src);
             (iop_a_src += 1, wuffs_base__make_empty_struct());
-            v_p0 = 0;
-            while (v_p0 < self->private_impl.f_rle_length) {
-              self->private_data.f_scratch[v_p0] = v_code;
-              v_p0 += 1;
+            if (self->private_impl.f_bits_per_pixel == 8) {
+              v_p0 = 0;
+              while (v_p0 < self->private_impl.f_rle_length) {
+                self->private_data.f_scratch[v_p0] = v_code;
+                v_p0 += 1;
+              }
+            } else {
+              v_indexes[0] = (v_code >> 4);
+              v_indexes[1] = (v_code & 15);
+              v_p0 = 0;
+              while (v_p0 < self->private_impl.f_rle_length) {
+                self->private_data.f_scratch[(v_p0 + 0)] = v_indexes[0];
+                self->private_data.f_scratch[(v_p0 + 1)] = v_indexes[1];
+                v_p0 += 2;
+              }
             }
             wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, v_dst, v_dst_palette, wuffs_base__slice_u8__subslice_j(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), self->private_impl.f_rle_length));
             wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x, self->private_impl.f_rle_length);
@@ -17438,19 +17491,38 @@
               goto label__inner__continue;
             }
             self->private_impl.f_rle_length = ((uint32_t)(v_code));
-            self->private_impl.f_rle_padded = ((v_code & 1) != 0);
+            self->private_impl.f_rle_padded = ((self->private_impl.f_bits_per_pixel == 8) && ((v_code & 1) != 0));
             v_rle_state = 3;
             goto label__inner__continue;
           } else if (v_rle_state == 3) {
-            v_n = wuffs_base__pixel_swizzler__limited_swizzle_u32_interleaved_from_reader(
-                &self->private_impl.f_swizzler,
-                self->private_impl.f_rle_length,
-                v_dst,
-                v_dst_palette,
-                &iop_a_src,
-                io2_a_src);
-            wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x, ((uint32_t)((v_n & 4294967295))));
-            wuffs_base__u32__sat_sub_indirect(&self->private_impl.f_rle_length, ((uint32_t)((v_n & 4294967295))));
+            if (self->private_impl.f_bits_per_pixel == 8) {
+              v_n = wuffs_base__pixel_swizzler__limited_swizzle_u32_interleaved_from_reader(
+                  &self->private_impl.f_swizzler,
+                  self->private_impl.f_rle_length,
+                  v_dst,
+                  v_dst_palette,
+                  &iop_a_src,
+                  io2_a_src);
+              wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x, ((uint32_t)((v_n & 4294967295))));
+              wuffs_base__u32__sat_sub_indirect(&self->private_impl.f_rle_length, ((uint32_t)((v_n & 4294967295))));
+            } else {
+              v_chunk_count = ((self->private_impl.f_rle_length + 3) / 4);
+              v_p0 = 0;
+              while ((v_chunk_count > 0) && (((uint64_t)(io2_a_src - iop_a_src)) >= 2)) {
+                v_chunk_bits = ((uint32_t)(wuffs_base__load_u16be__no_bounds_check(iop_a_src)));
+                (iop_a_src += 2, wuffs_base__make_empty_struct());
+                self->private_data.f_scratch[(v_p0 + 0)] = ((uint8_t)((15 & (v_chunk_bits >> 12))));
+                self->private_data.f_scratch[(v_p0 + 1)] = ((uint8_t)((15 & (v_chunk_bits >> 8))));
+                self->private_data.f_scratch[(v_p0 + 2)] = ((uint8_t)((15 & (v_chunk_bits >> 4))));
+                self->private_data.f_scratch[(v_p0 + 3)] = ((uint8_t)((15 & (v_chunk_bits >> 0))));
+                v_p0 = ((v_p0 & 255) + 4);
+                v_chunk_count -= 1;
+              }
+              v_p0 = wuffs_base__u32__min(v_p0, self->private_impl.f_rle_length);
+              wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, v_dst, v_dst_palette, wuffs_base__slice_u8__subslice_j(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), v_p0));
+              wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x, v_p0);
+              wuffs_base__u32__sat_sub_indirect(&self->private_impl.f_rle_length, v_p0);
+            }
             if (self->private_impl.f_rle_length > 0) {
               goto label__goto_suspend__break;
             }
@@ -17684,6 +17756,175 @@
   return status;
 }
 
+// -------- func bmp.decoder.swizzle_low_bit_depth
+
+static wuffs_base__status
+wuffs_bmp__decoder__swizzle_low_bit_depth(
+    wuffs_bmp__decoder* self,
+    wuffs_base__pixel_buffer* a_dst,
+    wuffs_base__io_buffer* a_src) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  wuffs_base__pixel_format v_dst_pixfmt = {0};
+  uint32_t v_dst_bits_per_pixel = 0;
+  uint64_t v_dst_bytes_per_pixel = 0;
+  uint64_t v_dst_bytes_per_row = 0;
+  wuffs_base__slice_u8 v_dst_palette = {0};
+  wuffs_base__table_u8 v_tab = {0};
+  wuffs_base__slice_u8 v_dst = {0};
+  uint64_t v_i = 0;
+  uint64_t v_n = 0;
+  uint32_t v_p0 = 0;
+  uint32_t v_chunk_bits = 0;
+  uint32_t v_chunk_count = 0;
+
+  const uint8_t* iop_a_src = NULL;
+  const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_src) {
+    io0_a_src = a_src->data.ptr;
+    io1_a_src = io0_a_src + a_src->meta.ri;
+    iop_a_src = io1_a_src;
+    io2_a_src = io0_a_src + a_src->meta.wi;
+  }
+
+  v_dst_pixfmt = wuffs_base__pixel_buffer__pixel_format(a_dst);
+  v_dst_bits_per_pixel = wuffs_base__pixel_format__bits_per_pixel(&v_dst_pixfmt);
+  if ((v_dst_bits_per_pixel & 7) != 0) {
+    status = wuffs_base__make_status(wuffs_base__error__unsupported_option);
+    goto exit;
+  }
+  v_dst_bytes_per_pixel = ((uint64_t)((v_dst_bits_per_pixel / 8)));
+  v_dst_bytes_per_row = (((uint64_t)(self->private_impl.f_width)) * v_dst_bytes_per_pixel);
+  v_dst_palette = wuffs_base__pixel_buffer__palette_or_else(a_dst, wuffs_base__make_slice_u8((self->private_data.f_scratch) + 1024, 1024));
+  v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0);
+  label__loop__continue:;
+  while (true) {
+    if (self->private_impl.f_dst_x == self->private_impl.f_width) {
+      self->private_impl.f_dst_x = 0;
+      self->private_impl.f_dst_y += self->private_impl.f_dst_y_inc;
+      if (self->private_impl.f_dst_y == self->private_impl.f_dst_y_end) {
+        goto label__loop__break;
+      }
+    }
+    v_dst = wuffs_base__table_u8__row(v_tab, self->private_impl.f_dst_y);
+    if (v_dst_bytes_per_row < ((uint64_t)(v_dst.len))) {
+      v_dst = wuffs_base__slice_u8__subslice_j(v_dst, v_dst_bytes_per_row);
+    }
+    v_i = (((uint64_t)(self->private_impl.f_dst_x)) * v_dst_bytes_per_pixel);
+    if (v_i >= ((uint64_t)(v_dst.len))) {
+      goto label__loop__continue;
+    }
+    v_dst = wuffs_base__slice_u8__subslice_i(v_dst, v_i);
+    v_p0 = 0;
+    if (self->private_impl.f_bits_per_pixel == 1) {
+      v_chunk_count = ((wuffs_base__u32__sat_sub(self->private_impl.f_width, self->private_impl.f_dst_x) + 31) / 32);
+      v_chunk_count = wuffs_base__u32__min(v_chunk_count, 16);
+      while ((v_chunk_count > 0) && (((uint64_t)(io2_a_src - iop_a_src)) >= 4)) {
+        v_chunk_bits = wuffs_base__load_u32be__no_bounds_check(iop_a_src);
+        (iop_a_src += 4, wuffs_base__make_empty_struct());
+        self->private_data.f_scratch[(v_p0 + 0)] = ((uint8_t)((1 & (v_chunk_bits >> 31))));
+        self->private_data.f_scratch[(v_p0 + 1)] = ((uint8_t)((1 & (v_chunk_bits >> 30))));
+        self->private_data.f_scratch[(v_p0 + 2)] = ((uint8_t)((1 & (v_chunk_bits >> 29))));
+        self->private_data.f_scratch[(v_p0 + 3)] = ((uint8_t)((1 & (v_chunk_bits >> 28))));
+        self->private_data.f_scratch[(v_p0 + 4)] = ((uint8_t)((1 & (v_chunk_bits >> 27))));
+        self->private_data.f_scratch[(v_p0 + 5)] = ((uint8_t)((1 & (v_chunk_bits >> 26))));
+        self->private_data.f_scratch[(v_p0 + 6)] = ((uint8_t)((1 & (v_chunk_bits >> 25))));
+        self->private_data.f_scratch[(v_p0 + 7)] = ((uint8_t)((1 & (v_chunk_bits >> 24))));
+        self->private_data.f_scratch[(v_p0 + 8)] = ((uint8_t)((1 & (v_chunk_bits >> 23))));
+        self->private_data.f_scratch[(v_p0 + 9)] = ((uint8_t)((1 & (v_chunk_bits >> 22))));
+        self->private_data.f_scratch[(v_p0 + 10)] = ((uint8_t)((1 & (v_chunk_bits >> 21))));
+        self->private_data.f_scratch[(v_p0 + 11)] = ((uint8_t)((1 & (v_chunk_bits >> 20))));
+        self->private_data.f_scratch[(v_p0 + 12)] = ((uint8_t)((1 & (v_chunk_bits >> 19))));
+        self->private_data.f_scratch[(v_p0 + 13)] = ((uint8_t)((1 & (v_chunk_bits >> 18))));
+        self->private_data.f_scratch[(v_p0 + 14)] = ((uint8_t)((1 & (v_chunk_bits >> 17))));
+        self->private_data.f_scratch[(v_p0 + 15)] = ((uint8_t)((1 & (v_chunk_bits >> 16))));
+        self->private_data.f_scratch[(v_p0 + 16)] = ((uint8_t)((1 & (v_chunk_bits >> 15))));
+        self->private_data.f_scratch[(v_p0 + 17)] = ((uint8_t)((1 & (v_chunk_bits >> 14))));
+        self->private_data.f_scratch[(v_p0 + 18)] = ((uint8_t)((1 & (v_chunk_bits >> 13))));
+        self->private_data.f_scratch[(v_p0 + 19)] = ((uint8_t)((1 & (v_chunk_bits >> 12))));
+        self->private_data.f_scratch[(v_p0 + 20)] = ((uint8_t)((1 & (v_chunk_bits >> 11))));
+        self->private_data.f_scratch[(v_p0 + 21)] = ((uint8_t)((1 & (v_chunk_bits >> 10))));
+        self->private_data.f_scratch[(v_p0 + 22)] = ((uint8_t)((1 & (v_chunk_bits >> 9))));
+        self->private_data.f_scratch[(v_p0 + 23)] = ((uint8_t)((1 & (v_chunk_bits >> 8))));
+        self->private_data.f_scratch[(v_p0 + 24)] = ((uint8_t)((1 & (v_chunk_bits >> 7))));
+        self->private_data.f_scratch[(v_p0 + 25)] = ((uint8_t)((1 & (v_chunk_bits >> 6))));
+        self->private_data.f_scratch[(v_p0 + 26)] = ((uint8_t)((1 & (v_chunk_bits >> 5))));
+        self->private_data.f_scratch[(v_p0 + 27)] = ((uint8_t)((1 & (v_chunk_bits >> 4))));
+        self->private_data.f_scratch[(v_p0 + 28)] = ((uint8_t)((1 & (v_chunk_bits >> 3))));
+        self->private_data.f_scratch[(v_p0 + 29)] = ((uint8_t)((1 & (v_chunk_bits >> 2))));
+        self->private_data.f_scratch[(v_p0 + 30)] = ((uint8_t)((1 & (v_chunk_bits >> 1))));
+        self->private_data.f_scratch[(v_p0 + 31)] = ((uint8_t)((1 & (v_chunk_bits >> 0))));
+        v_p0 = ((v_p0 & 511) + 32);
+        v_chunk_count -= 1;
+      }
+    } else if (self->private_impl.f_bits_per_pixel == 2) {
+      v_chunk_count = ((wuffs_base__u32__sat_sub(self->private_impl.f_width, self->private_impl.f_dst_x) + 15) / 16);
+      v_chunk_count = wuffs_base__u32__min(v_chunk_count, 32);
+      while ((v_chunk_count > 0) && (((uint64_t)(io2_a_src - iop_a_src)) >= 4)) {
+        v_chunk_bits = wuffs_base__load_u32be__no_bounds_check(iop_a_src);
+        (iop_a_src += 4, wuffs_base__make_empty_struct());
+        self->private_data.f_scratch[(v_p0 + 0)] = ((uint8_t)((3 & (v_chunk_bits >> 30))));
+        self->private_data.f_scratch[(v_p0 + 1)] = ((uint8_t)((3 & (v_chunk_bits >> 28))));
+        self->private_data.f_scratch[(v_p0 + 2)] = ((uint8_t)((3 & (v_chunk_bits >> 26))));
+        self->private_data.f_scratch[(v_p0 + 3)] = ((uint8_t)((3 & (v_chunk_bits >> 24))));
+        self->private_data.f_scratch[(v_p0 + 4)] = ((uint8_t)((3 & (v_chunk_bits >> 22))));
+        self->private_data.f_scratch[(v_p0 + 5)] = ((uint8_t)((3 & (v_chunk_bits >> 20))));
+        self->private_data.f_scratch[(v_p0 + 6)] = ((uint8_t)((3 & (v_chunk_bits >> 18))));
+        self->private_data.f_scratch[(v_p0 + 7)] = ((uint8_t)((3 & (v_chunk_bits >> 16))));
+        self->private_data.f_scratch[(v_p0 + 8)] = ((uint8_t)((3 & (v_chunk_bits >> 14))));
+        self->private_data.f_scratch[(v_p0 + 9)] = ((uint8_t)((3 & (v_chunk_bits >> 12))));
+        self->private_data.f_scratch[(v_p0 + 10)] = ((uint8_t)((3 & (v_chunk_bits >> 10))));
+        self->private_data.f_scratch[(v_p0 + 11)] = ((uint8_t)((3 & (v_chunk_bits >> 8))));
+        self->private_data.f_scratch[(v_p0 + 12)] = ((uint8_t)((3 & (v_chunk_bits >> 6))));
+        self->private_data.f_scratch[(v_p0 + 13)] = ((uint8_t)((3 & (v_chunk_bits >> 4))));
+        self->private_data.f_scratch[(v_p0 + 14)] = ((uint8_t)((3 & (v_chunk_bits >> 2))));
+        self->private_data.f_scratch[(v_p0 + 15)] = ((uint8_t)((3 & (v_chunk_bits >> 0))));
+        v_p0 = ((v_p0 & 511) + 16);
+        v_chunk_count -= 1;
+      }
+    } else if (self->private_impl.f_bits_per_pixel == 4) {
+      v_chunk_count = ((wuffs_base__u32__sat_sub(self->private_impl.f_width, self->private_impl.f_dst_x) + 7) / 8);
+      v_chunk_count = wuffs_base__u32__min(v_chunk_count, 64);
+      while ((v_chunk_count > 0) && (((uint64_t)(io2_a_src - iop_a_src)) >= 4)) {
+        v_chunk_bits = wuffs_base__load_u32be__no_bounds_check(iop_a_src);
+        (iop_a_src += 4, wuffs_base__make_empty_struct());
+        self->private_data.f_scratch[(v_p0 + 0)] = ((uint8_t)((15 & (v_chunk_bits >> 28))));
+        self->private_data.f_scratch[(v_p0 + 1)] = ((uint8_t)((15 & (v_chunk_bits >> 24))));
+        self->private_data.f_scratch[(v_p0 + 2)] = ((uint8_t)((15 & (v_chunk_bits >> 20))));
+        self->private_data.f_scratch[(v_p0 + 3)] = ((uint8_t)((15 & (v_chunk_bits >> 16))));
+        self->private_data.f_scratch[(v_p0 + 4)] = ((uint8_t)((15 & (v_chunk_bits >> 12))));
+        self->private_data.f_scratch[(v_p0 + 5)] = ((uint8_t)((15 & (v_chunk_bits >> 8))));
+        self->private_data.f_scratch[(v_p0 + 6)] = ((uint8_t)((15 & (v_chunk_bits >> 4))));
+        self->private_data.f_scratch[(v_p0 + 7)] = ((uint8_t)((15 & (v_chunk_bits >> 0))));
+        v_p0 = ((v_p0 & 511) + 8);
+        v_chunk_count -= 1;
+      }
+    }
+    v_p0 = wuffs_base__u32__min(v_p0, wuffs_base__u32__sat_sub(self->private_impl.f_width, self->private_impl.f_dst_x));
+    v_n = wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, v_dst, v_dst_palette, wuffs_base__slice_u8__subslice_j(wuffs_base__make_slice_u8(self->private_data.f_scratch, 2048), v_p0));
+    if (v_n == 0) {
+      status = wuffs_base__make_status(wuffs_bmp__note__internal_note_short_read);
+      goto ok;
+    }
+    wuffs_base__u32__sat_add_indirect(&self->private_impl.f_dst_x, ((uint32_t)((v_n & 4294967295))));
+  }
+  label__loop__break:;
+  status = wuffs_base__make_status(NULL);
+  goto ok;
+
+  goto ok;
+  ok:
+  goto exit;
+  exit:
+  if (a_src) {
+    a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+  }
+
+  return status;
+}
+
 // -------- func bmp.decoder.skip_frame
 
 static wuffs_base__status
diff --git a/std/bmp/decode_bmp.wuffs b/std/bmp/decode_bmp.wuffs
index 942fde5..de2b816 100644
--- a/std/bmp/decode_bmp.wuffs
+++ b/std/bmp/decode_bmp.wuffs
@@ -26,6 +26,7 @@
 pri const COMPRESSION_JPEG           : base.u32 = 4
 pri const COMPRESSION_PNG            : base.u32 = 5
 pri const COMPRESSION_ALPHABITFIELDS : base.u32 = 6
+pri const COMPRESSION_LOW_BIT_DEPTH  : base.u32 = 0x100
 
 pub struct decoder? implements base.image_decoder(
 	width  : base.u32[..= 0x7FFF_FFFF],
@@ -88,6 +89,7 @@
 	var height     : base.u32
 	var planes     : base.u32
 	var dst_pixfmt : base.u32
+	var byte_width : base.u32
 
 	if (this.call_sequence <> 0) or (this.io_redirect_fourcc == 1) {
 		return base."#bad call sequence"
@@ -221,7 +223,12 @@
 	}
 
 	if this.compression == COMPRESSION_NONE {
-		if this.bits_per_pixel == 8 {
+		if (this.bits_per_pixel == 1) or
+			(this.bits_per_pixel == 2) or
+			(this.bits_per_pixel == 4) {
+			this.src_pixfmt = base.PIXEL_FORMAT__INDEXED__BGRA_BINARY
+			this.compression = COMPRESSION_LOW_BIT_DEPTH
+		} else if this.bits_per_pixel == 8 {
 			this.src_pixfmt = base.PIXEL_FORMAT__INDEXED__BGRA_BINARY
 		} else if this.bits_per_pixel == 16 {
 			// BMP's 16-bit default is BGRX_5551.
@@ -251,6 +258,13 @@
 			return "#unsupported BMP file"
 		}
 
+	} else if this.compression == COMPRESSION_RLE4 {
+		if this.bits_per_pixel == 4 {
+			this.src_pixfmt = base.PIXEL_FORMAT__INDEXED__BGRA_BINARY
+		} else {
+			return "#unsupported BMP file"
+		}
+
 	} else if this.compression == COMPRESSION_BITFIELDS {
 		if (this.bits_per_pixel == 16) or (this.bits_per_pixel == 32) {
 			this.src_pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL_4X16LE
@@ -263,7 +277,22 @@
 	}
 
 	// The "((x + 3) >> 2) << 2" dance rounds x up to a multiple of 4.
-	if this.bits_per_pixel == 8 {
+	if this.bits_per_pixel == 1 {
+		// byte_width is this.width divided by 8, rounding up.
+		byte_width = (this.width >> 3) + (((this.width & 7) + 7) >> 3)
+		this.bytes_per_row = ((((byte_width as base.u64) * 1) + 3) >> 2) << 2
+		this.pad_per_row = (4 - (byte_width & 3)) & 3
+	} else if this.bits_per_pixel == 2 {
+		// byte_width is this.width divided by 4, rounding up.
+		byte_width = (this.width >> 2) + (((this.width & 3) + 3) >> 2)
+		this.bytes_per_row = ((((byte_width as base.u64) * 1) + 3) >> 2) << 2
+		this.pad_per_row = (4 - (byte_width & 3)) & 3
+	} else if this.bits_per_pixel == 4 {
+		// byte_width is this.width divided by 2, rounding up.
+		byte_width = (this.width >> 1) + (this.width & 1)
+		this.bytes_per_row = ((((byte_width as base.u64) * 1) + 3) >> 2) << 2
+		this.pad_per_row = (4 - (byte_width & 3)) & 3
+	} else if this.bits_per_pixel == 8 {
 		this.bytes_per_row = ((((this.width as base.u64) * 1) + 3) >> 2) << 2
 		this.pad_per_row = (4 - (this.width & 3)) & 3
 	} else if this.bits_per_pixel == 16 {
@@ -370,10 +399,12 @@
 		while true {
 			if this.compression == COMPRESSION_NONE {
 				status = this.swizzle_none!(dst: args.dst, src: args.src)
-			} else if this.compression == COMPRESSION_RLE8 {
+			} else if this.compression < COMPRESSION_BITFIELDS {
 				status = this.swizzle_rle!(dst: args.dst, src: args.src)
-			} else {
+			} else if this.compression == COMPRESSION_BITFIELDS {
 				status = this.swizzle_bitfields!(dst: args.dst, src: args.src)
+			} else {
+				status = this.swizzle_low_bit_depth!(dst: args.dst, src: args.src)
 			}
 
 			if status.is_ok() {
@@ -475,11 +506,15 @@
 	var i                   : base.u64
 	var n                   : base.u64
 
-	var p0   : base.u32[..= 255]
-	var code : base.u8
+	var p0      : base.u32[..= 259]
+	var code    : base.u8
+	var indexes : array[2] base.u8
 
 	var rle_state : base.u32
 
+	var chunk_bits  : base.u32
+	var chunk_count : base.u32[..= 64]
+
 	// TODO: the dst_pixfmt variable shouldn't be necessary. We should be able
 	// to chain the two calls: "args.dst.pixel_format().bits_per_pixel()".
 	dst_pixfmt = args.dst.pixel_format()
@@ -530,12 +565,24 @@
 					}
 					code = args.src.peek_u8()
 					args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-					p0 = 0
-					while p0 < this.rle_length {
-						assert p0 < 255 via "a < b: a < c; c <= b"(c: this.rle_length)
-						this.scratch[p0] = code
-						p0 += 1
-					} endwhile
+					if this.bits_per_pixel == 8 {
+						p0 = 0
+						while p0 < this.rle_length {
+							assert p0 < 255 via "a < b: a < c; c <= b"(c: this.rle_length)
+							this.scratch[p0] = code
+							p0 += 1
+						} endwhile
+					} else {
+						indexes[0] = code >> 4
+						indexes[1] = code & 0x0F
+						p0 = 0
+						while p0 < this.rle_length {
+							assert p0 < 255 via "a < b: a < c; c <= b"(c: this.rle_length)
+							this.scratch[p0 + 0] = indexes[0]
+							this.scratch[p0 + 1] = indexes[1]
+							p0 += 2
+						} endwhile
+					}
 					this.swizzler.swizzle_interleaved_from_slice!(
 						dst: dst,
 						dst_palette: dst_palette,
@@ -567,18 +614,44 @@
 						continue.inner
 					}
 					this.rle_length = code as base.u32
-					this.rle_padded = (code & 1) <> 0
+					this.rle_padded = (this.bits_per_pixel == 8) and ((code & 1) <> 0)
 					rle_state = RLE_STATE_LITERAL
 					continue.inner
 
 				} else if rle_state == RLE_STATE_LITERAL {
-					n = this.swizzler.limited_swizzle_u32_interleaved_from_reader!(
-						up_to_num_pixels: this.rle_length,
-						dst: dst,
-						dst_palette: dst_palette,
-						src: args.src)
-					this.dst_x ~sat+= (n & 0xFFFF_FFFF) as base.u32
-					this.rle_length ~sat-= (n & 0xFFFF_FFFF) as base.u32
+					if this.bits_per_pixel == 8 {
+						n = this.swizzler.limited_swizzle_u32_interleaved_from_reader!(
+							up_to_num_pixels: this.rle_length,
+							dst: dst,
+							dst_palette: dst_palette,
+							src: args.src)
+						this.dst_x ~sat+= (n & 0xFFFF_FFFF) as base.u32
+						this.rle_length ~sat-= (n & 0xFFFF_FFFF) as base.u32
+					} else {
+						// Calculate the remaining number of 16-bit chunks. At
+						// 4 bits per pixel there are 4 pixels per chunk.
+						// Division rounds up.
+						chunk_count = (this.rle_length + 3) / 4
+						p0 = 0
+						while (chunk_count > 0) and (args.src.length() >= 2) {
+							chunk_bits = args.src.peek_u16be_as_u32()
+							args.src.skip_u32_fast!(actual: 2, worst_case: 2)
+							this.scratch[p0 + 0x00] = (0x0F & (chunk_bits >> 0x0C)) as base.u8
+							this.scratch[p0 + 0x01] = (0x0F & (chunk_bits >> 0x08)) as base.u8
+							this.scratch[p0 + 0x02] = (0x0F & (chunk_bits >> 0x04)) as base.u8
+							this.scratch[p0 + 0x03] = (0x0F & (chunk_bits >> 0x00)) as base.u8
+							p0 = (p0 & 255) + 0x04
+							chunk_count -= 1
+						} endwhile
+						p0 = p0.min(a: this.rle_length)
+						this.swizzler.swizzle_interleaved_from_slice!(
+							dst: dst,
+							dst_palette: dst_palette,
+							src: this.scratch[.. p0])
+						this.dst_x ~sat+= p0
+						this.rle_length ~sat-= p0
+					}
+
 					if this.rle_length > 0 {
 						break.goto_suspend
 					}
@@ -794,6 +867,163 @@
 	return ok
 }
 
+pri func decoder.swizzle_low_bit_depth!(dst: ptr base.pixel_buffer, src: base.io_reader) base.status {
+	var dst_pixfmt          : base.pixel_format
+	var dst_bits_per_pixel  : base.u32[..= 256]
+	var dst_bytes_per_pixel : base.u64[..= 32]
+	var dst_bytes_per_row   : base.u64
+	var dst_palette         : slice base.u8
+	var tab                 : table base.u8
+	var dst                 : slice base.u8
+	var i                   : base.u64
+	var n                   : base.u64
+
+	var p0 : base.u32[..= 543]
+
+	var chunk_bits  : base.u32
+	var chunk_count : base.u32
+
+	// TODO: the dst_pixfmt variable shouldn't be necessary. We should be able
+	// to chain the two calls: "args.dst.pixel_format().bits_per_pixel()".
+	dst_pixfmt = args.dst.pixel_format()
+	dst_bits_per_pixel = dst_pixfmt.bits_per_pixel()
+	if (dst_bits_per_pixel & 7) <> 0 {
+		return base."#unsupported option"
+	}
+	dst_bytes_per_pixel = (dst_bits_per_pixel / 8) as base.u64
+	dst_bytes_per_row = (this.width as base.u64) * dst_bytes_per_pixel
+	dst_palette = args.dst.palette_or_else(fallback: this.scratch[1024 ..])
+	tab = args.dst.plane(p: 0)
+
+	while.loop true {
+		if this.dst_x == this.width {
+			this.dst_x = 0
+			this.dst_y ~mod+= this.dst_y_inc
+			if this.dst_y == this.dst_y_end {
+				break.loop
+			}
+		}
+
+		dst = tab.row(y: this.dst_y)
+		if dst_bytes_per_row < dst.length() {
+			dst = dst[.. dst_bytes_per_row]
+		}
+		i = (this.dst_x as base.u64) * dst_bytes_per_pixel
+		if i >= dst.length() {
+			// TODO: advance args.src if the dst pixel_buffer bounds is
+			// smaller than this BMP's image bounds?
+			continue.loop
+		}
+		dst = dst[i ..]
+		p0 = 0
+
+		if this.bits_per_pixel == 1 {
+			// Calculate the remaining number of 32-bit chunks. At 1 bit per
+			// pixel there are 32 pixels per chunk. Division rounds up.
+			chunk_count = ((this.width ~sat- this.dst_x) + 31) / 32
+			chunk_count = chunk_count.min(a: 16)  // Keep p0 <= 512.
+			while (chunk_count > 0) and (args.src.length() >= 4) {
+				chunk_bits = args.src.peek_u32be()
+				args.src.skip_u32_fast!(actual: 4, worst_case: 4)
+				this.scratch[p0 + 0x00] = (0x01 & (chunk_bits >> 0x1F)) as base.u8
+				this.scratch[p0 + 0x01] = (0x01 & (chunk_bits >> 0x1E)) as base.u8
+				this.scratch[p0 + 0x02] = (0x01 & (chunk_bits >> 0x1D)) as base.u8
+				this.scratch[p0 + 0x03] = (0x01 & (chunk_bits >> 0x1C)) as base.u8
+				this.scratch[p0 + 0x04] = (0x01 & (chunk_bits >> 0x1B)) as base.u8
+				this.scratch[p0 + 0x05] = (0x01 & (chunk_bits >> 0x1A)) as base.u8
+				this.scratch[p0 + 0x06] = (0x01 & (chunk_bits >> 0x19)) as base.u8
+				this.scratch[p0 + 0x07] = (0x01 & (chunk_bits >> 0x18)) as base.u8
+				this.scratch[p0 + 0x08] = (0x01 & (chunk_bits >> 0x17)) as base.u8
+				this.scratch[p0 + 0x09] = (0x01 & (chunk_bits >> 0x16)) as base.u8
+				this.scratch[p0 + 0x0A] = (0x01 & (chunk_bits >> 0x15)) as base.u8
+				this.scratch[p0 + 0x0B] = (0x01 & (chunk_bits >> 0x14)) as base.u8
+				this.scratch[p0 + 0x0C] = (0x01 & (chunk_bits >> 0x13)) as base.u8
+				this.scratch[p0 + 0x0D] = (0x01 & (chunk_bits >> 0x12)) as base.u8
+				this.scratch[p0 + 0x0E] = (0x01 & (chunk_bits >> 0x11)) as base.u8
+				this.scratch[p0 + 0x0F] = (0x01 & (chunk_bits >> 0x10)) as base.u8
+				this.scratch[p0 + 0x10] = (0x01 & (chunk_bits >> 0x0F)) as base.u8
+				this.scratch[p0 + 0x11] = (0x01 & (chunk_bits >> 0x0E)) as base.u8
+				this.scratch[p0 + 0x12] = (0x01 & (chunk_bits >> 0x0D)) as base.u8
+				this.scratch[p0 + 0x13] = (0x01 & (chunk_bits >> 0x0C)) as base.u8
+				this.scratch[p0 + 0x14] = (0x01 & (chunk_bits >> 0x0B)) as base.u8
+				this.scratch[p0 + 0x15] = (0x01 & (chunk_bits >> 0x0A)) as base.u8
+				this.scratch[p0 + 0x16] = (0x01 & (chunk_bits >> 0x09)) as base.u8
+				this.scratch[p0 + 0x17] = (0x01 & (chunk_bits >> 0x08)) as base.u8
+				this.scratch[p0 + 0x18] = (0x01 & (chunk_bits >> 0x07)) as base.u8
+				this.scratch[p0 + 0x19] = (0x01 & (chunk_bits >> 0x06)) as base.u8
+				this.scratch[p0 + 0x1A] = (0x01 & (chunk_bits >> 0x05)) as base.u8
+				this.scratch[p0 + 0x1B] = (0x01 & (chunk_bits >> 0x04)) as base.u8
+				this.scratch[p0 + 0x1C] = (0x01 & (chunk_bits >> 0x03)) as base.u8
+				this.scratch[p0 + 0x1D] = (0x01 & (chunk_bits >> 0x02)) as base.u8
+				this.scratch[p0 + 0x1E] = (0x01 & (chunk_bits >> 0x01)) as base.u8
+				this.scratch[p0 + 0x1F] = (0x01 & (chunk_bits >> 0x00)) as base.u8
+				p0 = (p0 & 511) + 0x20
+				chunk_count -= 1
+			} endwhile
+
+		} else if this.bits_per_pixel == 2 {
+			// Calculate the remaining number of 32-bit chunks. At 2 bits per
+			// pixel there are 16 pixels per chunk. Division rounds up.
+			chunk_count = ((this.width ~sat- this.dst_x) + 15) / 16
+			chunk_count = chunk_count.min(a: 32)  // Keep p0 <= 512.
+			while (chunk_count > 0) and (args.src.length() >= 4) {
+				chunk_bits = args.src.peek_u32be()
+				args.src.skip_u32_fast!(actual: 4, worst_case: 4)
+				this.scratch[p0 + 0x00] = (0x03 & (chunk_bits >> 0x1E)) as base.u8
+				this.scratch[p0 + 0x01] = (0x03 & (chunk_bits >> 0x1C)) as base.u8
+				this.scratch[p0 + 0x02] = (0x03 & (chunk_bits >> 0x1A)) as base.u8
+				this.scratch[p0 + 0x03] = (0x03 & (chunk_bits >> 0x18)) as base.u8
+				this.scratch[p0 + 0x04] = (0x03 & (chunk_bits >> 0x16)) as base.u8
+				this.scratch[p0 + 0x05] = (0x03 & (chunk_bits >> 0x14)) as base.u8
+				this.scratch[p0 + 0x06] = (0x03 & (chunk_bits >> 0x12)) as base.u8
+				this.scratch[p0 + 0x07] = (0x03 & (chunk_bits >> 0x10)) as base.u8
+				this.scratch[p0 + 0x08] = (0x03 & (chunk_bits >> 0x0E)) as base.u8
+				this.scratch[p0 + 0x09] = (0x03 & (chunk_bits >> 0x0C)) as base.u8
+				this.scratch[p0 + 0x0A] = (0x03 & (chunk_bits >> 0x0A)) as base.u8
+				this.scratch[p0 + 0x0B] = (0x03 & (chunk_bits >> 0x08)) as base.u8
+				this.scratch[p0 + 0x0C] = (0x03 & (chunk_bits >> 0x06)) as base.u8
+				this.scratch[p0 + 0x0D] = (0x03 & (chunk_bits >> 0x04)) as base.u8
+				this.scratch[p0 + 0x0E] = (0x03 & (chunk_bits >> 0x02)) as base.u8
+				this.scratch[p0 + 0x0F] = (0x03 & (chunk_bits >> 0x00)) as base.u8
+				p0 = (p0 & 511) + 0x10
+				chunk_count -= 1
+			} endwhile
+
+		} else if this.bits_per_pixel == 4 {
+			// Calculate the remaining number of 32-bit chunks. At 4 bits per
+			// pixel there are 8 pixels per chunk. Division rounds up.
+			chunk_count = ((this.width ~sat- this.dst_x) + 7) / 8
+			chunk_count = chunk_count.min(a: 64)  // Keep p0 <= 512.
+			while (chunk_count > 0) and (args.src.length() >= 4) {
+				chunk_bits = args.src.peek_u32be()
+				args.src.skip_u32_fast!(actual: 4, worst_case: 4)
+				this.scratch[p0 + 0x00] = (0x0F & (chunk_bits >> 0x1C)) as base.u8
+				this.scratch[p0 + 0x01] = (0x0F & (chunk_bits >> 0x18)) as base.u8
+				this.scratch[p0 + 0x02] = (0x0F & (chunk_bits >> 0x14)) as base.u8
+				this.scratch[p0 + 0x03] = (0x0F & (chunk_bits >> 0x10)) as base.u8
+				this.scratch[p0 + 0x04] = (0x0F & (chunk_bits >> 0x0C)) as base.u8
+				this.scratch[p0 + 0x05] = (0x0F & (chunk_bits >> 0x08)) as base.u8
+				this.scratch[p0 + 0x06] = (0x0F & (chunk_bits >> 0x04)) as base.u8
+				this.scratch[p0 + 0x07] = (0x0F & (chunk_bits >> 0x00)) as base.u8
+				p0 = (p0 & 511) + 0x08
+				chunk_count -= 1
+			} endwhile
+		}
+
+		p0 = p0.min(a: this.width ~sat- this.dst_x)
+		n = this.swizzler.swizzle_interleaved_from_slice!(
+			dst: dst,
+			dst_palette: dst_palette,
+			src: this.scratch[.. p0])
+		if n == 0 {
+			return "@internal note: short read"
+		}
+		this.dst_x ~sat+= (n & 0xFFFF_FFFF) as base.u32
+	} endwhile.loop
+
+	return ok
+}
+
 pri func decoder.skip_frame?(src: base.io_reader) {
 	args.src.skip_u32?(n: this.padding)
 	args.src.skip?(n: this.bytes_per_row * (this.height as base.u64))