Have std/bzip2 read 4 (not 1) bytes per inner loop

name                             old speed      new speed      delta

wuffs_bzip2_decode_10k/clang11   42.9MB/s ± 0%  46.5MB/s ± 0%   +8.38%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/clang11  36.9MB/s ± 0%  39.4MB/s ± 1%   +6.75%  (p=0.008 n=5+5)

wuffs_bzip2_decode_10k/gcc10     36.5MB/s ± 0%  43.8MB/s ± 0%  +19.73%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10    30.8MB/s ± 0%  35.9MB/s ± 0%  +16.37%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index c31dcc1..3e6581d 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6934,8 +6934,13 @@
 
     uint32_t f_bits;
     uint32_t f_n_bits;
-    uint32_t f_block_size;
     uint32_t f_max_incl_block_size;
+    uint32_t f_block_size;
+    bool f_decode_block_finished;
+    uint8_t f_decode_block_which;
+    uint32_t f_decode_block_ticks;
+    uint32_t f_decode_block_section;
+    uint32_t f_decode_block_run_shift;
     uint32_t f_final_checksum_have;
     uint32_t f_block_checksum_want;
     uint32_t f_original_pointer;
@@ -6961,12 +6966,7 @@
     uint32_t f_bwt[1048576];
 
     struct {
-      uint8_t v_which;
-      uint32_t v_ticks;
-      uint32_t v_section;
-      uint32_t v_bs;
       uint32_t v_node_index;
-      uint32_t v_run_shift;
     } s_decode_block_slow[1];
     struct {
       uint32_t v_i;
@@ -24864,6 +24864,7 @@
 const char wuffs_bzip2__error__bad_number_of_sections[] = "#bzip2: bad number of sections";
 const char wuffs_bzip2__error__unsupported_huffman_code[] = "#bzip2: unsupported Huffman code";
 const char wuffs_bzip2__error__unsupported_block_randomization[] = "#bzip2: unsupported block randomization";
+const char wuffs_bzip2__error__internal_error_inconsistent_huffman_decoder_state[] = "#bzip2: internal error: inconsistent Huffman decoder state";
 
 // ---------------- Private Consts
 
@@ -24913,6 +24914,11 @@
 // ---------------- Private Function Prototypes
 
 static wuffs_base__status
+wuffs_bzip2__decoder__decode_block_fast(
+    wuffs_bzip2__decoder* self,
+    wuffs_base__io_buffer* a_src);
+
+static wuffs_base__status
 wuffs_bzip2__decoder__decode_block_slow(
     wuffs_bzip2__decoder* self,
     wuffs_base__io_buffer* a_src);
@@ -25026,6 +25032,128 @@
 
 // ---------------- Function Implementations
 
+// -------- func bzip2.decoder.decode_block_fast
+
+static wuffs_base__status
+wuffs_bzip2__decoder__decode_block_fast(
+    wuffs_bzip2__decoder* self,
+    wuffs_base__io_buffer* a_src) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  uint32_t v_bits = 0;
+  uint32_t v_n_bits = 0;
+  uint32_t v_block_size = 0;
+  uint8_t v_which = 0;
+  uint32_t v_ticks = 0;
+  uint32_t v_section = 0;
+  uint32_t v_run_shift = 0;
+  uint16_t v_child = 0;
+  uint32_t v_child_ff = 0;
+  uint32_t v_i = 0;
+  uint32_t v_j = 0;
+  uint32_t v_output = 0;
+  uint32_t v_run = 0;
+  uint32_t v_mtft0 = 0;
+
+  const uint8_t* iop_a_src = NULL;
+  const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_src) {
+    io0_a_src = a_src->data.ptr;
+    io1_a_src = io0_a_src + a_src->meta.ri;
+    iop_a_src = io1_a_src;
+    io2_a_src = io0_a_src + a_src->meta.wi;
+  }
+
+  v_bits = self->private_impl.f_bits;
+  v_n_bits = self->private_impl.f_n_bits;
+  v_block_size = self->private_impl.f_block_size;
+  v_which = self->private_impl.f_decode_block_which;
+  v_ticks = self->private_impl.f_decode_block_ticks;
+  v_section = self->private_impl.f_decode_block_section;
+  v_run_shift = self->private_impl.f_decode_block_run_shift;
+  label__outer__continue:;
+  while (((uint64_t)(io2_a_src - iop_a_src)) >= 4) {
+    if (v_ticks > 0) {
+      v_ticks -= 1;
+    } else {
+      v_ticks = 49;
+      v_section += 1;
+      v_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[(v_section & 32767)] & 7)];
+    }
+    v_bits |= (wuffs_base__peek_u32be__no_bounds_check(iop_a_src) >> v_n_bits);
+    iop_a_src += ((31 - v_n_bits) >> 3);
+    v_n_bits |= 24;
+    v_child = 0;
+    while (v_child < 1024) {
+      v_child = self->private_data.f_huffman_trees[v_which][v_child][(v_bits >> 31)];
+      v_bits <<= 1;
+      if (v_n_bits <= 0) {
+        status = wuffs_base__make_status(wuffs_bzip2__error__internal_error_inconsistent_huffman_decoder_state);
+        goto exit;
+      }
+      v_n_bits -= 1;
+    }
+    if (v_child < 1280) {
+      v_child_ff = ((uint32_t)((v_child & 255)));
+      v_output = ((uint32_t)(self->private_data.f_mtft[v_child_ff]));
+      wuffs_base__slice_u8__copy_from_slice(wuffs_base__make_slice_u8_ij(self->private_data.f_mtft, 1, (1 + v_child_ff)), wuffs_base__make_slice_u8(self->private_data.f_mtft, v_child_ff));
+      self->private_data.f_mtft[0] = ((uint8_t)(v_output));
+      self->private_data.f_letter_counts[v_output] += 1;
+      self->private_data.f_bwt[v_block_size] = v_output;
+      if (v_block_size >= self->private_impl.f_max_incl_block_size) {
+        status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+        goto exit;
+      }
+      v_block_size += 1;
+      v_run_shift = 0;
+      goto label__outer__continue;
+    } else if (v_child > 1281) {
+      self->private_impl.f_decode_block_finished = true;
+      goto label__outer__break;
+    }
+    if (v_run_shift >= 23) {
+      status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+      goto exit;
+    }
+    v_run = (((uint32_t)((v_child - 1279))) << v_run_shift);
+    v_run_shift += 1;
+    v_i = v_block_size;
+    v_j = (v_run + v_block_size);
+    if (v_j > self->private_impl.f_max_incl_block_size) {
+      status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+      goto exit;
+    }
+    v_block_size = v_j;
+    v_mtft0 = ((uint32_t)(self->private_data.f_mtft[0]));
+    self->private_data.f_letter_counts[v_mtft0] += v_run;
+    while (v_i < v_j) {
+      self->private_data.f_bwt[v_i] = v_mtft0;
+      v_i += 1;
+    }
+  }
+  label__outer__break:;
+  self->private_impl.f_bits = v_bits;
+  self->private_impl.f_n_bits = v_n_bits;
+  self->private_impl.f_block_size = v_block_size;
+  self->private_impl.f_decode_block_which = v_which;
+  self->private_impl.f_decode_block_ticks = v_ticks;
+  self->private_impl.f_decode_block_section = v_section;
+  self->private_impl.f_decode_block_run_shift = v_run_shift;
+  status = wuffs_base__make_status(NULL);
+  goto ok;
+
+  ok:
+  goto exit;
+  exit:
+  if (a_src) {
+    a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+  }
+
+  return status;
+}
+
 // -------- func bzip2.decoder.decode_block_slow
 
 static wuffs_base__status
@@ -25034,18 +25162,13 @@
     wuffs_base__io_buffer* a_src) {
   wuffs_base__status status = wuffs_base__make_status(NULL);
 
-  uint8_t v_which = 0;
   uint8_t v_c = 0;
+  uint32_t v_node_index = 0;
   uint16_t v_child = 0;
   uint32_t v_child_ff = 0;
   uint32_t v_i = 0;
   uint32_t v_j = 0;
-  uint32_t v_ticks = 0;
-  uint32_t v_section = 0;
-  uint32_t v_bs = 0;
   uint32_t v_output = 0;
-  uint32_t v_node_index = 0;
-  uint32_t v_run_shift = 0;
   uint32_t v_run = 0;
   uint32_t v_mtft0 = 0;
 
@@ -25062,40 +25185,18 @@
 
   uint32_t coro_susp_point = self->private_impl.p_decode_block_slow[0];
   if (coro_susp_point) {
-    v_which = self->private_data.s_decode_block_slow[0].v_which;
-    v_ticks = self->private_data.s_decode_block_slow[0].v_ticks;
-    v_section = self->private_data.s_decode_block_slow[0].v_section;
-    v_bs = self->private_data.s_decode_block_slow[0].v_bs;
     v_node_index = self->private_data.s_decode_block_slow[0].v_node_index;
-    v_run_shift = self->private_data.s_decode_block_slow[0].v_run_shift;
   }
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
 
-    v_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[0] & 7)];
-    v_section = 1;
-    v_i = 0;
-    v_j = 0;
-    while (v_i < 256) {
-      if (self->private_data.f_presence[v_i] != 0) {
-        self->private_data.f_mtft[(v_j & 255)] = ((uint8_t)(v_i));
-        v_j += 1;
-      }
-      v_i += 1;
-    }
-    v_i = 0;
-    while (v_i < 256) {
-      self->private_data.f_letter_counts[v_i] = 0;
-      v_i += 1;
-    }
-    v_ticks = 50;
-    while (true) {
-      if (v_ticks > 0) {
-        v_ticks -= 1;
+    while ( ! (self->private_impl.p_decode_block_slow[0] != 0)) {
+      if (self->private_impl.f_decode_block_ticks > 0) {
+        self->private_impl.f_decode_block_ticks -= 1;
       } else {
-        v_ticks = 49;
-        v_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[(v_section & 32767)] & 7)];
-        v_section += 1;
+        self->private_impl.f_decode_block_ticks = 49;
+        self->private_impl.f_decode_block_section += 1;
+        self->private_impl.f_decode_block_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[(self->private_impl.f_decode_block_section & 32767)] & 7)];
       }
       v_node_index = 0;
       label__0__continue:;
@@ -25113,7 +25214,7 @@
           self->private_impl.f_bits = (((uint32_t)(v_c)) << 24);
           self->private_impl.f_n_bits = 8;
         }
-        v_child = self->private_data.f_huffman_trees[v_which][v_node_index][(self->private_impl.f_bits >> 31)];
+        v_child = self->private_data.f_huffman_trees[self->private_impl.f_decode_block_which][v_node_index][(self->private_impl.f_bits >> 31)];
         self->private_impl.f_bits <<= 1;
         self->private_impl.f_n_bits -= 1;
         if (v_child < 1024) {
@@ -25125,30 +25226,31 @@
           wuffs_base__slice_u8__copy_from_slice(wuffs_base__make_slice_u8_ij(self->private_data.f_mtft, 1, (1 + v_child_ff)), wuffs_base__make_slice_u8(self->private_data.f_mtft, v_child_ff));
           self->private_data.f_mtft[0] = ((uint8_t)(v_output));
           self->private_data.f_letter_counts[v_output] += 1;
-          self->private_data.f_bwt[v_bs] = v_output;
-          if (v_bs >= self->private_impl.f_max_incl_block_size) {
+          self->private_data.f_bwt[self->private_impl.f_block_size] = v_output;
+          if (self->private_impl.f_block_size >= self->private_impl.f_max_incl_block_size) {
             status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
             goto exit;
           }
-          v_bs += 1;
-          v_run_shift = 0;
+          self->private_impl.f_block_size += 1;
+          self->private_impl.f_decode_block_run_shift = 0;
           goto label__0__break;
         } else if (v_child > 1281) {
+          self->private_impl.f_decode_block_finished = true;
           goto label__outer__break;
         }
-        if (v_run_shift >= 23) {
+        if (self->private_impl.f_decode_block_run_shift >= 23) {
           status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
           goto exit;
         }
-        v_run = (((uint32_t)((v_child - 1279))) << v_run_shift);
-        v_run_shift += 1;
-        v_i = v_bs;
-        v_j = (v_run + v_bs);
+        v_run = (((uint32_t)((v_child - 1279))) << self->private_impl.f_decode_block_run_shift);
+        self->private_impl.f_decode_block_run_shift += 1;
+        v_i = self->private_impl.f_block_size;
+        v_j = (v_run + self->private_impl.f_block_size);
         if (v_j > self->private_impl.f_max_incl_block_size) {
           status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
           goto exit;
         }
-        v_bs = v_j;
+        self->private_impl.f_block_size = v_j;
         v_mtft0 = ((uint32_t)(self->private_data.f_mtft[0]));
         self->private_data.f_letter_counts[v_mtft0] += v_run;
         while (v_i < v_j) {
@@ -25160,11 +25262,6 @@
       label__0__break:;
     }
     label__outer__break:;
-    if (v_bs > self->private_impl.f_max_incl_block_size) {
-      status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
-      goto exit;
-    }
-    self->private_impl.f_block_size = v_bs;
 
     goto ok;
     ok:
@@ -25175,12 +25272,7 @@
   goto suspend;
   suspend:
   self->private_impl.p_decode_block_slow[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
-  self->private_data.s_decode_block_slow[0].v_which = v_which;
-  self->private_data.s_decode_block_slow[0].v_ticks = v_ticks;
-  self->private_data.s_decode_block_slow[0].v_section = v_section;
-  self->private_data.s_decode_block_slow[0].v_bs = v_bs;
   self->private_data.s_decode_block_slow[0].v_node_index = v_node_index;
-  self->private_data.s_decode_block_slow[0].v_run_shift = v_run_shift;
 
   goto exit;
   exit:
@@ -25249,6 +25341,7 @@
   uint8_t v_c = 0;
   uint32_t v_i = 0;
   uint64_t v_tag = 0;
+  wuffs_base__status v_status = wuffs_base__make_status(NULL);
   uint32_t v_final_checksum_want = 0;
 
   const uint8_t* iop_a_src = NULL;
@@ -25364,17 +25457,39 @@
       if (status.repr) {
         goto suspend;
       }
-      if (a_src) {
-        a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+      self->private_impl.f_block_size = 0;
+      self->private_impl.f_decode_block_finished = false;
+      self->private_impl.f_decode_block_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[0] & 7)];
+      self->private_impl.f_decode_block_ticks = 50;
+      self->private_impl.f_decode_block_section = 0;
+      self->private_impl.f_decode_block_run_shift = 0;
+      while ( ! self->private_impl.f_decode_block_finished) {
+        if (a_src) {
+          a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+        }
+        v_status = wuffs_bzip2__decoder__decode_block_fast(self, a_src);
+        if (a_src) {
+          iop_a_src = a_src->data.ptr + a_src->meta.ri;
+        }
+        if (wuffs_base__status__is_error(&v_status)) {
+          status = v_status;
+          goto exit;
+        } else if (self->private_impl.f_decode_block_finished) {
+          goto label__1__break;
+        }
+        if (a_src) {
+          a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+        }
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7);
+        status = wuffs_bzip2__decoder__decode_block_slow(self, a_src);
+        if (a_src) {
+          iop_a_src = a_src->data.ptr + a_src->meta.ri;
+        }
+        if (status.repr) {
+          goto suspend;
+        }
       }
-      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7);
-      status = wuffs_bzip2__decoder__decode_block_slow(self, a_src);
-      if (a_src) {
-        iop_a_src = a_src->data.ptr + a_src->meta.ri;
-      }
-      if (status.repr) {
-        goto suspend;
-      }
+      label__1__break:;
       wuffs_bzip2__decoder__invert_bwt(self);
       WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8);
       status = wuffs_bzip2__decoder__flush_block(self, a_dst);
@@ -25446,6 +25561,7 @@
 
   uint8_t v_c = 0;
   uint32_t v_i = 0;
+  uint32_t v_j = 0;
   uint32_t v_selector = 0;
   uint32_t v_sel_ff = 0;
   uint8_t v_movee = 0;
@@ -25719,6 +25835,20 @@
       }
       v_i += 1;
     }
+    v_i = 0;
+    v_j = 0;
+    while (v_i < 256) {
+      if (self->private_data.f_presence[v_i] != 0) {
+        self->private_data.f_mtft[(v_j & 255)] = ((uint8_t)(v_i));
+        v_j += 1;
+      }
+      v_i += 1;
+    }
+    v_i = 0;
+    while (v_i < 256) {
+      self->private_data.f_letter_counts[v_i] = 0;
+      v_i += 1;
+    }
 
     goto ok;
     ok:
diff --git a/std/bzip2/decode_block_fast.wuffs b/std/bzip2/decode_block_fast.wuffs
new file mode 100644
index 0000000..db9359b
--- /dev/null
+++ b/std/bzip2/decode_block_fast.wuffs
@@ -0,0 +1,120 @@
+// Copyright 2022 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func decoder.decode_block_fast!(src: base.io_reader) base.status {
+	var bits       : base.u32
+	var n_bits     : base.u32[..= 31]
+	var block_size : base.u32[..= 900000]
+	var which      : base.u8[..= 5]
+	var ticks      : base.u32[..= 50]
+	var section    : base.u32
+	var run_shift  : base.u32[..= 23]
+
+	var child    : base.u16
+	var child_ff : base.u32[..= 255]
+	var i        : base.u32
+	var j        : base.u32
+	var output   : base.u32[..= 255]
+	var run      : base.u32[..= 16_777216]  // 16_777216 = (2 << 23)
+	var mtft0    : base.u32[..= 255]
+
+	bits = this.bits
+	n_bits = this.n_bits
+	block_size = this.block_size
+	which = this.decode_block_which
+	ticks = this.decode_block_ticks
+	section = this.decode_block_section
+	run_shift = this.decode_block_run_shift
+
+	// Apply the Huffman trees.
+	while.outer args.src.length() >= 4 {
+		if ticks > 0 {
+			ticks -= 1
+		} else {
+			ticks = 49
+			section ~mod+= 1
+			which = CLAMP_TO_5[this.huffman_selectors[section & 32767] & 7]
+		}
+
+		bits |= args.src.peek_u32be() >> n_bits
+		args.src.skip_u32_fast!(actual: (31 - n_bits) >> 3, worst_case: 4)
+		n_bits |= 24
+
+		child = 0
+		while child < 1024 {
+			child = this.huffman_trees[which][child][bits >> 31]
+			bits ~mod<<= 1
+			if n_bits <= 0 {
+				return "#internal error: inconsistent Huffman decoder state"
+			}
+			n_bits -= 1
+		} endwhile
+
+		if child < 1280 {  // mNN symbol.
+			// Move to front.
+			child_ff = (child & 0xFF) as base.u32
+			output = this.mtft[child_ff] as base.u32
+			this.mtft[1 .. 1 + child_ff].copy_from_slice!(
+				s: this.mtft[.. child_ff])
+			this.mtft[0] = output as base.u8
+
+			this.letter_counts[output] ~mod+= 1
+			this.bwt[block_size] = output
+			if block_size >= this.max_incl_block_size {
+				return "#bad block length"
+			}
+			assert block_size < 900000 via "a < b: a < c; c <= b"(c: this.max_incl_block_size)
+			block_size += 1
+			run_shift = 0
+			continue.outer
+		} else if child > 1281 {  // EOB symbol.
+			this.decode_block_finished = true
+			break.outer
+		}
+
+		// RUNA or RUNB symbol.
+		if run_shift >= 23 {
+			return "#bad block length"
+		}
+		run = ((child - 1279) as base.u32) << run_shift
+		run_shift += 1
+		i = block_size
+		j = run + block_size
+		if j > this.max_incl_block_size {
+			return "#bad block length"
+		}
+		assert j <= 900000 via "a <= b: a <= c; c <= b"(c: this.max_incl_block_size)
+		block_size = j
+
+		mtft0 = this.mtft[0] as base.u32
+		this.letter_counts[mtft0] ~mod+= run
+		while i < j,
+			pre j <= 900000,
+		{
+			assert i < 900000 via "a < b: a < c; c <= b"(c: j)
+			this.bwt[i] = mtft0
+			i += 1
+		} endwhile
+	} endwhile.outer
+
+	this.bits = bits
+	this.n_bits = n_bits
+	this.block_size = block_size
+	this.decode_block_which = which
+	this.decode_block_ticks = ticks
+	this.decode_block_section = section
+	this.decode_block_run_shift = run_shift
+
+	return ok
+}
diff --git a/std/bzip2/decode_block_slow.wuffs b/std/bzip2/decode_block_slow.wuffs
index fe65baa..c1b9c48 100644
--- a/std/bzip2/decode_block_slow.wuffs
+++ b/std/bzip2/decode_block_slow.wuffs
@@ -13,50 +13,25 @@
 // limitations under the License.
 
 pri func decoder.decode_block_slow?(src: base.io_reader) {
-	var which      : base.u8[..= 5]
 	var c          : base.u8
-	var child      : base.u16
-	var child_ff   : base.u32[..= 255]
-	var i          : base.u32
-	var j          : base.u32
-	var ticks      : base.u32[..= 50]
-	var section    : base.u32
-	var bs         : base.u32[..= 1_048575]  // 1_048575 = ((1 << 20) - 1)
-	var output     : base.u32[..= 255]
 	var node_index : base.u32[..= 1023]
-	var run_shift  : base.u32[..= 23]
-	var run        : base.u32[..= 16_777216]  // 16_777216 = (2 << 23)
-	var mtft0      : base.u32[..= 255]
 
-	which = CLAMP_TO_5[this.huffman_selectors[0] & 7]
-	section = 1
-
-	// Initialize the MTFT state.
-	i = 0
-	j = 0
-	while i < 256 {
-		if this.presence[i] <> 0 {
-			this.mtft[j & 0xFF] = i as base.u8
-			j ~mod+= 1
-		}
-		i += 1
-	} endwhile
-
-	i = 0
-	while i < 256 {
-		this.letter_counts[i] = 0
-		i += 1
-	} endwhile
+	var child    : base.u16
+	var child_ff : base.u32[..= 255]
+	var i        : base.u32
+	var j        : base.u32
+	var output   : base.u32[..= 255]
+	var run      : base.u32[..= 16_777216]  // 16_777216 = (2 << 23)
+	var mtft0    : base.u32[..= 255]
 
 	// Apply the Huffman trees.
-	ticks = 50
-	while.outer true {
-		if ticks > 0 {
-			ticks -= 1
+	while.outer not coroutine_resumed {
+		if this.decode_block_ticks > 0 {
+			this.decode_block_ticks -= 1
 		} else {
-			ticks = 49
-			which = CLAMP_TO_5[this.huffman_selectors[section & 32767] & 7]
-			section ~mod+= 1
+			this.decode_block_ticks = 49
+			this.decode_block_section ~mod+= 1
+			this.decode_block_which = CLAMP_TO_5[this.huffman_selectors[this.decode_block_section & 32767] & 7]
 		}
 
 		node_index = 0
@@ -67,7 +42,7 @@
 				this.n_bits = 8
 				assert this.n_bits > 0
 			}
-			child = this.huffman_trees[which][node_index][this.bits >> 31]
+			child = this.huffman_trees[this.decode_block_which][node_index][this.bits >> 31]
 			this.bits ~mod<<= 1
 			this.n_bits -= 1
 
@@ -84,32 +59,33 @@
 				this.mtft[0] = output as base.u8
 
 				this.letter_counts[output] ~mod+= 1
-				this.bwt[bs] = output
-				if bs >= this.max_incl_block_size {
+				this.bwt[this.block_size] = output
+				if this.block_size >= this.max_incl_block_size {
 					return "#bad block length"
 				}
-				assert bs < 900000 via "a < b: a < c; c <= b"(c: this.max_incl_block_size)
-				bs += 1
-				run_shift = 0
+				assert this.block_size < 900000 via "a < b: a < c; c <= b"(c: this.max_incl_block_size)
+				this.block_size += 1
+				this.decode_block_run_shift = 0
 				break
 
 			} else if child > 1281 {  // EOB symbol.
+				this.decode_block_finished = true
 				break.outer
 			}
 
 			// RUNA or RUNB symbol.
-			if run_shift >= 23 {
+			if this.decode_block_run_shift >= 23 {
 				return "#bad block length"
 			}
-			run = ((child - 1279) as base.u32) << run_shift
-			run_shift += 1
-			i = bs
-			j = run + bs
+			run = ((child - 1279) as base.u32) << this.decode_block_run_shift
+			this.decode_block_run_shift += 1
+			i = this.block_size
+			j = run + this.block_size
 			if j > this.max_incl_block_size {
 				return "#bad block length"
 			}
 			assert j <= 900000 via "a <= b: a <= c; c <= b"(c: this.max_incl_block_size)
-			bs = j
+			this.block_size = j
 
 			mtft0 = this.mtft[0] as base.u32
 			this.letter_counts[mtft0] ~mod+= run
@@ -123,10 +99,4 @@
 			break
 		} endwhile
 	} endwhile.outer
-
-	if bs > this.max_incl_block_size {
-		return "#bad block length"
-	}
-	assert bs <= 900000 via "a <= b: a <= c; c <= b"(c: this.max_incl_block_size)
-	this.block_size = bs
 }
diff --git a/std/bzip2/decode_bzip2.wuffs b/std/bzip2/decode_bzip2.wuffs
index 6f86876..1fb899b 100644
--- a/std/bzip2/decode_bzip2.wuffs
+++ b/std/bzip2/decode_bzip2.wuffs
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO: optimize.
-
 pub status "#bad Huffman code (over-subscribed)"
 pub status "#bad Huffman code (under-subscribed)"
 pub status "#bad block header"
@@ -25,6 +23,8 @@
 pub status "#unsupported Huffman code"
 pub status "#unsupported block randomization"
 
+pri status "#internal error: inconsistent Huffman decoder state"
+
 pub const DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE : base.u64 = 0
 
 pri const CLAMP_TO_5 : array[8] base.u8[..= 5] = [0, 1, 2, 3, 4, 5, 5, 5]
@@ -34,9 +34,15 @@
 	n_bits : base.u32[..= 31],
 
 	// Block size is measured prior to the RLE1 step.
-	block_size          : base.u32[..= 900000],
 	max_incl_block_size : base.u32[..= 900000],
 
+	block_size             : base.u32[..= 900000],
+	decode_block_finished  : base.bool,
+	decode_block_which     : base.u8[..= 5],
+	decode_block_ticks     : base.u32[..= 50],
+	decode_block_section   : base.u32,
+	decode_block_run_shift : base.u32[..= 23],
+
 	final_checksum_have  : base.u32,
 	block_checksum_want  : base.u32,
 	original_pointer     : base.u32,
@@ -92,6 +98,7 @@
 	var c                   : base.u8
 	var i                   : base.u32
 	var tag                 : base.u64
+	var status              : base.status
 	var final_checksum_want : base.u32
 
 	// Read the header.
@@ -138,7 +145,23 @@
 		}
 
 		this.prepare_block?(src: args.src)
-		this.decode_block_slow?(src: args.src)
+
+		this.block_size = 0
+		this.decode_block_finished = false
+		this.decode_block_which = CLAMP_TO_5[this.huffman_selectors[0] & 7]
+		this.decode_block_ticks = 50
+		this.decode_block_section = 0
+		this.decode_block_run_shift = 0
+		while not this.decode_block_finished {
+			status = this.decode_block_fast!(src: args.src)
+			if status.is_error() {
+				return status
+			} else if this.decode_block_finished {
+				break
+			}
+			this.decode_block_slow?(src: args.src)
+		} endwhile
+
 		this.invert_bwt!()
 		this.flush_block?(dst: args.dst)
 	} endwhile
@@ -167,6 +190,7 @@
 pri func decoder.prepare_block?(src: base.io_reader) {
 	var c        : base.u8
 	var i        : base.u32
+	var j        : base.u32
 	var selector : base.u32
 	var sel_ff   : base.u32
 	var movee    : base.u8
@@ -378,6 +402,24 @@
 		}
 		i += 1
 	} endwhile
+
+	// Initialize the MTFT state.
+	i = 0
+	j = 0
+	while i < 256 {
+		if this.presence[i] <> 0 {
+			this.mtft[j & 0xFF] = i as base.u8
+			j ~mod+= 1
+		}
+		i += 1
+	} endwhile
+
+	// Initialize the letter counts.
+	i = 0
+	while i < 256 {
+		this.letter_counts[i] = 0
+		i += 1
+	} endwhile
 }
 
 pri func decoder.read_code_lengths?(src: base.io_reader) {
@@ -592,7 +634,8 @@
 
 // invert_bwt performs "two passes over the data, and one pass over the
 // alphabet", per the BWT technical report, except that the first data pass
-// (accumulating this.letter_counts) is integrated into execute_block.
+// (accumulating this.letter_counts) is integrated into the decode_block_etc
+// methods.
 pri func decoder.invert_bwt!() {
 	var i       : base.u32
 	var letter  : base.u32[..= 255]