Introduce std/bzip2 decoder.flush_fast method

Compared to the recent commit 623290ef "Cache std/bzip2 Huffman tree
lookup":

name                             old speed      new speed      delta

wuffs_bzip2_decode_10k/clang11   60.5MB/s ± 0%  60.5MB/s ± 0%    ~     (p=1.000 n=5+5)
wuffs_bzip2_decode_100k/clang11  46.7MB/s ± 1%  45.7MB/s ± 1%  -2.05%  (p=0.008 n=5+5)

wuffs_bzip2_decode_10k/gcc10     58.7MB/s ± 0%  58.5MB/s ± 0%    ~     (p=0.222 n=5+5)
wuffs_bzip2_decode_100k/gcc10    46.9MB/s ± 0%  46.3MB/s ± 0%  -1.46%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 381e4b5..b5df137 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6941,7 +6941,11 @@
     uint32_t f_decode_huffman_ticks;
     uint32_t f_decode_huffman_section;
     uint32_t f_decode_huffman_run_shift;
+    uint32_t f_flush_pointer;
+    uint32_t f_flush_repeat_count;
+    uint8_t f_flush_prev;
     uint32_t f_final_checksum_have;
+    uint32_t f_block_checksum_have;
     uint32_t f_block_checksum_want;
     uint32_t f_original_pointer;
     uint32_t f_num_symbols;
@@ -6952,7 +6956,7 @@
     uint32_t p_transform_io[1];
     uint32_t p_prepare_block[1];
     uint32_t p_read_code_lengths[1];
-    uint32_t p_flush_block[1];
+    uint32_t p_flush_slow[1];
     uint32_t p_decode_huffman_slow[1];
   } private_impl;
 
@@ -6980,14 +6984,14 @@
       uint32_t v_code_length;
     } s_read_code_lengths[1];
     struct {
-      uint32_t v_i;
-      uint32_t v_n;
-      uint32_t v_repeat_count;
+      uint32_t v_flush_pointer;
+      uint32_t v_flush_repeat_count;
+      uint8_t v_flush_prev;
       uint32_t v_block_checksum_have;
-      uint8_t v_prev;
+      uint32_t v_block_size;
       uint8_t v_curr;
       uint64_t scratch;
-    } s_flush_block[1];
+    } s_flush_slow[1];
     struct {
       uint32_t v_node_index;
     } s_decode_huffman_slow[1];
@@ -24938,8 +24942,13 @@
 wuffs_bzip2__decoder__invert_bwt(
     wuffs_bzip2__decoder* self);
 
+static wuffs_base__empty_struct
+wuffs_bzip2__decoder__flush_fast(
+    wuffs_bzip2__decoder* self,
+    wuffs_base__io_buffer* a_dst);
+
 static wuffs_base__status
-wuffs_bzip2__decoder__flush_block(
+wuffs_bzip2__decoder__flush_slow(
     wuffs_bzip2__decoder* self,
     wuffs_base__io_buffer* a_dst);
 
@@ -25246,11 +25255,32 @@
       }
       label__1__break:;
       wuffs_bzip2__decoder__invert_bwt(self);
-      WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8);
-      status = wuffs_bzip2__decoder__flush_block(self, a_dst);
-      if (status.repr) {
-        goto suspend;
+      self->private_impl.f_block_checksum_have = 4294967295;
+      if (self->private_impl.f_original_pointer >= self->private_impl.f_block_size) {
+        status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+        goto exit;
       }
+      self->private_impl.f_flush_pointer = (self->private_data.f_bwt[self->private_impl.f_original_pointer] >> 12);
+      self->private_impl.f_flush_repeat_count = 0;
+      self->private_impl.f_flush_prev = 0;
+      while (self->private_impl.f_block_size > 0) {
+        wuffs_bzip2__decoder__flush_fast(self, a_dst);
+        if (self->private_impl.f_block_size <= 0) {
+          goto label__2__break;
+        }
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8);
+        status = wuffs_bzip2__decoder__flush_slow(self, a_dst);
+        if (status.repr) {
+          goto suspend;
+        }
+      }
+      label__2__break:;
+      self->private_impl.f_block_checksum_have ^= 4294967295;
+      if (self->private_impl.f_block_checksum_have != self->private_impl.f_block_checksum_want) {
+        status = wuffs_base__make_status(wuffs_bzip2__error__bad_checksum);
+        goto exit;
+      }
+      self->private_impl.f_final_checksum_have = (self->private_impl.f_block_checksum_have ^ ((self->private_impl.f_final_checksum_have >> 31) | ((uint32_t)(self->private_impl.f_final_checksum_have << 1))));
     }
     label__0__break:;
     v_final_checksum_want = 0;
@@ -25903,20 +25933,18 @@
   return wuffs_base__make_empty_struct();
 }
 
-// -------- func bzip2.decoder.flush_block
+// -------- func bzip2.decoder.flush_fast
 
-static wuffs_base__status
-wuffs_bzip2__decoder__flush_block(
+static wuffs_base__empty_struct
+wuffs_bzip2__decoder__flush_fast(
     wuffs_bzip2__decoder* self,
     wuffs_base__io_buffer* a_dst) {
-  wuffs_base__status status = wuffs_base__make_status(NULL);
-
-  uint32_t v_i = 0;
-  uint32_t v_n = 0;
-  uint32_t v_entry = 0;
-  uint32_t v_repeat_count = 0;
+  uint32_t v_flush_pointer = 0;
+  uint32_t v_flush_repeat_count = 0;
+  uint8_t v_flush_prev = 0;
   uint32_t v_block_checksum_have = 0;
-  uint8_t v_prev = 0;
+  uint32_t v_block_size = 0;
+  uint32_t v_entry = 0;
   uint8_t v_curr = 0;
 
   uint8_t* iop_a_dst = NULL;
@@ -25933,89 +25961,163 @@
     }
   }
 
-  uint32_t coro_susp_point = self->private_impl.p_flush_block[0];
+  v_flush_pointer = self->private_impl.f_flush_pointer;
+  v_flush_repeat_count = self->private_impl.f_flush_repeat_count;
+  v_flush_prev = self->private_impl.f_flush_prev;
+  v_block_checksum_have = self->private_impl.f_block_checksum_have;
+  v_block_size = self->private_impl.f_block_size;
+  while ((v_block_size > 0) && (((uint64_t)(io2_a_dst - iop_a_dst)) > 255)) {
+    v_entry = self->private_data.f_bwt[v_flush_pointer];
+    v_curr = ((uint8_t)((v_entry & 255)));
+    v_flush_pointer = (v_entry >> 12);
+    if (v_flush_repeat_count >= 4) {
+      v_flush_repeat_count = ((uint32_t)(v_curr));
+      while (v_flush_repeat_count > 0) {
+        v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+        if (((uint64_t)(io2_a_dst - iop_a_dst)) > 0) {
+          (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_flush_prev), iop_a_dst += 1);
+        }
+        v_flush_repeat_count -= 1;
+      }
+      v_flush_repeat_count = 0;
+    } else if (v_curr != v_flush_prev) {
+      v_flush_repeat_count = 1;
+      v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+      (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
+    } else {
+      v_flush_repeat_count += 1;
+      v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+      (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
+    }
+    v_flush_prev = v_curr;
+    v_block_size -= 1;
+  }
+  self->private_impl.f_flush_pointer = v_flush_pointer;
+  self->private_impl.f_flush_repeat_count = v_flush_repeat_count;
+  self->private_impl.f_flush_prev = v_flush_prev;
+  self->private_impl.f_block_checksum_have = v_block_checksum_have;
+  if (v_block_size <= 900000) {
+    self->private_impl.f_block_size = v_block_size;
+  }
+  if (a_dst) {
+    a_dst->meta.wi = ((size_t)(iop_a_dst - a_dst->data.ptr));
+  }
+
+  return wuffs_base__make_empty_struct();
+}
+
+// -------- func bzip2.decoder.flush_slow
+
+static wuffs_base__status
+wuffs_bzip2__decoder__flush_slow(
+    wuffs_bzip2__decoder* self,
+    wuffs_base__io_buffer* a_dst) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  uint32_t v_flush_pointer = 0;
+  uint32_t v_flush_repeat_count = 0;
+  uint8_t v_flush_prev = 0;
+  uint32_t v_block_checksum_have = 0;
+  uint32_t v_block_size = 0;
+  uint32_t v_entry = 0;
+  uint8_t v_curr = 0;
+
+  uint8_t* iop_a_dst = NULL;
+  uint8_t* io0_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  uint8_t* io1_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  uint8_t* io2_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_dst) {
+    io0_a_dst = a_dst->data.ptr;
+    io1_a_dst = io0_a_dst + a_dst->meta.wi;
+    iop_a_dst = io1_a_dst;
+    io2_a_dst = io0_a_dst + a_dst->data.len;
+    if (a_dst->meta.closed) {
+      io2_a_dst = iop_a_dst;
+    }
+  }
+
+  uint32_t coro_susp_point = self->private_impl.p_flush_slow[0];
   if (coro_susp_point) {
-    v_i = self->private_data.s_flush_block[0].v_i;
-    v_n = self->private_data.s_flush_block[0].v_n;
-    v_repeat_count = self->private_data.s_flush_block[0].v_repeat_count;
-    v_block_checksum_have = self->private_data.s_flush_block[0].v_block_checksum_have;
-    v_prev = self->private_data.s_flush_block[0].v_prev;
-    v_curr = self->private_data.s_flush_block[0].v_curr;
+    v_flush_pointer = self->private_data.s_flush_slow[0].v_flush_pointer;
+    v_flush_repeat_count = self->private_data.s_flush_slow[0].v_flush_repeat_count;
+    v_flush_prev = self->private_data.s_flush_slow[0].v_flush_prev;
+    v_block_checksum_have = self->private_data.s_flush_slow[0].v_block_checksum_have;
+    v_block_size = self->private_data.s_flush_slow[0].v_block_size;
+    v_curr = self->private_data.s_flush_slow[0].v_curr;
   }
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
 
-    if (self->private_impl.f_original_pointer >= self->private_impl.f_block_size) {
-      status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
-      goto exit;
-    }
-    v_i = (self->private_data.f_bwt[self->private_impl.f_original_pointer] >> 12);
-    v_block_checksum_have = 4294967295;
-    v_n = 0;
-    while (v_n < self->private_impl.f_block_size) {
-      v_entry = self->private_data.f_bwt[v_i];
+    v_flush_pointer = self->private_impl.f_flush_pointer;
+    v_flush_repeat_count = self->private_impl.f_flush_repeat_count;
+    v_flush_prev = self->private_impl.f_flush_prev;
+    v_block_checksum_have = self->private_impl.f_block_checksum_have;
+    v_block_size = self->private_impl.f_block_size;
+    while ((v_block_size > 0) &&  ! (self->private_impl.p_flush_slow[0] != 0)) {
+      v_entry = self->private_data.f_bwt[v_flush_pointer];
       v_curr = ((uint8_t)((v_entry & 255)));
-      v_i = (v_entry >> 12);
-      if (v_repeat_count >= 4) {
-        v_repeat_count = ((uint32_t)(v_curr));
-        while (v_repeat_count > 0) {
-          v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-          self->private_data.s_flush_block[0].scratch = v_prev;
+      v_flush_pointer = (v_entry >> 12);
+      if (v_flush_repeat_count >= 4) {
+        v_flush_repeat_count = ((uint32_t)(v_curr));
+        while (v_flush_repeat_count > 0) {
+          v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+          self->private_data.s_flush_slow[0].scratch = v_flush_prev;
           WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1);
           if (iop_a_dst == io2_a_dst) {
             status = wuffs_base__make_status(wuffs_base__suspension__short_write);
             goto suspend;
           }
-          *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_block[0].scratch));
-          v_repeat_count -= 1;
+          *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
+          v_flush_repeat_count -= 1;
         }
-        v_repeat_count = 0;
-      } else if (v_curr != v_prev) {
-        v_repeat_count = 1;
+        v_flush_repeat_count = 0;
+      } else if (v_curr != v_flush_prev) {
+        v_flush_repeat_count = 1;
         v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-        self->private_data.s_flush_block[0].scratch = v_curr;
+        self->private_data.s_flush_slow[0].scratch = v_curr;
         WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2);
         if (iop_a_dst == io2_a_dst) {
           status = wuffs_base__make_status(wuffs_base__suspension__short_write);
           goto suspend;
         }
-        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_block[0].scratch));
+        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
       } else {
-        v_repeat_count += 1;
+        v_flush_repeat_count += 1;
         v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-        self->private_data.s_flush_block[0].scratch = v_curr;
+        self->private_data.s_flush_slow[0].scratch = v_curr;
         WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3);
         if (iop_a_dst == io2_a_dst) {
           status = wuffs_base__make_status(wuffs_base__suspension__short_write);
           goto suspend;
         }
-        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_block[0].scratch));
+        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
       }
-      v_prev = v_curr;
-      v_n += 1;
+      v_flush_prev = v_curr;
+      v_block_size -= 1;
     }
-    v_block_checksum_have ^= 4294967295;
-    if (v_block_checksum_have != self->private_impl.f_block_checksum_want) {
-      status = wuffs_base__make_status(wuffs_bzip2__error__bad_checksum);
-      goto exit;
+    self->private_impl.f_flush_pointer = v_flush_pointer;
+    self->private_impl.f_flush_repeat_count = v_flush_repeat_count;
+    self->private_impl.f_flush_prev = v_flush_prev;
+    self->private_impl.f_block_checksum_have = v_block_checksum_have;
+    if (v_block_size <= 900000) {
+      self->private_impl.f_block_size = v_block_size;
     }
-    self->private_impl.f_final_checksum_have = (v_block_checksum_have ^ ((self->private_impl.f_final_checksum_have >> 31) | ((uint32_t)(self->private_impl.f_final_checksum_have << 1))));
 
     goto ok;
     ok:
-    self->private_impl.p_flush_block[0] = 0;
+    self->private_impl.p_flush_slow[0] = 0;
     goto exit;
   }
 
   goto suspend;
   suspend:
-  self->private_impl.p_flush_block[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
-  self->private_data.s_flush_block[0].v_i = v_i;
-  self->private_data.s_flush_block[0].v_n = v_n;
-  self->private_data.s_flush_block[0].v_repeat_count = v_repeat_count;
-  self->private_data.s_flush_block[0].v_block_checksum_have = v_block_checksum_have;
-  self->private_data.s_flush_block[0].v_prev = v_prev;
-  self->private_data.s_flush_block[0].v_curr = v_curr;
+  self->private_impl.p_flush_slow[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
+  self->private_data.s_flush_slow[0].v_flush_pointer = v_flush_pointer;
+  self->private_data.s_flush_slow[0].v_flush_repeat_count = v_flush_repeat_count;
+  self->private_data.s_flush_slow[0].v_flush_prev = v_flush_prev;
+  self->private_data.s_flush_slow[0].v_block_checksum_have = v_block_checksum_have;
+  self->private_data.s_flush_slow[0].v_block_size = v_block_size;
+  self->private_data.s_flush_slow[0].v_curr = v_curr;
 
   goto exit;
   exit:
diff --git a/std/bzip2/decode_bzip2.wuffs b/std/bzip2/decode_bzip2.wuffs
index 64350c6..889c4dd 100644
--- a/std/bzip2/decode_bzip2.wuffs
+++ b/std/bzip2/decode_bzip2.wuffs
@@ -43,7 +43,12 @@
 	decode_huffman_section   : base.u32,
 	decode_huffman_run_shift : base.u32[..= 23],
 
+	flush_pointer      : base.u32[..= 1_048575],
+	flush_repeat_count : base.u32[..= 255],
+	flush_prev         : base.u8,
+
 	final_checksum_have  : base.u32,
+	block_checksum_have  : base.u32,
 	block_checksum_want  : base.u32,
 	original_pointer     : base.u32,
 	num_symbols          : base.u32[..= 258],
@@ -174,7 +179,31 @@
 		} endwhile
 
 		this.invert_bwt!()
-		this.flush_block?(dst: args.dst)
+
+		this.block_checksum_have = 0xFFFF_FFFF
+		if this.original_pointer >= this.block_size {
+			return "#bad block length"
+		}
+		assert this.original_pointer < 900000 via "a < b: a < c; c <= b"(c: this.block_size)
+		this.flush_pointer = this.bwt[this.original_pointer] >> 12
+		this.flush_repeat_count = 0
+		this.flush_prev = 0
+
+		while this.block_size > 0 {
+			this.flush_fast!(dst: args.dst)
+			if this.block_size <= 0 {
+				break
+			}
+			this.flush_slow?(dst: args.dst)
+		} endwhile
+
+		this.block_checksum_have ^= 0xFFFF_FFFF
+		if this.block_checksum_have <> this.block_checksum_want {
+			return "#bad checksum"
+		}
+		this.final_checksum_have = this.block_checksum_have ^ (
+			(this.final_checksum_have >> 31) |
+			(this.final_checksum_have ~mod<< 1))
 	} endwhile
 
 	// Read the 32-bit final checksum.
diff --git a/std/bzip2/decode_flush_fast.wuffs b/std/bzip2/decode_flush_fast.wuffs
new file mode 100644
index 0000000..a12dfa0
--- /dev/null
+++ b/std/bzip2/decode_flush_fast.wuffs
@@ -0,0 +1,75 @@
+// Copyright 2022 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func decoder.flush_fast!(dst: base.io_writer) {
+	var flush_pointer       : base.u32[..= 1_048575]
+	var flush_repeat_count  : base.u32[..= 255]
+	var flush_prev          : base.u8
+	var block_checksum_have : base.u32
+	var block_size          : base.u32
+
+	var entry : base.u32
+	var curr  : base.u8
+
+	flush_pointer = this.flush_pointer
+	flush_repeat_count = this.flush_repeat_count
+	flush_prev = this.flush_prev
+	block_checksum_have = this.block_checksum_have
+	block_size = this.block_size
+
+	while (block_size > 0) and (args.dst.length() > 255) {
+		entry = this.bwt[flush_pointer]
+		curr = (entry & 0xFF) as base.u8
+		flush_pointer = entry >> 12
+
+		if flush_repeat_count >= 4 {
+			flush_repeat_count = curr as base.u32
+			while flush_repeat_count > 0,
+				inv block_size > 0,
+			{
+				block_checksum_have =
+					REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ flush_prev] ^
+					(block_checksum_have ~mod<< 8)
+				if args.dst.length() > 0 {
+					args.dst.write_u8_fast!(a: flush_prev)
+				}
+				flush_repeat_count -= 1
+			} endwhile
+			flush_repeat_count = 0
+		} else if curr <> flush_prev {
+			flush_repeat_count = 1
+			block_checksum_have =
+				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
+				(block_checksum_have ~mod<< 8)
+			args.dst.write_u8_fast!(a: curr)
+		} else {
+			flush_repeat_count += 1
+			block_checksum_have =
+				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
+				(block_checksum_have ~mod<< 8)
+			args.dst.write_u8_fast!(a: curr)
+		}
+
+		flush_prev = curr
+		block_size -= 1
+	} endwhile
+
+	this.flush_pointer = flush_pointer
+	this.flush_repeat_count = flush_repeat_count
+	this.flush_prev = flush_prev
+	this.block_checksum_have = block_checksum_have
+	if block_size <= 900000 {
+		this.block_size = block_size
+	}
+}
diff --git a/std/bzip2/decode_flush_slow.wuffs b/std/bzip2/decode_flush_slow.wuffs
index 33b21d2..a0674c2 100644
--- a/std/bzip2/decode_flush_slow.wuffs
+++ b/std/bzip2/decode_flush_slow.wuffs
@@ -12,65 +12,62 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-pri func decoder.flush_block?(dst: base.io_writer) {
-	var i                   : base.u32[..= 1_048575]
-	var n                   : base.u32
-	var entry               : base.u32
-	var repeat_count        : base.u32[..= 255]
+pri func decoder.flush_slow?(dst: base.io_writer) {
+	var flush_pointer       : base.u32[..= 1_048575]
+	var flush_repeat_count  : base.u32[..= 255]
+	var flush_prev          : base.u8
 	var block_checksum_have : base.u32
-	var prev                : base.u8
-	var curr                : base.u8
+	var block_size          : base.u32
 
-	if this.original_pointer >= this.block_size {
-		return "#bad block length"
-	}
-	assert this.original_pointer < 900000 via "a < b: a < c; c <= b"(c: this.block_size)
-	i = this.bwt[this.original_pointer] >> 12
+	var entry : base.u32
+	var curr  : base.u8
 
-	block_checksum_have = 0xFFFF_FFFF
+	flush_pointer = this.flush_pointer
+	flush_repeat_count = this.flush_repeat_count
+	flush_prev = this.flush_prev
+	block_checksum_have = this.block_checksum_have
+	block_size = this.block_size
 
-	n = 0
-	while n < this.block_size {
-		assert n < 900000 via "a < b: a < c; c <= b"(c: this.block_size)
-		entry = this.bwt[i]
+	while (block_size > 0) and (not coroutine_resumed) {
+		entry = this.bwt[flush_pointer]
 		curr = (entry & 0xFF) as base.u8
-		i = entry >> 12
+		flush_pointer = entry >> 12
 
-		if repeat_count >= 4 {
-			repeat_count = curr as base.u32
-			while repeat_count > 0,
-				inv n < 900000,
+		if flush_repeat_count >= 4 {
+			flush_repeat_count = curr as base.u32
+			while flush_repeat_count > 0,
+				inv block_size > 0,
 			{
 				block_checksum_have =
-					REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ prev] ^
+					REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ flush_prev] ^
 					(block_checksum_have ~mod<< 8)
-				args.dst.write_u8?(a: prev)
-				repeat_count -= 1
+				args.dst.write_u8?(a: flush_prev)
+				flush_repeat_count -= 1
 			} endwhile
-			repeat_count = 0
-		} else if curr <> prev {
-			repeat_count = 1
+			flush_repeat_count = 0
+		} else if curr <> flush_prev {
+			flush_repeat_count = 1
 			block_checksum_have =
 				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
 				(block_checksum_have ~mod<< 8)
 			args.dst.write_u8?(a: curr)
 		} else {
-			repeat_count += 1
+			flush_repeat_count += 1
 			block_checksum_have =
 				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
 				(block_checksum_have ~mod<< 8)
 			args.dst.write_u8?(a: curr)
 		}
 
-		prev = curr
-		n += 1
+		flush_prev = curr
+		block_size -= 1
 	} endwhile
 
-	block_checksum_have ^= 0xFFFF_FFFF
-	if block_checksum_have <> this.block_checksum_want {
-		return "#bad checksum"
+	this.flush_pointer = flush_pointer
+	this.flush_repeat_count = flush_repeat_count
+	this.flush_prev = flush_prev
+	this.block_checksum_have = block_checksum_have
+	if block_size <= 900000 {
+		this.block_size = block_size
 	}
-	this.final_checksum_have = block_checksum_have ^ (
-		(this.final_checksum_have >> 31) |
-		(this.final_checksum_have ~mod<< 1))
 }