Tweak std/bzip2 flush_* basic blocks

name                             old speed      new speed      delta

wuffs_bzip2_decode_10k/clang11   65.2MB/s ± 0%  65.2MB/s ± 0%    ~     (p=0.056 n=5+5)
wuffs_bzip2_decode_100k/clang11  52.4MB/s ± 0%  51.5MB/s ± 0%  -1.67%  (p=0.016 n=5+4)

wuffs_bzip2_decode_10k/gcc10     60.6MB/s ± 0%  61.7MB/s ± 0%  +1.81%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10    49.2MB/s ± 0%  49.6MB/s ± 0%  +0.86%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index afa8408..45e2967 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -25967,10 +25967,10 @@
   v_block_checksum_have = self->private_impl.f_block_checksum_have;
   v_block_size = self->private_impl.f_block_size;
   while ((v_block_size > 0) && (((uint64_t)(io2_a_dst - iop_a_dst)) > 255)) {
-    v_entry = self->private_data.f_bwt[v_flush_pointer];
-    v_curr = ((uint8_t)((v_entry & 255)));
-    v_flush_pointer = (v_entry >> 12);
     if (v_flush_repeat_count < 4) {
+      v_entry = self->private_data.f_bwt[v_flush_pointer];
+      v_curr = ((uint8_t)((v_entry & 255)));
+      v_flush_pointer = (v_entry >> 12);
       if (v_curr == v_flush_prev) {
         v_flush_repeat_count += 1;
       } else {
@@ -25978,7 +25978,12 @@
       }
       v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
       (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
+      v_flush_prev = v_curr;
+      v_block_size -= 1;
     } else {
+      v_entry = self->private_data.f_bwt[v_flush_pointer];
+      v_curr = ((uint8_t)((v_entry & 255)));
+      v_flush_pointer = (v_entry >> 12);
       v_flush_repeat_count = ((uint32_t)(v_curr));
       while (v_flush_repeat_count > 0) {
         v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
@@ -25988,9 +25993,9 @@
         v_flush_repeat_count -= 1;
       }
       v_flush_repeat_count = 0;
+      v_flush_prev = v_curr;
+      v_block_size -= 1;
     }
-    v_flush_prev = v_curr;
-    v_block_size -= 1;
   }
   self->private_impl.f_flush_pointer = v_flush_pointer;
   self->private_impl.f_flush_repeat_count = v_flush_repeat_count;
@@ -26054,10 +26059,10 @@
     v_block_checksum_have = self->private_impl.f_block_checksum_have;
     v_block_size = self->private_impl.f_block_size;
     while ((v_block_size > 0) &&  ! (self->private_impl.p_flush_slow[0] != 0)) {
-      v_entry = self->private_data.f_bwt[v_flush_pointer];
-      v_curr = ((uint8_t)((v_entry & 255)));
-      v_flush_pointer = (v_entry >> 12);
       if (v_flush_repeat_count < 4) {
+        v_entry = self->private_data.f_bwt[v_flush_pointer];
+        v_curr = ((uint8_t)((v_entry & 255)));
+        v_flush_pointer = (v_entry >> 12);
         if (v_curr == v_flush_prev) {
           v_flush_repeat_count += 1;
         } else {
@@ -26071,7 +26076,12 @@
           goto suspend;
         }
         *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
+        v_flush_prev = v_curr;
+        v_block_size -= 1;
       } else {
+        v_entry = self->private_data.f_bwt[v_flush_pointer];
+        v_curr = ((uint8_t)((v_entry & 255)));
+        v_flush_pointer = (v_entry >> 12);
         v_flush_repeat_count = ((uint32_t)(v_curr));
         while (v_flush_repeat_count > 0) {
           v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
@@ -26085,9 +26095,9 @@
           v_flush_repeat_count -= 1;
         }
         v_flush_repeat_count = 0;
+        v_flush_prev = v_curr;
+        v_block_size -= 1;
       }
-      v_flush_prev = v_curr;
-      v_block_size -= 1;
     }
     self->private_impl.f_flush_pointer = v_flush_pointer;
     self->private_impl.f_flush_repeat_count = v_flush_repeat_count;
diff --git a/std/bzip2/decode_flush_fast.wuffs b/std/bzip2/decode_flush_fast.wuffs
index 5756a2c..c8fe0ef 100644
--- a/std/bzip2/decode_flush_fast.wuffs
+++ b/std/bzip2/decode_flush_fast.wuffs
@@ -29,11 +29,10 @@
 	block_size = this.block_size
 
 	while (block_size > 0) and (args.dst.length() > 255) {
-		entry = this.bwt[flush_pointer]
-		curr = (entry & 0xFF) as base.u8
-		flush_pointer = entry >> 12
-
 		if flush_repeat_count < 4 {
+			entry = this.bwt[flush_pointer]
+			curr = (entry & 0xFF) as base.u8
+			flush_pointer = entry >> 12
 			if curr == flush_prev {
 				flush_repeat_count += 1
 			} else {
@@ -43,8 +42,13 @@
 				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
 				(block_checksum_have ~mod<< 8)
 			args.dst.write_u8_fast!(a: curr)
+			flush_prev = curr
+			block_size -= 1
 
 		} else {
+			entry = this.bwt[flush_pointer]
+			curr = (entry & 0xFF) as base.u8
+			flush_pointer = entry >> 12
 			flush_repeat_count = curr as base.u32
 			while flush_repeat_count > 0,
 				inv block_size > 0,
@@ -58,10 +62,9 @@
 				flush_repeat_count -= 1
 			} endwhile
 			flush_repeat_count = 0
+			flush_prev = curr
+			block_size -= 1
 		}
-
-		flush_prev = curr
-		block_size -= 1
 	} endwhile
 
 	this.flush_pointer = flush_pointer
diff --git a/std/bzip2/decode_flush_slow.wuffs b/std/bzip2/decode_flush_slow.wuffs
index d12dd2e..8e0a5f4 100644
--- a/std/bzip2/decode_flush_slow.wuffs
+++ b/std/bzip2/decode_flush_slow.wuffs
@@ -29,11 +29,10 @@
 	block_size = this.block_size
 
 	while (block_size > 0) and (not coroutine_resumed) {
-		entry = this.bwt[flush_pointer]
-		curr = (entry & 0xFF) as base.u8
-		flush_pointer = entry >> 12
-
 		if flush_repeat_count < 4 {
+			entry = this.bwt[flush_pointer]
+			curr = (entry & 0xFF) as base.u8
+			flush_pointer = entry >> 12
 			if curr == flush_prev {
 				flush_repeat_count += 1
 			} else {
@@ -43,8 +42,13 @@
 				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
 				(block_checksum_have ~mod<< 8)
 			args.dst.write_u8?(a: curr)
+			flush_prev = curr
+			block_size -= 1
 
 		} else {
+			entry = this.bwt[flush_pointer]
+			curr = (entry & 0xFF) as base.u8
+			flush_pointer = entry >> 12
 			flush_repeat_count = curr as base.u32
 			while flush_repeat_count > 0,
 				inv block_size > 0,
@@ -56,10 +60,9 @@
 				flush_repeat_count -= 1
 			} endwhile
 			flush_repeat_count = 0
+			flush_prev = curr
+			block_size -= 1
 		}
-
-		flush_prev = curr
-		block_size -= 1
 	} endwhile
 
 	this.flush_pointer = flush_pointer