Optimize std/bzip2 flush_repeat_count checks

name                             old speed      new speed      delta

wuffs_bzip2_decode_10k/clang11   60.5MB/s ± 0%  65.2MB/s ± 0%   +7.85%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/clang11  45.7MB/s ± 1%  52.4MB/s ± 0%  +14.68%  (p=0.008 n=5+5)

wuffs_bzip2_decode_10k/gcc10     58.5MB/s ± 0%  60.6MB/s ± 0%   +3.67%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10    46.3MB/s ± 0%  49.2MB/s ± 0%   +6.31%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b5df137..afa8408 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -25970,7 +25970,15 @@
     v_entry = self->private_data.f_bwt[v_flush_pointer];
     v_curr = ((uint8_t)((v_entry & 255)));
     v_flush_pointer = (v_entry >> 12);
-    if (v_flush_repeat_count >= 4) {
+    if (v_flush_repeat_count < 4) {
+      if (v_curr == v_flush_prev) {
+        v_flush_repeat_count += 1;
+      } else {
+        v_flush_repeat_count = 1;
+      }
+      v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+      (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
+    } else {
       v_flush_repeat_count = ((uint32_t)(v_curr));
       while (v_flush_repeat_count > 0) {
         v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
@@ -25980,14 +25988,6 @@
         v_flush_repeat_count -= 1;
       }
       v_flush_repeat_count = 0;
-    } else if (v_curr != v_flush_prev) {
-      v_flush_repeat_count = 1;
-      v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-      (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
-    } else {
-      v_flush_repeat_count += 1;
-      v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-      (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
     }
     v_flush_prev = v_curr;
     v_block_size -= 1;
@@ -26057,12 +26057,26 @@
       v_entry = self->private_data.f_bwt[v_flush_pointer];
       v_curr = ((uint8_t)((v_entry & 255)));
       v_flush_pointer = (v_entry >> 12);
-      if (v_flush_repeat_count >= 4) {
+      if (v_flush_repeat_count < 4) {
+        if (v_curr == v_flush_prev) {
+          v_flush_repeat_count += 1;
+        } else {
+          v_flush_repeat_count = 1;
+        }
+        v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+        self->private_data.s_flush_slow[0].scratch = v_curr;
+        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1);
+        if (iop_a_dst == io2_a_dst) {
+          status = wuffs_base__make_status(wuffs_base__suspension__short_write);
+          goto suspend;
+        }
+        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
+      } else {
         v_flush_repeat_count = ((uint32_t)(v_curr));
         while (v_flush_repeat_count > 0) {
           v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
           self->private_data.s_flush_slow[0].scratch = v_flush_prev;
-          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1);
+          WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2);
           if (iop_a_dst == io2_a_dst) {
             status = wuffs_base__make_status(wuffs_base__suspension__short_write);
             goto suspend;
@@ -26071,26 +26085,6 @@
           v_flush_repeat_count -= 1;
         }
         v_flush_repeat_count = 0;
-      } else if (v_curr != v_flush_prev) {
-        v_flush_repeat_count = 1;
-        v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-        self->private_data.s_flush_slow[0].scratch = v_curr;
-        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2);
-        if (iop_a_dst == io2_a_dst) {
-          status = wuffs_base__make_status(wuffs_base__suspension__short_write);
-          goto suspend;
-        }
-        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
-      } else {
-        v_flush_repeat_count += 1;
-        v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
-        self->private_data.s_flush_slow[0].scratch = v_curr;
-        WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3);
-        if (iop_a_dst == io2_a_dst) {
-          status = wuffs_base__make_status(wuffs_base__suspension__short_write);
-          goto suspend;
-        }
-        *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
       }
       v_flush_prev = v_curr;
       v_block_size -= 1;
diff --git a/std/bzip2/decode_flush_fast.wuffs b/std/bzip2/decode_flush_fast.wuffs
index a12dfa0..5756a2c 100644
--- a/std/bzip2/decode_flush_fast.wuffs
+++ b/std/bzip2/decode_flush_fast.wuffs
@@ -33,7 +33,18 @@
 		curr = (entry & 0xFF) as base.u8
 		flush_pointer = entry >> 12
 
-		if flush_repeat_count >= 4 {
+		if flush_repeat_count < 4 {
+			if curr == flush_prev {
+				flush_repeat_count += 1
+			} else {
+				flush_repeat_count = 1
+			}
+			block_checksum_have =
+				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
+				(block_checksum_have ~mod<< 8)
+			args.dst.write_u8_fast!(a: curr)
+
+		} else {
 			flush_repeat_count = curr as base.u32
 			while flush_repeat_count > 0,
 				inv block_size > 0,
@@ -47,18 +58,6 @@
 				flush_repeat_count -= 1
 			} endwhile
 			flush_repeat_count = 0
-		} else if curr <> flush_prev {
-			flush_repeat_count = 1
-			block_checksum_have =
-				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
-				(block_checksum_have ~mod<< 8)
-			args.dst.write_u8_fast!(a: curr)
-		} else {
-			flush_repeat_count += 1
-			block_checksum_have =
-				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
-				(block_checksum_have ~mod<< 8)
-			args.dst.write_u8_fast!(a: curr)
 		}
 
 		flush_prev = curr
diff --git a/std/bzip2/decode_flush_slow.wuffs b/std/bzip2/decode_flush_slow.wuffs
index a0674c2..d12dd2e 100644
--- a/std/bzip2/decode_flush_slow.wuffs
+++ b/std/bzip2/decode_flush_slow.wuffs
@@ -33,7 +33,18 @@
 		curr = (entry & 0xFF) as base.u8
 		flush_pointer = entry >> 12
 
-		if flush_repeat_count >= 4 {
+		if flush_repeat_count < 4 {
+			if curr == flush_prev {
+				flush_repeat_count += 1
+			} else {
+				flush_repeat_count = 1
+			}
+			block_checksum_have =
+				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
+				(block_checksum_have ~mod<< 8)
+			args.dst.write_u8?(a: curr)
+
+		} else {
 			flush_repeat_count = curr as base.u32
 			while flush_repeat_count > 0,
 				inv block_size > 0,
@@ -45,18 +56,6 @@
 				flush_repeat_count -= 1
 			} endwhile
 			flush_repeat_count = 0
-		} else if curr <> flush_prev {
-			flush_repeat_count = 1
-			block_checksum_have =
-				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
-				(block_checksum_have ~mod<< 8)
-			args.dst.write_u8?(a: curr)
-		} else {
-			flush_repeat_count += 1
-			block_checksum_have =
-				REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
-				(block_checksum_have ~mod<< 8)
-			args.dst.write_u8?(a: curr)
 		}
 
 		flush_prev = curr