Optimize std/bzip2 flush_repeat_count checks
name old speed new speed delta
wuffs_bzip2_decode_10k/clang11 60.5MB/s ± 0% 65.2MB/s ± 0% +7.85% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/clang11 45.7MB/s ± 1% 52.4MB/s ± 0% +14.68% (p=0.008 n=5+5)
wuffs_bzip2_decode_10k/gcc10 58.5MB/s ± 0% 60.6MB/s ± 0% +3.67% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10 46.3MB/s ± 0% 49.2MB/s ± 0% +6.31% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b5df137..afa8408 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -25970,7 +25970,15 @@
v_entry = self->private_data.f_bwt[v_flush_pointer];
v_curr = ((uint8_t)((v_entry & 255)));
v_flush_pointer = (v_entry >> 12);
- if (v_flush_repeat_count >= 4) {
+ if (v_flush_repeat_count < 4) {
+ if (v_curr == v_flush_prev) {
+ v_flush_repeat_count += 1;
+ } else {
+ v_flush_repeat_count = 1;
+ }
+ v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+ (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
+ } else {
v_flush_repeat_count = ((uint32_t)(v_curr));
while (v_flush_repeat_count > 0) {
v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
@@ -25980,14 +25988,6 @@
v_flush_repeat_count -= 1;
}
v_flush_repeat_count = 0;
- } else if (v_curr != v_flush_prev) {
- v_flush_repeat_count = 1;
- v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
- (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
- } else {
- v_flush_repeat_count += 1;
- v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
- (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
}
v_flush_prev = v_curr;
v_block_size -= 1;
@@ -26057,12 +26057,26 @@
v_entry = self->private_data.f_bwt[v_flush_pointer];
v_curr = ((uint8_t)((v_entry & 255)));
v_flush_pointer = (v_entry >> 12);
- if (v_flush_repeat_count >= 4) {
+ if (v_flush_repeat_count < 4) {
+ if (v_curr == v_flush_prev) {
+ v_flush_repeat_count += 1;
+ } else {
+ v_flush_repeat_count = 1;
+ }
+ v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
+ self->private_data.s_flush_slow[0].scratch = v_curr;
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1);
+ if (iop_a_dst == io2_a_dst) {
+ status = wuffs_base__make_status(wuffs_base__suspension__short_write);
+ goto suspend;
+ }
+ *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
+ } else {
v_flush_repeat_count = ((uint32_t)(v_curr));
while (v_flush_repeat_count > 0) {
v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
self->private_data.s_flush_slow[0].scratch = v_flush_prev;
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1);
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2);
if (iop_a_dst == io2_a_dst) {
status = wuffs_base__make_status(wuffs_base__suspension__short_write);
goto suspend;
@@ -26071,26 +26085,6 @@
v_flush_repeat_count -= 1;
}
v_flush_repeat_count = 0;
- } else if (v_curr != v_flush_prev) {
- v_flush_repeat_count = 1;
- v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
- self->private_data.s_flush_slow[0].scratch = v_curr;
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2);
- if (iop_a_dst == io2_a_dst) {
- status = wuffs_base__make_status(wuffs_base__suspension__short_write);
- goto suspend;
- }
- *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
- } else {
- v_flush_repeat_count += 1;
- v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
- self->private_data.s_flush_slow[0].scratch = v_curr;
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3);
- if (iop_a_dst == io2_a_dst) {
- status = wuffs_base__make_status(wuffs_base__suspension__short_write);
- goto suspend;
- }
- *iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
}
v_flush_prev = v_curr;
v_block_size -= 1;
diff --git a/std/bzip2/decode_flush_fast.wuffs b/std/bzip2/decode_flush_fast.wuffs
index a12dfa0..5756a2c 100644
--- a/std/bzip2/decode_flush_fast.wuffs
+++ b/std/bzip2/decode_flush_fast.wuffs
@@ -33,7 +33,18 @@
curr = (entry & 0xFF) as base.u8
flush_pointer = entry >> 12
- if flush_repeat_count >= 4 {
+ if flush_repeat_count < 4 {
+ if curr == flush_prev {
+ flush_repeat_count += 1
+ } else {
+ flush_repeat_count = 1
+ }
+ block_checksum_have =
+ REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
+ (block_checksum_have ~mod<< 8)
+ args.dst.write_u8_fast!(a: curr)
+
+ } else {
flush_repeat_count = curr as base.u32
while flush_repeat_count > 0,
inv block_size > 0,
@@ -47,18 +58,6 @@
flush_repeat_count -= 1
} endwhile
flush_repeat_count = 0
- } else if curr <> flush_prev {
- flush_repeat_count = 1
- block_checksum_have =
- REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
- (block_checksum_have ~mod<< 8)
- args.dst.write_u8_fast!(a: curr)
- } else {
- flush_repeat_count += 1
- block_checksum_have =
- REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
- (block_checksum_have ~mod<< 8)
- args.dst.write_u8_fast!(a: curr)
}
flush_prev = curr
diff --git a/std/bzip2/decode_flush_slow.wuffs b/std/bzip2/decode_flush_slow.wuffs
index a0674c2..d12dd2e 100644
--- a/std/bzip2/decode_flush_slow.wuffs
+++ b/std/bzip2/decode_flush_slow.wuffs
@@ -33,7 +33,18 @@
curr = (entry & 0xFF) as base.u8
flush_pointer = entry >> 12
- if flush_repeat_count >= 4 {
+ if flush_repeat_count < 4 {
+ if curr == flush_prev {
+ flush_repeat_count += 1
+ } else {
+ flush_repeat_count = 1
+ }
+ block_checksum_have =
+ REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
+ (block_checksum_have ~mod<< 8)
+ args.dst.write_u8?(a: curr)
+
+ } else {
flush_repeat_count = curr as base.u32
while flush_repeat_count > 0,
inv block_size > 0,
@@ -45,18 +56,6 @@
flush_repeat_count -= 1
} endwhile
flush_repeat_count = 0
- } else if curr <> flush_prev {
- flush_repeat_count = 1
- block_checksum_have =
- REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
- (block_checksum_have ~mod<< 8)
- args.dst.write_u8?(a: curr)
- } else {
- flush_repeat_count += 1
- block_checksum_have =
- REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
- (block_checksum_have ~mod<< 8)
- args.dst.write_u8?(a: curr)
}
flush_prev = curr