Tweak std/bzip2 flush_* basic blocks
name old speed new speed delta
wuffs_bzip2_decode_10k/clang11 65.2MB/s ± 0% 65.2MB/s ± 0% ~ (p=0.056 n=5+5)
wuffs_bzip2_decode_100k/clang11 52.4MB/s ± 0% 51.5MB/s ± 0% -1.67% (p=0.016 n=5+4)
wuffs_bzip2_decode_10k/gcc10 60.6MB/s ± 0% 61.7MB/s ± 0% +1.81% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10 49.2MB/s ± 0% 49.6MB/s ± 0% +0.86% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index afa8408..45e2967 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -25967,10 +25967,10 @@
v_block_checksum_have = self->private_impl.f_block_checksum_have;
v_block_size = self->private_impl.f_block_size;
while ((v_block_size > 0) && (((uint64_t)(io2_a_dst - iop_a_dst)) > 255)) {
- v_entry = self->private_data.f_bwt[v_flush_pointer];
- v_curr = ((uint8_t)((v_entry & 255)));
- v_flush_pointer = (v_entry >> 12);
if (v_flush_repeat_count < 4) {
+ v_entry = self->private_data.f_bwt[v_flush_pointer];
+ v_curr = ((uint8_t)((v_entry & 255)));
+ v_flush_pointer = (v_entry >> 12);
if (v_curr == v_flush_prev) {
v_flush_repeat_count += 1;
} else {
@@ -25978,7 +25978,12 @@
}
v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_curr)] ^ ((uint32_t)(v_block_checksum_have << 8)));
(wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, v_curr), iop_a_dst += 1);
+ v_flush_prev = v_curr;
+ v_block_size -= 1;
} else {
+ v_entry = self->private_data.f_bwt[v_flush_pointer];
+ v_curr = ((uint8_t)((v_entry & 255)));
+ v_flush_pointer = (v_entry >> 12);
v_flush_repeat_count = ((uint32_t)(v_curr));
while (v_flush_repeat_count > 0) {
v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
@@ -25988,9 +25993,9 @@
v_flush_repeat_count -= 1;
}
v_flush_repeat_count = 0;
+ v_flush_prev = v_curr;
+ v_block_size -= 1;
}
- v_flush_prev = v_curr;
- v_block_size -= 1;
}
self->private_impl.f_flush_pointer = v_flush_pointer;
self->private_impl.f_flush_repeat_count = v_flush_repeat_count;
@@ -26054,10 +26059,10 @@
v_block_checksum_have = self->private_impl.f_block_checksum_have;
v_block_size = self->private_impl.f_block_size;
while ((v_block_size > 0) && ! (self->private_impl.p_flush_slow[0] != 0)) {
- v_entry = self->private_data.f_bwt[v_flush_pointer];
- v_curr = ((uint8_t)((v_entry & 255)));
- v_flush_pointer = (v_entry >> 12);
if (v_flush_repeat_count < 4) {
+ v_entry = self->private_data.f_bwt[v_flush_pointer];
+ v_curr = ((uint8_t)((v_entry & 255)));
+ v_flush_pointer = (v_entry >> 12);
if (v_curr == v_flush_prev) {
v_flush_repeat_count += 1;
} else {
@@ -26071,7 +26076,12 @@
goto suspend;
}
*iop_a_dst++ = ((uint8_t)(self->private_data.s_flush_slow[0].scratch));
+ v_flush_prev = v_curr;
+ v_block_size -= 1;
} else {
+ v_entry = self->private_data.f_bwt[v_flush_pointer];
+ v_curr = ((uint8_t)((v_entry & 255)));
+ v_flush_pointer = (v_entry >> 12);
v_flush_repeat_count = ((uint32_t)(v_curr));
while (v_flush_repeat_count > 0) {
v_block_checksum_have = (WUFFS_BZIP2__REV_CRC32_TABLE[(((uint8_t)((v_block_checksum_have >> 24))) ^ v_flush_prev)] ^ ((uint32_t)(v_block_checksum_have << 8)));
@@ -26085,9 +26095,9 @@
v_flush_repeat_count -= 1;
}
v_flush_repeat_count = 0;
+ v_flush_prev = v_curr;
+ v_block_size -= 1;
}
- v_flush_prev = v_curr;
- v_block_size -= 1;
}
self->private_impl.f_flush_pointer = v_flush_pointer;
self->private_impl.f_flush_repeat_count = v_flush_repeat_count;
diff --git a/std/bzip2/decode_flush_fast.wuffs b/std/bzip2/decode_flush_fast.wuffs
index 5756a2c..c8fe0ef 100644
--- a/std/bzip2/decode_flush_fast.wuffs
+++ b/std/bzip2/decode_flush_fast.wuffs
@@ -29,11 +29,10 @@
block_size = this.block_size
while (block_size > 0) and (args.dst.length() > 255) {
- entry = this.bwt[flush_pointer]
- curr = (entry & 0xFF) as base.u8
- flush_pointer = entry >> 12
-
if flush_repeat_count < 4 {
+ entry = this.bwt[flush_pointer]
+ curr = (entry & 0xFF) as base.u8
+ flush_pointer = entry >> 12
if curr == flush_prev {
flush_repeat_count += 1
} else {
@@ -43,8 +42,13 @@
REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
(block_checksum_have ~mod<< 8)
args.dst.write_u8_fast!(a: curr)
+ flush_prev = curr
+ block_size -= 1
} else {
+ entry = this.bwt[flush_pointer]
+ curr = (entry & 0xFF) as base.u8
+ flush_pointer = entry >> 12
flush_repeat_count = curr as base.u32
while flush_repeat_count > 0,
inv block_size > 0,
@@ -58,10 +62,9 @@
flush_repeat_count -= 1
} endwhile
flush_repeat_count = 0
+ flush_prev = curr
+ block_size -= 1
}
-
- flush_prev = curr
- block_size -= 1
} endwhile
this.flush_pointer = flush_pointer
diff --git a/std/bzip2/decode_flush_slow.wuffs b/std/bzip2/decode_flush_slow.wuffs
index d12dd2e..8e0a5f4 100644
--- a/std/bzip2/decode_flush_slow.wuffs
+++ b/std/bzip2/decode_flush_slow.wuffs
@@ -29,11 +29,10 @@
block_size = this.block_size
while (block_size > 0) and (not coroutine_resumed) {
- entry = this.bwt[flush_pointer]
- curr = (entry & 0xFF) as base.u8
- flush_pointer = entry >> 12
-
if flush_repeat_count < 4 {
+ entry = this.bwt[flush_pointer]
+ curr = (entry & 0xFF) as base.u8
+ flush_pointer = entry >> 12
if curr == flush_prev {
flush_repeat_count += 1
} else {
@@ -43,8 +42,13 @@
REV_CRC32_TABLE[((block_checksum_have >> 24) as base.u8) ^ curr] ^
(block_checksum_have ~mod<< 8)
args.dst.write_u8?(a: curr)
+ flush_prev = curr
+ block_size -= 1
} else {
+ entry = this.bwt[flush_pointer]
+ curr = (entry & 0xFF) as base.u8
+ flush_pointer = entry >> 12
flush_repeat_count = curr as base.u32
while flush_repeat_count > 0,
inv block_size > 0,
@@ -56,10 +60,9 @@
flush_repeat_count -= 1
} endwhile
flush_repeat_count = 0
+ flush_prev = curr
+ block_size -= 1
}
-
- flush_prev = curr
- block_size -= 1
} endwhile
this.flush_pointer = flush_pointer