Cache std/bzip2 Huffman tree lookup
name old speed new speed delta
wuffs_bzip2_decode_10k/clang11 46.5MB/s ± 0% 60.5MB/s ± 0% +30.16% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/clang11 39.4MB/s ± 1% 46.7MB/s ± 1% +18.43% (p=0.008 n=5+5)
wuffs_bzip2_decode_10k/gcc10 43.8MB/s ± 0% 58.7MB/s ± 0% +34.07% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10 35.9MB/s ± 0% 46.9MB/s ± 0% +30.92% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 3e6581d..84efaa1 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6963,6 +6963,7 @@
uint8_t f_mtft[256];
uint8_t f_huffman_selectors[32768];
uint16_t f_huffman_trees[6][1024][2];
+ uint16_t f_huffman_tables[6][256];
uint32_t f_bwt[1048576];
struct {
@@ -24939,6 +24940,11 @@
uint32_t a_which);
static wuffs_base__empty_struct
+wuffs_bzip2__decoder__build_huffman_table(
+ wuffs_bzip2__decoder* self,
+ uint32_t a_which);
+
+static wuffs_base__empty_struct
wuffs_bzip2__decoder__invert_bwt(
wuffs_bzip2__decoder* self);
@@ -25047,6 +25053,7 @@
uint32_t v_ticks = 0;
uint32_t v_section = 0;
uint32_t v_run_shift = 0;
+ uint16_t v_table_entry = 0;
uint16_t v_child = 0;
uint32_t v_child_ff = 0;
uint32_t v_i = 0;
@@ -25085,7 +25092,10 @@
v_bits |= (wuffs_base__peek_u32be__no_bounds_check(iop_a_src) >> v_n_bits);
iop_a_src += ((31 - v_n_bits) >> 3);
v_n_bits |= 24;
- v_child = 0;
+ v_table_entry = self->private_data.f_huffman_tables[v_which][(v_bits >> 24)];
+ v_bits <<= (v_table_entry >> 12);
+ v_n_bits -= ((uint32_t)((v_table_entry >> 12)));
+ v_child = (v_table_entry & 2047);
while (v_child < 1024) {
v_child = self->private_data.f_huffman_trees[v_which][v_child][(v_bits >> 31)];
v_bits <<= 1;
@@ -25833,6 +25843,7 @@
status = v_status;
goto exit;
}
+ wuffs_bzip2__decoder__build_huffman_table(self, v_i);
v_i += 1;
}
v_i = 0;
@@ -26086,6 +26097,39 @@
return wuffs_base__make_status(NULL);
}
+// -------- func bzip2.decoder.build_huffman_table
+
+static wuffs_base__empty_struct
+wuffs_bzip2__decoder__build_huffman_table(
+ wuffs_bzip2__decoder* self,
+ uint32_t a_which) {
+ uint32_t v_i = 0;
+ uint32_t v_bits = 0;
+ uint16_t v_n_bits = 0;
+ uint16_t v_child = 0;
+
+ while (v_i < 256) {
+ v_bits = (v_i << 24);
+ v_n_bits = 0;
+ v_child = 0;
+ while ((v_child < 1024) && (v_n_bits < 8)) {
+ v_child = self->private_data.f_huffman_trees[a_which][v_child][(v_bits >> 31)];
+ v_bits <<= 1;
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+ v_n_bits += 1;
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+ }
+ self->private_data.f_huffman_tables[a_which][v_i] = (v_child | (v_n_bits << 12));
+ v_i += 1;
+ }
+ return wuffs_base__make_empty_struct();
+}
+
// -------- func bzip2.decoder.invert_bwt
static wuffs_base__empty_struct
diff --git a/std/bzip2/decode_block_fast.wuffs b/std/bzip2/decode_block_fast.wuffs
index db9359b..ecd2eee 100644
--- a/std/bzip2/decode_block_fast.wuffs
+++ b/std/bzip2/decode_block_fast.wuffs
@@ -21,13 +21,14 @@
var section : base.u32
var run_shift : base.u32[..= 23]
- var child : base.u16
- var child_ff : base.u32[..= 255]
- var i : base.u32
- var j : base.u32
- var output : base.u32[..= 255]
- var run : base.u32[..= 16_777216] // 16_777216 = (2 << 23)
- var mtft0 : base.u32[..= 255]
+ var table_entry : base.u16
+ var child : base.u16
+ var child_ff : base.u32[..= 255]
+ var i : base.u32
+ var j : base.u32
+ var output : base.u32[..= 255]
+ var run : base.u32[..= 16_777216] // 16_777216 = (2 << 23)
+ var mtft0 : base.u32[..= 255]
bits = this.bits
n_bits = this.n_bits
@@ -51,7 +52,10 @@
args.src.skip_u32_fast!(actual: (31 - n_bits) >> 3, worst_case: 4)
n_bits |= 24
- child = 0
+ table_entry = this.huffman_tables[which][bits >> 24]
+ bits ~mod<<= table_entry >> 12
+ n_bits -= (table_entry >> 12) as base.u32
+ child = table_entry & 0x7FF
while child < 1024 {
child = this.huffman_trees[which][child][bits >> 31]
bits ~mod<<= 1
diff --git a/std/bzip2/decode_bzip2.wuffs b/std/bzip2/decode_bzip2.wuffs
index 1fb899b..c06e073 100644
--- a/std/bzip2/decode_bzip2.wuffs
+++ b/std/bzip2/decode_bzip2.wuffs
@@ -75,6 +75,16 @@
// more rigorous about deriving this upper bound.
huffman_trees : array[6] array[1024] array[2] base.u16,
+ // huffman_tables[which] caches the result of walking huffman_trees[which]
+ // for each possible 8-bit input. The base.u16 value decomposes as:
+ // - The low 11 bits are the final node, where finality comes either from
+ // hitting a leaf node or from spending all 8 bits of the input. As per
+ // the build_huffman_tree method, a value of 0x3FF = 1023 or less means
+ // a branch node and 0x400 = 1024 or more means a leaf node.
+ // - The middle bit is unused.
+ // - The high 4 bits are the number of input bits spent.
+ huffman_tables : array[6] array[256] base.u16,
+
// bwt is the Burrows Wheeler Transform state. Per the README.md file, each
// entry is a row with the low 8 bits holding the L column and the high 20
// bits holding the U column. The middle 4 bits are unused.
@@ -400,6 +410,7 @@
if status.is_error() {
return status
}
+ this.build_huffman_table!(which: i)
i += 1
} endwhile
@@ -632,6 +643,28 @@
return ok
}
+pri func decoder.build_huffman_table!(which: base.u32[..= 5]) {
+ var i : base.u32
+ var bits : base.u32
+ var n_bits : base.u16[..= 8]
+ var child : base.u16
+
+ while i < 256 {
+ bits = i << 24
+ n_bits = 0
+ child = 0
+ while (child < 1024) and (n_bits < 8),
+ inv i < 256,
+ {
+ child = this.huffman_trees[args.which][child][bits >> 31]
+ bits ~mod<<= 1
+ n_bits += 1
+ } endwhile
+ this.huffman_tables[args.which][i] = child | (n_bits << 12)
+ i += 1
+ } endwhile
+}
+
// invert_bwt performs "two passes over the data, and one pass over the
// alphabet", per the BWT technical report, except that the first data pass
// (accumulating this.letter_counts) is integrated into the decode_block_etc