Cache std/bzip2 Huffman tree lookup

name                             old speed      new speed      delta

wuffs_bzip2_decode_10k/clang11   46.5MB/s ± 0%  60.5MB/s ± 0%  +30.16%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/clang11  39.4MB/s ± 1%  46.7MB/s ± 1%  +18.43%  (p=0.008 n=5+5)

wuffs_bzip2_decode_10k/gcc10     43.8MB/s ± 0%  58.7MB/s ± 0%  +34.07%  (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10    35.9MB/s ± 0%  46.9MB/s ± 0%  +30.92%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 3e6581d..84efaa1 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6963,6 +6963,7 @@
     uint8_t f_mtft[256];
     uint8_t f_huffman_selectors[32768];
     uint16_t f_huffman_trees[6][1024][2];
+    uint16_t f_huffman_tables[6][256];
     uint32_t f_bwt[1048576];
 
     struct {
@@ -24939,6 +24940,11 @@
     uint32_t a_which);
 
 static wuffs_base__empty_struct
+wuffs_bzip2__decoder__build_huffman_table(
+    wuffs_bzip2__decoder* self,
+    uint32_t a_which);
+
+static wuffs_base__empty_struct
 wuffs_bzip2__decoder__invert_bwt(
     wuffs_bzip2__decoder* self);
 
@@ -25047,6 +25053,7 @@
   uint32_t v_ticks = 0;
   uint32_t v_section = 0;
   uint32_t v_run_shift = 0;
+  uint16_t v_table_entry = 0;
   uint16_t v_child = 0;
   uint32_t v_child_ff = 0;
   uint32_t v_i = 0;
@@ -25085,7 +25092,10 @@
     v_bits |= (wuffs_base__peek_u32be__no_bounds_check(iop_a_src) >> v_n_bits);
     iop_a_src += ((31 - v_n_bits) >> 3);
     v_n_bits |= 24;
-    v_child = 0;
+    v_table_entry = self->private_data.f_huffman_tables[v_which][(v_bits >> 24)];
+    v_bits <<= (v_table_entry >> 12);
+    v_n_bits -= ((uint32_t)((v_table_entry >> 12)));
+    v_child = (v_table_entry & 2047);
     while (v_child < 1024) {
       v_child = self->private_data.f_huffman_trees[v_which][v_child][(v_bits >> 31)];
       v_bits <<= 1;
@@ -25833,6 +25843,7 @@
         status = v_status;
         goto exit;
       }
+      wuffs_bzip2__decoder__build_huffman_table(self, v_i);
       v_i += 1;
     }
     v_i = 0;
@@ -26086,6 +26097,39 @@
   return wuffs_base__make_status(NULL);
 }
 
+// -------- func bzip2.decoder.build_huffman_table
+
+static wuffs_base__empty_struct
+wuffs_bzip2__decoder__build_huffman_table(
+    wuffs_bzip2__decoder* self,
+    uint32_t a_which) {
+  uint32_t v_i = 0;
+  uint32_t v_bits = 0;
+  uint16_t v_n_bits = 0;
+  uint16_t v_child = 0;
+
+  while (v_i < 256) {
+    v_bits = (v_i << 24);
+    v_n_bits = 0;
+    v_child = 0;
+    while ((v_child < 1024) && (v_n_bits < 8)) {
+      v_child = self->private_data.f_huffman_trees[a_which][v_child][(v_bits >> 31)];
+      v_bits <<= 1;
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+      v_n_bits += 1;
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+    }
+    self->private_data.f_huffman_tables[a_which][v_i] = (v_child | (v_n_bits << 12));
+    v_i += 1;
+  }
+  return wuffs_base__make_empty_struct();
+}
+
 // -------- func bzip2.decoder.invert_bwt
 
 static wuffs_base__empty_struct
diff --git a/std/bzip2/decode_block_fast.wuffs b/std/bzip2/decode_block_fast.wuffs
index db9359b..ecd2eee 100644
--- a/std/bzip2/decode_block_fast.wuffs
+++ b/std/bzip2/decode_block_fast.wuffs
@@ -21,13 +21,14 @@
 	var section    : base.u32
 	var run_shift  : base.u32[..= 23]
 
-	var child    : base.u16
-	var child_ff : base.u32[..= 255]
-	var i        : base.u32
-	var j        : base.u32
-	var output   : base.u32[..= 255]
-	var run      : base.u32[..= 16_777216]  // 16_777216 = (2 << 23)
-	var mtft0    : base.u32[..= 255]
+	var table_entry : base.u16
+	var child       : base.u16
+	var child_ff    : base.u32[..= 255]
+	var i           : base.u32
+	var j           : base.u32
+	var output      : base.u32[..= 255]
+	var run         : base.u32[..= 16_777216]  // 16_777216 = (2 << 23)
+	var mtft0       : base.u32[..= 255]
 
 	bits = this.bits
 	n_bits = this.n_bits
@@ -51,7 +52,10 @@
 		args.src.skip_u32_fast!(actual: (31 - n_bits) >> 3, worst_case: 4)
 		n_bits |= 24
 
-		child = 0
+		table_entry = this.huffman_tables[which][bits >> 24]
+		bits ~mod<<= table_entry >> 12
+		n_bits -= (table_entry >> 12) as base.u32
+		child = table_entry & 0x7FF
 		while child < 1024 {
 			child = this.huffman_trees[which][child][bits >> 31]
 			bits ~mod<<= 1
diff --git a/std/bzip2/decode_bzip2.wuffs b/std/bzip2/decode_bzip2.wuffs
index 1fb899b..c06e073 100644
--- a/std/bzip2/decode_bzip2.wuffs
+++ b/std/bzip2/decode_bzip2.wuffs
@@ -75,6 +75,16 @@
 	// more rigorous about deriving this upper bound.
 	huffman_trees : array[6] array[1024] array[2] base.u16,
 
+	// huffman_tables[which] caches the result of walking huffman_trees[which]
+	// for each possible 8-bit input. The base.u16 value decomposes as:
+	//  - The low 11 bits are the final node, where finality comes either from
+	//    hitting a leaf node or from spending all 8 bits of the input. As per
+	//    the build_huffman_tree method, a value of 0x3FF = 1023 or less means
+	//    a branch node and 0x400 = 1024 or more means a leaf node.
+	//  - The middle bit is unused.
+	//  - The high 4 bits are the number of input bits spent.
+	huffman_tables : array[6] array[256] base.u16,
+
 	// bwt is the Burrows Wheeler Transform state. Per the README.md file, each
 	// entry is a row with the low 8 bits holding the L column and the high 20
 	// bits holding the U column. The middle 4 bits are unused.
@@ -400,6 +410,7 @@
 		if status.is_error() {
 			return status
 		}
+		this.build_huffman_table!(which: i)
 		i += 1
 	} endwhile
 
@@ -632,6 +643,28 @@
 	return ok
 }
 
+pri func decoder.build_huffman_table!(which: base.u32[..= 5]) {
+	var i      : base.u32
+	var bits   : base.u32
+	var n_bits : base.u16[..= 8]
+	var child  : base.u16
+
+	while i < 256 {
+		bits = i << 24
+		n_bits = 0
+		child = 0
+		while (child < 1024) and (n_bits < 8),
+			inv i < 256,
+		{
+			child = this.huffman_trees[args.which][child][bits >> 31]
+			bits ~mod<<= 1
+			n_bits += 1
+		} endwhile
+		this.huffman_tables[args.which][i] = child | (n_bits << 12)
+		i += 1
+	} endwhile
+}
+
 // invert_bwt performs "two passes over the data, and one pass over the
 // alphabet", per the BWT technical report, except that the first data pass
 // (accumulating this.letter_counts) is integrated into the decode_block_etc