Tighten std/bzip2 huffman_trees array size
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 689f314..5d3504a 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6828,7 +6828,6 @@
extern const char wuffs_bzip2__error__bad_checksum[];
extern const char wuffs_bzip2__error__bad_header[];
extern const char wuffs_bzip2__error__bad_number_of_sections[];
-extern const char wuffs_bzip2__error__unsupported_huffman_code[];
extern const char wuffs_bzip2__error__unsupported_block_randomization[];
// ---------------- Public Consts
@@ -6966,7 +6965,7 @@
uint8_t f_presence[256];
uint8_t f_mtft[256];
uint8_t f_huffman_selectors[32768];
- uint16_t f_huffman_trees[6][1024][2];
+ uint16_t f_huffman_trees[6][257][2];
uint16_t f_huffman_tables[6][256];
uint32_t f_bwt[1048576];
@@ -24866,7 +24865,6 @@
const char wuffs_bzip2__error__bad_checksum[] = "#bzip2: bad checksum";
const char wuffs_bzip2__error__bad_header[] = "#bzip2: bad header";
const char wuffs_bzip2__error__bad_number_of_sections[] = "#bzip2: bad number of sections";
-const char wuffs_bzip2__error__unsupported_huffman_code[] = "#bzip2: unsupported Huffman code";
const char wuffs_bzip2__error__unsupported_block_randomization[] = "#bzip2: unsupported block randomization";
const char wuffs_bzip2__error__internal_error_inconsistent_huffman_decoder_state[] = "#bzip2: internal error: inconsistent Huffman decoder state";
@@ -25839,8 +25837,8 @@
} else {
self->private_data.f_huffman_trees[a_which][v_node_index][1] = ((uint16_t)(v_num_nodes));
}
- if (v_num_nodes >= 1023) {
- return wuffs_base__make_status(wuffs_bzip2__error__unsupported_huffman_code);
+ if (v_num_nodes >= 257) {
+ return wuffs_base__make_status(wuffs_bzip2__error__bad_huffman_code_under_subscribed);
}
v_stack_values[v_stack_height] = v_num_nodes;
self->private_data.f_huffman_trees[a_which][v_num_nodes][0] = 0;
@@ -25896,7 +25894,7 @@
v_bits = (v_i << 24);
v_n_bits = 0;
v_child = 0;
- while ((v_child < 1024) && (v_n_bits < 8)) {
+ while ((v_child < 257) && (v_n_bits < 8)) {
v_child = self->private_data.f_huffman_trees[a_which][v_child][(v_bits >> 31)];
v_bits <<= 1;
#if defined(__GNUC__)
@@ -26203,7 +26201,7 @@
v_bits <<= (v_table_entry >> 12);
v_n_bits -= ((uint32_t)((v_table_entry >> 12)));
v_child = (v_table_entry & 2047);
- while (v_child < 1024) {
+ while (v_child < 257) {
v_child = self->private_data.f_huffman_trees[v_which][v_child][(v_bits >> 31)];
v_bits <<= 1;
if (v_n_bits <= 0) {
@@ -26338,7 +26336,7 @@
v_child = self->private_data.f_huffman_trees[self->private_impl.f_decode_huffman_which][v_node_index][(self->private_impl.f_bits >> 31)];
self->private_impl.f_bits <<= 1;
self->private_impl.f_n_bits -= 1;
- if (v_child < 1024) {
+ if (v_child < 257) {
v_node_index = ((uint32_t)(v_child));
goto label__0__continue;
} else if (v_child < 1280) {
diff --git a/std/bzip2/decode_bzip2.wuffs b/std/bzip2/decode_bzip2.wuffs
index e13021b..1e9f6c0 100644
--- a/std/bzip2/decode_bzip2.wuffs
+++ b/std/bzip2/decode_bzip2.wuffs
@@ -19,7 +19,6 @@
pub status "#bad checksum"
pub status "#bad header"
pub status "#bad number of sections"
-pub status "#unsupported Huffman code"
pub status "#unsupported block randomization"
pri status "#internal error: inconsistent Huffman decoder state"
@@ -73,12 +72,22 @@
// 50-symbol section.
huffman_selectors : array[32768] base.u8,
- // Each array[2] base.u16 is a Huffman tree node. The build_huffman_tree
- // method comment details what the base.u16 values mean.
+ // Each array[2] base.u16 is a branch node in the Huffman tree. The
+ // build_huffman_tree method comment details what the base.u16 values mean.
//
- // 1024 is just a guess for the worst case number of nodes needed. TODO: be
- // more rigorous about deriving this upper bound.
- huffman_trees : array[6] array[1024] array[2] base.u16,
+ // Each Huffman tree has num_symbols leaf nodes and hence (num_symbols - 1)
+ // branch nodes, also called internal nodes, as per
+ // https://stackoverflow.com/questions/69399994/maximum-number-of-nodes-in-a-huffman-tree/69401083#69401083
+ //
+ // This can be proven inductively. The base case Huffman code (two symbols)
+ // has one branch node (the root node). The inductive case, adding one
+ // symbol to a (complete but possibly non-canonical) Huffman code replaces
+ // one leaf node with one branch node (and two leaf nodes). The net gain in
+ // branch nodes is +1.
+ //
+ // Since num_symbols' inclusive maximum is 258 (for RUNA, RUNB, 255 × mXYZ,
+ // EOB), there are at most 257 branch nodes.
+ huffman_trees : array[6] array[257] array[2] base.u16,
// huffman_tables[which] caches the result of walking huffman_trees[which]
// for each possible 8-bit input. The base.u16 value decomposes as:
@@ -550,9 +559,11 @@
// - "110" 0x402 (m02)
// - "111" 0x7FF (EOB)
//
-// The Huffman tree has 5 nodes. Each node has two children, represented by a
-// base.u16 value. 1023 or less means that the child is a branch node. 1024 =
-// 0x400 or above means a leaf node (with the symbol values as above).
+// The Huffman tree has 5 branch nodes (where 5+1 is both the number of leaf
+// nodes and the number of symbols). Each branch node has two children,
+// represented by a base.u16 value. 256 = 0x100 or less means that the child is
+// a branch node. 1024 = 0x400 or above means a leaf node (with the symbol
+// values as above).
// - node0 = [0x001 (node1), 0x002 (node2)]
// - node1 = [0x401 (m01) , 0x403 (m03) ]
// - node2 = [0x003 (node3), 0x004 (node4)]
@@ -580,10 +591,10 @@
pri func decoder.build_huffman_tree!(which: base.u32[..= 5]) base.status {
var code_length : base.u32
var symbol_index : base.u32
- var num_nodes : base.u32[..= 1023]
+ var num_nodes : base.u32[..= 257]
var stack_height : base.u32[..= 21]
- var stack_values : array[21] base.u32[..= 1023]
- var node_index : base.u32[..= 1023]
+ var stack_values : array[21] base.u32[..= 256]
+ var node_index : base.u32[..= 256]
var leaf_value : base.u16
// Push a root node.
@@ -627,8 +638,8 @@
} else {
this.huffman_trees[args.which][node_index][1] = num_nodes as base.u16
}
- if num_nodes >= 1023 {
- return "#unsupported Huffman code"
+ if num_nodes >= 257 {
+ return "#bad Huffman code (under-subscribed)"
}
stack_values[stack_height] = num_nodes
this.huffman_trees[args.which][num_nodes][0] = 0
@@ -684,7 +695,7 @@
bits = i << 24
n_bits = 0
child = 0
- while (child < 1024) and (n_bits < 8),
+ while (child < 257) and (n_bits < 8),
inv i < 256,
{
child = this.huffman_trees[args.which][child][bits >> 31]
diff --git a/std/bzip2/decode_huffman_fast.wuffs b/std/bzip2/decode_huffman_fast.wuffs
index c4e5df0..05e5b8c 100644
--- a/std/bzip2/decode_huffman_fast.wuffs
+++ b/std/bzip2/decode_huffman_fast.wuffs
@@ -59,7 +59,7 @@
bits ~mod<<= table_entry >> 12
n_bits -= (table_entry >> 12) as base.u32
child = table_entry & 0x7FF
- while child < 1024 {
+ while child < 257 {
child = this.huffman_trees[which][child][bits >> 31]
bits ~mod<<= 1
if n_bits <= 0 {
diff --git a/std/bzip2/decode_huffman_slow.wuffs b/std/bzip2/decode_huffman_slow.wuffs
index e22cb03..5c41a83 100644
--- a/std/bzip2/decode_huffman_slow.wuffs
+++ b/std/bzip2/decode_huffman_slow.wuffs
@@ -14,7 +14,7 @@
pri func decoder.decode_huffman_slow?(src: base.io_reader) {
var c : base.u8
- var node_index : base.u32[..= 1023]
+ var node_index : base.u32[..= 256]
var child : base.u16
var child_ff : base.u32[..= 255]
@@ -49,7 +49,7 @@
this.bits ~mod<<= 1
this.n_bits -= 1
- if child < 1024 { // Branch node.
+ if child < 257 { // Branch node.
node_index = child as base.u32
continue
diff --git a/test/c/std/bzip2.c b/test/c/std/bzip2.c
index 5e5ccaa..8fe938a 100644
--- a/test/c/std/bzip2.c
+++ b/test/c/std/bzip2.c
@@ -79,6 +79,11 @@
.src_filename = "test/data/artificial-bzip2/bad-number-of-sections.bz2",
};
+golden_test g_bzip2_huffman_258_gt = {
+ .want_filename = "test/data/abraca.txt",
+ .src_filename = "test/data/artificial-bzip2/huffman-258.bz2",
+};
+
golden_test g_bzip2_midsummer_gt = {
.want_filename = "test/data/midsummer.txt",
.src_filename = "test/data/midsummer.txt.bz2",
@@ -156,6 +161,13 @@
}
const char* //
+test_wuffs_bzip2_decode_huffman_258() {
+ CHECK_FOCUS(__func__);
+ return do_test_io_buffers(wuffs_bzip2_decode, &g_bzip2_huffman_258_gt,
+ UINT64_MAX, UINT64_MAX);
+}
+
+const char* //
test_wuffs_bzip2_decode_midsummer() {
CHECK_FOCUS(__func__);
return do_test_io_buffers(wuffs_bzip2_decode, &g_bzip2_midsummer_gt,
@@ -246,6 +258,7 @@
test_wuffs_bzip2_decode_256_bytes,
test_wuffs_bzip2_decode_bad_number_of_sections,
+ test_wuffs_bzip2_decode_huffman_258,
test_wuffs_bzip2_decode_interface,
test_wuffs_bzip2_decode_midsummer,
test_wuffs_bzip2_decode_pi,
diff --git a/test/data/artificial-bzip2/abraca.txt.bz2.make-artificial.txt b/test/data/artificial-bzip2/abraca.txt.bz2.make-artificial.txt
index 4097152..8788369 100644
--- a/test/data/artificial-bzip2/abraca.txt.bz2.make-artificial.txt
+++ b/test/data/artificial-bzip2/abraca.txt.bz2.make-artificial.txt
@@ -30,8 +30,9 @@
bits 1 0x0
# Bitslices (n) ..= (o)
-bits 19 0x0CD34
-bits 19 0x0CD34
+repeat 2 [
+ bits 19 0x0CD34
+]
# Bitslice (p)
bits 3 0x6
diff --git a/test/data/artificial-bzip2/bad-number-of-sections.bz2.make-artificial.txt b/test/data/artificial-bzip2/bad-number-of-sections.bz2.make-artificial.txt
index 7722919..4c9cb8e 100644
--- a/test/data/artificial-bzip2/bad-number-of-sections.bz2.make-artificial.txt
+++ b/test/data/artificial-bzip2/bad-number-of-sections.bz2.make-artificial.txt
@@ -26,8 +26,9 @@
bits 1 0x0
# Bitslices (n) ..= (o)
-bits 19 0x0CD34
-bits 19 0x0CD34
+repeat 2 [
+ bits 19 0x0CD34
+]
# Bitslice (p)
repeat 50 [
diff --git a/test/data/artificial-bzip2/huffman-258.bz2 b/test/data/artificial-bzip2/huffman-258.bz2
new file mode 100644
index 0000000..29d1823
--- /dev/null
+++ b/test/data/artificial-bzip2/huffman-258.bz2
Binary files differ
diff --git a/test/data/artificial-bzip2/huffman-258.bz2.make-artificial.txt b/test/data/artificial-bzip2/huffman-258.bz2.make-artificial.txt
new file mode 100644
index 0000000..dbd8dd3
--- /dev/null
+++ b/test/data/artificial-bzip2/huffman-258.bz2.make-artificial.txt
@@ -0,0 +1,83 @@
+# Feed this file to script/make-artificial.go
+
+# This modifies abraca.txt.bz2.make-artificial.txt to produce the same
+# decompressed output, "abraca", but in a more elaborate way. The Huffman codes
+# here have the full 258 symbols instead of just 6.
+
+make bzip2
+
+# Bitslices (a) ..= (g)
+bits 16 0x425A
+bits 8 0x68
+bits 8 0x39
+bits 48 0x314159265359
+bits 32 0x76A70995
+bits 1 0x0
+bits 24 0x000001
+
+# Bitslices (h) ..= (j)
+bits 16 0xFFFF
+repeat 16 [
+ bits 16 0xFFFF
+]
+
+# Bitslices (k) ..= (m)
+bits 3 0x2
+bits 15 0x0001
+bits 1 0x0
+
+# Bitslices (n) ..= (o)
+#
+# Both Huffman codes have 258 symbols: 254 × 8-bit symbols and then 4 × 9-bit
+# symbols.
+#
+# For small XYZ (where XYZ < 253), the mXYZ symbol's bitstring is (XYZ+1) in
+# binary. For example, the m099 symbol's bitstring is "01100100", because the
+# binary representation of 0d099+1 = 0d100 = 0x64 = 0b01100100.
+#
+# RUNA is the 8-bit bitstring "00000000".
+# RUNB is the 8-bit bitstring "00000001".
+# m001 is the 8-bit bitstring "00000010".
+# m002 is the 8-bit bitstring "00000011".
+# m003 is the 8-bit bitstring "00000100".
+# etc
+# m250 is the 8-bit bitstring "11111011".
+# m251 is the 8-bit bitstring "11111100".
+# m252 is the 8-bit bitstring "11111101".
+# m253 is the 9-bit bitstring "111111100".
+# m254 is the 9-bit bitstring "111111101".
+# m255 is the 9-bit bitstring "111111110".
+# EOB is the 9-bit bitstring "111111111".
+repeat 2 [
+ bits 5 0x08
+ repeat 254 [
+ bits 1 0x0
+ ]
+ bits 3 0x4
+ bits 1 0x0
+ bits 1 0x0
+ bits 1 0x0
+]
+
+# Bitslice (p)
+#
+# There are 7 symbols.
+# Order = [0x00, 0x01, 0x02, 0x03, ... 0xFF] m099 picks 'c'. 0d099+1 = 0x64.
+# Order = ['c', 0x00, 0x01, 0x02, ... 0xFF] m098 picks 'a'. 0d098+1 = 0x63.
+# Order = ['a', 'c', 0x00, 0x01, ... 0xFF] m114 picks 'r'. 0d114+1 = 0x73.
+# Order = ['r', 'a', 'c', 0x00, ... 0xFF] m001 picks 'a'. 0d001+1 = 0x02.
+# Order = ['a', 'r', 'c', 0x00, ... 0xFF] RUNA picks 'a'.
+# Order = ['a', 'r', 'c', 0x00, ... 0xFF] m100 picks 'b'. 0d100+1 = 0x65.
+# Order = ['b', 'a', 'r', 'c', ... 0xFF] EOB.
+bits 8 0x64
+bits 8 0x63
+bits 8 0x73
+bits 8 0x02
+bits 8 0x00
+bits 8 0x65
+bits 9 0x1FF
+
+# Bitslices (q) ..= (s)
+bits 48 0x177245385090
+bits 32 0x76A70995
+bits 1 0x0