Have std/bzip2 read 4 (not 1) bytes per inner loop
name old speed new speed delta
wuffs_bzip2_decode_10k/clang11 42.9MB/s ± 0% 46.5MB/s ± 0% +8.38% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/clang11 36.9MB/s ± 0% 39.4MB/s ± 1% +6.75% (p=0.008 n=5+5)
wuffs_bzip2_decode_10k/gcc10 36.5MB/s ± 0% 43.8MB/s ± 0% +19.73% (p=0.008 n=5+5)
wuffs_bzip2_decode_100k/gcc10 30.8MB/s ± 0% 35.9MB/s ± 0% +16.37% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index c31dcc1..3e6581d 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -6934,8 +6934,13 @@
uint32_t f_bits;
uint32_t f_n_bits;
- uint32_t f_block_size;
uint32_t f_max_incl_block_size;
+ uint32_t f_block_size;
+ bool f_decode_block_finished;
+ uint8_t f_decode_block_which;
+ uint32_t f_decode_block_ticks;
+ uint32_t f_decode_block_section;
+ uint32_t f_decode_block_run_shift;
uint32_t f_final_checksum_have;
uint32_t f_block_checksum_want;
uint32_t f_original_pointer;
@@ -6961,12 +6966,7 @@
uint32_t f_bwt[1048576];
struct {
- uint8_t v_which;
- uint32_t v_ticks;
- uint32_t v_section;
- uint32_t v_bs;
uint32_t v_node_index;
- uint32_t v_run_shift;
} s_decode_block_slow[1];
struct {
uint32_t v_i;
@@ -24864,6 +24864,7 @@
const char wuffs_bzip2__error__bad_number_of_sections[] = "#bzip2: bad number of sections";
const char wuffs_bzip2__error__unsupported_huffman_code[] = "#bzip2: unsupported Huffman code";
const char wuffs_bzip2__error__unsupported_block_randomization[] = "#bzip2: unsupported block randomization";
+const char wuffs_bzip2__error__internal_error_inconsistent_huffman_decoder_state[] = "#bzip2: internal error: inconsistent Huffman decoder state";
// ---------------- Private Consts
@@ -24913,6 +24914,11 @@
// ---------------- Private Function Prototypes
static wuffs_base__status
+wuffs_bzip2__decoder__decode_block_fast(
+ wuffs_bzip2__decoder* self,
+ wuffs_base__io_buffer* a_src);
+
+static wuffs_base__status
wuffs_bzip2__decoder__decode_block_slow(
wuffs_bzip2__decoder* self,
wuffs_base__io_buffer* a_src);
@@ -25026,6 +25032,128 @@
// ---------------- Function Implementations
+// -------- func bzip2.decoder.decode_block_fast
+
+static wuffs_base__status
+wuffs_bzip2__decoder__decode_block_fast(
+ wuffs_bzip2__decoder* self,
+ wuffs_base__io_buffer* a_src) {
+ wuffs_base__status status = wuffs_base__make_status(NULL);
+
+ uint32_t v_bits = 0;
+ uint32_t v_n_bits = 0;
+ uint32_t v_block_size = 0;
+ uint8_t v_which = 0;
+ uint32_t v_ticks = 0;
+ uint32_t v_section = 0;
+ uint32_t v_run_shift = 0;
+ uint16_t v_child = 0;
+ uint32_t v_child_ff = 0;
+ uint32_t v_i = 0;
+ uint32_t v_j = 0;
+ uint32_t v_output = 0;
+ uint32_t v_run = 0;
+ uint32_t v_mtft0 = 0;
+
+ const uint8_t* iop_a_src = NULL;
+ const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ if (a_src) {
+ io0_a_src = a_src->data.ptr;
+ io1_a_src = io0_a_src + a_src->meta.ri;
+ iop_a_src = io1_a_src;
+ io2_a_src = io0_a_src + a_src->meta.wi;
+ }
+
+ v_bits = self->private_impl.f_bits;
+ v_n_bits = self->private_impl.f_n_bits;
+ v_block_size = self->private_impl.f_block_size;
+ v_which = self->private_impl.f_decode_block_which;
+ v_ticks = self->private_impl.f_decode_block_ticks;
+ v_section = self->private_impl.f_decode_block_section;
+ v_run_shift = self->private_impl.f_decode_block_run_shift;
+ label__outer__continue:;
+ while (((uint64_t)(io2_a_src - iop_a_src)) >= 4) {
+ if (v_ticks > 0) {
+ v_ticks -= 1;
+ } else {
+ v_ticks = 49;
+ v_section += 1;
+ v_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[(v_section & 32767)] & 7)];
+ }
+ v_bits |= (wuffs_base__peek_u32be__no_bounds_check(iop_a_src) >> v_n_bits);
+ iop_a_src += ((31 - v_n_bits) >> 3);
+ v_n_bits |= 24;
+ v_child = 0;
+ while (v_child < 1024) {
+ v_child = self->private_data.f_huffman_trees[v_which][v_child][(v_bits >> 31)];
+ v_bits <<= 1;
+ if (v_n_bits <= 0) {
+ status = wuffs_base__make_status(wuffs_bzip2__error__internal_error_inconsistent_huffman_decoder_state);
+ goto exit;
+ }
+ v_n_bits -= 1;
+ }
+ if (v_child < 1280) {
+ v_child_ff = ((uint32_t)((v_child & 255)));
+ v_output = ((uint32_t)(self->private_data.f_mtft[v_child_ff]));
+ wuffs_base__slice_u8__copy_from_slice(wuffs_base__make_slice_u8_ij(self->private_data.f_mtft, 1, (1 + v_child_ff)), wuffs_base__make_slice_u8(self->private_data.f_mtft, v_child_ff));
+ self->private_data.f_mtft[0] = ((uint8_t)(v_output));
+ self->private_data.f_letter_counts[v_output] += 1;
+ self->private_data.f_bwt[v_block_size] = v_output;
+ if (v_block_size >= self->private_impl.f_max_incl_block_size) {
+ status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+ goto exit;
+ }
+ v_block_size += 1;
+ v_run_shift = 0;
+ goto label__outer__continue;
+ } else if (v_child > 1281) {
+ self->private_impl.f_decode_block_finished = true;
+ goto label__outer__break;
+ }
+ if (v_run_shift >= 23) {
+ status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+ goto exit;
+ }
+ v_run = (((uint32_t)((v_child - 1279))) << v_run_shift);
+ v_run_shift += 1;
+ v_i = v_block_size;
+ v_j = (v_run + v_block_size);
+ if (v_j > self->private_impl.f_max_incl_block_size) {
+ status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
+ goto exit;
+ }
+ v_block_size = v_j;
+ v_mtft0 = ((uint32_t)(self->private_data.f_mtft[0]));
+ self->private_data.f_letter_counts[v_mtft0] += v_run;
+ while (v_i < v_j) {
+ self->private_data.f_bwt[v_i] = v_mtft0;
+ v_i += 1;
+ }
+ }
+ label__outer__break:;
+ self->private_impl.f_bits = v_bits;
+ self->private_impl.f_n_bits = v_n_bits;
+ self->private_impl.f_block_size = v_block_size;
+ self->private_impl.f_decode_block_which = v_which;
+ self->private_impl.f_decode_block_ticks = v_ticks;
+ self->private_impl.f_decode_block_section = v_section;
+ self->private_impl.f_decode_block_run_shift = v_run_shift;
+ status = wuffs_base__make_status(NULL);
+ goto ok;
+
+ ok:
+ goto exit;
+ exit:
+ if (a_src) {
+ a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+ }
+
+ return status;
+}
+
// -------- func bzip2.decoder.decode_block_slow
static wuffs_base__status
@@ -25034,18 +25162,13 @@
wuffs_base__io_buffer* a_src) {
wuffs_base__status status = wuffs_base__make_status(NULL);
- uint8_t v_which = 0;
uint8_t v_c = 0;
+ uint32_t v_node_index = 0;
uint16_t v_child = 0;
uint32_t v_child_ff = 0;
uint32_t v_i = 0;
uint32_t v_j = 0;
- uint32_t v_ticks = 0;
- uint32_t v_section = 0;
- uint32_t v_bs = 0;
uint32_t v_output = 0;
- uint32_t v_node_index = 0;
- uint32_t v_run_shift = 0;
uint32_t v_run = 0;
uint32_t v_mtft0 = 0;
@@ -25062,40 +25185,18 @@
uint32_t coro_susp_point = self->private_impl.p_decode_block_slow[0];
if (coro_susp_point) {
- v_which = self->private_data.s_decode_block_slow[0].v_which;
- v_ticks = self->private_data.s_decode_block_slow[0].v_ticks;
- v_section = self->private_data.s_decode_block_slow[0].v_section;
- v_bs = self->private_data.s_decode_block_slow[0].v_bs;
v_node_index = self->private_data.s_decode_block_slow[0].v_node_index;
- v_run_shift = self->private_data.s_decode_block_slow[0].v_run_shift;
}
switch (coro_susp_point) {
WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
- v_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[0] & 7)];
- v_section = 1;
- v_i = 0;
- v_j = 0;
- while (v_i < 256) {
- if (self->private_data.f_presence[v_i] != 0) {
- self->private_data.f_mtft[(v_j & 255)] = ((uint8_t)(v_i));
- v_j += 1;
- }
- v_i += 1;
- }
- v_i = 0;
- while (v_i < 256) {
- self->private_data.f_letter_counts[v_i] = 0;
- v_i += 1;
- }
- v_ticks = 50;
- while (true) {
- if (v_ticks > 0) {
- v_ticks -= 1;
+ while ( ! (self->private_impl.p_decode_block_slow[0] != 0)) {
+ if (self->private_impl.f_decode_block_ticks > 0) {
+ self->private_impl.f_decode_block_ticks -= 1;
} else {
- v_ticks = 49;
- v_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[(v_section & 32767)] & 7)];
- v_section += 1;
+ self->private_impl.f_decode_block_ticks = 49;
+ self->private_impl.f_decode_block_section += 1;
+ self->private_impl.f_decode_block_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[(self->private_impl.f_decode_block_section & 32767)] & 7)];
}
v_node_index = 0;
label__0__continue:;
@@ -25113,7 +25214,7 @@
self->private_impl.f_bits = (((uint32_t)(v_c)) << 24);
self->private_impl.f_n_bits = 8;
}
- v_child = self->private_data.f_huffman_trees[v_which][v_node_index][(self->private_impl.f_bits >> 31)];
+ v_child = self->private_data.f_huffman_trees[self->private_impl.f_decode_block_which][v_node_index][(self->private_impl.f_bits >> 31)];
self->private_impl.f_bits <<= 1;
self->private_impl.f_n_bits -= 1;
if (v_child < 1024) {
@@ -25125,30 +25226,31 @@
wuffs_base__slice_u8__copy_from_slice(wuffs_base__make_slice_u8_ij(self->private_data.f_mtft, 1, (1 + v_child_ff)), wuffs_base__make_slice_u8(self->private_data.f_mtft, v_child_ff));
self->private_data.f_mtft[0] = ((uint8_t)(v_output));
self->private_data.f_letter_counts[v_output] += 1;
- self->private_data.f_bwt[v_bs] = v_output;
- if (v_bs >= self->private_impl.f_max_incl_block_size) {
+ self->private_data.f_bwt[self->private_impl.f_block_size] = v_output;
+ if (self->private_impl.f_block_size >= self->private_impl.f_max_incl_block_size) {
status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
goto exit;
}
- v_bs += 1;
- v_run_shift = 0;
+ self->private_impl.f_block_size += 1;
+ self->private_impl.f_decode_block_run_shift = 0;
goto label__0__break;
} else if (v_child > 1281) {
+ self->private_impl.f_decode_block_finished = true;
goto label__outer__break;
}
- if (v_run_shift >= 23) {
+ if (self->private_impl.f_decode_block_run_shift >= 23) {
status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
goto exit;
}
- v_run = (((uint32_t)((v_child - 1279))) << v_run_shift);
- v_run_shift += 1;
- v_i = v_bs;
- v_j = (v_run + v_bs);
+ v_run = (((uint32_t)((v_child - 1279))) << self->private_impl.f_decode_block_run_shift);
+ self->private_impl.f_decode_block_run_shift += 1;
+ v_i = self->private_impl.f_block_size;
+ v_j = (v_run + self->private_impl.f_block_size);
if (v_j > self->private_impl.f_max_incl_block_size) {
status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
goto exit;
}
- v_bs = v_j;
+ self->private_impl.f_block_size = v_j;
v_mtft0 = ((uint32_t)(self->private_data.f_mtft[0]));
self->private_data.f_letter_counts[v_mtft0] += v_run;
while (v_i < v_j) {
@@ -25160,11 +25262,6 @@
label__0__break:;
}
label__outer__break:;
- if (v_bs > self->private_impl.f_max_incl_block_size) {
- status = wuffs_base__make_status(wuffs_bzip2__error__bad_block_length);
- goto exit;
- }
- self->private_impl.f_block_size = v_bs;
goto ok;
ok:
@@ -25175,12 +25272,7 @@
goto suspend;
suspend:
self->private_impl.p_decode_block_slow[0] = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0;
- self->private_data.s_decode_block_slow[0].v_which = v_which;
- self->private_data.s_decode_block_slow[0].v_ticks = v_ticks;
- self->private_data.s_decode_block_slow[0].v_section = v_section;
- self->private_data.s_decode_block_slow[0].v_bs = v_bs;
self->private_data.s_decode_block_slow[0].v_node_index = v_node_index;
- self->private_data.s_decode_block_slow[0].v_run_shift = v_run_shift;
goto exit;
exit:
@@ -25249,6 +25341,7 @@
uint8_t v_c = 0;
uint32_t v_i = 0;
uint64_t v_tag = 0;
+ wuffs_base__status v_status = wuffs_base__make_status(NULL);
uint32_t v_final_checksum_want = 0;
const uint8_t* iop_a_src = NULL;
@@ -25364,17 +25457,39 @@
if (status.repr) {
goto suspend;
}
- if (a_src) {
- a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+ self->private_impl.f_block_size = 0;
+ self->private_impl.f_decode_block_finished = false;
+ self->private_impl.f_decode_block_which = WUFFS_BZIP2__CLAMP_TO_5[(self->private_data.f_huffman_selectors[0] & 7)];
+ self->private_impl.f_decode_block_ticks = 50;
+ self->private_impl.f_decode_block_section = 0;
+ self->private_impl.f_decode_block_run_shift = 0;
+ while ( ! self->private_impl.f_decode_block_finished) {
+ if (a_src) {
+ a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+ }
+ v_status = wuffs_bzip2__decoder__decode_block_fast(self, a_src);
+ if (a_src) {
+ iop_a_src = a_src->data.ptr + a_src->meta.ri;
+ }
+ if (wuffs_base__status__is_error(&v_status)) {
+ status = v_status;
+ goto exit;
+ } else if (self->private_impl.f_decode_block_finished) {
+ goto label__1__break;
+ }
+ if (a_src) {
+ a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+ }
+ WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7);
+ status = wuffs_bzip2__decoder__decode_block_slow(self, a_src);
+ if (a_src) {
+ iop_a_src = a_src->data.ptr + a_src->meta.ri;
+ }
+ if (status.repr) {
+ goto suspend;
+ }
}
- WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7);
- status = wuffs_bzip2__decoder__decode_block_slow(self, a_src);
- if (a_src) {
- iop_a_src = a_src->data.ptr + a_src->meta.ri;
- }
- if (status.repr) {
- goto suspend;
- }
+ label__1__break:;
wuffs_bzip2__decoder__invert_bwt(self);
WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8);
status = wuffs_bzip2__decoder__flush_block(self, a_dst);
@@ -25446,6 +25561,7 @@
uint8_t v_c = 0;
uint32_t v_i = 0;
+ uint32_t v_j = 0;
uint32_t v_selector = 0;
uint32_t v_sel_ff = 0;
uint8_t v_movee = 0;
@@ -25719,6 +25835,20 @@
}
v_i += 1;
}
+ v_i = 0;
+ v_j = 0;
+ while (v_i < 256) {
+ if (self->private_data.f_presence[v_i] != 0) {
+ self->private_data.f_mtft[(v_j & 255)] = ((uint8_t)(v_i));
+ v_j += 1;
+ }
+ v_i += 1;
+ }
+ v_i = 0;
+ while (v_i < 256) {
+ self->private_data.f_letter_counts[v_i] = 0;
+ v_i += 1;
+ }
goto ok;
ok:
diff --git a/std/bzip2/decode_block_fast.wuffs b/std/bzip2/decode_block_fast.wuffs
new file mode 100644
index 0000000..db9359b
--- /dev/null
+++ b/std/bzip2/decode_block_fast.wuffs
@@ -0,0 +1,120 @@
+// Copyright 2022 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func decoder.decode_block_fast!(src: base.io_reader) base.status {
+ var bits : base.u32
+ var n_bits : base.u32[..= 31]
+ var block_size : base.u32[..= 900000]
+ var which : base.u8[..= 5]
+ var ticks : base.u32[..= 50]
+ var section : base.u32
+ var run_shift : base.u32[..= 23]
+
+ var child : base.u16
+ var child_ff : base.u32[..= 255]
+ var i : base.u32
+ var j : base.u32
+ var output : base.u32[..= 255]
+ var run : base.u32[..= 16_777216] // 16_777216 = (2 << 23)
+ var mtft0 : base.u32[..= 255]
+
+ bits = this.bits
+ n_bits = this.n_bits
+ block_size = this.block_size
+ which = this.decode_block_which
+ ticks = this.decode_block_ticks
+ section = this.decode_block_section
+ run_shift = this.decode_block_run_shift
+
+ // Apply the Huffman trees.
+ while.outer args.src.length() >= 4 {
+ if ticks > 0 {
+ ticks -= 1
+ } else {
+ ticks = 49
+ section ~mod+= 1
+ which = CLAMP_TO_5[this.huffman_selectors[section & 32767] & 7]
+ }
+
+ bits |= args.src.peek_u32be() >> n_bits
+ args.src.skip_u32_fast!(actual: (31 - n_bits) >> 3, worst_case: 4)
+ n_bits |= 24
+
+ child = 0
+ while child < 1024 {
+ child = this.huffman_trees[which][child][bits >> 31]
+ bits ~mod<<= 1
+ if n_bits <= 0 {
+ return "#internal error: inconsistent Huffman decoder state"
+ }
+ n_bits -= 1
+ } endwhile
+
+ if child < 1280 { // mNN symbol.
+ // Move to front.
+ child_ff = (child & 0xFF) as base.u32
+ output = this.mtft[child_ff] as base.u32
+ this.mtft[1 .. 1 + child_ff].copy_from_slice!(
+ s: this.mtft[.. child_ff])
+ this.mtft[0] = output as base.u8
+
+ this.letter_counts[output] ~mod+= 1
+ this.bwt[block_size] = output
+ if block_size >= this.max_incl_block_size {
+ return "#bad block length"
+ }
+ assert block_size < 900000 via "a < b: a < c; c <= b"(c: this.max_incl_block_size)
+ block_size += 1
+ run_shift = 0
+ continue.outer
+ } else if child > 1281 { // EOB symbol.
+ this.decode_block_finished = true
+ break.outer
+ }
+
+ // RUNA or RUNB symbol.
+ if run_shift >= 23 {
+ return "#bad block length"
+ }
+ run = ((child - 1279) as base.u32) << run_shift
+ run_shift += 1
+ i = block_size
+ j = run + block_size
+ if j > this.max_incl_block_size {
+ return "#bad block length"
+ }
+ assert j <= 900000 via "a <= b: a <= c; c <= b"(c: this.max_incl_block_size)
+ block_size = j
+
+ mtft0 = this.mtft[0] as base.u32
+ this.letter_counts[mtft0] ~mod+= run
+ while i < j,
+ pre j <= 900000,
+ {
+ assert i < 900000 via "a < b: a < c; c <= b"(c: j)
+ this.bwt[i] = mtft0
+ i += 1
+ } endwhile
+ } endwhile.outer
+
+ this.bits = bits
+ this.n_bits = n_bits
+ this.block_size = block_size
+ this.decode_block_which = which
+ this.decode_block_ticks = ticks
+ this.decode_block_section = section
+ this.decode_block_run_shift = run_shift
+
+ return ok
+}
diff --git a/std/bzip2/decode_block_slow.wuffs b/std/bzip2/decode_block_slow.wuffs
index fe65baa..c1b9c48 100644
--- a/std/bzip2/decode_block_slow.wuffs
+++ b/std/bzip2/decode_block_slow.wuffs
@@ -13,50 +13,25 @@
// limitations under the License.
pri func decoder.decode_block_slow?(src: base.io_reader) {
- var which : base.u8[..= 5]
var c : base.u8
- var child : base.u16
- var child_ff : base.u32[..= 255]
- var i : base.u32
- var j : base.u32
- var ticks : base.u32[..= 50]
- var section : base.u32
- var bs : base.u32[..= 1_048575] // 1_048575 = ((1 << 20) - 1)
- var output : base.u32[..= 255]
var node_index : base.u32[..= 1023]
- var run_shift : base.u32[..= 23]
- var run : base.u32[..= 16_777216] // 16_777216 = (2 << 23)
- var mtft0 : base.u32[..= 255]
- which = CLAMP_TO_5[this.huffman_selectors[0] & 7]
- section = 1
-
- // Initialize the MTFT state.
- i = 0
- j = 0
- while i < 256 {
- if this.presence[i] <> 0 {
- this.mtft[j & 0xFF] = i as base.u8
- j ~mod+= 1
- }
- i += 1
- } endwhile
-
- i = 0
- while i < 256 {
- this.letter_counts[i] = 0
- i += 1
- } endwhile
+ var child : base.u16
+ var child_ff : base.u32[..= 255]
+ var i : base.u32
+ var j : base.u32
+ var output : base.u32[..= 255]
+ var run : base.u32[..= 16_777216] // 16_777216 = (2 << 23)
+ var mtft0 : base.u32[..= 255]
// Apply the Huffman trees.
- ticks = 50
- while.outer true {
- if ticks > 0 {
- ticks -= 1
+ while.outer not coroutine_resumed {
+ if this.decode_block_ticks > 0 {
+ this.decode_block_ticks -= 1
} else {
- ticks = 49
- which = CLAMP_TO_5[this.huffman_selectors[section & 32767] & 7]
- section ~mod+= 1
+ this.decode_block_ticks = 49
+ this.decode_block_section ~mod+= 1
+ this.decode_block_which = CLAMP_TO_5[this.huffman_selectors[this.decode_block_section & 32767] & 7]
}
node_index = 0
@@ -67,7 +42,7 @@
this.n_bits = 8
assert this.n_bits > 0
}
- child = this.huffman_trees[which][node_index][this.bits >> 31]
+ child = this.huffman_trees[this.decode_block_which][node_index][this.bits >> 31]
this.bits ~mod<<= 1
this.n_bits -= 1
@@ -84,32 +59,33 @@
this.mtft[0] = output as base.u8
this.letter_counts[output] ~mod+= 1
- this.bwt[bs] = output
- if bs >= this.max_incl_block_size {
+ this.bwt[this.block_size] = output
+ if this.block_size >= this.max_incl_block_size {
return "#bad block length"
}
- assert bs < 900000 via "a < b: a < c; c <= b"(c: this.max_incl_block_size)
- bs += 1
- run_shift = 0
+ assert this.block_size < 900000 via "a < b: a < c; c <= b"(c: this.max_incl_block_size)
+ this.block_size += 1
+ this.decode_block_run_shift = 0
break
} else if child > 1281 { // EOB symbol.
+ this.decode_block_finished = true
break.outer
}
// RUNA or RUNB symbol.
- if run_shift >= 23 {
+ if this.decode_block_run_shift >= 23 {
return "#bad block length"
}
- run = ((child - 1279) as base.u32) << run_shift
- run_shift += 1
- i = bs
- j = run + bs
+ run = ((child - 1279) as base.u32) << this.decode_block_run_shift
+ this.decode_block_run_shift += 1
+ i = this.block_size
+ j = run + this.block_size
if j > this.max_incl_block_size {
return "#bad block length"
}
assert j <= 900000 via "a <= b: a <= c; c <= b"(c: this.max_incl_block_size)
- bs = j
+ this.block_size = j
mtft0 = this.mtft[0] as base.u32
this.letter_counts[mtft0] ~mod+= run
@@ -123,10 +99,4 @@
break
} endwhile
} endwhile.outer
-
- if bs > this.max_incl_block_size {
- return "#bad block length"
- }
- assert bs <= 900000 via "a <= b: a <= c; c <= b"(c: this.max_incl_block_size)
- this.block_size = bs
}
diff --git a/std/bzip2/decode_bzip2.wuffs b/std/bzip2/decode_bzip2.wuffs
index 6f86876..1fb899b 100644
--- a/std/bzip2/decode_bzip2.wuffs
+++ b/std/bzip2/decode_bzip2.wuffs
@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// TODO: optimize.
-
pub status "#bad Huffman code (over-subscribed)"
pub status "#bad Huffman code (under-subscribed)"
pub status "#bad block header"
@@ -25,6 +23,8 @@
pub status "#unsupported Huffman code"
pub status "#unsupported block randomization"
+pri status "#internal error: inconsistent Huffman decoder state"
+
pub const DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE : base.u64 = 0
pri const CLAMP_TO_5 : array[8] base.u8[..= 5] = [0, 1, 2, 3, 4, 5, 5, 5]
@@ -34,9 +34,15 @@
n_bits : base.u32[..= 31],
// Block size is measured prior to the RLE1 step.
- block_size : base.u32[..= 900000],
max_incl_block_size : base.u32[..= 900000],
+ block_size : base.u32[..= 900000],
+ decode_block_finished : base.bool,
+ decode_block_which : base.u8[..= 5],
+ decode_block_ticks : base.u32[..= 50],
+ decode_block_section : base.u32,
+ decode_block_run_shift : base.u32[..= 23],
+
final_checksum_have : base.u32,
block_checksum_want : base.u32,
original_pointer : base.u32,
@@ -92,6 +98,7 @@
var c : base.u8
var i : base.u32
var tag : base.u64
+ var status : base.status
var final_checksum_want : base.u32
// Read the header.
@@ -138,7 +145,23 @@
}
this.prepare_block?(src: args.src)
- this.decode_block_slow?(src: args.src)
+
+ this.block_size = 0
+ this.decode_block_finished = false
+ this.decode_block_which = CLAMP_TO_5[this.huffman_selectors[0] & 7]
+ this.decode_block_ticks = 50
+ this.decode_block_section = 0
+ this.decode_block_run_shift = 0
+ while not this.decode_block_finished {
+ status = this.decode_block_fast!(src: args.src)
+ if status.is_error() {
+ return status
+ } else if this.decode_block_finished {
+ break
+ }
+ this.decode_block_slow?(src: args.src)
+ } endwhile
+
this.invert_bwt!()
this.flush_block?(dst: args.dst)
} endwhile
@@ -167,6 +190,7 @@
pri func decoder.prepare_block?(src: base.io_reader) {
var c : base.u8
var i : base.u32
+ var j : base.u32
var selector : base.u32
var sel_ff : base.u32
var movee : base.u8
@@ -378,6 +402,24 @@
}
i += 1
} endwhile
+
+ // Initialize the MTFT state.
+ i = 0
+ j = 0
+ while i < 256 {
+ if this.presence[i] <> 0 {
+ this.mtft[j & 0xFF] = i as base.u8
+ j ~mod+= 1
+ }
+ i += 1
+ } endwhile
+
+ // Initialize the letter counts.
+ i = 0
+ while i < 256 {
+ this.letter_counts[i] = 0
+ i += 1
+ } endwhile
}
pri func decoder.read_code_lengths?(src: base.io_reader) {
@@ -592,7 +634,8 @@
// invert_bwt performs "two passes over the data, and one pass over the
// alphabet", per the BWT technical report, except that the first data pass
-// (accumulating this.letter_counts) is integrated into execute_block.
+// (accumulating this.letter_counts) is integrated into the decode_block_etc
+// methods.
pri func decoder.invert_bwt!() {
var i : base.u32
var letter : base.u32[..= 255]