std/lzma: simplify decode-a-literal
name old speed new speed delta
wuffs_lzma_decode_100k/clang14 54.8MB/s ± 1% 56.2MB/s ± 0% +2.61% (p=0.008 n=5+5)
wuffs_lzma_decode_100k/gcc12 56.8MB/s ± 0% 56.4MB/s ± 1% -0.75% (p=0.008 n=5+5)
$ time example-mzcat < linux-6.8.2.tar.xz > /dev/null
Before: user 0m8.409s
After: user 0m8.396s
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b864346..5ae002d 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -52123,67 +52123,40 @@
v_range <<= 8u;
}
v_index_lit = (15u & ((((uint32_t)((v_pos & v_lp_mask))) << v_lc) | (((uint32_t)(v_prev_byte)) >> (8u - v_lc))));
+ v_lanl_offset = 0u;
if (v_state >= 7u) {
v_lanl_offset = 256u;
- v_tree_node = 1u;
- while (v_tree_node < 256u) {
- v_match_byte <<= 1u;
- v_lanl_old_offset = v_lanl_offset;
- v_lanl_offset &= v_match_byte;
- v_lanl_index = (v_lanl_offset + v_lanl_old_offset + v_tree_node);
- v_prob = ((uint32_t)(self->private_data.f_probs_lit[v_index_lit][v_lanl_index]));
- v_threshold = ((uint32_t)((v_range >> 11u) * v_prob));
- if (v_bits < v_threshold) {
- v_lanl_offset = ((v_lanl_offset ^ v_lanl_old_offset) & 256u);
- v_range = v_threshold;
- v_prob += (((uint32_t)(2048u - v_prob)) >> 5u);
- self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
- v_tree_node = (v_tree_node << 1u);
- } else {
- v_bits -= v_threshold;
- v_range -= v_threshold;
- v_prob -= (v_prob >> 5u);
- self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
- v_tree_node = ((v_tree_node << 1u) | 1u);
- }
- if ((v_range >> 24u) == 0u) {
- if (((uint64_t)(io2_a_src - iop_a_src)) <= 0u) {
- status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_i_o);
- goto exit;
- }
- v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
- iop_a_src += 1u;
- v_bits = (((uint32_t)(v_bits << 8u)) | ((uint32_t)(v_c8)));
- v_range <<= 8u;
- }
+ }
+ v_tree_node = 1u;
+ while (v_tree_node < 256u) {
+ v_match_byte <<= 1u;
+ v_lanl_old_offset = v_lanl_offset;
+ v_lanl_offset &= v_match_byte;
+ v_lanl_index = (v_lanl_offset + v_lanl_old_offset + v_tree_node);
+ v_prob = ((uint32_t)(self->private_data.f_probs_lit[v_index_lit][v_lanl_index]));
+ v_threshold = ((uint32_t)((v_range >> 11u) * v_prob));
+ if (v_bits < v_threshold) {
+ v_lanl_offset = ((v_lanl_offset ^ v_lanl_old_offset) & 256u);
+ v_range = v_threshold;
+ v_prob += (((uint32_t)(2048u - v_prob)) >> 5u);
+ self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
+ v_tree_node = (v_tree_node << 1u);
+ } else {
+ v_bits -= v_threshold;
+ v_range -= v_threshold;
+ v_prob -= (v_prob >> 5u);
+ self->private_data.f_probs_lit[v_index_lit][v_lanl_index] = ((uint16_t)(v_prob));
+ v_tree_node = ((v_tree_node << 1u) | 1u);
}
- } else {
- v_tree_node = 1u;
- while (v_tree_node < 256u) {
- v_prob = ((uint32_t)(self->private_data.f_probs_lit[v_index_lit][v_tree_node]));
- v_threshold = ((uint32_t)((v_range >> 11u) * v_prob));
- if (v_bits < v_threshold) {
- v_range = v_threshold;
- v_prob += (((uint32_t)(2048u - v_prob)) >> 5u);
- self->private_data.f_probs_lit[v_index_lit][v_tree_node] = ((uint16_t)(v_prob));
- v_tree_node = (v_tree_node << 1u);
- } else {
- v_bits -= v_threshold;
- v_range -= v_threshold;
- v_prob -= (v_prob >> 5u);
- self->private_data.f_probs_lit[v_index_lit][v_tree_node] = ((uint16_t)(v_prob));
- v_tree_node = ((v_tree_node << 1u) | 1u);
+ if ((v_range >> 24u) == 0u) {
+ if (((uint64_t)(io2_a_src - iop_a_src)) <= 0u) {
+ status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_i_o);
+ goto exit;
}
- if ((v_range >> 24u) == 0u) {
- if (((uint64_t)(io2_a_src - iop_a_src)) <= 0u) {
- status = wuffs_base__make_status(wuffs_lzma__error__internal_error_inconsistent_i_o);
- goto exit;
- }
- v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
- iop_a_src += 1u;
- v_bits = (((uint32_t)(v_bits << 8u)) | ((uint32_t)(v_c8)));
- v_range <<= 8u;
- }
+ v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src);
+ iop_a_src += 1u;
+ v_bits = (((uint32_t)(v_bits << 8u)) | ((uint32_t)(v_c8)));
+ v_range <<= 8u;
}
}
v_prev_byte = ((uint8_t)(v_tree_node));
diff --git a/std/lzma/decode_bitstream_fast.wuffs b/std/lzma/decode_bitstream_fast.wuffs
index 05053da..5f2d213 100644
--- a/std/lzma/decode_bitstream_fast.wuffs
+++ b/std/lzma/decode_bitstream_fast.wuffs
@@ -130,102 +130,46 @@
(((pos & lp_mask) as base.u32) << lc) |
((prev_byte as base.u32) >> (8 - lc)))
+ lanl_offset = 0
if state >= 7 {
- // We are decoding an 8-bit LITERAL, and (state >= 7) means
- // that we most recently decoded a NON-LITERAL. The
- // this.probs_lit[index_lit] array has 0x300 elements: three
- // banks of 0x100.
- //
- // For (state < 7), we only use the first bank (and we run the
- // while.low_state loop instead). This is equivalent to running
- // the while.high_state loop where lanl_offset starts at and
- // stays at zero for the entire loop. match_byte is ignored.
- //
- // For (state >= 7), we start by using the two later banks, but
- // as we are decoding those 8 bits (from MSB to LSB order), we
- // drop down to the first bank (and stay there) if and when the
- // decoded bit doesn't equal the corresponding bit (in the same
- // MSB to LSB order) in the match_byte variable.
- //
- // lanl_offset stays at 0x100 until that "if and when" point
- // (which typically happens; if all 8 bits were equal then the
- // previous NON-LITERAL copy could often just have been
- // longer), when lanl_offset then drops down to zero. We then
- // effectively run the remaining (8 - I) iterations in the
- // while.low_state loop, where I is the iteration count.
- //
- // Whether we use the 0x100 or 0x200 bank, out of the two later
- // banks, depends on the match_byte's I'th bit (MSB to LSB).
- // match_byte is conceptually a base.u8 (it is <= 0xFF outside
- // of this code fragment) but the intermediate computation is
- // simpler as a base.u32 (ignoring the high 23 bits).
lanl_offset = 0x100
-
- tree_node = 1
- while.high_state tree_node < 0x100,
- inv args.dst.length() >= 282,
- {
- match_byte ~mod<<= 1
- lanl_old_offset = lanl_offset
- lanl_offset &= match_byte
- lanl_index = lanl_offset + lanl_old_offset + tree_node
-
- prob = this.probs_lit[index_lit][lanl_index] as base.u32
- threshold = (range >> 11) ~mod* prob
- if bits < threshold {
- lanl_offset = (lanl_offset ^ lanl_old_offset) & 0x100
- range = threshold
- prob ~mod+= (2048 ~mod- prob) >> 5
- this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
- tree_node = (tree_node << 1)
- } else {
- bits ~mod-= threshold
- range ~mod-= threshold
- prob ~mod-= prob >> 5
- this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
- tree_node = (tree_node << 1) | 1
- }
- if (range >> 24) == 0 {
- if args.src.length() <= 0 {
- return "#internal error: inconsistent I/O"
- }
- c8 = args.src.peek_u8()
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- bits = (bits ~mod<< 8) | (c8 as base.u32)
- range ~mod<<= 8
- }
- } endwhile.high_state
-
- } else {
- tree_node = 1
- while.low_state tree_node < 0x100,
- inv args.dst.length() >= 282,
- {
- prob = this.probs_lit[index_lit][tree_node] as base.u32
- threshold = (range >> 11) ~mod* prob
- if bits < threshold {
- range = threshold
- prob ~mod+= (2048 ~mod- prob) >> 5
- this.probs_lit[index_lit][tree_node] = (prob & 0xFFFF) as base.u16
- tree_node = (tree_node << 1)
- } else {
- bits ~mod-= threshold
- range ~mod-= threshold
- prob ~mod-= prob >> 5
- this.probs_lit[index_lit][tree_node] = (prob & 0xFFFF) as base.u16
- tree_node = (tree_node << 1) | 1
- }
- if (range >> 24) == 0 {
- if args.src.length() <= 0 {
- return "#internal error: inconsistent I/O"
- }
- c8 = args.src.peek_u8()
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- bits = (bits ~mod<< 8) | (c8 as base.u32)
- range ~mod<<= 8
- }
- } endwhile.low_state
}
+
+ tree_node = 1
+ while tree_node < 0x100,
+ inv args.dst.length() >= 282,
+ {
+ match_byte ~mod<<= 1
+ lanl_old_offset = lanl_offset
+ lanl_offset &= match_byte
+ lanl_index = lanl_offset + lanl_old_offset + tree_node
+
+ prob = this.probs_lit[index_lit][lanl_index] as base.u32
+ threshold = (range >> 11) ~mod* prob
+ if bits < threshold {
+ lanl_offset = (lanl_offset ^ lanl_old_offset) & 0x100
+ range = threshold
+ prob ~mod+= (2048 ~mod- prob) >> 5
+ this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
+ tree_node = (tree_node << 1)
+ } else {
+ bits ~mod-= threshold
+ range ~mod-= threshold
+ prob ~mod-= prob >> 5
+ this.probs_lit[index_lit][lanl_index] = (prob & 0xFFFF) as base.u16
+ tree_node = (tree_node << 1) | 1
+ }
+ if (range >> 24) == 0 {
+ if args.src.length() <= 0 {
+ return "#internal error: inconsistent I/O"
+ }
+ c8 = args.src.peek_u8()
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ bits = (bits ~mod<< 8) | (c8 as base.u32)
+ range ~mod<<= 8
+ }
+ } endwhile
+
assert args.dst.length() >= 282
// AlgOve03 emitLiteral(literal)