Have std/deflate read 64 bits once per inner loop
On a mid-range x86_64 laptop:
wuffs_deflate_decode_1k_full_init/clang9 101MB/s ± 0% 92MB/s ± 2% -9.22% (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/clang9 117MB/s ± 0% 107MB/s ± 3% -8.00% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/clang9 129MB/s ± 0% 140MB/s ± 1% +8.36% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/clang9 131MB/s ± 0% 142MB/s ± 1% +8.59% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/clang9 158MB/s ± 0% 184MB/s ± 1% +16.29% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/clang9 133MB/s ± 0% 155MB/s ± 1% +16.26% (p=0.008 n=5+5)
wuffs_deflate_decode_1k_full_init/gcc10 98.0MB/s ± 0% 97.7MB/s ± 0% ~ (p=0.222 n=5+5)
wuffs_deflate_decode_1k_part_init/gcc10 112MB/s ± 0% 111MB/s ± 0% -1.04% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/gcc10 138MB/s ± 0% 179MB/s ± 1% +29.91% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/gcc10 140MB/s ± 0% 183MB/s ± 1% +30.52% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/gcc10 188MB/s ± 0% 211MB/s ± 1% +12.32% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/gcc10 156MB/s ± 0% 172MB/s ± 1% +10.00% (p=0.008 n=5+5)
mimic_deflate_decode_1k 128MB/s ± 0% 127MB/s ± 1% -0.99% (p=0.016 n=5+5)
mimic_deflate_decode_10k 143MB/s ± 0% 142MB/s ± 1% ~ (p=0.056 n=5+5)
mimic_deflate_decode_100k_just_one_read 176MB/s ± 0% 174MB/s ± 1% ~ (p=0.151 n=5+5)
mimic_deflate_decode_100k_many_big_reads 135MB/s ± 0% 134MB/s ± 1% ~ (p=0.151 n=5+5)
wuffs_gzip_decode_10k/clang9 134MB/s ± 1% 127MB/s ± 0% -5.23% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang9 150MB/s ± 0% 162MB/s ± 0% +8.23% (p=0.008 n=5+5)
wuffs_gzip_decode_10k/gcc10 131MB/s ± 0% 152MB/s ± 0% +16.36% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc10 159MB/s ± 2% 188MB/s ± 0% +17.78% (p=0.008 n=5+5)
mimic_gzip_decode_10k 121MB/s ± 0% 121MB/s ± 0% ~ (p=0.841 n=5+5)
mimic_gzip_decode_100k 144MB/s ± 0% 143MB/s ± 0% -0.11% (p=0.008 n=5+5)
wuffs_zlib_decode_10k/clang9 139MB/s ± 0% 135MB/s ± 0% -2.36% (p=0.008 n=5+5)
wuffs_zlib_decode_100k/clang9 155MB/s ± 0% 176MB/s ± 0% +13.83% (p=0.008 n=5+5)
wuffs_zlib_decode_10k/gcc10 141MB/s ± 0% 156MB/s ± 1% +10.94% (p=0.008 n=5+5)
wuffs_zlib_decode_100k/gcc10 172MB/s ± 0% 193MB/s ± 1% +12.27% (p=0.016 n=4+5)
mimic_zlib_decode_10k 133MB/s ± 0% 132MB/s ± 0% -1.29% (p=0.008 n=5+5)
mimic_zlib_decode_100k 160MB/s ± 0% 158MB/s ± 0% -1.02% (p=0.008 n=5+5)
wuffs_png_decode_image_19k_8bpp/clang9 94.7MB/s ± 0% 103.4MB/s ± 0% +9.17% (p=0.016 n=4+5)
wuffs_png_decode_image_40k_24bpp/clang9 135MB/s ± 0% 146MB/s ± 0% +8.03% (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang9 351MB/s ± 0% 380MB/s ± 0% +8.48% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 320MB/s ± 1% 343MB/s ± 0% +7.20% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 308MB/s ± 0% 328MB/s ± 0% +6.59% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9 138MB/s ± 0% 149MB/s ± 0% +8.02% (p=0.016 n=5+4)
wuffs_png_decode_image_19k_8bpp/gcc10 101MB/s ± 1% 116MB/s ± 0% +14.61% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10 136MB/s ± 0% 154MB/s ± 0% +13.10% (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/gcc10 371MB/s ± 0% 426MB/s ± 0% +14.80% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 333MB/s ± 0% 357MB/s ± 0% +7.31% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 318MB/s ± 0% 341MB/s ± 0% +7.27% (p=0.016 n=5+4)
wuffs_png_decode_image_4002k_24bpp/gcc10 138MB/s ± 0% 157MB/s ± 0% +13.59% (p=0.008 n=5+5)
mimic_png_decode_image_19k_8bpp 58.2MB/s ± 0% 58.1MB/s ± 0% ~ (p=0.841 n=5+5)
mimic_png_decode_image_40k_24bpp 73.3MB/s ± 0% 73.2MB/s ± 0% -0.16% (p=0.032 n=5+5)
mimic_png_decode_image_77k_8bpp 177MB/s ± 0% 177MB/s ± 0% ~ (p=0.690 n=5+5)
mimic_png_decode_image_552k_32bpp_ignore_checksum skipped
mimic_png_decode_image_552k_32bpp_verify_checksum 146MB/s ± 0% 146MB/s ± 0% ~ (p=0.690 n=5+5)
mimic_png_decode_image_4002k_24bpp 104MB/s ± 0% 104MB/s ± 0% ~ (p=0.841 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index ca9ad79..b449c47 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -22624,12 +22624,12 @@
wuffs_base__io_buffer* a_src) {
wuffs_base__status status = wuffs_base__make_status(NULL);
- uint32_t v_bits = 0;
+ uint64_t v_bits = 0;
uint32_t v_n_bits = 0;
uint32_t v_table_entry = 0;
uint32_t v_table_entry_n_bits = 0;
- uint32_t v_lmask = 0;
- uint32_t v_dmask = 0;
+ uint64_t v_lmask = 0;
+ uint64_t v_dmask = 0;
uint32_t v_redir_top = 0;
uint32_t v_redir_mask = 0;
uint32_t v_length = 0;
@@ -22665,21 +22665,15 @@
status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
goto exit;
}
- v_bits = self->private_impl.f_bits;
+ v_bits = ((uint64_t)(self->private_impl.f_bits));
v_n_bits = self->private_impl.f_n_bits;
- v_lmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
- v_dmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
+ v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
+ v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
label__loop__continue:;
- while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 12)) {
- if (v_n_bits < 15) {
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- } else {
- }
+ while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
+ v_bits |= (wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63));
+ iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
+ v_n_bits |= 56;
v_table_entry = self->private_data.f_huffs[0][(v_bits & v_lmask)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
@@ -22692,18 +22686,9 @@
self->private_impl.f_end_of_block = true;
goto label__loop__break;
} else if ((v_table_entry >> 28) != 0) {
- if (v_n_bits < 15) {
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- } else {
- }
v_redir_top = ((v_table_entry >> 8) & 65535);
v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
- v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
+ v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
@@ -22734,50 +22719,21 @@
v_length = (((v_table_entry >> 8) & 255) + 3);
v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
if (v_table_entry_n_bits > 0) {
- if (v_n_bits < 15) {
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- } else {
- }
- v_length = (((v_length + 253 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 255) + 3);
+ v_length = (((v_length + 253 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 255) + 3);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
- } else {
- }
- if (v_n_bits < 15) {
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- } else {
}
v_table_entry = self->private_data.f_huffs[1][(v_bits & v_dmask)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
if ((v_table_entry >> 28) == 1) {
- if (v_n_bits < 15) {
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- } else {
- }
v_redir_top = ((v_table_entry >> 8) & 65535);
v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
- v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
+ v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
- } else {
}
if ((v_table_entry >> 24) != 64) {
if ((v_table_entry >> 24) == 8) {
@@ -22789,15 +22745,7 @@
}
v_dist_minus_1 = ((v_table_entry >> 8) & 32767);
v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
- if (v_n_bits < v_table_entry_n_bits) {
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
- iop_a_src += 1;
- v_n_bits += 8;
- }
- v_dist_minus_1 = ((v_dist_minus_1 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 32767);
+ v_dist_minus_1 = ((v_dist_minus_1 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 32767);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
while (true) {
@@ -22833,6 +22781,10 @@
label__0__break:;
}
label__loop__break:;
+ if (v_n_bits > 63) {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+ goto exit;
+ }
while (v_n_bits >= 8) {
v_n_bits -= 8;
if (iop_a_src > io1_a_src) {
@@ -22842,7 +22794,7 @@
goto exit;
}
}
- self->private_impl.f_bits = (v_bits & ((((uint32_t)(1)) << v_n_bits) - 1));
+ self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1)) << v_n_bits) - 1))));
self->private_impl.f_n_bits = v_n_bits;
if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> self->private_impl.f_n_bits) != 0)) {
status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
diff --git a/std/deflate/decode_huffman_fast.wuffs b/std/deflate/decode_huffman_fast.wuffs
index 22260cb..a3d439b 100644
--- a/std/deflate/decode_huffman_fast.wuffs
+++ b/std/deflate/decode_huffman_fast.wuffs
@@ -20,12 +20,12 @@
// decode_huffman_*.wuffs files as small as possible, while retaining both
// correctness and performance.
- var bits : base.u32
+ var bits : base.u64
var n_bits : base.u32
var table_entry : base.u32
var table_entry_n_bits : base.u32[..= 15]
- var lmask : base.u32[..= 511]
- var dmask : base.u32[..= 511]
+ var lmask : base.u64[..= 511]
+ var dmask : base.u64[..= 511]
var redir_top : base.u32[..= 0xFFFF]
var redir_mask : base.u32[..= 0x7FFF]
var length : base.u32[..= 258]
@@ -37,17 +37,16 @@
return "#internal error: inconsistent n_bits"
}
- bits = this.bits
+ bits = this.bits as base.u64
n_bits = this.n_bits
- lmask = ((1 as base.u32) << this.n_huffs_bits[0]) - 1
- dmask = ((1 as base.u32) << this.n_huffs_bits[1]) - 1
+ lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1
+ dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
// Check up front, on each iteration, that we have enough buffer space to
- // both read (12 bytes) and write (258 bytes) as much as we need to. Doing
+ // both read (8 bytes) and write (258 bytes) as much as we need to. Doing
// this check once (per iteration), up front, removes the need to check
- // (and possibly suspend the coroutine) multiple times inside the loop
- // body, so it's faster overall.
+ // multiple times inside the loop body, so it's faster overall.
//
// For writing, a literal code obviously corresponds to writing 1 byte, and
// 258 is the maximum length in a length-distance pair, as specified in the
@@ -56,45 +55,28 @@
// For reading, strictly speaking, we only need 6 bytes (48 bits) of
// available input, because the H-L Literal/Length code is up to 15 bits
// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
- // to 13 extra bits and 15 + 5 + 15 + 13 == 48. However,
- // args.src.length() assertions are made in terms of bytes, not bits.
- //
- // In theory, we could write assertions that mixed bytes and bits, such as:
- //
- // assert ((args.src.length() >= 6) and (n_bits >= 5)) or
- // ((args.src.length() >= 4) and (n_bits >= 21))
- //
- // but that looks clumsy and makes the proofs more complicated. Instead, we
- // are conservative and work in terms of bytes only. In bytes, each code
- // requires either one (direct) or two (redirect) lookups, up to 2 bytes
- // for each. There are also up to 13 extra bits, or 2 bytes. Thus, each
- // code requires up to 2 + 2 + 2 == 6 bytes, and there are two codes (one
- // from H-L and one from H-D). Therefore, we check for at least 12 bytes.
- while.loop(args.dst.length() >= 258) and (args.src.length() >= 12) {
- // Ensure that we have at least 15 bits of input.
- if n_bits < 15 {
- assert n_bits >= 0 // TODO: this shouldn't be necessary.
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- assert n_bits >= 15
- } else {
- assert args.src.length() >= 10
- }
- // These assertions are redundant, but are listed explicitly for
- // clarity. They must hold regardless of which branch was taken for the
- // if statement above.
- assert args.src.length() >= 10
- assert n_bits >= 15
+ // to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
+ // read 64 bits than 48 bits or (48 - n_bits) bits.
+ while.loop(args.dst.length() >= 258) and (args.src.length() >= 8) {
+ // Ensure that we have at least 56 bits of input.
+ //
+ // This is "Variant 4" of
+ // https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+ //
+ // 56, the number of bits in 7 bytes, is a property of that "Variant 4"
+ // bit-reading technique, and not related to the DEFLATE format per se.
+ //
+ // The "& 63" is a no-op, and not part of the original "Variant 4"
+ // technique, but satisfies Wuffs' overflow/underflow checks.
+ bits |= args.src.peek_u64le() ~mod<< (n_bits & 63)
+ args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8)
+ n_bits |= 56
// Decode an lcode symbol from H-L.
table_entry = this.huffs[0][bits & lmask]
table_entry_n_bits = table_entry & 0x0F
bits >>= table_entry_n_bits
- n_bits -= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
if (table_entry >> 31) <> 0 {
// Literal.
@@ -102,37 +84,19 @@
continue.loop
} else if (table_entry >> 30) <> 0 {
// No-op; code continues past the if-else chain.
- assert args.src.length() >= 8
} else if (table_entry >> 29) <> 0 {
// End of block.
this.end_of_block = true
break.loop
} else if (table_entry >> 28) <> 0 {
// Redirect.
-
- // Ensure that we have at least 15 bits of input.
- if n_bits < 15 {
- assert n_bits >= 0 // TODO: this shouldn't be necessary.
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- assert n_bits >= 15
- } else {
- assert args.src.length() >= 8
- }
- // Once again, redundant but explicit assertions.
- assert args.src.length() >= 8
- assert n_bits >= 15
-
redir_top = (table_entry >> 8) & 0xFFFF
redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
- table_entry = this.huffs[0][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
+ table_entry = this.huffs[0][
+ (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
table_entry_n_bits = table_entry & 0x0F
bits >>= table_entry_n_bits
- n_bits -= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
if (table_entry >> 31) <> 0 {
// Literal.
@@ -152,9 +116,6 @@
return "#internal error: inconsistent Huffman decoder state"
}
- // Once again, redundant but explicit assertions.
- assert args.src.length() >= 8
-
} else if (table_entry >> 27) <> 0 {
return "#bad Huffman code"
} else {
@@ -168,83 +129,28 @@
length = ((table_entry >> 8) & 0xFF) + 3
table_entry_n_bits = (table_entry >> 4) & 0x0F
if table_entry_n_bits > 0 {
- // Ensure that we have at least 15 bits of input.
- if n_bits < 15 {
- assert n_bits >= 0 // TODO: this shouldn't be necessary.
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- assert n_bits >= 15
- } else {
- assert args.src.length() >= 6
- }
- // Once again, redundant but explicit assertions.
- assert args.src.length() >= 6
- assert n_bits >= 15
-
// The "+ 253" is the same as "- 3", after the "& 0xFF", but the
// plus form won't require an underflow check.
- length = ((length + 253 + bits.low_bits(n: table_entry_n_bits)) & 0xFF) + 3
+ length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3
bits >>= table_entry_n_bits
- n_bits -= table_entry_n_bits
-
- } else {
- assert args.src.length() >= 6
+ n_bits ~mod-= table_entry_n_bits
}
- // Ensure that we have at least 15 bits of input.
- if n_bits < 15 {
- assert n_bits >= 0 // TODO: this shouldn't be necessary.
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- assert n_bits >= 15
- } else {
- assert args.src.length() >= 4
- }
- // Once again, redundant but explicit assertions.
- assert args.src.length() >= 4
- assert n_bits >= 15
-
// Decode a dcode symbol from H-D.
table_entry = this.huffs[1][bits & dmask]
table_entry_n_bits = table_entry & 15
bits >>= table_entry_n_bits
- n_bits -= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
// Check for a redirect.
if (table_entry >> 28) == 1 {
- // Ensure that we have at least 15 bits of input.
- if n_bits < 15 {
- assert n_bits >= 0 // TODO: this shouldn't be necessary.
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- assert n_bits >= 15
- } else {
- assert args.src.length() >= 2
- }
- // Once again, redundant but explicit assertions.
- assert args.src.length() >= 2
- assert n_bits >= 15
-
redir_top = (table_entry >> 8) & 0xFFFF
redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
- table_entry = this.huffs[1][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
+ table_entry = this.huffs[1][
+ (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
table_entry_n_bits = table_entry & 0x0F
bits >>= table_entry_n_bits
- n_bits -= table_entry_n_bits
- } else {
- assert args.src.length() >= 2
+ n_bits ~mod-= table_entry_n_bits
}
// For H-D, all symbols should be base_number + extra_bits.
@@ -263,25 +169,10 @@
// undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial.
dist_minus_1 = (table_entry >> 8) & 0x7FFF
table_entry_n_bits = (table_entry >> 4) & 0x0F
- // Ensure that we have at least table_entry_n_bits bits of input.
- if n_bits < table_entry_n_bits {
- assert n_bits < 15 via "a < b: a < c; c <= b"(c: table_entry_n_bits)
- assert n_bits >= 0 // TODO: this shouldn't be necessary.
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- bits |= args.src.peek_u8_as_u32() << n_bits
- args.src.skip_u32_fast!(actual: 1, worst_case: 1)
- n_bits += 8
- assert table_entry_n_bits <= n_bits via "a <= b: a <= c; c <= b"(c: 16)
- assert n_bits >= table_entry_n_bits via "a >= b: b <= a"()
- }
- // Once again, redundant but explicit assertions.
- assert n_bits >= table_entry_n_bits
- dist_minus_1 = (dist_minus_1 + bits.low_bits(n: table_entry_n_bits)) & 0x7FFF
+ dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF
bits >>= table_entry_n_bits
- n_bits -= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
// The "while true { etc; break }" is a redundant version of "etc", but
// its presence minimizes the diff between decode_huffman_fast and
@@ -355,6 +246,9 @@
// mean that the (possibly different) args.src is no longer rewindable,
// even if conceptually, this function was responsible for reading the
// bytes we want to rewind.
+ if n_bits > 63 {
+ return "#internal error: inconsistent n_bits"
+ }
while n_bits >= 8,
post n_bits < 8,
{
@@ -366,7 +260,7 @@
}
} endwhile
- this.bits = bits & (((1 as base.u32) << n_bits) - 1)
+ this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32
this.n_bits = n_bits
if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) {