Add std/deflate decode_huffman_fast32
This undoes commit 1a63b53e "Have std/deflate read 64 bits once per
inner loop" for 32-bit CPUs.
On a Raspberry Pi 4 (32-bit armv7l):
wuffs_deflate_decode_1k_full_init/clang9 44.9MB/s ± 1% 50.1MB/s ± 1% +11.54% (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/clang9 52.8MB/s ± 0% 61.1MB/s ± 0% +15.61% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/clang9 81.4MB/s ± 0% 84.5MB/s ± 0% +3.89% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/clang9 87.9MB/s ± 0% 91.1MB/s ± 0% +3.67% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/clang9 106MB/s ± 0% 113MB/s ± 0% +5.70% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/clang9 59.6MB/s ± 0% 59.1MB/s ± 0% -0.81% (p=0.008 n=5+5)
wuffs_deflate_decode_1k_full_init/gcc8 44.7MB/s ± 1% 50.9MB/s ± 1% +13.91% (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/gcc8 52.6MB/s ± 0% 61.5MB/s ± 0% +17.06% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/gcc8 76.5MB/s ± 0% 85.6MB/s ± 1% +11.89% (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/gcc8 82.3MB/s ± 0% 92.2MB/s ± 0% +12.07% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/gcc8 97.8MB/s ± 0% 112.1MB/s ± 0% +14.62% (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/gcc8 58.0MB/s ± 0% 61.6MB/s ± 0% +6.09% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 1ef07d9..1e1941a 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -23084,12 +23084,12 @@
wuffs_base__io_buffer* a_src) {
wuffs_base__status status = wuffs_base__make_status(NULL);
- uint64_t v_bits = 0;
+ uint32_t v_bits = 0;
uint32_t v_n_bits = 0;
uint32_t v_table_entry = 0;
uint32_t v_table_entry_n_bits = 0;
- uint64_t v_lmask = 0;
- uint64_t v_dmask = 0;
+ uint32_t v_lmask = 0;
+ uint32_t v_dmask = 0;
uint32_t v_redir_top = 0;
uint32_t v_redir_mask = 0;
uint32_t v_length = 0;
@@ -23125,15 +23125,21 @@
status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
goto exit;
}
- v_bits = ((uint64_t)(self->private_impl.f_bits));
+ v_bits = self->private_impl.f_bits;
v_n_bits = self->private_impl.f_n_bits;
- v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
- v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
+ v_lmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
+ v_dmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
label__loop__continue:;
- while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
- v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63)));
- iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
- v_n_bits |= 56;
+ while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 12)) {
+ if (v_n_bits < 15) {
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ } else {
+ }
v_table_entry = self->private_data.f_huffs[0][(v_bits & v_lmask)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
@@ -23146,9 +23152,18 @@
self->private_impl.f_end_of_block = true;
goto label__loop__break;
} else if ((v_table_entry >> 28) != 0) {
+ if (v_n_bits < 15) {
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ } else {
+ }
v_redir_top = ((v_table_entry >> 8) & 65535);
v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
- v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+ v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
@@ -23179,21 +23194,50 @@
v_length = (((v_table_entry >> 8) & 255) + 3);
v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
if (v_table_entry_n_bits > 0) {
- v_length = (((v_length + 253 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 255) + 3);
+ if (v_n_bits < 15) {
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ } else {
+ }
+ v_length = (((v_length + 253 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 255) + 3);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
+ } else {
+ }
+ if (v_n_bits < 15) {
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ } else {
}
v_table_entry = self->private_data.f_huffs[1][(v_bits & v_dmask)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
if ((v_table_entry >> 28) == 1) {
+ if (v_n_bits < 15) {
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ } else {
+ }
v_redir_top = ((v_table_entry >> 8) & 65535);
v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
- v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+ v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
v_table_entry_n_bits = (v_table_entry & 15);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
+ } else {
}
if ((v_table_entry >> 24) != 64) {
if ((v_table_entry >> 24) == 8) {
@@ -23205,7 +23249,15 @@
}
v_dist_minus_1 = ((v_table_entry >> 8) & 32767);
v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
- v_dist_minus_1 = ((v_dist_minus_1 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 32767);
+ if (v_n_bits < v_table_entry_n_bits) {
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+ iop_a_src += 1;
+ v_n_bits += 8;
+ }
+ v_dist_minus_1 = ((v_dist_minus_1 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 32767);
v_bits >>= v_table_entry_n_bits;
v_n_bits -= v_table_entry_n_bits;
while (true) {
@@ -23242,10 +23294,6 @@
label__0__break:;
}
label__loop__break:;
- if (v_n_bits > 63) {
- status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
- goto exit;
- }
while (v_n_bits >= 8) {
v_n_bits -= 8;
if (iop_a_src > io1_a_src) {
@@ -23255,7 +23303,7 @@
goto exit;
}
}
- self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1)) << v_n_bits) - 1))));
+ self->private_impl.f_bits = (v_bits & ((((uint32_t)(1)) << v_n_bits) - 1));
self->private_impl.f_n_bits = v_n_bits;
if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> self->private_impl.f_n_bits) != 0)) {
status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
diff --git a/std/deflate/decode_huffman_fast32.wuffs b/std/deflate/decode_huffman_fast32.wuffs
index cbeb691..d75ee37 100644
--- a/std/deflate/decode_huffman_fast32.wuffs
+++ b/std/deflate/decode_huffman_fast32.wuffs
@@ -20,12 +20,12 @@
// decode_huffman_*.wuffs files as small as possible, while retaining both
// correctness and performance.
- var bits : base.u64
+ var bits : base.u32
var n_bits : base.u32
var table_entry : base.u32
var table_entry_n_bits : base.u32[..= 15]
- var lmask : base.u64[..= 511]
- var dmask : base.u64[..= 511]
+ var lmask : base.u32[..= 511]
+ var dmask : base.u32[..= 511]
var redir_top : base.u32[..= 0xFFFF]
var redir_mask : base.u32[..= 0x7FFF]
var length : base.u32[..= 258]
@@ -37,14 +37,14 @@
return "#internal error: inconsistent n_bits"
}
- bits = this.bits as base.u64
+ bits = this.bits
n_bits = this.n_bits
- lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1
- dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
+ lmask = ((1 as base.u32) << this.n_huffs_bits[0]) - 1
+ dmask = ((1 as base.u32) << this.n_huffs_bits[1]) - 1
// Check up front, on each iteration, that we have enough buffer space to
- // both read (8 bytes) and write (258 bytes) as much as we need to. Doing
+ // both read (12 bytes) and write (258 bytes) as much as we need to. Doing
// this check once (per iteration), up front, removes the need to check
// multiple times inside the loop body, so it's faster overall.
//
@@ -55,28 +55,45 @@
// For reading, strictly speaking, we only need 6 bytes (48 bits) of
// available input, because the H-L Literal/Length code is up to 15 bits
// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
- // to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
- // read 64 bits than 48 bits or (48 - n_bits) bits.
- while.loop(args.dst.length() >= 258) and (args.src.length() >= 8) {
- // Ensure that we have at least 56 bits of input.
- //
- // This is "Variant 4" of
- // https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
- //
- // 56, the number of bits in 7 bytes, is a property of that "Variant 4"
- // bit-reading technique, and not related to the DEFLATE format per se.
- //
- // The "& 63" is a no-op, and not part of the original "Variant 4"
- // technique, but satisfies Wuffs' overflow/underflow checks.
- bits |= args.src.peek_u64le() ~mod<< (n_bits & 63)
- args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8)
- n_bits |= 56
+ // to 13 extra bits and 15 + 5 + 15 + 13 == 48. However,
+ // args.src.length() assertions are made in terms of bytes, not bits.
+ //
+ // In theory, we could write assertions that mixed bytes and bits, such as:
+ //
+ // assert ((args.src.length() >= 6) and (n_bits >= 5)) or
+ // ((args.src.length() >= 4) and (n_bits >= 21))
+ //
+ // but that looks clumsy and makes the proofs more complicated. Instead, we
+ // are conservative and work in terms of bytes only. In bytes, each code
+ // requires either one (direct) or two (redirect) lookups, up to 2 bytes
+ // for each. There are also up to 13 extra bits, or 2 bytes. Thus, each
+ // code requires up to 2 + 2 + 2 == 6 bytes, and there are two codes (one
+ // from H-L and one from H-D). Therefore, we check for at least 12 bytes.
+ while.loop(args.dst.length() >= 258) and (args.src.length() >= 12) {
+ // Ensure that we have at least 15 bits of input.
+ if n_bits < 15 {
+ assert n_bits >= 0 // TODO: this shouldn't be necessary.
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ assert n_bits >= 15
+ } else {
+ assert args.src.length() >= 10
+ }
+ // These assertions are redundant, but are listed explicitly for
+ // clarity. They must hold regardless of which branch was taken for the
+ // if statement above.
+ assert args.src.length() >= 10
+ assert n_bits >= 15
// Decode an lcode symbol from H-L.
table_entry = this.huffs[0][bits & lmask]
table_entry_n_bits = table_entry & 0x0F
bits >>= table_entry_n_bits
- n_bits ~mod-= table_entry_n_bits
+ n_bits -= table_entry_n_bits
if (table_entry >> 31) <> 0 {
// Literal.
@@ -84,19 +101,37 @@
continue.loop
} else if (table_entry >> 30) <> 0 {
// No-op; code continues past the if-else chain.
+ assert args.src.length() >= 8
} else if (table_entry >> 29) <> 0 {
// End of block.
this.end_of_block = true
break.loop
} else if (table_entry >> 28) <> 0 {
// Redirect.
+
+ // Ensure that we have at least 15 bits of input.
+ if n_bits < 15 {
+ assert n_bits >= 0 // TODO: this shouldn't be necessary.
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ assert n_bits >= 15
+ } else {
+ assert args.src.length() >= 8
+ }
+ // Once again, redundant but explicit assertions.
+ assert args.src.length() >= 8
+ assert n_bits >= 15
+
redir_top = (table_entry >> 8) & 0xFFFF
redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
- table_entry = this.huffs[0][
- (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+ table_entry = this.huffs[0][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
table_entry_n_bits = table_entry & 0x0F
bits >>= table_entry_n_bits
- n_bits ~mod-= table_entry_n_bits
+ n_bits -= table_entry_n_bits
if (table_entry >> 31) <> 0 {
// Literal.
@@ -116,6 +151,9 @@
return "#internal error: inconsistent Huffman decoder state"
}
+ // Once again, redundant but explicit assertions.
+ assert args.src.length() >= 8
+
} else if (table_entry >> 27) <> 0 {
return "#bad Huffman code"
} else {
@@ -129,28 +167,83 @@
length = ((table_entry >> 8) & 0xFF) + 3
table_entry_n_bits = (table_entry >> 4) & 0x0F
if table_entry_n_bits > 0 {
+ // Ensure that we have at least 15 bits of input.
+ if n_bits < 15 {
+ assert n_bits >= 0 // TODO: this shouldn't be necessary.
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ assert n_bits >= 15
+ } else {
+ assert args.src.length() >= 6
+ }
+ // Once again, redundant but explicit assertions.
+ assert args.src.length() >= 6
+ assert n_bits >= 15
+
// The "+ 253" is the same as "- 3", after the "& 0xFF", but the
// plus form won't require an underflow check.
- length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3
+ length = ((length + 253 + bits.low_bits(n: table_entry_n_bits)) & 0xFF) + 3
bits >>= table_entry_n_bits
- n_bits ~mod-= table_entry_n_bits
+ n_bits -= table_entry_n_bits
+
+ } else {
+ assert args.src.length() >= 6
}
+ // Ensure that we have at least 15 bits of input.
+ if n_bits < 15 {
+ assert n_bits >= 0 // TODO: this shouldn't be necessary.
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ assert n_bits >= 15
+ } else {
+ assert args.src.length() >= 4
+ }
+ // Once again, redundant but explicit assertions.
+ assert args.src.length() >= 4
+ assert n_bits >= 15
+
// Decode a dcode symbol from H-D.
table_entry = this.huffs[1][bits & dmask]
table_entry_n_bits = table_entry & 15
bits >>= table_entry_n_bits
- n_bits ~mod-= table_entry_n_bits
+ n_bits -= table_entry_n_bits
// Check for a redirect.
if (table_entry >> 28) == 1 {
+ // Ensure that we have at least 15 bits of input.
+ if n_bits < 15 {
+ assert n_bits >= 0 // TODO: this shouldn't be necessary.
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ assert n_bits >= 15
+ } else {
+ assert args.src.length() >= 2
+ }
+ // Once again, redundant but explicit assertions.
+ assert args.src.length() >= 2
+ assert n_bits >= 15
+
redir_top = (table_entry >> 8) & 0xFFFF
redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
- table_entry = this.huffs[1][
- (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+ table_entry = this.huffs[1][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
table_entry_n_bits = table_entry & 0x0F
bits >>= table_entry_n_bits
- n_bits ~mod-= table_entry_n_bits
+ n_bits -= table_entry_n_bits
+ } else {
+ assert args.src.length() >= 2
}
// For H-D, all symbols should be base_number + extra_bits.
@@ -169,10 +262,25 @@
// undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial.
dist_minus_1 = (table_entry >> 8) & 0x7FFF
table_entry_n_bits = (table_entry >> 4) & 0x0F
+ // Ensure that we have at least table_entry_n_bits bits of input.
+ if n_bits < table_entry_n_bits {
+ assert n_bits < 15 via "a < b: a < c; c <= b"(c: table_entry_n_bits)
+ assert n_bits >= 0 // TODO: this shouldn't be necessary.
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ bits |= args.src.peek_u8_as_u32() << n_bits
+ args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+ n_bits += 8
+ assert table_entry_n_bits <= n_bits via "a <= b: a <= c; c <= b"(c: 16)
+ assert n_bits >= table_entry_n_bits via "a >= b: b <= a"()
+ }
+ // Once again, redundant but explicit assertions.
+ assert n_bits >= table_entry_n_bits
- dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF
+ dist_minus_1 = (dist_minus_1 + bits.low_bits(n: table_entry_n_bits)) & 0x7FFF
bits >>= table_entry_n_bits
- n_bits ~mod-= table_entry_n_bits
+ n_bits -= table_entry_n_bits
// The "while true { etc; break }" is a redundant version of "etc", but
// its presence minimizes the diff between decode_huffman_fastxx and
@@ -248,9 +356,6 @@
// mean that the (possibly different) args.src is no longer rewindable,
// even if conceptually, this function was responsible for reading the
// bytes we want to rewind.
- if n_bits > 63 {
- return "#internal error: inconsistent n_bits"
- }
while n_bits >= 8,
post n_bits < 8,
{
@@ -262,7 +367,7 @@
}
} endwhile
- this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32
+ this.bits = bits & (((1 as base.u32) << n_bits) - 1)
this.n_bits = n_bits
if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) {