| // Copyright 2017 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // TODO: describe how the xxx_fastxx version differs from the xxx_slow one, the |
| // assumptions that xxx_fastxx makes, and how that makes it fast. |
| pri func decoder.decode_huffman_fast64!(dst: base.io_writer, src: base.io_reader) base.status { |
| // When editing this function, consider making the equivalent change to the |
| // decode_huffman_slow function. Keep the diff between the two |
| // decode_huffman_*.wuffs files as small as possible, while retaining both |
| // correctness and performance. |
| |
| var bits : base.u64 |
| var n_bits : base.u32 |
| var table_entry : base.u32 |
| var table_entry_n_bits : base.u32[..= 15] |
| var lmask : base.u64[..= 511] |
| var dmask : base.u64[..= 511] |
| var redir_top : base.u32[..= 0xFFFF] |
| var redir_mask : base.u32[..= 0x7FFF] |
| var length : base.u32[..= 258] |
| var dist_minus_1 : base.u32[..= 0x7FFF] |
| var hlen : base.u32[..= 0x7FFF] |
| var hdist : base.u32 |
| |
| if (this.n_bits >= 8) or ((this.bits >> (this.n_bits & 7)) <> 0) { |
| return "#internal error: inconsistent n_bits" |
| } |
| |
| bits = this.bits as base.u64 |
| n_bits = this.n_bits |
| |
| lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1 |
| dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1 |
| |
| // Check up front, on each iteration, that we have enough buffer space to |
| // both read (8 bytes) and write (266 bytes) as much as we need to. Doing |
| // this check once (per iteration), up front, removes the need to check |
| // multiple times inside the loop body, so it's faster overall. |
| // |
| // For writing, a literal code obviously corresponds to writing 1 byte, and |
| // 258 is the maximum length in a length-distance pair, as specified in the |
| // RFC section 3.2.5. Compressed blocks (length and distance codes). |
| // |
| // We adjust 258 up to (258 + 8) = 266 since it can be faster to overshoot |
| // a little and make multiple-of-8-byte copies even when the length in the |
| // length-distance pair isn't an exact multiple-of-8. Strictly speaking, |
| // 264 (an exact multiple-of-8) is the tight bound but (258 + 8) is easier |
| // for the Wuffs proof system, as length's type is refined to [..= 258], |
| // |
| // For reading, strictly speaking, we only need 6 bytes (48 bits) of |
| // available input, because the H-L Literal/Length code is up to 15 bits |
| // plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up |
| // to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to |
| // read 64 bits than 48 bits or (48 - n_bits) bits. |
| while.loop(args.dst.length() >= 266) and (args.src.length() >= 8) { |
| // Ensure that we have at least 56 bits of input. |
| // |
| // This is "Variant 4" of |
| // https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/ |
| // |
| // 56, the number of bits in 7 bytes, is a property of that "Variant 4" |
| // bit-reading technique, and not related to the DEFLATE format per se. |
| // |
| // The "& 63" is a no-op, and not part of the original "Variant 4" |
| // technique, but satisfies Wuffs' overflow/underflow checks. |
| bits |= args.src.peek_u64le() ~mod<< (n_bits & 63) |
| args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8) |
| n_bits |= 56 |
| |
| // Decode an lcode symbol from H-L. |
| table_entry = this.huffs[0][bits & lmask] |
| table_entry_n_bits = table_entry & 0x0F |
| bits >>= table_entry_n_bits |
| n_bits ~mod-= table_entry_n_bits |
| |
| if (table_entry >> 31) <> 0 { |
| // Literal. |
| args.dst.write_u8_fast!(a: ((table_entry >> 8) & 0xFF) as base.u8) |
| continue.loop |
| } else if (table_entry >> 30) <> 0 { |
| // No-op; code continues past the if-else chain. |
| } else if (table_entry >> 29) <> 0 { |
| // End of block. |
| this.end_of_block = true |
| break.loop |
| } else if (table_entry >> 28) <> 0 { |
| // Redirect. |
| redir_top = (table_entry >> 8) & 0xFFFF |
| redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 |
| table_entry = this.huffs[0][ |
| (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK] |
| table_entry_n_bits = table_entry & 0x0F |
| bits >>= table_entry_n_bits |
| n_bits ~mod-= table_entry_n_bits |
| |
| if (table_entry >> 31) <> 0 { |
| // Literal. |
| args.dst.write_u8_fast!(a: ((table_entry >> 8) & 0xFF) as base.u8) |
| continue.loop |
| } else if (table_entry >> 30) <> 0 { |
| // No-op; code continues past the if-else chain. |
| } else if (table_entry >> 29) <> 0 { |
| // End of block. |
| this.end_of_block = true |
| break.loop |
| } else if (table_entry >> 28) <> 0 { |
| return "#internal error: inconsistent Huffman decoder state" |
| } else if (table_entry >> 27) <> 0 { |
| return "#bad Huffman code" |
| } else { |
| return "#internal error: inconsistent Huffman decoder state" |
| } |
| |
| } else if (table_entry >> 27) <> 0 { |
| return "#bad Huffman code" |
| } else { |
| return "#internal error: inconsistent Huffman decoder state" |
| } |
| |
| // length = base_number_minus_3 + 3 + extra_bits. |
| // |
| // The -3 is from the bias in script/print-deflate-magic-numbers.go. |
| // That bias makes the "& 0xFF" 1 and 15-ish lines below correct. |
| length = ((table_entry >> 8) & 0xFF) + 3 |
| table_entry_n_bits = (table_entry >> 4) & 0x0F |
| if table_entry_n_bits > 0 { |
| // The "+ 253" is the same as "- 3", after the "& 0xFF", but the |
| // plus form won't require an underflow check. |
| length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3 |
| bits >>= table_entry_n_bits |
| n_bits ~mod-= table_entry_n_bits |
| } |
| |
| // Decode a dcode symbol from H-D. |
| table_entry = this.huffs[1][bits & dmask] |
| table_entry_n_bits = table_entry & 15 |
| bits >>= table_entry_n_bits |
| n_bits ~mod-= table_entry_n_bits |
| |
| // Check for a redirect. |
| if (table_entry >> 28) == 1 { |
| redir_top = (table_entry >> 8) & 0xFFFF |
| redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 |
| table_entry = this.huffs[1][ |
| (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK] |
| table_entry_n_bits = table_entry & 0x0F |
| bits >>= table_entry_n_bits |
| n_bits ~mod-= table_entry_n_bits |
| } |
| |
| // For H-D, all symbols should be base_number + extra_bits. |
| if (table_entry >> 24) <> 0x40 { |
| if (table_entry >> 24) == 0x08 { |
| return "#bad Huffman code" |
| } |
| return "#internal error: inconsistent Huffman decoder state" |
| } |
| |
| // dist_minus_1 = base_number_minus_1 + extra_bits. |
| // distance = dist_minus_1 + 1. |
| // |
| // The -1 is from the bias in script/print-deflate-magic-numbers.go. |
| // That bias makes the "& 0x7FFF" 2 and 15-ish lines below correct and |
| // undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial. |
| dist_minus_1 = (table_entry >> 8) & 0x7FFF |
| table_entry_n_bits = (table_entry >> 4) & 0x0F |
| |
| dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF |
| bits >>= table_entry_n_bits |
| n_bits ~mod-= table_entry_n_bits |
| |
| // The "while true { etc; break }" is a redundant version of "etc", but |
| // its presence minimizes the diff between decode_huffman_fastxx and |
| // decode_huffman_slow. |
| while true, |
| pre args.dst.length() >= 266, |
| { |
| // We can therefore prove: |
| assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266) |
| assert ((length + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266) |
| |
| // Copy from this.history. |
| if ((dist_minus_1 + 1) as base.u64) > args.dst.history_length() { |
| // Set (hlen, hdist) to be the length-distance pair to copy |
| // from this.history, and (length, distance) to be the |
| // remaining length-distance pair to copy from args.dst. |
| hlen = 0 |
| hdist = (((dist_minus_1 + 1) as base.u64) - args.dst.history_length()) as base.u32 |
| if length > hdist { |
| assert hdist < length via "a < b: b > a"() |
| assert hdist < 0x8000 via "a < b: a < c; c <= b"(c: length) |
| length -= hdist |
| hlen = hdist |
| } else { |
| hlen = length |
| length = 0 |
| } |
| if this.history_index < hdist { |
| return "#bad distance" |
| } |
| // Re-purpose the hdist variable as the this.history index to |
| // start copying from. |
| hdist = this.history_index - hdist |
| |
| // Copy from hdist to the end of this.history. |
| // |
| // This copying is simpler than the decode_huffman_slow version |
| // because it cannot yield. We have already checked that |
| // args.dst.length() is large enough. |
| args.dst.limited_copy_u32_from_slice!( |
| up_to: hlen, s: this.history[hdist & 0x7FFF ..]) |
| |
| if length == 0 { |
| // No need to copy from args.dst. |
| continue.loop |
| } |
| |
| if (((dist_minus_1 + 1) as base.u64) > args.dst.history_length()) or |
| ((length as base.u64) > args.dst.length()) or |
| (((length + 8) as base.u64) > args.dst.length()) { |
| return "#internal error: inconsistent distance" |
| } |
| } |
| // Once again, redundant but explicit assertions. |
| assert (dist_minus_1 + 1) >= 1 |
| assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length() |
| assert (length as base.u64) <= args.dst.length() |
| assert ((length + 8) as base.u64) <= args.dst.length() |
| |
| // Copy from args.dst. |
| // |
| // For short distances, less than 8 bytes, copying atomic 8-byte |
| // chunks can result in incorrect output, so we fall back to a |
| // slower 1-byte-at-a-time copy. Deflate's copy-from-history can |
| // pick up freshly written bytes. A length = 5, distance = 2 copy |
| // starting with "abc" should give "abcbcbcb" but the 8-byte chunk |
| // technique would give "abcbc???", the exact output depending on |
| // what was previously in the writer buffer. |
| if (dist_minus_1 + 1) >= 8 { |
| args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!( |
| up_to: length, distance: (dist_minus_1 + 1)) |
| } else { |
| args.dst.limited_copy_u32_from_history_fast!( |
| up_to: length, distance: (dist_minus_1 + 1)) |
| } |
| break |
| } endwhile |
| } endwhile.loop |
| |
| // Ensure n_bits < 8 by rewindng args.src, if we loaded too many of its |
| // bytes into the bits variable. |
| // |
| // Note that we can unconditionally call undo_read (without resulting in an |
| // "invalid I/O operation" error code) only because this whole function can |
| // never suspend, as all of its I/O operations were checked beforehand for |
| // sufficient buffer space. Otherwise, resuming from the suspension could |
| // mean that the (possibly different) args.src is no longer rewindable, |
| // even if conceptually, this function was responsible for reading the |
| // bytes we want to rewind. |
| if n_bits > 63 { |
| return "#internal error: inconsistent n_bits" |
| } |
| while n_bits >= 8, |
| post n_bits < 8, |
| { |
| n_bits -= 8 |
| if args.src.can_undo_byte() { |
| args.src.undo_byte!() |
| } else { |
| return "#internal error: inconsistent I/O" |
| } |
| } endwhile |
| |
| this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32 |
| this.n_bits = n_bits |
| |
| if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) { |
| return "#internal error: inconsistent n_bits" |
| } |
| } |