blob: 99ecc19f09b3ddb4da66d8a02309fde07a55c45d [file] [log] [blame]
// Copyright 2017 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
// ----------------
// This file contains a hand-written C implementation of
// release/c/wuffs-unsupported-snapshot.c's generated
// wuffs_deflate__decoder__decode_huffman_fast function.
//
// It is not intended to be used in production settings, on untrusted data. Its
// purpose is to give a rough upper bound on how fast Wuffs' generated C code
// can be, with a sufficiently smart Wuffs compiler.
//
// To repeat, substituting in this C implementation is NOT SAFE, and may result
// in buffer overflows. This code exists only to aid optimization of the (safe)
// std/deflate/*.wuffs code and the Wuffs compiler's code generation.
// ----------------
// Having said that, to generate the benchmark numbers with this hand-written C
// implementation, edit release/c/wuffs-unsupported-snapshot.c and find the
// lines that say
//
// // ‼ WUFFS C HEADER ENDS HERE.
// #ifdef WUFFS_IMPLEMENTATION
//
// After that, add this line:
//
// #include "../../script/wuffs-deflate-decoder-decode-huffman.c"
//
// Then find the calls to wuffs_deflate__decoder__decode_huffman_fastxx. They
// should be inside the wuffs_deflate__decoder__decode_blocks function body,
// and the lines of code should look something like
// v_status = wuffs_deflate__decoder__decode_huffman_fast32(etc);
// and
// v_status = wuffs_deflate__decoder__decode_huffman_fast64(etc);
//
// Change the "wuffs_etc_fastxx" to "c_wuffs_etc_fast", i.e. add a "c_" prefix
// and drop the "32" or "64" suffix. The net result should look something like:
//
// clang-format off
/*
$ git diff release/c/wuffs-unsupported-snapshot.c
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index d255a788..2a112728 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -9758,6 +9758,8 @@ DecodeJson(DecodeJsonCallbacks& callbacks,
// ‼ WUFFS C HEADER ENDS HERE.
#ifdef WUFFS_IMPLEMENTATION
+#include "../../script/wuffs-deflate-decoder-decode-huffman.c"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -25326,7 +25328,7 @@ wuffs_deflate__decoder__decode_blocks(
if (a_src) {
a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
}
- v_status = wuffs_deflate__decoder__decode_huffman_fast32(self, a_dst, a_src);
+ v_status = c_wuffs_deflate__decoder__decode_huffman_fast(self, a_dst, a_src);
if (a_src) {
iop_a_src = a_src->data.ptr + a_src->meta.ri;
}
@@ -25334,7 +25336,7 @@ wuffs_deflate__decoder__decode_blocks(
if (a_src) {
a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
}
- v_status = wuffs_deflate__decoder__decode_huffman_fast64(self, a_dst, a_src);
+ v_status = c_wuffs_deflate__decoder__decode_huffman_fast(self, a_dst, a_src);
if (a_src) {
iop_a_src = a_src->data.ptr + a_src->meta.ri;
}
*/
// clang-format on
//
// That concludes the edits to release/c/wuffs-unsupported-snapshot.c. Run the
// tests and benchmarks with the "-skipgen" flag, otherwise the "wuffs" tool
// will re-generate the C code and override your edits:
//
// wuffs test -skipgen std/deflate
// wuffs bench -skipgen std/deflate
//
// You may also want to focus on one specific test, e.g.:
//
// wuffs test -skipgen -focus=wuffs_deflate_decode_midsummer std/deflate
// wuffs bench -skipgen -focus=wuffs_deflate_decode_100k_just std/deflate
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h> // For manual printf debugging.
#include <string.h>
extern const char wuffs_deflate__error__internal_error_inconsistent_distance[];
extern const char
wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state[];
// ----------------
// Define this macro to make this file compilable standalone, instead of
// hooking it into release/c/wuffs-unsupported-snapshot.c per the instructions
// above. Standalone means that you can run a "does it compile" check by
//
// gcc -c script/wuffs-deflate-decoder-decode-huffman.c -o /dev/null
//
// and also drop this file into disassembly tools like https://godbolt.org/
//#define MAKE_THIS_FILE_COMPILABLE_STANDALONE
#ifdef MAKE_THIS_FILE_COMPILABLE_STANDALONE
typedef struct {
const char* repr;
} wuffs_base__status;
static inline wuffs_base__status //
wuffs_base__make_status(const char* repr) {
wuffs_base__status z;
z.repr = repr;
return z;
}
typedef struct {
uint8_t* ptr;
size_t len;
} wuffs_base__slice_u8;
typedef struct {
size_t wi;
size_t ri;
uint64_t pos;
bool closed;
} wuffs_base__io_buffer_meta;
typedef struct {
wuffs_base__slice_u8 data;
wuffs_base__io_buffer_meta meta;
} wuffs_base__io_buffer;
typedef struct wuffs_deflate__decoder__struct {
struct {
uint32_t magic;
uint32_t f_bits;
uint32_t f_n_bits;
uint32_t f_history_index;
uint32_t f_n_huffs_bits[2];
bool f_end_of_block;
} private_impl;
struct {
// 1024 is huffs_table_size in std/deflate/decode_deflate.wuffs.
uint32_t f_huffs[2][1024];
uint8_t f_history[32768];
uint8_t f_code_lengths[320];
} private_data;
} wuffs_deflate__decoder;
const char wuffs_base__error__bad_argument[] = "?base: bad argument";
const char wuffs_deflate__error__bad_distance[] = "?deflate: bad distance";
const char wuffs_deflate__error__internal_error_inconsistent_distance[] =
"?deflate: internal error: inconsistent distance";
const char
wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state[] =
"?deflate: internal error: inconsistent Huffman decoder state";
static wuffs_base__status //
wuffs_deflate__decoder__decode_huffman_fast(wuffs_deflate__decoder* self,
wuffs_base__io_buffer* a_dst,
wuffs_base__io_buffer* a_src) {
return wuffs_base__make_status(NULL);
}
#endif // MAKE_THIS_FILE_COMPILABLE_STANDALONE
// ----------------
// Define this macro to further speed up this C implementation, using two
// techniques that are not used by the zlib-the-library C implementation (as of
// version 1.2.11, current as of November 2017). Doing so gives data for "how
// fast can Wuffs' zlib-the-format implementation be" instead of "is Wuffs'
// generated C code as fast as zlib-the-library's hand-written C code".
//
// Whether the same techniques could apply to zlib-the-library is discussed at
// https://github.com/madler/zlib/pull/292
#ifdef __x86_64__
//#define WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS
#endif
// wuffs_base__width_to_mask_table is the width_to_mask_table from
// https://fgiesen.wordpress.com/2018/02/19/reading-bits-in-far-too-many-ways-part-1/
//
// Look for "It may feel ridiculous" on that page for the rationale.
static const uint32_t wuffs_base__width_to_mask_table[33] = {
0x00000000u,
0x00000001u, 0x00000003u, 0x00000007u, 0x0000000Fu,
0x0000001Fu, 0x0000003Fu, 0x0000007Fu, 0x000000FFu,
0x000001FFu, 0x000003FFu, 0x000007FFu, 0x00000FFFu,
0x00001FFFu, 0x00003FFFu, 0x00007FFFu, 0x0000FFFFu,
0x0001FFFFu, 0x0003FFFFu, 0x0007FFFFu, 0x000FFFFFu,
0x001FFFFFu, 0x003FFFFFu, 0x007FFFFFu, 0x00FFFFFFu,
0x01FFFFFFu, 0x03FFFFFFu, 0x07FFFFFFu, 0x0FFFFFFFu,
0x1FFFFFFFu, 0x3FFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu,
};
// Uncomment exactly one of these. Choosing between them lets us measure the
// effect of the table-based mask.
//#define MASK(n) ((((uint32_t)1) << (n)) - 1)
#define MASK(n) wuffs_base__width_to_mask_table[n]
// ----------------
// These are the generated functions that we are explicitly overriding. Note
// that the function name is "wuffs_etc", not "c_wuffs_etc".
static wuffs_base__status //
wuffs_deflate__decoder__decode_huffman_fast32(wuffs_deflate__decoder* self,
wuffs_base__io_buffer* a_dst,
wuffs_base__io_buffer* a_src);
static wuffs_base__status //
wuffs_deflate__decoder__decode_huffman_fast64(wuffs_deflate__decoder* self,
wuffs_base__io_buffer* a_dst,
wuffs_base__io_buffer* a_src);
// This is the overriding implementation.
wuffs_base__status //
c_wuffs_deflate__decoder__decode_huffman_fast(wuffs_deflate__decoder* self,
wuffs_base__io_buffer* a_dst,
wuffs_base__io_buffer* a_src) {
// Avoid the -Werror=unused-function warning for the now-unused
// overridden wuffs_deflate__decoder__decode_huffman_fastxx functions.
(void)(wuffs_deflate__decoder__decode_huffman_fast32);
(void)(wuffs_deflate__decoder__decode_huffman_fast64);
if (!a_dst || !a_src) {
return wuffs_base__make_status(wuffs_base__error__bad_argument);
}
wuffs_base__status status = wuffs_base__make_status(NULL);
// Load contextual state. Prepare to check that pdst and psrc remain within
// a_dst's and a_src's bounds.
uint8_t* pdst = a_dst->data.ptr + a_dst->meta.wi;
uint8_t* qdst = a_dst->data.ptr + a_dst->data.len;
if ((qdst - pdst) < 258) {
return wuffs_base__make_status(NULL);
} else {
qdst -= 258;
}
uint8_t* psrc = a_src->data.ptr + a_src->meta.ri;
uint8_t* qsrc = a_src->data.ptr + a_src->meta.wi;
if ((qsrc - psrc) < 12) {
return wuffs_base__make_status(NULL);
} else {
qsrc -= 12;
}
#if defined(WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS)
uint64_t bits = self->private_impl.f_bits;
#else
uint32_t bits = self->private_impl.f_bits;
#endif
uint32_t n_bits = self->private_impl.f_n_bits;
// Initialize other local variables.
uint8_t* pdst_mark = a_dst->data.ptr;
uint32_t lmask = MASK(self->private_impl.f_n_huffs_bits[0]);
uint32_t dmask = MASK(self->private_impl.f_n_huffs_bits[1]);
outer_loop:
while ((pdst <= qdst) && (psrc <= qsrc)) {
#if defined(WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS)
// Ensure that we have at least 56 bits of input.
//
// This is "Variant 4" of
// https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
//
// 56, the number of bits in 7 bytes, is a property of that "Variant 4"
// bit-reading technique, and not related to the DEFLATE format per se.
//
// Specifically for DEFLATE, we need only up to 48 bits per outer_loop
// iteration. The maximum input bits used by a length/distance pair is 15
// bits for the length code, 5 bits for the length extra, 15 bits for the
// distance code, and 13 bits for the distance extra. This totals 48 bits.
//
// The fact that the 48 we need is less than the 56 we get is a happy
// coincidence. It lets us eliminate any other loads in the loop body.
uint64_t src_u64;
memcpy(&src_u64, psrc, 8);
bits |= src_u64 << n_bits;
psrc += (63 - n_bits) >> 3;
n_bits |= 56;
#else
// Ensure that we have at least 15 bits of input.
if (n_bits < 15) {
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
}
#endif
// Decode an lcode symbol from H-L.
uint32_t table_entry = self->private_data.f_huffs[0][bits & lmask];
while (true) {
uint32_t n = table_entry & 0x0F;
bits >>= n;
n_bits -= n;
if ((table_entry >> 31) != 0) {
// Literal.
*pdst++ = (uint8_t)(table_entry >> 8);
goto outer_loop;
}
if ((table_entry >> 30) != 0) {
// Back-reference; length = base_number + extra_bits.
break;
}
if ((table_entry >> 29) != 0) {
// End of block.
self->private_impl.f_end_of_block = true;
goto end;
}
if ((table_entry >> 24) != 0x10) {
status = wuffs_base__make_status(
wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
goto end;
}
uint32_t top = (table_entry >> 8) & 0xFFFF;
uint32_t mask = MASK((table_entry >> 4) & 0x0F);
table_entry = self->private_data.f_huffs[0][top + (bits & mask)];
}
// length = base_number_minus_3 + 3 + extra_bits.
uint32_t length = ((table_entry >> 8) & 0xFF) + 3;
{
uint32_t n = (table_entry >> 4) & 0x0F;
if (n) {
#if !defined(WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS)
if (n_bits < n) {
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
}
#endif
length += bits & MASK(n);
bits >>= n;
n_bits -= n;
}
}
#if !defined(WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS)
// Ensure that we have at least 15 bits of input.
if (n_bits < 15) {
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
}
#endif
// Decode a dcode symbol from H-D.
table_entry = self->private_data.f_huffs[1][bits & dmask];
while (true) {
uint32_t n = table_entry & 0x0F;
bits >>= n;
n_bits -= n;
if ((table_entry >> 30) != 0) {
// Back-reference; distance = base_number + extra_bits.
break;
}
if ((table_entry >> 24) != 0x10) {
status = wuffs_base__make_status(
wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
goto end;
}
uint32_t top = (table_entry >> 8) & 0xFFFF;
uint32_t mask = MASK((table_entry >> 4) & 0x0F);
table_entry = self->private_data.f_huffs[1][top + (bits & mask)];
}
// dist_minus_1 = base_number_minus_1 + extra_bits.
// distance = dist_minus_1 + 1.
uint32_t dist_minus_1 = (table_entry >> 8) & 0xFFFF;
{
uint32_t n = (table_entry >> 4) & 0x0F;
#if !defined(WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS)
// Ensure that we have at least n bits of input.
if (n_bits < n) {
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
bits |= ((uint32_t)(*psrc++)) << n_bits;
n_bits += 8;
}
#endif
dist_minus_1 += bits & MASK(n);
bits >>= n;
n_bits -= n;
}
// Copy from the history buffer, if necessary.
if ((ptrdiff_t)(dist_minus_1 + 1) > (pdst - pdst_mark)) {
// Set (hlen, hdist) to be the length-distance pair to copy from
// this.history, and (length, distance) to be the remaining
// length-distance pair to copy from args.dst.
uint32_t hlen = 0;
uint32_t hdist = ((dist_minus_1 + 1) - (pdst - pdst_mark));
if (length > hdist) {
length -= hdist;
hlen = hdist;
} else {
hlen = length;
length = 0;
}
if (self->private_impl.f_history_index < hdist) {
status = wuffs_base__make_status(wuffs_deflate__error__bad_distance);
goto end;
}
// Copy up to the right edge of the f_history array.
uint32_t history_index =
(self->private_impl.f_history_index - hdist) & 0x7FFF;
uint32_t available = 0x8000 - history_index;
uint32_t n_copied = (hlen < available) ? hlen : available;
memmove(pdst, self->private_data.f_history + history_index, n_copied);
pdst += n_copied;
// Copy from the left edge of the f_history array.
if (hlen > n_copied) {
hlen -= n_copied;
memmove(pdst, self->private_data.f_history, hlen);
pdst += hlen;
}
if (length == 0) {
goto outer_loop;
}
if ((ptrdiff_t)(dist_minus_1 + 1) > (pdst - pdst_mark)) {
status = wuffs_base__make_status(
wuffs_deflate__error__internal_error_inconsistent_distance);
goto end;
}
}
uint8_t* pback = pdst - (dist_minus_1 + 1);
#if defined(WUFFS_DEFLATE__HAVE_64_BIT_UNALIGNED_LITTLE_ENDIAN_LOADS)
// Back-copy fast path, copying 8 instead of 1 bytes at a time.
//
// This always copies 8*N bytes (where N is the smallest integer such that
// 8*N >= length, i.e. we round length up to a multiple of 8), instead of
// only length bytes, but that's OK, as subsequent iterations will fix up
// the overrun.
if (dist_minus_1 + 1 >= 8) {
while (1) {
memcpy(pdst, pback, 8);
if (length <= 8) {
pdst += length;
break;
}
pdst += 8;
pback += 8;
length -= 8;
}
continue;
}
#endif
// Back-copy slow path.
for (; length >= 3; length -= 3) {
*pdst++ = *pback++;
*pdst++ = *pback++;
*pdst++ = *pback++;
}
for (; length; length--) {
*pdst++ = *pback++;
}
}
end:
// Return unused input bytes.
for (; n_bits >= 8; n_bits -= 8) {
psrc--;
}
bits &= MASK(n_bits);
// Save contextual state.
a_dst->meta.wi = pdst - a_dst->data.ptr;
a_src->meta.ri = psrc - a_src->data.ptr;
self->private_impl.f_bits = bits;
self->private_impl.f_n_bits = n_bits;
return status;
}