Add deflate.decoder.decode_huffman_bmi2
Binary size, before:
21864 gen/lib/c/clang-9-dynamic/wuffs-std-deflate.lo
21488 gen/lib/c/clang-9-static/wuffs-std-deflate.o
21136 gen/lib/c/gcc-dynamic/wuffs-std-deflate.lo
21536 gen/lib/c/gcc-static/wuffs-std-deflate.o
After:
24592 gen/lib/c/clang-9-dynamic/wuffs-std-deflate.lo
24368 gen/lib/c/clang-9-static/wuffs-std-deflate.o
23320 gen/lib/c/gcc-dynamic/wuffs-std-deflate.lo
23928 gen/lib/c/gcc-static/wuffs-std-deflate.o
I've upgraded my kernel and compilers recently, so the absolute Wuffs
benchmark numbers below aren't directly comparable to those reported in
previous commits, but the relative changes still seems valid.
On a mid-range x86_64 laptop:
name old speed new speed delta
wuffs_deflate_decode_1k_full_init/clang9 107MB/s ± 1% 104MB/s ± 1% -2.36% (p=0.000 n=8+9)
wuffs_deflate_decode_1k_part_init/clang9 123MB/s ± 1% 121MB/s ± 1% -1.78% (p=0.000 n=10+9)
wuffs_deflate_decode_10k_full_init/clang9 221MB/s ± 2% 223MB/s ± 1% +0.86% (p=0.035 n=10+9)
wuffs_deflate_decode_10k_part_init/clang9 228MB/s ± 0% 230MB/s ± 1% +0.59% (p=0.003 n=8+10)
wuffs_deflate_decode_100k_just_one_read/clang9 276MB/s ± 0% 279MB/s ± 1% +1.12% (p=0.001 n=8+9)
wuffs_deflate_decode_100k_many_big_reads/clang9 174MB/s ± 1% 178MB/s ± 5% +2.38% (p=0.028 n=9+10)
wuffs_deflate_decode_1k_full_init/gcc10 103MB/s ± 3% 103MB/s ± 1% ~ (p=0.105 n=10+10)
wuffs_deflate_decode_1k_part_init/gcc10 123MB/s ± 1% 122MB/s ± 1% -0.94% (p=0.001 n=9+10)
wuffs_deflate_decode_10k_full_init/gcc10 216MB/s ± 2% 235MB/s ± 1% +8.83% (p=0.000 n=10+10)
wuffs_deflate_decode_10k_part_init/gcc10 223MB/s ± 1% 242MB/s ± 1% +8.45% (p=0.000 n=9+9)
wuffs_deflate_decode_100k_just_one_read/gcc10 270MB/s ± 4% 300MB/s ± 4% +11.08% (p=0.000 n=9+10)
wuffs_deflate_decode_100k_many_big_reads/gcc10 179MB/s ± 1% 193MB/s ± 2% +7.97% (p=0.000 n=10+10)
wuffs_gzip_decode_10k/clang9 218MB/s ± 3% 220MB/s ± 1% ~ (p=0.549 n=10+9)
wuffs_gzip_decode_100k/clang9 270MB/s ± 0% 269MB/s ± 2% ~ (p=0.730 n=9+9)
wuffs_gzip_decode_10k/gcc10 208MB/s ± 1% 227MB/s ± 1% +8.87% (p=0.000 n=9+10)
wuffs_gzip_decode_100k/gcc10 267MB/s ± 1% 283MB/s ± 1% +6.05% (p=0.000 n=10+9)
wuffs_zlib_decode_10k/clang9 214MB/s ± 2% 226MB/s ± 1% +5.69% (p=0.000 n=10+9)
wuffs_zlib_decode_100k/clang9 259MB/s ± 6% 272MB/s ± 2% +5.05% (p=0.000 n=9+10)
wuffs_zlib_decode_10k/gcc10 211MB/s ± 1% 227MB/s ± 1% +7.53% (p=0.000 n=10+10)
wuffs_zlib_decode_100k/gcc10 262MB/s ± 1% 282MB/s ± 1% +7.69% (p=0.000 n=10+9)
wuffs_png_decode_image_19k_8bpp/clang9 144MB/s ± 0% 148MB/s ± 1% +2.68% (p=0.000 n=9+9)
wuffs_png_decode_image_40k_24bpp/clang9 166MB/s ± 1% 170MB/s ± 1% +2.38% (p=0.000 n=9+9)
wuffs_png_decode_image_77k_8bpp/clang9 515MB/s ± 1% 525MB/s ± 1% +1.91% (p=0.000 n=9+8)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 401MB/s ± 1% 400MB/s ± 2% ~ (p=0.913 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 387MB/s ± 1% 388MB/s ± 1% ~ (p=0.546 n=9+9)
wuffs_png_decode_image_4002k_24bpp/clang9 170MB/s ± 3% 175MB/s ± 1% +3.02% (p=0.000 n=9+10)
wuffs_png_decode_image_19k_8bpp/gcc10 143MB/s ± 2% 154MB/s ± 3% +8.32% (p=0.000 n=9+9)
wuffs_png_decode_image_40k_24bpp/gcc10 174MB/s ± 2% 182MB/s ± 1% +4.42% (p=0.000 n=10+9)
wuffs_png_decode_image_77k_8bpp/gcc10 513MB/s ± 2% 552MB/s ± 3% +7.66% (p=0.000 n=10+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 408MB/s ± 1% 426MB/s ± 2% +4.43% (p=0.000 n=9+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 390MB/s ± 4% 410MB/s ± 2% +5.05% (p=0.000 n=10+9)
wuffs_png_decode_image_4002k_24bpp/gcc10 177MB/s ± 1% 186MB/s ± 1% +5.13% (p=0.000 n=9+9)
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 14c0911..d260979 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -149,6 +149,33 @@
}
static inline bool //
+wuffs_base__cpu_arch__have_x86_bmi2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ // GCC defines these macros but MSVC does not.
+ // - bit_BMI2 = (1 << 8)
+ const unsigned int bmi2_ebx7 = 0x00000100;
+
+ // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+ unsigned int eax7 = 0;
+ unsigned int ebx7 = 0;
+ unsigned int ecx7 = 0;
+ unsigned int edx7 = 0;
+ if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+ return (ebx7 & bmi2_ebx7) == bmi2_ebx7;
+ }
+#elif defined(_MSC_VER) // defined(__GNUC__)
+ int x[4];
+ __cpuidex(x, 7, 0);
+ return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif // defined(__GNUC__); defined(_MSC_VER)
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ return false;
+}
+
+static inline bool //
wuffs_base__cpu_arch__have_x86_sse42() {
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
// GCC defines these macros but MSVC does not.
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 7d9423e..8c03c4c 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -60,8 +60,9 @@
"" +
"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
"" +
- "// ---------------- CPU Architecture\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int sse42_ecx1 = 0x00900002;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n return (ecx1 & sse42_ecx1) == sse42_" +
- "ecx1;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x[4];\n __cpuid(x, 1);\n return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\n" +
+ "// ---------------- CPU Architecture\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_BMI2 = (1 << 8)\n const unsigned int bmi2_ebx7 = 0x00000100;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n return (ebx7 & bmi2_ebx7) == bmi2_ebx7;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n i" +
+ "nt x[4];\n __cpuidex(x, 7, 0);\n return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int sse42_ecx1 = 0x00900002;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n return (ecx1 & sse42_ecx1) == sse42_ecx1;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x[4];\n __cpuid(x, 1);\n return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#els" +
+ "e\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\n" +
"" +
"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n// - converting a uint32_t to a size_t will never overflow.\n// - converting a size_t to a uint64_t will never overflow.\n#if defined(__WORDSIZE)\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
"" +
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index b30ed39..8343623 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -279,6 +279,10 @@
caMacro, caName, caAttribute =
"X86_64", "x86_sse42",
"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2\")"
+ case t.IDX86BMI2:
+ caMacro, caName, caAttribute =
+ "X86_64", "x86_bmi2",
+ "WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"bmi2\")"
}
}
}
diff --git a/lang/ast/ast.go b/lang/ast/ast.go
index 509fa97..39665f8 100644
--- a/lang/ast/ast.go
+++ b/lang/ast/ast.go
@@ -491,7 +491,7 @@
return false
}
switch rhs.Ident() {
- case t.IDARMCRC32, t.IDARMNeon, t.IDX86SSE42, t.IDX86AVX2:
+ case t.IDARMCRC32, t.IDARMNeon, t.IDX86SSE42, t.IDX86AVX2, t.IDX86BMI2:
return true
}
return false
diff --git a/lang/token/list.go b/lang/token/list.go
index 797f983..c046b42 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -709,6 +709,7 @@
IDX86SSE42Utility = ID(0x391)
IDX86AVX2 = ID(0x392)
IDX86AVX2Utility = ID(0x393)
+ IDX86BMI2 = ID(0x394)
IDX86M128I = ID(0x3A0)
)
@@ -1120,6 +1121,7 @@
IDX86SSE42Utility: "x86_sse42_utility",
IDX86AVX2: "x86_avx2",
IDX86AVX2Utility: "x86_avx2_utility",
+ IDX86BMI2: "x86_bmi2",
IDX86M128I: "x86_m128i",
}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 4c3b78e..543fda8 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -188,6 +188,33 @@
}
static inline bool //
+wuffs_base__cpu_arch__have_x86_bmi2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ // GCC defines these macros but MSVC does not.
+ // - bit_BMI2 = (1 << 8)
+ const unsigned int bmi2_ebx7 = 0x00000100;
+
+ // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+ unsigned int eax7 = 0;
+ unsigned int ebx7 = 0;
+ unsigned int ecx7 = 0;
+ unsigned int edx7 = 0;
+ if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+ return (ebx7 & bmi2_ebx7) == bmi2_ebx7;
+ }
+#elif defined(_MSC_VER) // defined(__GNUC__)
+ int x[4];
+ __cpuidex(x, 7, 0);
+ return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif // defined(__GNUC__); defined(_MSC_VER)
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ return false;
+}
+
+static inline bool //
wuffs_base__cpu_arch__have_x86_sse42() {
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
// GCC defines these macros but MSVC does not.
@@ -25165,6 +25192,14 @@
uint32_t a_n_codes1,
uint32_t a_base_symbol);
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__status
+wuffs_deflate__decoder__decode_huffman_bmi2(
+ wuffs_deflate__decoder* self,
+ wuffs_base__io_buffer* a_dst,
+ wuffs_base__io_buffer* a_src);
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
static wuffs_base__status
wuffs_deflate__decoder__decode_huffman_fast32(
wuffs_deflate__decoder* self,
@@ -25392,6 +25427,11 @@
switch (coro_susp_point) {
WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
+ self->private_impl.choosy_decode_huffman_fast64 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ wuffs_base__cpu_arch__have_x86_bmi2() ? &wuffs_deflate__decoder__decode_huffman_bmi2 :
+#endif
+ self->private_impl.choosy_decode_huffman_fast64);
while (true) {
v_mark = ((uint64_t)(iop_a_dst - io0_a_dst));
{
@@ -26280,6 +26320,213 @@
return wuffs_base__make_status(NULL);
}
+// ‼ WUFFS MULTI-FILE SECTION +x86_bmi2
+// -------- func deflate.decoder.decode_huffman_bmi2
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("bmi2")
+static wuffs_base__status
+wuffs_deflate__decoder__decode_huffman_bmi2(
+ wuffs_deflate__decoder* self,
+ wuffs_base__io_buffer* a_dst,
+ wuffs_base__io_buffer* a_src) {
+ wuffs_base__status status = wuffs_base__make_status(NULL);
+
+ uint64_t v_bits = 0;
+ uint32_t v_n_bits = 0;
+ uint32_t v_table_entry = 0;
+ uint32_t v_table_entry_n_bits = 0;
+ uint64_t v_lmask = 0;
+ uint64_t v_dmask = 0;
+ uint32_t v_redir_top = 0;
+ uint32_t v_redir_mask = 0;
+ uint32_t v_length = 0;
+ uint32_t v_dist_minus_1 = 0;
+ uint32_t v_hlen = 0;
+ uint32_t v_hdist = 0;
+
+ uint8_t* iop_a_dst = NULL;
+ uint8_t* io0_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ uint8_t* io1_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ uint8_t* io2_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ if (a_dst) {
+ io0_a_dst = a_dst->data.ptr;
+ io1_a_dst = io0_a_dst + a_dst->meta.wi;
+ iop_a_dst = io1_a_dst;
+ io2_a_dst = io0_a_dst + a_dst->data.len;
+ if (a_dst->meta.closed) {
+ io2_a_dst = iop_a_dst;
+ }
+ }
+ const uint8_t* iop_a_src = NULL;
+ const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+ if (a_src) {
+ io0_a_src = a_src->data.ptr;
+ io1_a_src = io0_a_src + a_src->meta.ri;
+ iop_a_src = io1_a_src;
+ io2_a_src = io0_a_src + a_src->meta.wi;
+ }
+
+ if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> (self->private_impl.f_n_bits & 7)) != 0)) {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+ goto exit;
+ }
+ v_bits = ((uint64_t)(self->private_impl.f_bits));
+ v_n_bits = self->private_impl.f_n_bits;
+ v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
+ v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
+ label__loop__continue:;
+ while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 266) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
+ v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63)));
+ iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
+ v_n_bits |= 56;
+ v_table_entry = self->private_data.f_huffs[0][(v_bits & v_lmask)];
+ v_table_entry_n_bits = (v_table_entry & 15);
+ v_bits >>= v_table_entry_n_bits;
+ v_n_bits -= v_table_entry_n_bits;
+ if ((v_table_entry >> 31) != 0) {
+ (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, ((uint8_t)(((v_table_entry >> 8) & 255)))), iop_a_dst += 1);
+ goto label__loop__continue;
+ } else if ((v_table_entry >> 30) != 0) {
+ } else if ((v_table_entry >> 29) != 0) {
+ self->private_impl.f_end_of_block = true;
+ goto label__loop__break;
+ } else if ((v_table_entry >> 28) != 0) {
+ v_redir_top = ((v_table_entry >> 8) & 65535);
+ v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
+ v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+ v_table_entry_n_bits = (v_table_entry & 15);
+ v_bits >>= v_table_entry_n_bits;
+ v_n_bits -= v_table_entry_n_bits;
+ if ((v_table_entry >> 31) != 0) {
+ (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, ((uint8_t)(((v_table_entry >> 8) & 255)))), iop_a_dst += 1);
+ goto label__loop__continue;
+ } else if ((v_table_entry >> 30) != 0) {
+ } else if ((v_table_entry >> 29) != 0) {
+ self->private_impl.f_end_of_block = true;
+ goto label__loop__break;
+ } else if ((v_table_entry >> 28) != 0) {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+ goto exit;
+ } else if ((v_table_entry >> 27) != 0) {
+ status = wuffs_base__make_status(wuffs_deflate__error__bad_huffman_code);
+ goto exit;
+ } else {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+ goto exit;
+ }
+ } else if ((v_table_entry >> 27) != 0) {
+ status = wuffs_base__make_status(wuffs_deflate__error__bad_huffman_code);
+ goto exit;
+ } else {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+ goto exit;
+ }
+ v_length = (((v_table_entry >> 8) & 255) + 3);
+ v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
+ if (v_table_entry_n_bits > 0) {
+ v_length = (((v_length + 253 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 255) + 3);
+ v_bits >>= v_table_entry_n_bits;
+ v_n_bits -= v_table_entry_n_bits;
+ }
+ v_table_entry = self->private_data.f_huffs[1][(v_bits & v_dmask)];
+ v_table_entry_n_bits = (v_table_entry & 15);
+ v_bits >>= v_table_entry_n_bits;
+ v_n_bits -= v_table_entry_n_bits;
+ if ((v_table_entry >> 28) == 1) {
+ v_redir_top = ((v_table_entry >> 8) & 65535);
+ v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
+ v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+ v_table_entry_n_bits = (v_table_entry & 15);
+ v_bits >>= v_table_entry_n_bits;
+ v_n_bits -= v_table_entry_n_bits;
+ }
+ if ((v_table_entry >> 24) != 64) {
+ if ((v_table_entry >> 24) == 8) {
+ status = wuffs_base__make_status(wuffs_deflate__error__bad_huffman_code);
+ goto exit;
+ }
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+ goto exit;
+ }
+ v_dist_minus_1 = ((v_table_entry >> 8) & 32767);
+ v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
+ v_dist_minus_1 = ((v_dist_minus_1 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 32767);
+ v_bits >>= v_table_entry_n_bits;
+ v_n_bits -= v_table_entry_n_bits;
+ while (true) {
+ if (((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) {
+ v_hlen = 0;
+ v_hdist = ((uint32_t)((((uint64_t)((v_dist_minus_1 + 1))) - ((uint64_t)(iop_a_dst - io0_a_dst)))));
+ if (v_length > v_hdist) {
+ v_length -= v_hdist;
+ v_hlen = v_hdist;
+ } else {
+ v_hlen = v_length;
+ v_length = 0;
+ }
+ if (self->private_impl.f_history_index < v_hdist) {
+ status = wuffs_base__make_status(wuffs_deflate__error__bad_distance);
+ goto exit;
+ }
+ v_hdist = (self->private_impl.f_history_index - v_hdist);
+ wuffs_base__io_writer__limited_copy_u32_from_slice(
+ &iop_a_dst, io2_a_dst,v_hlen, wuffs_base__slice_u8__subslice_i(wuffs_base__make_slice_u8(self->private_data.f_history, 33025), (v_hdist & 32767)));
+ if (v_length == 0) {
+ goto label__loop__continue;
+ }
+ if ((((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) || (((uint64_t)(v_length)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)((v_length + 8))) > ((uint64_t)(io2_a_dst - iop_a_dst)))) {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_distance);
+ goto exit;
+ }
+ }
+ if ((v_dist_minus_1 + 1) >= 8) {
+ wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ } else {
+ wuffs_base__io_writer__limited_copy_u32_from_history_fast(
+ &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+ }
+ goto label__0__break;
+ }
+ label__0__break:;
+ }
+ label__loop__break:;
+ if (v_n_bits > 63) {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+ goto exit;
+ }
+ while (v_n_bits >= 8) {
+ v_n_bits -= 8;
+ if (iop_a_src > io1_a_src) {
+ iop_a_src--;
+ } else {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_i_o);
+ goto exit;
+ }
+ }
+ self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1)) << v_n_bits) - 1))));
+ self->private_impl.f_n_bits = v_n_bits;
+ if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> self->private_impl.f_n_bits) != 0)) {
+ status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+ goto exit;
+ }
+ goto exit;
+ exit:
+ if (a_dst) {
+ a_dst->meta.wi = ((size_t)(iop_a_dst - a_dst->data.ptr));
+ }
+ if (a_src) {
+ a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+ }
+
+ return status;
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+// ‼ WUFFS MULTI-FILE SECTION -x86_bmi2
+
// -------- func deflate.decoder.decode_huffman_fast32
static wuffs_base__status
diff --git a/std/deflate/decode_deflate.wuffs b/std/deflate/decode_deflate.wuffs
index 9a98acc..428890c 100644
--- a/std/deflate/decode_deflate.wuffs
+++ b/std/deflate/decode_deflate.wuffs
@@ -203,6 +203,8 @@
var mark : base.u64
var status : base.status
+ choose decode_huffman_fast64 = [decode_huffman_bmi2]
+
while true {
mark = args.dst.mark()
status =? this.decode_blocks?(dst: args.dst, src: args.src)
diff --git a/std/deflate/decode_huffman_bmi2.wuffs b/std/deflate/decode_huffman_bmi2.wuffs
new file mode 100644
index 0000000..cad7dc1
--- /dev/null
+++ b/std/deflate/decode_huffman_bmi2.wuffs
@@ -0,0 +1,294 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// decode_huffman_bmi2 is exactly the same as decode_huffman_fast64 except for
+// the "choose cpu_arch >= x86_bmi2". Unsurprisingly, having Bit Manipulation
+// Instructions available to the compiler can help this function's performance.
+pri func decoder.decode_huffman_bmi2!(dst: base.io_writer, src: base.io_reader) base.status,
+ choose cpu_arch >= x86_bmi2,
+{
+ // When editing this function, consider making the equivalent change to the
+ // decode_huffman_slow function. Keep the diff between the two
+ // decode_huffman_*.wuffs files as small as possible, while retaining both
+ // correctness and performance.
+
+ var bits : base.u64
+ var n_bits : base.u32
+ var table_entry : base.u32
+ var table_entry_n_bits : base.u32[..= 15]
+ var lmask : base.u64[..= 511]
+ var dmask : base.u64[..= 511]
+ var redir_top : base.u32[..= 0xFFFF]
+ var redir_mask : base.u32[..= 0x7FFF]
+ var length : base.u32[..= 258]
+ var dist_minus_1 : base.u32[..= 0x7FFF]
+ var hlen : base.u32[..= 0x7FFF]
+ var hdist : base.u32
+
+ if (this.n_bits >= 8) or ((this.bits >> (this.n_bits & 7)) <> 0) {
+ return "#internal error: inconsistent n_bits"
+ }
+
+ bits = this.bits as base.u64
+ n_bits = this.n_bits
+
+ lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1
+ dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
+
+ // Check up front, on each iteration, that we have enough buffer space to
+ // both read (8 bytes) and write (266 bytes) as much as we need to. Doing
+ // this check once (per iteration), up front, removes the need to check
+ // multiple times inside the loop body, so it's faster overall.
+ //
+ // For writing, a literal code obviously corresponds to writing 1 byte, and
+ // 258 is the maximum length in a length-distance pair, as specified in the
+ // RFC section 3.2.5. Compressed blocks (length and distance codes).
+ //
+ // We adjust 258 up to (258 + 8) = 266 since it can be faster to overshoot
+ // a little and make multiple-of-8-byte copies even when the length in the
+ // length-distance pair isn't an exact multiple-of-8. Strictly speaking,
+ // 264 (an exact multiple-of-8) is the tight bound but (258 + 8) is easier
+ // for the Wuffs proof system, as length's type is refined to [..= 258],
+ //
+ // For reading, strictly speaking, we only need 6 bytes (48 bits) of
+ // available input, because the H-L Literal/Length code is up to 15 bits
+ // plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
+ // to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
+ // read 64 bits than 48 bits or (48 - n_bits) bits.
+ while.loop(args.dst.length() >= 266) and (args.src.length() >= 8) {
+ // Ensure that we have at least 56 bits of input.
+ //
+ // This is "Variant 4" of
+ // https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+ //
+ // 56, the number of bits in 7 bytes, is a property of that "Variant 4"
+ // bit-reading technique, and not related to the DEFLATE format per se.
+ //
+ // The "& 63" is a no-op, and not part of the original "Variant 4"
+ // technique, but satisfies Wuffs' overflow/underflow checks.
+ bits |= args.src.peek_u64le() ~mod<< (n_bits & 63)
+ args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8)
+ n_bits |= 56
+
+ // Decode an lcode symbol from H-L.
+ table_entry = this.huffs[0][bits & lmask]
+ table_entry_n_bits = table_entry & 0x0F
+ bits >>= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
+
+ if (table_entry >> 31) <> 0 {
+ // Literal.
+ args.dst.write_u8_fast!(a: ((table_entry >> 8) & 0xFF) as base.u8)
+ continue.loop
+ } else if (table_entry >> 30) <> 0 {
+ // No-op; code continues past the if-else chain.
+ } else if (table_entry >> 29) <> 0 {
+ // End of block.
+ this.end_of_block = true
+ break.loop
+ } else if (table_entry >> 28) <> 0 {
+ // Redirect.
+ redir_top = (table_entry >> 8) & 0xFFFF
+ redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
+ table_entry = this.huffs[0][
+ (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+ table_entry_n_bits = table_entry & 0x0F
+ bits >>= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
+
+ if (table_entry >> 31) <> 0 {
+ // Literal.
+ args.dst.write_u8_fast!(a: ((table_entry >> 8) & 0xFF) as base.u8)
+ continue.loop
+ } else if (table_entry >> 30) <> 0 {
+ // No-op; code continues past the if-else chain.
+ } else if (table_entry >> 29) <> 0 {
+ // End of block.
+ this.end_of_block = true
+ break.loop
+ } else if (table_entry >> 28) <> 0 {
+ return "#internal error: inconsistent Huffman decoder state"
+ } else if (table_entry >> 27) <> 0 {
+ return "#bad Huffman code"
+ } else {
+ return "#internal error: inconsistent Huffman decoder state"
+ }
+
+ } else if (table_entry >> 27) <> 0 {
+ return "#bad Huffman code"
+ } else {
+ return "#internal error: inconsistent Huffman decoder state"
+ }
+
+ // length = base_number_minus_3 + 3 + extra_bits.
+ //
+ // The -3 is from the bias in script/print-deflate-magic-numbers.go.
+ // That bias makes the "& 0xFF" 1 and 15-ish lines below correct.
+ length = ((table_entry >> 8) & 0xFF) + 3
+ table_entry_n_bits = (table_entry >> 4) & 0x0F
+ if table_entry_n_bits > 0 {
+ // The "+ 253" is the same as "- 3", after the "& 0xFF", but the
+ // plus form won't require an underflow check.
+ length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3
+ bits >>= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
+ }
+
+ // Decode a dcode symbol from H-D.
+ table_entry = this.huffs[1][bits & dmask]
+ table_entry_n_bits = table_entry & 15
+ bits >>= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
+
+ // Check for a redirect.
+ if (table_entry >> 28) == 1 {
+ redir_top = (table_entry >> 8) & 0xFFFF
+ redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
+ table_entry = this.huffs[1][
+ (redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+ table_entry_n_bits = table_entry & 0x0F
+ bits >>= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
+ }
+
+ // For H-D, all symbols should be base_number + extra_bits.
+ if (table_entry >> 24) <> 0x40 {
+ if (table_entry >> 24) == 0x08 {
+ return "#bad Huffman code"
+ }
+ return "#internal error: inconsistent Huffman decoder state"
+ }
+
+ // dist_minus_1 = base_number_minus_1 + extra_bits.
+ // distance = dist_minus_1 + 1.
+ //
+ // The -1 is from the bias in script/print-deflate-magic-numbers.go.
+ // That bias makes the "& 0x7FFF" 2 and 15-ish lines below correct and
+ // undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial.
+ dist_minus_1 = (table_entry >> 8) & 0x7FFF
+ table_entry_n_bits = (table_entry >> 4) & 0x0F
+
+ dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF
+ bits >>= table_entry_n_bits
+ n_bits ~mod-= table_entry_n_bits
+
+ // The "while true { etc; break }" is a redundant version of "etc", but
+ // its presence minimizes the diff between decode_huffman_fastxx and
+ // decode_huffman_slow.
+ while true,
+ pre args.dst.length() >= 266,
+ {
+ // We can therefore prove:
+ assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+ assert ((length + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+
+ // Copy from this.history.
+ if ((dist_minus_1 + 1) as base.u64) > args.dst.history_length() {
+ // Set (hlen, hdist) to be the length-distance pair to copy
+ // from this.history, and (length, distance) to be the
+ // remaining length-distance pair to copy from args.dst.
+ hlen = 0
+ hdist = (((dist_minus_1 + 1) as base.u64) - args.dst.history_length()) as base.u32
+ if length > hdist {
+ assert hdist < length via "a < b: b > a"()
+ assert hdist < 0x8000 via "a < b: a < c; c <= b"(c: length)
+ length -= hdist
+ hlen = hdist
+ } else {
+ hlen = length
+ length = 0
+ }
+ if this.history_index < hdist {
+ return "#bad distance"
+ }
+ // Re-purpose the hdist variable as the this.history index to
+ // start copying from.
+ hdist = this.history_index - hdist
+
+ // Copy from hdist to the end of this.history.
+ //
+ // This copying is simpler than the decode_huffman_slow version
+ // because it cannot yield. We have already checked that
+ // args.dst.length() is large enough.
+ args.dst.limited_copy_u32_from_slice!(
+ up_to: hlen, s: this.history[hdist & 0x7FFF ..])
+
+ if length == 0 {
+ // No need to copy from args.dst.
+ continue.loop
+ }
+
+ if (((dist_minus_1 + 1) as base.u64) > args.dst.history_length()) or
+ ((length as base.u64) > args.dst.length()) or
+ (((length + 8) as base.u64) > args.dst.length()) {
+ return "#internal error: inconsistent distance"
+ }
+ }
+ // Once again, redundant but explicit assertions.
+ assert (dist_minus_1 + 1) >= 1
+ assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length()
+ assert (length as base.u64) <= args.dst.length()
+ assert ((length + 8) as base.u64) <= args.dst.length()
+
+ // Copy from args.dst.
+ //
+ // For short distances, less than 8 bytes, copying atomic 8-byte
+ // chunks can result in incorrect output, so we fall back to a
+ // slower 1-byte-at-a-time copy. Deflate's copy-from-history can
+ // pick up freshly written bytes. A length = 5, distance = 2 copy
+ // starting with "abc" should give "abcbcbcb" but the 8-byte chunk
+ // technique would give "abcbc???", the exact output depending on
+ // what was previously in the writer buffer.
+ if (dist_minus_1 + 1) >= 8 {
+ args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
+ up_to: length, distance: (dist_minus_1 + 1))
+ } else {
+ args.dst.limited_copy_u32_from_history_fast!(
+ up_to: length, distance: (dist_minus_1 + 1))
+ }
+ break
+ } endwhile
+ } endwhile.loop
+
+ // Ensure n_bits < 8 by rewindng args.src, if we loaded too many of its
+ // bytes into the bits variable.
+ //
+ // Note that we can unconditionally call undo_read (without resulting in an
+ // "invalid I/O operation" error code) only because this whole function can
+ // never suspend, as all of its I/O operations were checked beforehand for
+ // sufficient buffer space. Otherwise, resuming from the suspension could
+ // mean that the (possibly different) args.src is no longer rewindable,
+ // even if conceptually, this function was responsible for reading the
+ // bytes we want to rewind.
+ if n_bits > 63 {
+ return "#internal error: inconsistent n_bits"
+ }
+ while n_bits >= 8,
+ post n_bits < 8,
+ {
+ n_bits -= 8
+ if args.src.can_undo_byte() {
+ args.src.undo_byte!()
+ } else {
+ return "#internal error: inconsistent I/O"
+ }
+ } endwhile
+
+ this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32
+ this.n_bits = n_bits
+
+ if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) {
+ return "#internal error: inconsistent n_bits"
+ }
+}