Add deflate.decoder.decode_huffman_bmi2

Binary size, before:
21864 gen/lib/c/clang-9-dynamic/wuffs-std-deflate.lo
21488 gen/lib/c/clang-9-static/wuffs-std-deflate.o
21136 gen/lib/c/gcc-dynamic/wuffs-std-deflate.lo
21536 gen/lib/c/gcc-static/wuffs-std-deflate.o

After:
24592 gen/lib/c/clang-9-dynamic/wuffs-std-deflate.lo
24368 gen/lib/c/clang-9-static/wuffs-std-deflate.o
23320 gen/lib/c/gcc-dynamic/wuffs-std-deflate.lo
23928 gen/lib/c/gcc-static/wuffs-std-deflate.o

I've upgraded my kernel and compilers recently, so the absolute Wuffs
benchmark numbers below aren't directly comparable to those reported in
previous commits, but the relative changes still seems valid.

On a mid-range x86_64 laptop:

name                                                      old speed      new speed      delta

wuffs_deflate_decode_1k_full_init/clang9                   107MB/s ± 1%   104MB/s ± 1%   -2.36%  (p=0.000 n=8+9)
wuffs_deflate_decode_1k_part_init/clang9                   123MB/s ± 1%   121MB/s ± 1%   -1.78%  (p=0.000 n=10+9)
wuffs_deflate_decode_10k_full_init/clang9                  221MB/s ± 2%   223MB/s ± 1%   +0.86%  (p=0.035 n=10+9)
wuffs_deflate_decode_10k_part_init/clang9                  228MB/s ± 0%   230MB/s ± 1%   +0.59%  (p=0.003 n=8+10)
wuffs_deflate_decode_100k_just_one_read/clang9             276MB/s ± 0%   279MB/s ± 1%   +1.12%  (p=0.001 n=8+9)
wuffs_deflate_decode_100k_many_big_reads/clang9            174MB/s ± 1%   178MB/s ± 5%   +2.38%  (p=0.028 n=9+10)

wuffs_deflate_decode_1k_full_init/gcc10                    103MB/s ± 3%   103MB/s ± 1%     ~     (p=0.105 n=10+10)
wuffs_deflate_decode_1k_part_init/gcc10                    123MB/s ± 1%   122MB/s ± 1%   -0.94%  (p=0.001 n=9+10)
wuffs_deflate_decode_10k_full_init/gcc10                   216MB/s ± 2%   235MB/s ± 1%   +8.83%  (p=0.000 n=10+10)
wuffs_deflate_decode_10k_part_init/gcc10                   223MB/s ± 1%   242MB/s ± 1%   +8.45%  (p=0.000 n=9+9)
wuffs_deflate_decode_100k_just_one_read/gcc10              270MB/s ± 4%   300MB/s ± 4%  +11.08%  (p=0.000 n=9+10)
wuffs_deflate_decode_100k_many_big_reads/gcc10             179MB/s ± 1%   193MB/s ± 2%   +7.97%  (p=0.000 n=10+10)

wuffs_gzip_decode_10k/clang9                               218MB/s ± 3%   220MB/s ± 1%     ~     (p=0.549 n=10+9)
wuffs_gzip_decode_100k/clang9                              270MB/s ± 0%   269MB/s ± 2%     ~     (p=0.730 n=9+9)

wuffs_gzip_decode_10k/gcc10                                208MB/s ± 1%   227MB/s ± 1%   +8.87%  (p=0.000 n=9+10)
wuffs_gzip_decode_100k/gcc10                               267MB/s ± 1%   283MB/s ± 1%   +6.05%  (p=0.000 n=10+9)

wuffs_zlib_decode_10k/clang9                               214MB/s ± 2%   226MB/s ± 1%   +5.69%  (p=0.000 n=10+9)
wuffs_zlib_decode_100k/clang9                              259MB/s ± 6%   272MB/s ± 2%   +5.05%  (p=0.000 n=9+10)

wuffs_zlib_decode_10k/gcc10                                211MB/s ± 1%   227MB/s ± 1%   +7.53%  (p=0.000 n=10+10)
wuffs_zlib_decode_100k/gcc10                               262MB/s ± 1%   282MB/s ± 1%   +7.69%  (p=0.000 n=10+9)

wuffs_png_decode_image_19k_8bpp/clang9                     144MB/s ± 0%   148MB/s ± 1%   +2.68%  (p=0.000 n=9+9)
wuffs_png_decode_image_40k_24bpp/clang9                    166MB/s ± 1%   170MB/s ± 1%   +2.38%  (p=0.000 n=9+9)
wuffs_png_decode_image_77k_8bpp/clang9                     515MB/s ± 1%   525MB/s ± 1%   +1.91%  (p=0.000 n=9+8)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   401MB/s ± 1%   400MB/s ± 2%     ~     (p=0.913 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   387MB/s ± 1%   388MB/s ± 1%     ~     (p=0.546 n=9+9)
wuffs_png_decode_image_4002k_24bpp/clang9                  170MB/s ± 3%   175MB/s ± 1%   +3.02%  (p=0.000 n=9+10)

wuffs_png_decode_image_19k_8bpp/gcc10                      143MB/s ± 2%   154MB/s ± 3%   +8.32%  (p=0.000 n=9+9)
wuffs_png_decode_image_40k_24bpp/gcc10                     174MB/s ± 2%   182MB/s ± 1%   +4.42%  (p=0.000 n=10+9)
wuffs_png_decode_image_77k_8bpp/gcc10                      513MB/s ± 2%   552MB/s ± 3%   +7.66%  (p=0.000 n=10+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10    408MB/s ± 1%   426MB/s ± 2%   +4.43%  (p=0.000 n=9+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10    390MB/s ± 4%   410MB/s ± 2%   +5.05%  (p=0.000 n=10+9)
wuffs_png_decode_image_4002k_24bpp/gcc10                   177MB/s ± 1%   186MB/s ± 1%   +5.13%  (p=0.000 n=9+9)
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 14c0911..d260979 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -149,6 +149,33 @@
 }
 
 static inline bool  //
+wuffs_base__cpu_arch__have_x86_bmi2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  // GCC defines these macros but MSVC does not.
+  //  - bit_BMI2 = (1 <<  8)
+  const unsigned int bmi2_ebx7 = 0x00000100;
+
+  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+  unsigned int eax7 = 0;
+  unsigned int ebx7 = 0;
+  unsigned int ecx7 = 0;
+  unsigned int edx7 = 0;
+  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+    return (ebx7 & bmi2_ebx7) == bmi2_ebx7;
+  }
+#elif defined(_MSC_VER)  // defined(__GNUC__)
+  int x[4];
+  __cpuidex(x, 7, 0);
+  return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif  // defined(__GNUC__); defined(_MSC_VER)
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  return false;
+}
+
+static inline bool  //
 wuffs_base__cpu_arch__have_x86_sse42() {
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
   // GCC defines these macros but MSVC does not.
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index 7d9423e..8c03c4c 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -60,8 +60,9 @@
 	"" +
 	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
 	"" +
-	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    return (ecx1 & sse42_ecx1) == sse42_" +
-	"ecx1;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuid(x, 1);\n  return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
+	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_BMI2 = (1 <<  8)\n  const unsigned int bmi2_ebx7 = 0x00000100;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    return (ebx7 & bmi2_ebx7) == bmi2_ebx7;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  i" +
+	"nt x[4];\n  __cpuidex(x, 7, 0);\n  return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    return (ecx1 & sse42_ecx1) == sse42_ecx1;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuid(x, 1);\n  return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#els" +
+	"e\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
 	"" +
 	"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#if defined(__WORDSIZE)\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
 	"" +
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index b30ed39..8343623 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -279,6 +279,10 @@
 				caMacro, caName, caAttribute =
 					"X86_64", "x86_sse42",
 					"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2\")"
+			case t.IDX86BMI2:
+				caMacro, caName, caAttribute =
+					"X86_64", "x86_bmi2",
+					"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"bmi2\")"
 			}
 		}
 	}
diff --git a/lang/ast/ast.go b/lang/ast/ast.go
index 509fa97..39665f8 100644
--- a/lang/ast/ast.go
+++ b/lang/ast/ast.go
@@ -491,7 +491,7 @@
 		return false
 	}
 	switch rhs.Ident() {
-	case t.IDARMCRC32, t.IDARMNeon, t.IDX86SSE42, t.IDX86AVX2:
+	case t.IDARMCRC32, t.IDARMNeon, t.IDX86SSE42, t.IDX86AVX2, t.IDX86BMI2:
 		return true
 	}
 	return false
diff --git a/lang/token/list.go b/lang/token/list.go
index 797f983..c046b42 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -709,6 +709,7 @@
 	IDX86SSE42Utility = ID(0x391)
 	IDX86AVX2         = ID(0x392)
 	IDX86AVX2Utility  = ID(0x393)
+	IDX86BMI2         = ID(0x394)
 
 	IDX86M128I = ID(0x3A0)
 )
@@ -1120,6 +1121,7 @@
 	IDX86SSE42Utility: "x86_sse42_utility",
 	IDX86AVX2:         "x86_avx2",
 	IDX86AVX2Utility:  "x86_avx2_utility",
+	IDX86BMI2:         "x86_bmi2",
 
 	IDX86M128I: "x86_m128i",
 }
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 4c3b78e..543fda8 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -188,6 +188,33 @@
 }
 
 static inline bool  //
+wuffs_base__cpu_arch__have_x86_bmi2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  // GCC defines these macros but MSVC does not.
+  //  - bit_BMI2 = (1 <<  8)
+  const unsigned int bmi2_ebx7 = 0x00000100;
+
+  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+  unsigned int eax7 = 0;
+  unsigned int ebx7 = 0;
+  unsigned int ecx7 = 0;
+  unsigned int edx7 = 0;
+  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+    return (ebx7 & bmi2_ebx7) == bmi2_ebx7;
+  }
+#elif defined(_MSC_VER)  // defined(__GNUC__)
+  int x[4];
+  __cpuidex(x, 7, 0);
+  return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif  // defined(__GNUC__); defined(_MSC_VER)
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  return false;
+}
+
+static inline bool  //
 wuffs_base__cpu_arch__have_x86_sse42() {
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
   // GCC defines these macros but MSVC does not.
@@ -25165,6 +25192,14 @@
     uint32_t a_n_codes1,
     uint32_t a_base_symbol);
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__status
+wuffs_deflate__decoder__decode_huffman_bmi2(
+    wuffs_deflate__decoder* self,
+    wuffs_base__io_buffer* a_dst,
+    wuffs_base__io_buffer* a_src);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 static wuffs_base__status
 wuffs_deflate__decoder__decode_huffman_fast32(
     wuffs_deflate__decoder* self,
@@ -25392,6 +25427,11 @@
   switch (coro_susp_point) {
     WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0;
 
+    self->private_impl.choosy_decode_huffman_fast64 = (
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+        wuffs_base__cpu_arch__have_x86_bmi2() ? &wuffs_deflate__decoder__decode_huffman_bmi2 :
+#endif
+        self->private_impl.choosy_decode_huffman_fast64);
     while (true) {
       v_mark = ((uint64_t)(iop_a_dst - io0_a_dst));
       {
@@ -26280,6 +26320,213 @@
   return wuffs_base__make_status(NULL);
 }
 
+// ‼ WUFFS MULTI-FILE SECTION +x86_bmi2
+// -------- func deflate.decoder.decode_huffman_bmi2
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("bmi2")
+static wuffs_base__status
+wuffs_deflate__decoder__decode_huffman_bmi2(
+    wuffs_deflate__decoder* self,
+    wuffs_base__io_buffer* a_dst,
+    wuffs_base__io_buffer* a_src) {
+  wuffs_base__status status = wuffs_base__make_status(NULL);
+
+  uint64_t v_bits = 0;
+  uint32_t v_n_bits = 0;
+  uint32_t v_table_entry = 0;
+  uint32_t v_table_entry_n_bits = 0;
+  uint64_t v_lmask = 0;
+  uint64_t v_dmask = 0;
+  uint32_t v_redir_top = 0;
+  uint32_t v_redir_mask = 0;
+  uint32_t v_length = 0;
+  uint32_t v_dist_minus_1 = 0;
+  uint32_t v_hlen = 0;
+  uint32_t v_hdist = 0;
+
+  uint8_t* iop_a_dst = NULL;
+  uint8_t* io0_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  uint8_t* io1_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  uint8_t* io2_a_dst WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_dst) {
+    io0_a_dst = a_dst->data.ptr;
+    io1_a_dst = io0_a_dst + a_dst->meta.wi;
+    iop_a_dst = io1_a_dst;
+    io2_a_dst = io0_a_dst + a_dst->data.len;
+    if (a_dst->meta.closed) {
+      io2_a_dst = iop_a_dst;
+    }
+  }
+  const uint8_t* iop_a_src = NULL;
+  const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL;
+  if (a_src) {
+    io0_a_src = a_src->data.ptr;
+    io1_a_src = io0_a_src + a_src->meta.ri;
+    iop_a_src = io1_a_src;
+    io2_a_src = io0_a_src + a_src->meta.wi;
+  }
+
+  if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> (self->private_impl.f_n_bits & 7)) != 0)) {
+    status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+    goto exit;
+  }
+  v_bits = ((uint64_t)(self->private_impl.f_bits));
+  v_n_bits = self->private_impl.f_n_bits;
+  v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
+  v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
+  label__loop__continue:;
+  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 266) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
+    v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63)));
+    iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
+    v_n_bits |= 56;
+    v_table_entry = self->private_data.f_huffs[0][(v_bits & v_lmask)];
+    v_table_entry_n_bits = (v_table_entry & 15);
+    v_bits >>= v_table_entry_n_bits;
+    v_n_bits -= v_table_entry_n_bits;
+    if ((v_table_entry >> 31) != 0) {
+      (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, ((uint8_t)(((v_table_entry >> 8) & 255)))), iop_a_dst += 1);
+      goto label__loop__continue;
+    } else if ((v_table_entry >> 30) != 0) {
+    } else if ((v_table_entry >> 29) != 0) {
+      self->private_impl.f_end_of_block = true;
+      goto label__loop__break;
+    } else if ((v_table_entry >> 28) != 0) {
+      v_redir_top = ((v_table_entry >> 8) & 65535);
+      v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
+      v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+      v_table_entry_n_bits = (v_table_entry & 15);
+      v_bits >>= v_table_entry_n_bits;
+      v_n_bits -= v_table_entry_n_bits;
+      if ((v_table_entry >> 31) != 0) {
+        (wuffs_base__poke_u8be__no_bounds_check(iop_a_dst, ((uint8_t)(((v_table_entry >> 8) & 255)))), iop_a_dst += 1);
+        goto label__loop__continue;
+      } else if ((v_table_entry >> 30) != 0) {
+      } else if ((v_table_entry >> 29) != 0) {
+        self->private_impl.f_end_of_block = true;
+        goto label__loop__break;
+      } else if ((v_table_entry >> 28) != 0) {
+        status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+        goto exit;
+      } else if ((v_table_entry >> 27) != 0) {
+        status = wuffs_base__make_status(wuffs_deflate__error__bad_huffman_code);
+        goto exit;
+      } else {
+        status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+        goto exit;
+      }
+    } else if ((v_table_entry >> 27) != 0) {
+      status = wuffs_base__make_status(wuffs_deflate__error__bad_huffman_code);
+      goto exit;
+    } else {
+      status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+      goto exit;
+    }
+    v_length = (((v_table_entry >> 8) & 255) + 3);
+    v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
+    if (v_table_entry_n_bits > 0) {
+      v_length = (((v_length + 253 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 255) + 3);
+      v_bits >>= v_table_entry_n_bits;
+      v_n_bits -= v_table_entry_n_bits;
+    }
+    v_table_entry = self->private_data.f_huffs[1][(v_bits & v_dmask)];
+    v_table_entry_n_bits = (v_table_entry & 15);
+    v_bits >>= v_table_entry_n_bits;
+    v_n_bits -= v_table_entry_n_bits;
+    if ((v_table_entry >> 28) == 1) {
+      v_redir_top = ((v_table_entry >> 8) & 65535);
+      v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
+      v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+      v_table_entry_n_bits = (v_table_entry & 15);
+      v_bits >>= v_table_entry_n_bits;
+      v_n_bits -= v_table_entry_n_bits;
+    }
+    if ((v_table_entry >> 24) != 64) {
+      if ((v_table_entry >> 24) == 8) {
+        status = wuffs_base__make_status(wuffs_deflate__error__bad_huffman_code);
+        goto exit;
+      }
+      status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_huffman_decoder_state);
+      goto exit;
+    }
+    v_dist_minus_1 = ((v_table_entry >> 8) & 32767);
+    v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
+    v_dist_minus_1 = ((v_dist_minus_1 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 32767);
+    v_bits >>= v_table_entry_n_bits;
+    v_n_bits -= v_table_entry_n_bits;
+    while (true) {
+      if (((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) {
+        v_hlen = 0;
+        v_hdist = ((uint32_t)((((uint64_t)((v_dist_minus_1 + 1))) - ((uint64_t)(iop_a_dst - io0_a_dst)))));
+        if (v_length > v_hdist) {
+          v_length -= v_hdist;
+          v_hlen = v_hdist;
+        } else {
+          v_hlen = v_length;
+          v_length = 0;
+        }
+        if (self->private_impl.f_history_index < v_hdist) {
+          status = wuffs_base__make_status(wuffs_deflate__error__bad_distance);
+          goto exit;
+        }
+        v_hdist = (self->private_impl.f_history_index - v_hdist);
+        wuffs_base__io_writer__limited_copy_u32_from_slice(
+            &iop_a_dst, io2_a_dst,v_hlen, wuffs_base__slice_u8__subslice_i(wuffs_base__make_slice_u8(self->private_data.f_history, 33025), (v_hdist & 32767)));
+        if (v_length == 0) {
+          goto label__loop__continue;
+        }
+        if ((((uint64_t)((v_dist_minus_1 + 1))) > ((uint64_t)(iop_a_dst - io0_a_dst))) || (((uint64_t)(v_length)) > ((uint64_t)(io2_a_dst - iop_a_dst))) || (((uint64_t)((v_length + 8))) > ((uint64_t)(io2_a_dst - iop_a_dst)))) {
+          status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_distance);
+          goto exit;
+        }
+      }
+      if ((v_dist_minus_1 + 1) >= 8) {
+        wuffs_base__io_writer__limited_copy_u32_from_history_8_byte_chunks_fast(
+            &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      } else {
+        wuffs_base__io_writer__limited_copy_u32_from_history_fast(
+            &iop_a_dst, io0_a_dst, io2_a_dst, v_length, (v_dist_minus_1 + 1));
+      }
+      goto label__0__break;
+    }
+    label__0__break:;
+  }
+  label__loop__break:;
+  if (v_n_bits > 63) {
+    status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+    goto exit;
+  }
+  while (v_n_bits >= 8) {
+    v_n_bits -= 8;
+    if (iop_a_src > io1_a_src) {
+      iop_a_src--;
+    } else {
+      status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_i_o);
+      goto exit;
+    }
+  }
+  self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1)) << v_n_bits) - 1))));
+  self->private_impl.f_n_bits = v_n_bits;
+  if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> self->private_impl.f_n_bits) != 0)) {
+    status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+    goto exit;
+  }
+  goto exit;
+  exit:
+  if (a_dst) {
+    a_dst->meta.wi = ((size_t)(iop_a_dst - a_dst->data.ptr));
+  }
+  if (a_src) {
+    a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr));
+  }
+
+  return status;
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+// ‼ WUFFS MULTI-FILE SECTION -x86_bmi2
+
 // -------- func deflate.decoder.decode_huffman_fast32
 
 static wuffs_base__status
diff --git a/std/deflate/decode_deflate.wuffs b/std/deflate/decode_deflate.wuffs
index 9a98acc..428890c 100644
--- a/std/deflate/decode_deflate.wuffs
+++ b/std/deflate/decode_deflate.wuffs
@@ -203,6 +203,8 @@
 	var mark   : base.u64
 	var status : base.status
 
+	choose decode_huffman_fast64 = [decode_huffman_bmi2]
+
 	while true {
 		mark = args.dst.mark()
 		status =? this.decode_blocks?(dst: args.dst, src: args.src)
diff --git a/std/deflate/decode_huffman_bmi2.wuffs b/std/deflate/decode_huffman_bmi2.wuffs
new file mode 100644
index 0000000..cad7dc1
--- /dev/null
+++ b/std/deflate/decode_huffman_bmi2.wuffs
@@ -0,0 +1,294 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// decode_huffman_bmi2 is exactly the same as decode_huffman_fast64 except for
+// the "choose cpu_arch >= x86_bmi2". Unsurprisingly, having Bit Manipulation
+// Instructions available to the compiler can help this function's performance.
+pri func decoder.decode_huffman_bmi2!(dst: base.io_writer, src: base.io_reader) base.status,
+	choose cpu_arch >= x86_bmi2,
+{
+	// When editing this function, consider making the equivalent change to the
+	// decode_huffman_slow function. Keep the diff between the two
+	// decode_huffman_*.wuffs files as small as possible, while retaining both
+	// correctness and performance.
+
+	var bits               : base.u64
+	var n_bits             : base.u32
+	var table_entry        : base.u32
+	var table_entry_n_bits : base.u32[..= 15]
+	var lmask              : base.u64[..= 511]
+	var dmask              : base.u64[..= 511]
+	var redir_top          : base.u32[..= 0xFFFF]
+	var redir_mask         : base.u32[..= 0x7FFF]
+	var length             : base.u32[..= 258]
+	var dist_minus_1       : base.u32[..= 0x7FFF]
+	var hlen               : base.u32[..= 0x7FFF]
+	var hdist              : base.u32
+
+	if (this.n_bits >= 8) or ((this.bits >> (this.n_bits & 7)) <> 0) {
+		return "#internal error: inconsistent n_bits"
+	}
+
+	bits = this.bits as base.u64
+	n_bits = this.n_bits
+
+	lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1
+	dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
+
+	// Check up front, on each iteration, that we have enough buffer space to
+	// both read (8 bytes) and write (266 bytes) as much as we need to. Doing
+	// this check once (per iteration), up front, removes the need to check
+	// multiple times inside the loop body, so it's faster overall.
+	//
+	// For writing, a literal code obviously corresponds to writing 1 byte, and
+	// 258 is the maximum length in a length-distance pair, as specified in the
+	// RFC section 3.2.5. Compressed blocks (length and distance codes).
+	//
+	// We adjust 258 up to (258 + 8) = 266 since it can be faster to overshoot
+	// a little and make multiple-of-8-byte copies even when the length in the
+	// length-distance pair isn't an exact multiple-of-8. Strictly speaking,
+	// 264 (an exact multiple-of-8) is the tight bound but (258 + 8) is easier
+	// for the Wuffs proof system, as length's type is refined to [..= 258],
+	//
+	// For reading, strictly speaking, we only need 6 bytes (48 bits) of
+	// available input, because the H-L Literal/Length code is up to 15 bits
+	// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
+	// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
+	// read 64 bits than 48 bits or (48 - n_bits) bits.
+	while.loop(args.dst.length() >= 266) and (args.src.length() >= 8) {
+		// Ensure that we have at least 56 bits of input.
+		//
+		// This is "Variant 4" of
+		// https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+		//
+		// 56, the number of bits in 7 bytes, is a property of that "Variant 4"
+		// bit-reading technique, and not related to the DEFLATE format per se.
+		//
+		// The "& 63" is a no-op, and not part of the original "Variant 4"
+		// technique, but satisfies Wuffs' overflow/underflow checks.
+		bits |= args.src.peek_u64le() ~mod<< (n_bits & 63)
+		args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8)
+		n_bits |= 56
+
+		// Decode an lcode symbol from H-L.
+		table_entry = this.huffs[0][bits & lmask]
+		table_entry_n_bits = table_entry & 0x0F
+		bits >>= table_entry_n_bits
+		n_bits ~mod-= table_entry_n_bits
+
+		if (table_entry >> 31) <> 0 {
+			// Literal.
+			args.dst.write_u8_fast!(a: ((table_entry >> 8) & 0xFF) as base.u8)
+			continue.loop
+		} else if (table_entry >> 30) <> 0 {
+			// No-op; code continues past the if-else chain.
+		} else if (table_entry >> 29) <> 0 {
+			// End of block.
+			this.end_of_block = true
+			break.loop
+		} else if (table_entry >> 28) <> 0 {
+			// Redirect.
+			redir_top = (table_entry >> 8) & 0xFFFF
+			redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
+			table_entry = this.huffs[0][
+				(redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+			table_entry_n_bits = table_entry & 0x0F
+			bits >>= table_entry_n_bits
+			n_bits ~mod-= table_entry_n_bits
+
+			if (table_entry >> 31) <> 0 {
+				// Literal.
+				args.dst.write_u8_fast!(a: ((table_entry >> 8) & 0xFF) as base.u8)
+				continue.loop
+			} else if (table_entry >> 30) <> 0 {
+				// No-op; code continues past the if-else chain.
+			} else if (table_entry >> 29) <> 0 {
+				// End of block.
+				this.end_of_block = true
+				break.loop
+			} else if (table_entry >> 28) <> 0 {
+				return "#internal error: inconsistent Huffman decoder state"
+			} else if (table_entry >> 27) <> 0 {
+				return "#bad Huffman code"
+			} else {
+				return "#internal error: inconsistent Huffman decoder state"
+			}
+
+		} else if (table_entry >> 27) <> 0 {
+			return "#bad Huffman code"
+		} else {
+			return "#internal error: inconsistent Huffman decoder state"
+		}
+
+		// length = base_number_minus_3 + 3 + extra_bits.
+		//
+		// The -3 is from the bias in script/print-deflate-magic-numbers.go.
+		// That bias makes the "& 0xFF" 1 and 15-ish lines below correct.
+		length = ((table_entry >> 8) & 0xFF) + 3
+		table_entry_n_bits = (table_entry >> 4) & 0x0F
+		if table_entry_n_bits > 0 {
+			// The "+ 253" is the same as "- 3", after the "& 0xFF", but the
+			// plus form won't require an underflow check.
+			length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3
+			bits >>= table_entry_n_bits
+			n_bits ~mod-= table_entry_n_bits
+		}
+
+		// Decode a dcode symbol from H-D.
+		table_entry = this.huffs[1][bits & dmask]
+		table_entry_n_bits = table_entry & 15
+		bits >>= table_entry_n_bits
+		n_bits ~mod-= table_entry_n_bits
+
+		// Check for a redirect.
+		if (table_entry >> 28) == 1 {
+			redir_top = (table_entry >> 8) & 0xFFFF
+			redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
+			table_entry = this.huffs[1][
+				(redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+			table_entry_n_bits = table_entry & 0x0F
+			bits >>= table_entry_n_bits
+			n_bits ~mod-= table_entry_n_bits
+		}
+
+		// For H-D, all symbols should be base_number + extra_bits.
+		if (table_entry >> 24) <> 0x40 {
+			if (table_entry >> 24) == 0x08 {
+				return "#bad Huffman code"
+			}
+			return "#internal error: inconsistent Huffman decoder state"
+		}
+
+		// dist_minus_1 = base_number_minus_1 + extra_bits.
+		// distance     = dist_minus_1 + 1.
+		//
+		// The -1 is from the bias in script/print-deflate-magic-numbers.go.
+		// That bias makes the "& 0x7FFF" 2 and 15-ish lines below correct and
+		// undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial.
+		dist_minus_1 = (table_entry >> 8) & 0x7FFF
+		table_entry_n_bits = (table_entry >> 4) & 0x0F
+
+		dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF
+		bits >>= table_entry_n_bits
+		n_bits ~mod-= table_entry_n_bits
+
+		// The "while true { etc; break }" is a redundant version of "etc", but
+		// its presence minimizes the diff between decode_huffman_fastxx and
+		// decode_huffman_slow.
+		while true,
+			pre args.dst.length() >= 266,
+		{
+			// We can therefore prove:
+			assert (length as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+			assert ((length + 8) as base.u64) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: 266)
+
+			// Copy from this.history.
+			if ((dist_minus_1 + 1) as base.u64) > args.dst.history_length() {
+				// Set (hlen, hdist) to be the length-distance pair to copy
+				// from this.history, and (length, distance) to be the
+				// remaining length-distance pair to copy from args.dst.
+				hlen = 0
+				hdist = (((dist_minus_1 + 1) as base.u64) - args.dst.history_length()) as base.u32
+				if length > hdist {
+					assert hdist < length via "a < b: b > a"()
+					assert hdist < 0x8000 via "a < b: a < c; c <= b"(c: length)
+					length -= hdist
+					hlen = hdist
+				} else {
+					hlen = length
+					length = 0
+				}
+				if this.history_index < hdist {
+					return "#bad distance"
+				}
+				// Re-purpose the hdist variable as the this.history index to
+				// start copying from.
+				hdist = this.history_index - hdist
+
+				// Copy from hdist to the end of this.history.
+				//
+				// This copying is simpler than the decode_huffman_slow version
+				// because it cannot yield. We have already checked that
+				// args.dst.length() is large enough.
+				args.dst.limited_copy_u32_from_slice!(
+					up_to: hlen, s: this.history[hdist & 0x7FFF ..])
+
+				if length == 0 {
+					// No need to copy from args.dst.
+					continue.loop
+				}
+
+				if (((dist_minus_1 + 1) as base.u64) > args.dst.history_length()) or
+					((length as base.u64) > args.dst.length()) or
+					(((length + 8) as base.u64) > args.dst.length()) {
+					return "#internal error: inconsistent distance"
+				}
+			}
+			// Once again, redundant but explicit assertions.
+			assert (dist_minus_1 + 1) >= 1
+			assert ((dist_minus_1 + 1) as base.u64) <= args.dst.history_length()
+			assert (length as base.u64) <= args.dst.length()
+			assert ((length + 8) as base.u64) <= args.dst.length()
+
+			// Copy from args.dst.
+			//
+			// For short distances, less than 8 bytes, copying atomic 8-byte
+			// chunks can result in incorrect output, so we fall back to a
+			// slower 1-byte-at-a-time copy. Deflate's copy-from-history can
+			// pick up freshly written bytes. A length = 5, distance = 2 copy
+			// starting with "abc" should give "abcbcbcb" but the 8-byte chunk
+			// technique would give "abcbc???", the exact output depending on
+			// what was previously in the writer buffer.
+			if (dist_minus_1 + 1) >= 8 {
+				args.dst.limited_copy_u32_from_history_8_byte_chunks_fast!(
+					up_to: length, distance: (dist_minus_1 + 1))
+			} else {
+				args.dst.limited_copy_u32_from_history_fast!(
+					up_to: length, distance: (dist_minus_1 + 1))
+			}
+			break
+		} endwhile
+	} endwhile.loop
+
+	// Ensure n_bits < 8 by rewindng args.src, if we loaded too many of its
+	// bytes into the bits variable.
+	//
+	// Note that we can unconditionally call undo_read (without resulting in an
+	// "invalid I/O operation" error code) only because this whole function can
+	// never suspend, as all of its I/O operations were checked beforehand for
+	// sufficient buffer space. Otherwise, resuming from the suspension could
+	// mean that the (possibly different) args.src is no longer rewindable,
+	// even if conceptually, this function was responsible for reading the
+	// bytes we want to rewind.
+	if n_bits > 63 {
+		return "#internal error: inconsistent n_bits"
+	}
+	while n_bits >= 8,
+		post n_bits < 8,
+	{
+		n_bits -= 8
+		if args.src.can_undo_byte() {
+			args.src.undo_byte!()
+		} else {
+			return "#internal error: inconsistent I/O"
+		}
+	} endwhile
+
+	this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32
+	this.n_bits = n_bits
+
+	if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) {
+		return "#internal error: inconsistent n_bits"
+	}
+}