Have std/deflate read 64 bits once per inner loop

On a mid-range x86_64 laptop:

wuffs_deflate_decode_1k_full_init/clang9                   101MB/s ± 0%     92MB/s ± 2%   -9.22%  (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/clang9                   117MB/s ± 0%    107MB/s ± 3%   -8.00%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/clang9                  129MB/s ± 0%    140MB/s ± 1%   +8.36%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/clang9                  131MB/s ± 0%    142MB/s ± 1%   +8.59%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/clang9             158MB/s ± 0%    184MB/s ± 1%  +16.29%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/clang9            133MB/s ± 0%    155MB/s ± 1%  +16.26%  (p=0.008 n=5+5)

wuffs_deflate_decode_1k_full_init/gcc10                   98.0MB/s ± 0%   97.7MB/s ± 0%     ~     (p=0.222 n=5+5)
wuffs_deflate_decode_1k_part_init/gcc10                    112MB/s ± 0%    111MB/s ± 0%   -1.04%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/gcc10                   138MB/s ± 0%    179MB/s ± 1%  +29.91%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/gcc10                   140MB/s ± 0%    183MB/s ± 1%  +30.52%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/gcc10              188MB/s ± 0%    211MB/s ± 1%  +12.32%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/gcc10             156MB/s ± 0%    172MB/s ± 1%  +10.00%  (p=0.008 n=5+5)

mimic_deflate_decode_1k                                    128MB/s ± 0%    127MB/s ± 1%   -0.99%  (p=0.016 n=5+5)
mimic_deflate_decode_10k                                   143MB/s ± 0%    142MB/s ± 1%     ~     (p=0.056 n=5+5)
mimic_deflate_decode_100k_just_one_read                    176MB/s ± 0%    174MB/s ± 1%     ~     (p=0.151 n=5+5)
mimic_deflate_decode_100k_many_big_reads                   135MB/s ± 0%    134MB/s ± 1%     ~     (p=0.151 n=5+5)

wuffs_gzip_decode_10k/clang9                               134MB/s ± 1%    127MB/s ± 0%   -5.23%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang9                              150MB/s ± 0%    162MB/s ± 0%   +8.23%  (p=0.008 n=5+5)

wuffs_gzip_decode_10k/gcc10                                131MB/s ± 0%    152MB/s ± 0%  +16.36%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc10                               159MB/s ± 2%    188MB/s ± 0%  +17.78%  (p=0.008 n=5+5)

mimic_gzip_decode_10k                                      121MB/s ± 0%    121MB/s ± 0%     ~     (p=0.841 n=5+5)
mimic_gzip_decode_100k                                     144MB/s ± 0%    143MB/s ± 0%   -0.11%  (p=0.008 n=5+5)

wuffs_zlib_decode_10k/clang9                               139MB/s ± 0%    135MB/s ± 0%   -2.36%  (p=0.008 n=5+5)
wuffs_zlib_decode_100k/clang9                              155MB/s ± 0%    176MB/s ± 0%  +13.83%  (p=0.008 n=5+5)

wuffs_zlib_decode_10k/gcc10                                141MB/s ± 0%    156MB/s ± 1%  +10.94%  (p=0.008 n=5+5)
wuffs_zlib_decode_100k/gcc10                               172MB/s ± 0%    193MB/s ± 1%  +12.27%  (p=0.016 n=4+5)

mimic_zlib_decode_10k                                      133MB/s ± 0%    132MB/s ± 0%   -1.29%  (p=0.008 n=5+5)
mimic_zlib_decode_100k                                     160MB/s ± 0%    158MB/s ± 0%   -1.02%  (p=0.008 n=5+5)

wuffs_png_decode_image_19k_8bpp/clang9                    94.7MB/s ± 0%  103.4MB/s ± 0%   +9.17%  (p=0.016 n=4+5)
wuffs_png_decode_image_40k_24bpp/clang9                    135MB/s ± 0%    146MB/s ± 0%   +8.03%  (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang9                     351MB/s ± 0%    380MB/s ± 0%   +8.48%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   320MB/s ± 1%    343MB/s ± 0%   +7.20%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   308MB/s ± 0%    328MB/s ± 0%   +6.59%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9                  138MB/s ± 0%    149MB/s ± 0%   +8.02%  (p=0.016 n=5+4)

wuffs_png_decode_image_19k_8bpp/gcc10                      101MB/s ± 1%    116MB/s ± 0%  +14.61%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10                     136MB/s ± 0%    154MB/s ± 0%  +13.10%  (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/gcc10                      371MB/s ± 0%    426MB/s ± 0%  +14.80%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10    333MB/s ± 0%    357MB/s ± 0%   +7.31%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10    318MB/s ± 0%    341MB/s ± 0%   +7.27%  (p=0.016 n=5+4)
wuffs_png_decode_image_4002k_24bpp/gcc10                   138MB/s ± 0%    157MB/s ± 0%  +13.59%  (p=0.008 n=5+5)

mimic_png_decode_image_19k_8bpp                           58.2MB/s ± 0%   58.1MB/s ± 0%     ~     (p=0.841 n=5+5)
mimic_png_decode_image_40k_24bpp                          73.3MB/s ± 0%   73.2MB/s ± 0%   -0.16%  (p=0.032 n=5+5)
mimic_png_decode_image_77k_8bpp                            177MB/s ± 0%    177MB/s ± 0%     ~     (p=0.690 n=5+5)
mimic_png_decode_image_552k_32bpp_ignore_checksum               skipped
mimic_png_decode_image_552k_32bpp_verify_checksum          146MB/s ± 0%    146MB/s ± 0%     ~     (p=0.690 n=5+5)
mimic_png_decode_image_4002k_24bpp                         104MB/s ± 0%    104MB/s ± 0%     ~     (p=0.841 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index ca9ad79..b449c47 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -22624,12 +22624,12 @@
     wuffs_base__io_buffer* a_src) {
   wuffs_base__status status = wuffs_base__make_status(NULL);
 
-  uint32_t v_bits = 0;
+  uint64_t v_bits = 0;
   uint32_t v_n_bits = 0;
   uint32_t v_table_entry = 0;
   uint32_t v_table_entry_n_bits = 0;
-  uint32_t v_lmask = 0;
-  uint32_t v_dmask = 0;
+  uint64_t v_lmask = 0;
+  uint64_t v_dmask = 0;
   uint32_t v_redir_top = 0;
   uint32_t v_redir_mask = 0;
   uint32_t v_length = 0;
@@ -22665,21 +22665,15 @@
     status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
     goto exit;
   }
-  v_bits = self->private_impl.f_bits;
+  v_bits = ((uint64_t)(self->private_impl.f_bits));
   v_n_bits = self->private_impl.f_n_bits;
-  v_lmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
-  v_dmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
+  v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
+  v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
   label__loop__continue:;
-  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 12)) {
-    if (v_n_bits < 15) {
-      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-      iop_a_src += 1;
-      v_n_bits += 8;
-      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-      iop_a_src += 1;
-      v_n_bits += 8;
-    } else {
-    }
+  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
+    v_bits |= (wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63));
+    iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
+    v_n_bits |= 56;
     v_table_entry = self->private_data.f_huffs[0][(v_bits & v_lmask)];
     v_table_entry_n_bits = (v_table_entry & 15);
     v_bits >>= v_table_entry_n_bits;
@@ -22692,18 +22686,9 @@
       self->private_impl.f_end_of_block = true;
       goto label__loop__break;
     } else if ((v_table_entry >> 28) != 0) {
-      if (v_n_bits < 15) {
-        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-        iop_a_src += 1;
-        v_n_bits += 8;
-        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-        iop_a_src += 1;
-        v_n_bits += 8;
-      } else {
-      }
       v_redir_top = ((v_table_entry >> 8) & 65535);
       v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
-      v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
+      v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
       v_table_entry_n_bits = (v_table_entry & 15);
       v_bits >>= v_table_entry_n_bits;
       v_n_bits -= v_table_entry_n_bits;
@@ -22734,50 +22719,21 @@
     v_length = (((v_table_entry >> 8) & 255) + 3);
     v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
     if (v_table_entry_n_bits > 0) {
-      if (v_n_bits < 15) {
-        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-        iop_a_src += 1;
-        v_n_bits += 8;
-        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-        iop_a_src += 1;
-        v_n_bits += 8;
-      } else {
-      }
-      v_length = (((v_length + 253 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 255) + 3);
+      v_length = (((v_length + 253 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 255) + 3);
       v_bits >>= v_table_entry_n_bits;
       v_n_bits -= v_table_entry_n_bits;
-    } else {
-    }
-    if (v_n_bits < 15) {
-      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-      iop_a_src += 1;
-      v_n_bits += 8;
-      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-      iop_a_src += 1;
-      v_n_bits += 8;
-    } else {
     }
     v_table_entry = self->private_data.f_huffs[1][(v_bits & v_dmask)];
     v_table_entry_n_bits = (v_table_entry & 15);
     v_bits >>= v_table_entry_n_bits;
     v_n_bits -= v_table_entry_n_bits;
     if ((v_table_entry >> 28) == 1) {
-      if (v_n_bits < 15) {
-        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-        iop_a_src += 1;
-        v_n_bits += 8;
-        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-        iop_a_src += 1;
-        v_n_bits += 8;
-      } else {
-      }
       v_redir_top = ((v_table_entry >> 8) & 65535);
       v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
-      v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
+      v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
       v_table_entry_n_bits = (v_table_entry & 15);
       v_bits >>= v_table_entry_n_bits;
       v_n_bits -= v_table_entry_n_bits;
-    } else {
     }
     if ((v_table_entry >> 24) != 64) {
       if ((v_table_entry >> 24) == 8) {
@@ -22789,15 +22745,7 @@
     }
     v_dist_minus_1 = ((v_table_entry >> 8) & 32767);
     v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
-    if (v_n_bits < v_table_entry_n_bits) {
-      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-      iop_a_src += 1;
-      v_n_bits += 8;
-      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
-      iop_a_src += 1;
-      v_n_bits += 8;
-    }
-    v_dist_minus_1 = ((v_dist_minus_1 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 32767);
+    v_dist_minus_1 = ((v_dist_minus_1 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 32767);
     v_bits >>= v_table_entry_n_bits;
     v_n_bits -= v_table_entry_n_bits;
     while (true) {
@@ -22833,6 +22781,10 @@
     label__0__break:;
   }
   label__loop__break:;
+  if (v_n_bits > 63) {
+    status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
+    goto exit;
+  }
   while (v_n_bits >= 8) {
     v_n_bits -= 8;
     if (iop_a_src > io1_a_src) {
@@ -22842,7 +22794,7 @@
       goto exit;
     }
   }
-  self->private_impl.f_bits = (v_bits & ((((uint32_t)(1)) << v_n_bits) - 1));
+  self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1)) << v_n_bits) - 1))));
   self->private_impl.f_n_bits = v_n_bits;
   if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> self->private_impl.f_n_bits) != 0)) {
     status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
diff --git a/std/deflate/decode_huffman_fast.wuffs b/std/deflate/decode_huffman_fast.wuffs
index 22260cb..a3d439b 100644
--- a/std/deflate/decode_huffman_fast.wuffs
+++ b/std/deflate/decode_huffman_fast.wuffs
@@ -20,12 +20,12 @@
 	// decode_huffman_*.wuffs files as small as possible, while retaining both
 	// correctness and performance.
 
-	var bits               : base.u32
+	var bits               : base.u64
 	var n_bits             : base.u32
 	var table_entry        : base.u32
 	var table_entry_n_bits : base.u32[..= 15]
-	var lmask              : base.u32[..= 511]
-	var dmask              : base.u32[..= 511]
+	var lmask              : base.u64[..= 511]
+	var dmask              : base.u64[..= 511]
 	var redir_top          : base.u32[..= 0xFFFF]
 	var redir_mask         : base.u32[..= 0x7FFF]
 	var length             : base.u32[..= 258]
@@ -37,17 +37,16 @@
 		return "#internal error: inconsistent n_bits"
 	}
 
-	bits = this.bits
+	bits = this.bits as base.u64
 	n_bits = this.n_bits
 
-	lmask = ((1 as base.u32) << this.n_huffs_bits[0]) - 1
-	dmask = ((1 as base.u32) << this.n_huffs_bits[1]) - 1
+	lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1
+	dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
 
 	// Check up front, on each iteration, that we have enough buffer space to
-	// both read (12 bytes) and write (258 bytes) as much as we need to. Doing
+	// both read (8 bytes) and write (258 bytes) as much as we need to. Doing
 	// this check once (per iteration), up front, removes the need to check
-	// (and possibly suspend the coroutine) multiple times inside the loop
-	// body, so it's faster overall.
+	// multiple times inside the loop body, so it's faster overall.
 	//
 	// For writing, a literal code obviously corresponds to writing 1 byte, and
 	// 258 is the maximum length in a length-distance pair, as specified in the
@@ -56,45 +55,28 @@
 	// For reading, strictly speaking, we only need 6 bytes (48 bits) of
 	// available input, because the H-L Literal/Length code is up to 15 bits
 	// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
-	// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However,
-	// args.src.length() assertions are made in terms of bytes, not bits.
-	//
-	// In theory, we could write assertions that mixed bytes and bits, such as:
-	//
-	// assert ((args.src.length() >= 6) and (n_bits >=  5)) or
-	//        ((args.src.length() >= 4) and (n_bits >= 21))
-	//
-	// but that looks clumsy and makes the proofs more complicated. Instead, we
-	// are conservative and work in terms of bytes only. In bytes, each code
-	// requires either one (direct) or two (redirect) lookups, up to 2 bytes
-	// for each. There are also up to 13 extra bits, or 2 bytes. Thus, each
-	// code requires up to 2 + 2 + 2 == 6 bytes, and there are two codes (one
-	// from H-L and one from H-D). Therefore, we check for at least 12 bytes.
-	while.loop(args.dst.length() >= 258) and (args.src.length() >= 12) {
-		// Ensure that we have at least 15 bits of input.
-		if n_bits < 15 {
-			assert n_bits >= 0  // TODO: this shouldn't be necessary.
-			bits |= args.src.peek_u8_as_u32() << n_bits
-			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-			n_bits += 8
-			bits |= args.src.peek_u8_as_u32() << n_bits
-			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-			n_bits += 8
-			assert n_bits >= 15
-		} else {
-			assert args.src.length() >= 10
-		}
-		// These assertions are redundant, but are listed explicitly for
-		// clarity. They must hold regardless of which branch was taken for the
-		// if statement above.
-		assert args.src.length() >= 10
-		assert n_bits >= 15
+	// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
+	// read 64 bits than 48 bits or (48 - n_bits) bits.
+	while.loop(args.dst.length() >= 258) and (args.src.length() >= 8) {
+		// Ensure that we have at least 56 bits of input.
+		//
+		// This is "Variant 4" of
+		// https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+		//
+		// 56, the number of bits in 7 bytes, is a property of that "Variant 4"
+		// bit-reading technique, and not related to the DEFLATE format per se.
+		//
+		// The "& 63" is a no-op, and not part of the original "Variant 4"
+		// technique, but satisfies Wuffs' overflow/underflow checks.
+		bits |= args.src.peek_u64le() ~mod<< (n_bits & 63)
+		args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8)
+		n_bits |= 56
 
 		// Decode an lcode symbol from H-L.
 		table_entry = this.huffs[0][bits & lmask]
 		table_entry_n_bits = table_entry & 0x0F
 		bits >>= table_entry_n_bits
-		n_bits -= table_entry_n_bits
+		n_bits ~mod-= table_entry_n_bits
 
 		if (table_entry >> 31) <> 0 {
 			// Literal.
@@ -102,37 +84,19 @@
 			continue.loop
 		} else if (table_entry >> 30) <> 0 {
 			// No-op; code continues past the if-else chain.
-			assert args.src.length() >= 8
 		} else if (table_entry >> 29) <> 0 {
 			// End of block.
 			this.end_of_block = true
 			break.loop
 		} else if (table_entry >> 28) <> 0 {
 			// Redirect.
-
-			// Ensure that we have at least 15 bits of input.
-			if n_bits < 15 {
-				assert n_bits >= 0  // TODO: this shouldn't be necessary.
-				bits |= args.src.peek_u8_as_u32() << n_bits
-				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-				n_bits += 8
-				bits |= args.src.peek_u8_as_u32() << n_bits
-				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-				n_bits += 8
-				assert n_bits >= 15
-			} else {
-				assert args.src.length() >= 8
-			}
-			// Once again, redundant but explicit assertions.
-			assert args.src.length() >= 8
-			assert n_bits >= 15
-
 			redir_top = (table_entry >> 8) & 0xFFFF
 			redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
-			table_entry = this.huffs[0][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
+			table_entry = this.huffs[0][
+				(redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
 			table_entry_n_bits = table_entry & 0x0F
 			bits >>= table_entry_n_bits
-			n_bits -= table_entry_n_bits
+			n_bits ~mod-= table_entry_n_bits
 
 			if (table_entry >> 31) <> 0 {
 				// Literal.
@@ -152,9 +116,6 @@
 				return "#internal error: inconsistent Huffman decoder state"
 			}
 
-			// Once again, redundant but explicit assertions.
-			assert args.src.length() >= 8
-
 		} else if (table_entry >> 27) <> 0 {
 			return "#bad Huffman code"
 		} else {
@@ -168,83 +129,28 @@
 		length = ((table_entry >> 8) & 0xFF) + 3
 		table_entry_n_bits = (table_entry >> 4) & 0x0F
 		if table_entry_n_bits > 0 {
-			// Ensure that we have at least 15 bits of input.
-			if n_bits < 15 {
-				assert n_bits >= 0  // TODO: this shouldn't be necessary.
-				bits |= args.src.peek_u8_as_u32() << n_bits
-				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-				n_bits += 8
-				bits |= args.src.peek_u8_as_u32() << n_bits
-				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-				n_bits += 8
-				assert n_bits >= 15
-			} else {
-				assert args.src.length() >= 6
-			}
-			// Once again, redundant but explicit assertions.
-			assert args.src.length() >= 6
-			assert n_bits >= 15
-
 			// The "+ 253" is the same as "- 3", after the "& 0xFF", but the
 			// plus form won't require an underflow check.
-			length = ((length + 253 + bits.low_bits(n: table_entry_n_bits)) & 0xFF) + 3
+			length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3
 			bits >>= table_entry_n_bits
-			n_bits -= table_entry_n_bits
-
-		} else {
-			assert args.src.length() >= 6
+			n_bits ~mod-= table_entry_n_bits
 		}
 
-		// Ensure that we have at least 15 bits of input.
-		if n_bits < 15 {
-			assert n_bits >= 0  // TODO: this shouldn't be necessary.
-			bits |= args.src.peek_u8_as_u32() << n_bits
-			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-			n_bits += 8
-			bits |= args.src.peek_u8_as_u32() << n_bits
-			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-			n_bits += 8
-			assert n_bits >= 15
-		} else {
-			assert args.src.length() >= 4
-		}
-		// Once again, redundant but explicit assertions.
-		assert args.src.length() >= 4
-		assert n_bits >= 15
-
 		// Decode a dcode symbol from H-D.
 		table_entry = this.huffs[1][bits & dmask]
 		table_entry_n_bits = table_entry & 15
 		bits >>= table_entry_n_bits
-		n_bits -= table_entry_n_bits
+		n_bits ~mod-= table_entry_n_bits
 
 		// Check for a redirect.
 		if (table_entry >> 28) == 1 {
-			// Ensure that we have at least 15 bits of input.
-			if n_bits < 15 {
-				assert n_bits >= 0  // TODO: this shouldn't be necessary.
-				bits |= args.src.peek_u8_as_u32() << n_bits
-				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-				n_bits += 8
-				bits |= args.src.peek_u8_as_u32() << n_bits
-				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-				n_bits += 8
-				assert n_bits >= 15
-			} else {
-				assert args.src.length() >= 2
-			}
-			// Once again, redundant but explicit assertions.
-			assert args.src.length() >= 2
-			assert n_bits >= 15
-
 			redir_top = (table_entry >> 8) & 0xFFFF
 			redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
-			table_entry = this.huffs[1][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
+			table_entry = this.huffs[1][
+				(redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
 			table_entry_n_bits = table_entry & 0x0F
 			bits >>= table_entry_n_bits
-			n_bits -= table_entry_n_bits
-		} else {
-			assert args.src.length() >= 2
+			n_bits ~mod-= table_entry_n_bits
 		}
 
 		// For H-D, all symbols should be base_number + extra_bits.
@@ -263,25 +169,10 @@
 		// undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial.
 		dist_minus_1 = (table_entry >> 8) & 0x7FFF
 		table_entry_n_bits = (table_entry >> 4) & 0x0F
-		// Ensure that we have at least table_entry_n_bits bits of input.
-		if n_bits < table_entry_n_bits {
-			assert n_bits < 15 via "a < b: a < c; c <= b"(c: table_entry_n_bits)
-			assert n_bits >= 0  // TODO: this shouldn't be necessary.
-			bits |= args.src.peek_u8_as_u32() << n_bits
-			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-			n_bits += 8
-			bits |= args.src.peek_u8_as_u32() << n_bits
-			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
-			n_bits += 8
-			assert table_entry_n_bits <= n_bits via "a <= b: a <= c; c <= b"(c: 16)
-			assert n_bits >= table_entry_n_bits via "a >= b: b <= a"()
-		}
-		// Once again, redundant but explicit assertions.
-		assert n_bits >= table_entry_n_bits
 
-		dist_minus_1 = (dist_minus_1 + bits.low_bits(n: table_entry_n_bits)) & 0x7FFF
+		dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF
 		bits >>= table_entry_n_bits
-		n_bits -= table_entry_n_bits
+		n_bits ~mod-= table_entry_n_bits
 
 		// The "while true { etc; break }" is a redundant version of "etc", but
 		// its presence minimizes the diff between decode_huffman_fast and
@@ -355,6 +246,9 @@
 	// mean that the (possibly different) args.src is no longer rewindable,
 	// even if conceptually, this function was responsible for reading the
 	// bytes we want to rewind.
+	if n_bits > 63 {
+		return "#internal error: inconsistent n_bits"
+	}
 	while n_bits >= 8,
 		post n_bits < 8,
 	{
@@ -366,7 +260,7 @@
 		}
 	} endwhile
 
-	this.bits = bits & (((1 as base.u32) << n_bits) - 1)
+	this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32
 	this.n_bits = n_bits
 
 	if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) {