Add std/deflate decode_huffman_fast32

This undoes commit 1a63b53e "Have std/deflate read 64 bits once per
inner loop" for 32-bit CPUs.

On a Raspberry Pi 4 (32-bit armv7l):

wuffs_deflate_decode_1k_full_init/clang9         44.9MB/s ± 1%   50.1MB/s ± 1%  +11.54%  (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/clang9         52.8MB/s ± 0%   61.1MB/s ± 0%  +15.61%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/clang9        81.4MB/s ± 0%   84.5MB/s ± 0%   +3.89%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/clang9        87.9MB/s ± 0%   91.1MB/s ± 0%   +3.67%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/clang9    106MB/s ± 0%    113MB/s ± 0%   +5.70%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/clang9  59.6MB/s ± 0%   59.1MB/s ± 0%   -0.81%  (p=0.008 n=5+5)

wuffs_deflate_decode_1k_full_init/gcc8           44.7MB/s ± 1%   50.9MB/s ± 1%  +13.91%  (p=0.008 n=5+5)
wuffs_deflate_decode_1k_part_init/gcc8           52.6MB/s ± 0%   61.5MB/s ± 0%  +17.06%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_full_init/gcc8          76.5MB/s ± 0%   85.6MB/s ± 1%  +11.89%  (p=0.008 n=5+5)
wuffs_deflate_decode_10k_part_init/gcc8          82.3MB/s ± 0%   92.2MB/s ± 0%  +12.07%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_just_one_read/gcc8     97.8MB/s ± 0%  112.1MB/s ± 0%  +14.62%  (p=0.008 n=5+5)
wuffs_deflate_decode_100k_many_big_reads/gcc8    58.0MB/s ± 0%   61.6MB/s ± 0%   +6.09%  (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 1ef07d9..1e1941a 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -23084,12 +23084,12 @@
     wuffs_base__io_buffer* a_src) {
   wuffs_base__status status = wuffs_base__make_status(NULL);
 
-  uint64_t v_bits = 0;
+  uint32_t v_bits = 0;
   uint32_t v_n_bits = 0;
   uint32_t v_table_entry = 0;
   uint32_t v_table_entry_n_bits = 0;
-  uint64_t v_lmask = 0;
-  uint64_t v_dmask = 0;
+  uint32_t v_lmask = 0;
+  uint32_t v_dmask = 0;
   uint32_t v_redir_top = 0;
   uint32_t v_redir_mask = 0;
   uint32_t v_length = 0;
@@ -23125,15 +23125,21 @@
     status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
     goto exit;
   }
-  v_bits = ((uint64_t)(self->private_impl.f_bits));
+  v_bits = self->private_impl.f_bits;
   v_n_bits = self->private_impl.f_n_bits;
-  v_lmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
-  v_dmask = ((((uint64_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
+  v_lmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[0]) - 1);
+  v_dmask = ((((uint32_t)(1)) << self->private_impl.f_n_huffs_bits[1]) - 1);
   label__loop__continue:;
-  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 8)) {
-    v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63)));
-    iop_a_src += ((63 - (v_n_bits & 63)) >> 3);
-    v_n_bits |= 56;
+  while ((((uint64_t)(io2_a_dst - iop_a_dst)) >= 258) && (((uint64_t)(io2_a_src - iop_a_src)) >= 12)) {
+    if (v_n_bits < 15) {
+      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+      iop_a_src += 1;
+      v_n_bits += 8;
+      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+      iop_a_src += 1;
+      v_n_bits += 8;
+    } else {
+    }
     v_table_entry = self->private_data.f_huffs[0][(v_bits & v_lmask)];
     v_table_entry_n_bits = (v_table_entry & 15);
     v_bits >>= v_table_entry_n_bits;
@@ -23146,9 +23152,18 @@
       self->private_impl.f_end_of_block = true;
       goto label__loop__break;
     } else if ((v_table_entry >> 28) != 0) {
+      if (v_n_bits < 15) {
+        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+        iop_a_src += 1;
+        v_n_bits += 8;
+        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+        iop_a_src += 1;
+        v_n_bits += 8;
+      } else {
+      }
       v_redir_top = ((v_table_entry >> 8) & 65535);
       v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
-      v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+      v_table_entry = self->private_data.f_huffs[0][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
       v_table_entry_n_bits = (v_table_entry & 15);
       v_bits >>= v_table_entry_n_bits;
       v_n_bits -= v_table_entry_n_bits;
@@ -23179,21 +23194,50 @@
     v_length = (((v_table_entry >> 8) & 255) + 3);
     v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
     if (v_table_entry_n_bits > 0) {
-      v_length = (((v_length + 253 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 255) + 3);
+      if (v_n_bits < 15) {
+        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+        iop_a_src += 1;
+        v_n_bits += 8;
+        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+        iop_a_src += 1;
+        v_n_bits += 8;
+      } else {
+      }
+      v_length = (((v_length + 253 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 255) + 3);
       v_bits >>= v_table_entry_n_bits;
       v_n_bits -= v_table_entry_n_bits;
+    } else {
+    }
+    if (v_n_bits < 15) {
+      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+      iop_a_src += 1;
+      v_n_bits += 8;
+      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+      iop_a_src += 1;
+      v_n_bits += 8;
+    } else {
     }
     v_table_entry = self->private_data.f_huffs[1][(v_bits & v_dmask)];
     v_table_entry_n_bits = (v_table_entry & 15);
     v_bits >>= v_table_entry_n_bits;
     v_n_bits -= v_table_entry_n_bits;
     if ((v_table_entry >> 28) == 1) {
+      if (v_n_bits < 15) {
+        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+        iop_a_src += 1;
+        v_n_bits += 8;
+        v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+        iop_a_src += 1;
+        v_n_bits += 8;
+      } else {
+      }
       v_redir_top = ((v_table_entry >> 8) & 65535);
       v_redir_mask = ((((uint32_t)(1)) << ((v_table_entry >> 4) & 15)) - 1);
-      v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (((uint32_t)((v_bits & 4294967295))) & v_redir_mask)) & 1023)];
+      v_table_entry = self->private_data.f_huffs[1][((v_redir_top + (v_bits & v_redir_mask)) & 1023)];
       v_table_entry_n_bits = (v_table_entry & 15);
       v_bits >>= v_table_entry_n_bits;
       v_n_bits -= v_table_entry_n_bits;
+    } else {
     }
     if ((v_table_entry >> 24) != 64) {
       if ((v_table_entry >> 24) == 8) {
@@ -23205,7 +23249,15 @@
     }
     v_dist_minus_1 = ((v_table_entry >> 8) & 32767);
     v_table_entry_n_bits = ((v_table_entry >> 4) & 15);
-    v_dist_minus_1 = ((v_dist_minus_1 + ((uint32_t)(((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U64(v_table_entry_n_bits))))) & 32767);
+    if (v_n_bits < v_table_entry_n_bits) {
+      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+      iop_a_src += 1;
+      v_n_bits += 8;
+      v_bits |= (((uint32_t)(wuffs_base__peek_u8be__no_bounds_check(iop_a_src))) << v_n_bits);
+      iop_a_src += 1;
+      v_n_bits += 8;
+    }
+    v_dist_minus_1 = ((v_dist_minus_1 + ((v_bits) & WUFFS_BASE__LOW_BITS_MASK__U32(v_table_entry_n_bits))) & 32767);
     v_bits >>= v_table_entry_n_bits;
     v_n_bits -= v_table_entry_n_bits;
     while (true) {
@@ -23242,10 +23294,6 @@
     label__0__break:;
   }
   label__loop__break:;
-  if (v_n_bits > 63) {
-    status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
-    goto exit;
-  }
   while (v_n_bits >= 8) {
     v_n_bits -= 8;
     if (iop_a_src > io1_a_src) {
@@ -23255,7 +23303,7 @@
       goto exit;
     }
   }
-  self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1)) << v_n_bits) - 1))));
+  self->private_impl.f_bits = (v_bits & ((((uint32_t)(1)) << v_n_bits) - 1));
   self->private_impl.f_n_bits = v_n_bits;
   if ((self->private_impl.f_n_bits >= 8) || ((self->private_impl.f_bits >> self->private_impl.f_n_bits) != 0)) {
     status = wuffs_base__make_status(wuffs_deflate__error__internal_error_inconsistent_n_bits);
diff --git a/std/deflate/decode_huffman_fast32.wuffs b/std/deflate/decode_huffman_fast32.wuffs
index cbeb691..d75ee37 100644
--- a/std/deflate/decode_huffman_fast32.wuffs
+++ b/std/deflate/decode_huffman_fast32.wuffs
@@ -20,12 +20,12 @@
 	// decode_huffman_*.wuffs files as small as possible, while retaining both
 	// correctness and performance.
 
-	var bits               : base.u64
+	var bits               : base.u32
 	var n_bits             : base.u32
 	var table_entry        : base.u32
 	var table_entry_n_bits : base.u32[..= 15]
-	var lmask              : base.u64[..= 511]
-	var dmask              : base.u64[..= 511]
+	var lmask              : base.u32[..= 511]
+	var dmask              : base.u32[..= 511]
 	var redir_top          : base.u32[..= 0xFFFF]
 	var redir_mask         : base.u32[..= 0x7FFF]
 	var length             : base.u32[..= 258]
@@ -37,14 +37,14 @@
 		return "#internal error: inconsistent n_bits"
 	}
 
-	bits = this.bits as base.u64
+	bits = this.bits
 	n_bits = this.n_bits
 
-	lmask = ((1 as base.u64) << this.n_huffs_bits[0]) - 1
-	dmask = ((1 as base.u64) << this.n_huffs_bits[1]) - 1
+	lmask = ((1 as base.u32) << this.n_huffs_bits[0]) - 1
+	dmask = ((1 as base.u32) << this.n_huffs_bits[1]) - 1
 
 	// Check up front, on each iteration, that we have enough buffer space to
-	// both read (8 bytes) and write (258 bytes) as much as we need to. Doing
+	// both read (12 bytes) and write (258 bytes) as much as we need to. Doing
 	// this check once (per iteration), up front, removes the need to check
 	// multiple times inside the loop body, so it's faster overall.
 	//
@@ -55,28 +55,45 @@
 	// For reading, strictly speaking, we only need 6 bytes (48 bits) of
 	// available input, because the H-L Literal/Length code is up to 15 bits
 	// plus up to 5 extra bits, the H-D Distance code is up to 15 bits plus up
-	// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However, it's faster to
-	// read 64 bits than 48 bits or (48 - n_bits) bits.
-	while.loop(args.dst.length() >= 258) and (args.src.length() >= 8) {
-		// Ensure that we have at least 56 bits of input.
-		//
-		// This is "Variant 4" of
-		// https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
-		//
-		// 56, the number of bits in 7 bytes, is a property of that "Variant 4"
-		// bit-reading technique, and not related to the DEFLATE format per se.
-		//
-		// The "& 63" is a no-op, and not part of the original "Variant 4"
-		// technique, but satisfies Wuffs' overflow/underflow checks.
-		bits |= args.src.peek_u64le() ~mod<< (n_bits & 63)
-		args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8)
-		n_bits |= 56
+	// to 13 extra bits and 15 + 5 + 15 + 13 == 48. However,
+	// args.src.length() assertions are made in terms of bytes, not bits.
+	//
+	// In theory, we could write assertions that mixed bytes and bits, such as:
+	//
+	// assert ((args.src.length() >= 6) and (n_bits >=  5)) or
+	//        ((args.src.length() >= 4) and (n_bits >= 21))
+	//
+	// but that looks clumsy and makes the proofs more complicated. Instead, we
+	// are conservative and work in terms of bytes only. In bytes, each code
+	// requires either one (direct) or two (redirect) lookups, up to 2 bytes
+	// for each. There are also up to 13 extra bits, or 2 bytes. Thus, each
+	// code requires up to 2 + 2 + 2 == 6 bytes, and there are two codes (one
+	// from H-L and one from H-D). Therefore, we check for at least 12 bytes.
+	while.loop(args.dst.length() >= 258) and (args.src.length() >= 12) {
+		// Ensure that we have at least 15 bits of input.
+		if n_bits < 15 {
+			assert n_bits >= 0  // TODO: this shouldn't be necessary.
+			bits |= args.src.peek_u8_as_u32() << n_bits
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+			n_bits += 8
+			bits |= args.src.peek_u8_as_u32() << n_bits
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+			n_bits += 8
+			assert n_bits >= 15
+		} else {
+			assert args.src.length() >= 10
+		}
+		// These assertions are redundant, but are listed explicitly for
+		// clarity. They must hold regardless of which branch was taken for the
+		// if statement above.
+		assert args.src.length() >= 10
+		assert n_bits >= 15
 
 		// Decode an lcode symbol from H-L.
 		table_entry = this.huffs[0][bits & lmask]
 		table_entry_n_bits = table_entry & 0x0F
 		bits >>= table_entry_n_bits
-		n_bits ~mod-= table_entry_n_bits
+		n_bits -= table_entry_n_bits
 
 		if (table_entry >> 31) <> 0 {
 			// Literal.
@@ -84,19 +101,37 @@
 			continue.loop
 		} else if (table_entry >> 30) <> 0 {
 			// No-op; code continues past the if-else chain.
+			assert args.src.length() >= 8
 		} else if (table_entry >> 29) <> 0 {
 			// End of block.
 			this.end_of_block = true
 			break.loop
 		} else if (table_entry >> 28) <> 0 {
 			// Redirect.
+
+			// Ensure that we have at least 15 bits of input.
+			if n_bits < 15 {
+				assert n_bits >= 0  // TODO: this shouldn't be necessary.
+				bits |= args.src.peek_u8_as_u32() << n_bits
+				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+				n_bits += 8
+				bits |= args.src.peek_u8_as_u32() << n_bits
+				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+				n_bits += 8
+				assert n_bits >= 15
+			} else {
+				assert args.src.length() >= 8
+			}
+			// Once again, redundant but explicit assertions.
+			assert args.src.length() >= 8
+			assert n_bits >= 15
+
 			redir_top = (table_entry >> 8) & 0xFFFF
 			redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
-			table_entry = this.huffs[0][
-				(redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+			table_entry = this.huffs[0][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
 			table_entry_n_bits = table_entry & 0x0F
 			bits >>= table_entry_n_bits
-			n_bits ~mod-= table_entry_n_bits
+			n_bits -= table_entry_n_bits
 
 			if (table_entry >> 31) <> 0 {
 				// Literal.
@@ -116,6 +151,9 @@
 				return "#internal error: inconsistent Huffman decoder state"
 			}
 
+			// Once again, redundant but explicit assertions.
+			assert args.src.length() >= 8
+
 		} else if (table_entry >> 27) <> 0 {
 			return "#bad Huffman code"
 		} else {
@@ -129,28 +167,83 @@
 		length = ((table_entry >> 8) & 0xFF) + 3
 		table_entry_n_bits = (table_entry >> 4) & 0x0F
 		if table_entry_n_bits > 0 {
+			// Ensure that we have at least 15 bits of input.
+			if n_bits < 15 {
+				assert n_bits >= 0  // TODO: this shouldn't be necessary.
+				bits |= args.src.peek_u8_as_u32() << n_bits
+				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+				n_bits += 8
+				bits |= args.src.peek_u8_as_u32() << n_bits
+				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+				n_bits += 8
+				assert n_bits >= 15
+			} else {
+				assert args.src.length() >= 6
+			}
+			// Once again, redundant but explicit assertions.
+			assert args.src.length() >= 6
+			assert n_bits >= 15
+
 			// The "+ 253" is the same as "- 3", after the "& 0xFF", but the
 			// plus form won't require an underflow check.
-			length = ((length + 253 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0xFF) + 3
+			length = ((length + 253 + bits.low_bits(n: table_entry_n_bits)) & 0xFF) + 3
 			bits >>= table_entry_n_bits
-			n_bits ~mod-= table_entry_n_bits
+			n_bits -= table_entry_n_bits
+
+		} else {
+			assert args.src.length() >= 6
 		}
 
+		// Ensure that we have at least 15 bits of input.
+		if n_bits < 15 {
+			assert n_bits >= 0  // TODO: this shouldn't be necessary.
+			bits |= args.src.peek_u8_as_u32() << n_bits
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+			n_bits += 8
+			bits |= args.src.peek_u8_as_u32() << n_bits
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+			n_bits += 8
+			assert n_bits >= 15
+		} else {
+			assert args.src.length() >= 4
+		}
+		// Once again, redundant but explicit assertions.
+		assert args.src.length() >= 4
+		assert n_bits >= 15
+
 		// Decode a dcode symbol from H-D.
 		table_entry = this.huffs[1][bits & dmask]
 		table_entry_n_bits = table_entry & 15
 		bits >>= table_entry_n_bits
-		n_bits ~mod-= table_entry_n_bits
+		n_bits -= table_entry_n_bits
 
 		// Check for a redirect.
 		if (table_entry >> 28) == 1 {
+			// Ensure that we have at least 15 bits of input.
+			if n_bits < 15 {
+				assert n_bits >= 0  // TODO: this shouldn't be necessary.
+				bits |= args.src.peek_u8_as_u32() << n_bits
+				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+				n_bits += 8
+				bits |= args.src.peek_u8_as_u32() << n_bits
+				args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+				n_bits += 8
+				assert n_bits >= 15
+			} else {
+				assert args.src.length() >= 2
+			}
+			// Once again, redundant but explicit assertions.
+			assert args.src.length() >= 2
+			assert n_bits >= 15
+
 			redir_top = (table_entry >> 8) & 0xFFFF
 			redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1
-			table_entry = this.huffs[1][
-				(redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & HUFFS_TABLE_MASK]
+			table_entry = this.huffs[1][(redir_top + (bits & redir_mask)) & HUFFS_TABLE_MASK]
 			table_entry_n_bits = table_entry & 0x0F
 			bits >>= table_entry_n_bits
-			n_bits ~mod-= table_entry_n_bits
+			n_bits -= table_entry_n_bits
+		} else {
+			assert args.src.length() >= 2
 		}
 
 		// For H-D, all symbols should be base_number + extra_bits.
@@ -169,10 +262,25 @@
 		// undoing that bias makes proving (dist_minus_1 + 1) > 0 trivial.
 		dist_minus_1 = (table_entry >> 8) & 0x7FFF
 		table_entry_n_bits = (table_entry >> 4) & 0x0F
+		// Ensure that we have at least table_entry_n_bits bits of input.
+		if n_bits < table_entry_n_bits {
+			assert n_bits < 15 via "a < b: a < c; c <= b"(c: table_entry_n_bits)
+			assert n_bits >= 0  // TODO: this shouldn't be necessary.
+			bits |= args.src.peek_u8_as_u32() << n_bits
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+			n_bits += 8
+			bits |= args.src.peek_u8_as_u32() << n_bits
+			args.src.skip_u32_fast!(actual: 1, worst_case: 1)
+			n_bits += 8
+			assert table_entry_n_bits <= n_bits via "a <= b: a <= c; c <= b"(c: 16)
+			assert n_bits >= table_entry_n_bits via "a >= b: b <= a"()
+		}
+		// Once again, redundant but explicit assertions.
+		assert n_bits >= table_entry_n_bits
 
-		dist_minus_1 = (dist_minus_1 + (bits.low_bits(n: table_entry_n_bits) as base.u32)) & 0x7FFF
+		dist_minus_1 = (dist_minus_1 + bits.low_bits(n: table_entry_n_bits)) & 0x7FFF
 		bits >>= table_entry_n_bits
-		n_bits ~mod-= table_entry_n_bits
+		n_bits -= table_entry_n_bits
 
 		// The "while true { etc; break }" is a redundant version of "etc", but
 		// its presence minimizes the diff between decode_huffman_fastxx and
@@ -248,9 +356,6 @@
 	// mean that the (possibly different) args.src is no longer rewindable,
 	// even if conceptually, this function was responsible for reading the
 	// bytes we want to rewind.
-	if n_bits > 63 {
-		return "#internal error: inconsistent n_bits"
-	}
 	while n_bits >= 8,
 		post n_bits < 8,
 	{
@@ -262,7 +367,7 @@
 		}
 	} endwhile
 
-	this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32
+	this.bits = bits & (((1 as base.u32) << n_bits) - 1)
 	this.n_bits = n_bits
 
 	if (this.n_bits >= 8) or ((this.bits >> this.n_bits) <> 0) {