Add base.arm_crc32_u32 type

On a Raspberry Pi 4 (32-bit armv7l) with -march=native and -mfpu=neon
("native" means "armv8-a+crc+simd"):

wuffs_crc32_ieee_10k/clang9                                825MB/s ± 0%  6802MB/s ± 0%  +724.00%  (p=0.008 n=5+5)
wuffs_crc32_ieee_100k/clang9                               837MB/s ± 0%  6199MB/s ± 0%  +640.25%  (p=0.008 n=5+5)

wuffs_crc32_ieee_10k/gcc8                                  960MB/s ± 0%  4495MB/s ± 0%  +368.35%  (p=0.016 n=4+5)
wuffs_crc32_ieee_100k/gcc8                                 961MB/s ± 0%  4447MB/s ± 0%  +363.01%  (p=0.016 n=5+4)

mimic_crc32_ieee_10k                                       638MB/s ± 0%   631MB/s ± 0%    -1.17%  (p=0.008 n=5+5)
mimic_crc32_ieee_100k                                      639MB/s ± 0%   639MB/s ± 0%    +0.02%  (p=0.008 n=5+5)

wuffs_gzip_decode_10k/clang9                              83.0MB/s ± 0%  90.7MB/s ± 0%    +9.26%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang9                              100MB/s ± 0%   112MB/s ± 0%   +11.91%  (p=0.008 n=5+5)

wuffs_gzip_decode_10k/gcc8                                86.1MB/s ± 0%  91.9MB/s ± 0%    +6.72%  (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc8                                102MB/s ± 0%   112MB/s ± 0%    +9.34%  (p=0.008 n=5+5)

mimic_gzip_decode_10k                                     98.8MB/s ± 0%  98.2MB/s ± 0%    -0.60%  (p=0.008 n=5+5)
mimic_gzip_decode_100k                                     128MB/s ± 0%   128MB/s ± 0%    -0.08%  (p=0.008 n=5+5)

wuffs_png_decode_image_19k_8bpp/clang9                    67.6MB/s ± 0%  70.8MB/s ± 0%    +4.72%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9                   92.8MB/s ± 0%  98.8MB/s ± 0%    +6.49%  (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang9                     255MB/s ± 0%   269MB/s ± 0%    +5.44%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9   123MB/s ± 0%   123MB/s ± 0%    +0.13%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9   110MB/s ± 0%   112MB/s ± 0%    +1.46%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang9                 88.4MB/s ± 0%  92.8MB/s ± 0%    +4.90%  (p=0.008 n=5+5)

wuffs_png_decode_image_19k_8bpp/gcc8                      67.2MB/s ± 0%  70.8MB/s ± 0%    +5.37%  (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc8                     94.0MB/s ± 0%  98.2MB/s ± 0%    +4.45%  (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/gcc8                       236MB/s ± 0%   248MB/s ± 0%    +4.84%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc8     114MB/s ± 0%   115MB/s ± 0%    +1.11%  (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc8     102MB/s ± 0%   104MB/s ± 0%    +2.17%  (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc8                   88.8MB/s ± 0%  94.3MB/s ± 0%    +6.21%  (p=0.008 n=5+5)

mimic_png_decode_image_19k_8bpp                           43.7MB/s ± 1%  43.4MB/s ± 1%      ~     (p=0.095 n=5+5)
mimic_png_decode_image_40k_24bpp                          54.6MB/s ± 0%  53.9MB/s ± 0%    -1.25%  (p=0.008 n=5+5)
mimic_png_decode_image_77k_8bpp                            125MB/s ± 0%   124MB/s ± 0%    -0.73%  (p=0.008 n=5+5)
mimic_png_decode_image_552k_32bpp_ignore_checksum               skipped
mimic_png_decode_image_552k_32bpp_verify_checksum          102MB/s ± 0%   101MB/s ± 0%    -0.95%  (p=0.008 n=5+5)
mimic_png_decode_image_4002k_24bpp                        81.3MB/s ± 0%  81.0MB/s ± 0%    -0.46%  (p=0.016 n=4+5)
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 6b602db..45dbefe 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -95,6 +95,8 @@
 
 	if qid[1].IsNumType() {
 		return g.writeBuiltinNumType(b, recv, method.Ident(), n.Args(), depth)
+	} else if qid[1].IsBuiltInCPUArch() {
+		return g.writeBuiltinCPUArch(b, recv, method.Ident(), n.Args(), sideEffectsOnly, depth)
 	} else {
 		switch qid[1] {
 		case t.IDIOReader:
@@ -144,8 +146,6 @@
 				b.writes("&empty_io_buffer")
 				return nil
 			}
-		case t.IDX86M128I:
-			return g.writeBuiltinCPUArch(b, recv, method.Ident(), n.Args(), sideEffectsOnly, depth)
 		}
 	}
 	return errNoSuchBuiltin
@@ -524,8 +524,11 @@
 	}
 
 	const create = "create"
-	methodStr := method.Str(g.tm)
-	if strings.HasPrefix(methodStr, create) {
+	if methodStr := method.Str(g.tm); methodStr == "value" {
+		return g.writeExpr(b, recv, false, depth)
+	} else if methodStr == create {
+		return g.writeExpr(b, args[0].AsArg().Value(), false, depth)
+	} else if strings.HasPrefix(methodStr, create) {
 		b.printf("%s(", methodStr[len(create):])
 		for i, o := range args {
 			if i > 0 {
@@ -552,6 +555,10 @@
 			b.writes(after)
 		}
 	} else {
+		armCRC32U32 := recv.MType().Eq(typeExprARMCRC32U32)
+		if armCRC32U32 {
+			b.writeb('_')
+		}
 		b.printf("%s(", methodStr)
 		if err := g.writeExpr(b, recv, false, depth); err != nil {
 			return err
@@ -560,7 +567,9 @@
 			b.writes(", ")
 			after := ""
 			v := o.AsArg().Value()
-			if !v.MType().IsCPUArchType() {
+			if armCRC32U32 {
+				// No-op.
+			} else if !v.MType().IsCPUArchType() {
 				b.writes("(int32_t)(")
 				after = ")"
 			}
diff --git a/internal/cgen/cgen.go b/internal/cgen/cgen.go
index ee3ee7a..492c517 100644
--- a/internal/cgen/cgen.go
+++ b/internal/cgen/cgen.go
@@ -43,6 +43,7 @@
 
 	maxInt64 = big.NewInt((1 << 63) - 1)
 
+	typeExprARMCRC32U32   = a.NewTypeExpr(0, t.IDBase, t.IDARMCRC32U32, nil, nil, nil)
 	typeExprPixelSwizzler = a.NewTypeExpr(0, t.IDBase, t.IDPixelSwizzler, nil, nil, nil)
 	typeExprUtility       = a.NewTypeExpr(0, t.IDBase, t.IDUtility, nil, nil, nil)
 )
diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index 3747fa4..c280ced 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go
@@ -577,7 +577,8 @@
 	t.IDTokenReader: "wuffs_base__token_buffer*",
 	t.IDTokenWriter: "wuffs_base__token_buffer*",
 
-	t.IDX86M128I: "__m128i",
+	t.IDARMCRC32U32: "uint32_t",
+	t.IDX86M128I:    "__m128i",
 }
 
 const noSuchCOperator = " no_such_C_operator "
diff --git a/internal/cgen/var.go b/internal/cgen/var.go
index f59e651..95ddfcf 100644
--- a/internal/cgen/var.go
+++ b/internal/cgen/var.go
@@ -351,6 +351,8 @@
 			b.writes(" = wuffs_base__make_status(NULL);\n")
 		} else if typ.IsIOType() {
 			b.printf(" = &%s%s;\n", uPrefix, name)
+		} else if typ.Eq(typeExprARMCRC32U32) {
+			b.writes(" = 0;\n")
 		} else {
 			b.writes(" = {0};\n")
 		}
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 688201d..25d7cf7 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -245,6 +245,7 @@
 
 	// ----
 
+	"arm_crc32_u32",
 	"x86_m128i",
 }
 
@@ -531,6 +532,16 @@
 	"pixel_swizzler.swizzle_interleaved_transparent_black!(" +
 		"dst: slice u8, dst_palette: slice u8, num_pixels: u64) u64",
 
+	// ---- arm_crc32_u32
+
+	"arm_crc32_u32.create(a: u32) arm_crc32_u32",
+	"arm_crc32_u32.value() u32",
+
+	"arm_crc32_u32._crc32b(b: u8) arm_crc32_u32",
+	"arm_crc32_u32._crc32h(b: u16) arm_crc32_u32",
+	"arm_crc32_u32._crc32w(b: u32) arm_crc32_u32",
+	"arm_crc32_u32._crc32d(b: u64) arm_crc32_u32",
+
 	// ---- x86_m128i
 
 	"x86_m128i.load_u32!(a: u32)",
diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index c1e2c31..611321b 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go
@@ -75,7 +75,8 @@
 
 	typeExprDecodeFrameOptions = a.NewTypeExpr(0, t.IDBase, t.IDDecodeFrameOptions, nil, nil, nil)
 
-	typeExprX86M128I = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
+	typeExprARMCRC32U32 = a.NewTypeExpr(0, t.IDBase, t.IDARMCRC32U32, nil, nil, nil)
+	typeExprX86M128I    = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
 
 	typeExprSliceU8 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, typeExprU8)
 	typeExprTableU8 = a.NewTypeExpr(t.IDTable, 0, 0, nil, nil, typeExprU8)
@@ -125,7 +126,8 @@
 
 	t.IDDecodeFrameOptions: typeExprDecodeFrameOptions,
 
-	t.IDX86M128I: typeExprX86M128I,
+	t.IDARMCRC32U32: typeExprARMCRC32U32,
+	t.IDX86M128I:    typeExprX86M128I,
 }
 
 func (c *Checker) parseBuiltInFuncs(m map[t.QQID]*a.Func, ss []string) error {
diff --git a/lang/check/type.go b/lang/check/type.go
index 90a2875..f444972 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -25,7 +25,9 @@
 type cpuArchBits uint32
 
 const (
-	cpuArchBitsX86SSE42 = cpuArchBits(0x00000001)
+	cpuArchBitsARMCRC32 = cpuArchBits(0x00000001)
+	cpuArchBitsARMNeon  = cpuArchBits(0x00000002)
+	cpuArchBitsX86SSE42 = cpuArchBits(0x00000004)
 )
 
 func calcCPUArchBits(n *a.Func) (ret cpuArchBits) {
@@ -35,6 +37,10 @@
 			continue
 		}
 		switch o.Condition().RHS().AsExpr().Ident() {
+		case t.IDARMCRC32:
+			ret |= cpuArchBitsARMCRC32
+		case t.IDARMNeon:
+			ret |= cpuArchBitsARMNeon
 		case t.IDX86SSE42:
 			ret |= cpuArchBitsX86SSE42
 		}
@@ -46,6 +52,8 @@
 	if qid := typ.Innermost().QID(); qid[0] == t.IDBase {
 		need := cpuArchBits(0)
 		switch qid[1] {
+		case t.IDARMCRC32U32:
+			need = cpuArchBitsARMCRC32
 		case t.IDX86M128I:
 			need = cpuArchBitsX86SSE42
 		}
diff --git a/lang/token/list.go b/lang/token/list.go
index 6231aba..22664d0 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -668,6 +668,8 @@
 	IDARMCRC32 = ID(0x300)
 	IDARMNeon  = ID(0x301)
 
+	IDARMCRC32U32 = ID(0x308)
+
 	IDX86SSE42 = ID(0x320)
 	IDX86AVX2  = ID(0x321)
 
@@ -1087,6 +1089,8 @@
 	IDARMCRC32: "arm_crc32",
 	IDARMNeon:  "arm_neon",
 
+	IDARMCRC32U32: "arm_crc32_u32",
+
 	IDX86SSE42: "x86_sse42",
 	IDX86AVX2:  "x86_avx2",
 
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index e403bc3..09a2379 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -21945,87 +21945,58 @@
 
   v_s = (4294967295 ^ self->private_impl.f_state);
   while ((((uint64_t)(a_x.len)) > 0) && ((15 & ((uint32_t)(0xFFF & (uintptr_t)(a_x.ptr)))) != 0)) {
-    v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ a_x.ptr[0])] ^ (v_s >> 8));
+    v_s = __crc32b(v_s, a_x.ptr[0]);
     a_x = wuffs_base__slice_u8__subslice_i(a_x, 1);
   }
   {
     wuffs_base__slice_u8 i_slice_p = a_x;
     v_p.ptr = i_slice_p.ptr;
-    v_p.len = 16;
-    uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
+    v_p.len = 8;
+    uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 128) * 128);
     while (v_p.ptr < i_end0_p) {
-      v_s ^= ((((uint32_t)(v_p.ptr[0])) << 0) |
-          (((uint32_t)(v_p.ptr[1])) << 8) |
-          (((uint32_t)(v_p.ptr[2])) << 16) |
-          (((uint32_t)(v_p.ptr[3])) << 24));
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][v_p.ptr[15]] ^
-          WUFFS_CRC32__IEEE_TABLE[1][v_p.ptr[14]] ^
-          WUFFS_CRC32__IEEE_TABLE[2][v_p.ptr[13]] ^
-          WUFFS_CRC32__IEEE_TABLE[3][v_p.ptr[12]] ^
-          WUFFS_CRC32__IEEE_TABLE[4][v_p.ptr[11]] ^
-          WUFFS_CRC32__IEEE_TABLE[5][v_p.ptr[10]] ^
-          WUFFS_CRC32__IEEE_TABLE[6][v_p.ptr[9]] ^
-          WUFFS_CRC32__IEEE_TABLE[7][v_p.ptr[8]] ^
-          WUFFS_CRC32__IEEE_TABLE[8][v_p.ptr[7]] ^
-          WUFFS_CRC32__IEEE_TABLE[9][v_p.ptr[6]] ^
-          WUFFS_CRC32__IEEE_TABLE[10][v_p.ptr[5]] ^
-          WUFFS_CRC32__IEEE_TABLE[11][v_p.ptr[4]] ^
-          WUFFS_CRC32__IEEE_TABLE[12][(255 & (v_s >> 24))] ^
-          WUFFS_CRC32__IEEE_TABLE[13][(255 & (v_s >> 16))] ^
-          WUFFS_CRC32__IEEE_TABLE[14][(255 & (v_s >> 8))] ^
-          WUFFS_CRC32__IEEE_TABLE[15][(255 & (v_s >> 0))]);
-      v_p.ptr += 16;
-      v_s ^= ((((uint32_t)(v_p.ptr[0])) << 0) |
-          (((uint32_t)(v_p.ptr[1])) << 8) |
-          (((uint32_t)(v_p.ptr[2])) << 16) |
-          (((uint32_t)(v_p.ptr[3])) << 24));
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][v_p.ptr[15]] ^
-          WUFFS_CRC32__IEEE_TABLE[1][v_p.ptr[14]] ^
-          WUFFS_CRC32__IEEE_TABLE[2][v_p.ptr[13]] ^
-          WUFFS_CRC32__IEEE_TABLE[3][v_p.ptr[12]] ^
-          WUFFS_CRC32__IEEE_TABLE[4][v_p.ptr[11]] ^
-          WUFFS_CRC32__IEEE_TABLE[5][v_p.ptr[10]] ^
-          WUFFS_CRC32__IEEE_TABLE[6][v_p.ptr[9]] ^
-          WUFFS_CRC32__IEEE_TABLE[7][v_p.ptr[8]] ^
-          WUFFS_CRC32__IEEE_TABLE[8][v_p.ptr[7]] ^
-          WUFFS_CRC32__IEEE_TABLE[9][v_p.ptr[6]] ^
-          WUFFS_CRC32__IEEE_TABLE[10][v_p.ptr[5]] ^
-          WUFFS_CRC32__IEEE_TABLE[11][v_p.ptr[4]] ^
-          WUFFS_CRC32__IEEE_TABLE[12][(255 & (v_s >> 24))] ^
-          WUFFS_CRC32__IEEE_TABLE[13][(255 & (v_s >> 16))] ^
-          WUFFS_CRC32__IEEE_TABLE[14][(255 & (v_s >> 8))] ^
-          WUFFS_CRC32__IEEE_TABLE[15][(255 & (v_s >> 0))]);
-      v_p.ptr += 16;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
     }
-    v_p.len = 16;
-    uint8_t* i_end1_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 16) * 16);
+    v_p.len = 8;
+    uint8_t* i_end1_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 8) * 8);
     while (v_p.ptr < i_end1_p) {
-      v_s ^= ((((uint32_t)(v_p.ptr[0])) << 0) |
-          (((uint32_t)(v_p.ptr[1])) << 8) |
-          (((uint32_t)(v_p.ptr[2])) << 16) |
-          (((uint32_t)(v_p.ptr[3])) << 24));
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][v_p.ptr[15]] ^
-          WUFFS_CRC32__IEEE_TABLE[1][v_p.ptr[14]] ^
-          WUFFS_CRC32__IEEE_TABLE[2][v_p.ptr[13]] ^
-          WUFFS_CRC32__IEEE_TABLE[3][v_p.ptr[12]] ^
-          WUFFS_CRC32__IEEE_TABLE[4][v_p.ptr[11]] ^
-          WUFFS_CRC32__IEEE_TABLE[5][v_p.ptr[10]] ^
-          WUFFS_CRC32__IEEE_TABLE[6][v_p.ptr[9]] ^
-          WUFFS_CRC32__IEEE_TABLE[7][v_p.ptr[8]] ^
-          WUFFS_CRC32__IEEE_TABLE[8][v_p.ptr[7]] ^
-          WUFFS_CRC32__IEEE_TABLE[9][v_p.ptr[6]] ^
-          WUFFS_CRC32__IEEE_TABLE[10][v_p.ptr[5]] ^
-          WUFFS_CRC32__IEEE_TABLE[11][v_p.ptr[4]] ^
-          WUFFS_CRC32__IEEE_TABLE[12][(255 & (v_s >> 24))] ^
-          WUFFS_CRC32__IEEE_TABLE[13][(255 & (v_s >> 16))] ^
-          WUFFS_CRC32__IEEE_TABLE[14][(255 & (v_s >> 8))] ^
-          WUFFS_CRC32__IEEE_TABLE[15][(255 & (v_s >> 0))]);
-      v_p.ptr += 16;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
     }
     v_p.len = 1;
     uint8_t* i_end2_p = i_slice_p.ptr + i_slice_p.len;
     while (v_p.ptr < i_end2_p) {
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+      v_s = __crc32b(v_s, v_p.ptr[0]);
       v_p.ptr += 1;
     }
     v_p.len = 0;
diff --git a/std/crc32/common_crc32.wuffs b/std/crc32/common_crc32.wuffs
index 02d6293..ee3470e 100644
--- a/std/crc32/common_crc32.wuffs
+++ b/std/crc32/common_crc32.wuffs
@@ -72,45 +72,24 @@
 pri func ieee_hasher.up_arm_crc32!(x: slice base.u8),
 	choose cpu_arch >= arm_crc32,
 {
-	var s : base.u32
+	var s : base.arm_crc32_u32
 	var p : slice base.u8
 
-	s = 0xFFFF_FFFF ^ this.state
+	s = s.create(a: 0xFFFF_FFFF ^ this.state)
 
 	// Align to a 16-byte boundary.
 	while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
-		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
+		s = s._crc32b(b: args.x[0])
 		args.x = args.x[1 ..]
 	} endwhile
 
-	// See "Multi-Byte Lookup Tables" in std/crc32/README.md for more detail on
-	// the slicing-by-M algorithm. We use an M of 16.
-	iterate (p = args.x)(length: 16, advance: 16, unroll: 2) {
-		s ^= ((p[0x00] as base.u32) << 0) |
-			((p[0x01] as base.u32) << 8) |
-			((p[0x02] as base.u32) << 16) |
-			((p[0x03] as base.u32) << 24)
-		s = IEEE_TABLE[0x00][p[0x0F]] ^
-			IEEE_TABLE[0x01][p[0x0E]] ^
-			IEEE_TABLE[0x02][p[0x0D]] ^
-			IEEE_TABLE[0x03][p[0x0C]] ^
-			IEEE_TABLE[0x04][p[0x0B]] ^
-			IEEE_TABLE[0x05][p[0x0A]] ^
-			IEEE_TABLE[0x06][p[0x09]] ^
-			IEEE_TABLE[0x07][p[0x08]] ^
-			IEEE_TABLE[0x08][p[0x07]] ^
-			IEEE_TABLE[0x09][p[0x06]] ^
-			IEEE_TABLE[0x0A][p[0x05]] ^
-			IEEE_TABLE[0x0B][p[0x04]] ^
-			IEEE_TABLE[0x0C][0xFF & (s >> 24)] ^
-			IEEE_TABLE[0x0D][0xFF & (s >> 16)] ^
-			IEEE_TABLE[0x0E][0xFF & (s >> 8)] ^
-			IEEE_TABLE[0x0F][0xFF & (s >> 0)]
+	iterate (p = args.x)(length: 8, advance: 8, unroll: 16) {
+		s = s._crc32d(b: p.peek_u64le())
 	} else (length: 1, advance: 1, unroll: 1) {
-		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+		s = s._crc32b(b: p[0])
 	}
 
-	this.state = 0xFFFF_FFFF ^ s
+	this.state = 0xFFFF_FFFF ^ s.value()
 }
 
 // The table below was created by script/print-crc32-magic-numbers.go.