Add base.arm_crc32_u32 type On a Raspberry Pi 4 (32-bit armv7l) with -march=native and -mfpu=neon ("native" means "armv8-a+crc+simd"): wuffs_crc32_ieee_10k/clang9 825MB/s ± 0% 6802MB/s ± 0% +724.00% (p=0.008 n=5+5) wuffs_crc32_ieee_100k/clang9 837MB/s ± 0% 6199MB/s ± 0% +640.25% (p=0.008 n=5+5) wuffs_crc32_ieee_10k/gcc8 960MB/s ± 0% 4495MB/s ± 0% +368.35% (p=0.016 n=4+5) wuffs_crc32_ieee_100k/gcc8 961MB/s ± 0% 4447MB/s ± 0% +363.01% (p=0.016 n=5+4) mimic_crc32_ieee_10k 638MB/s ± 0% 631MB/s ± 0% -1.17% (p=0.008 n=5+5) mimic_crc32_ieee_100k 639MB/s ± 0% 639MB/s ± 0% +0.02% (p=0.008 n=5+5) wuffs_gzip_decode_10k/clang9 83.0MB/s ± 0% 90.7MB/s ± 0% +9.26% (p=0.008 n=5+5) wuffs_gzip_decode_100k/clang9 100MB/s ± 0% 112MB/s ± 0% +11.91% (p=0.008 n=5+5) wuffs_gzip_decode_10k/gcc8 86.1MB/s ± 0% 91.9MB/s ± 0% +6.72% (p=0.008 n=5+5) wuffs_gzip_decode_100k/gcc8 102MB/s ± 0% 112MB/s ± 0% +9.34% (p=0.008 n=5+5) mimic_gzip_decode_10k 98.8MB/s ± 0% 98.2MB/s ± 0% -0.60% (p=0.008 n=5+5) mimic_gzip_decode_100k 128MB/s ± 0% 128MB/s ± 0% -0.08% (p=0.008 n=5+5) wuffs_png_decode_image_19k_8bpp/clang9 67.6MB/s ± 0% 70.8MB/s ± 0% +4.72% (p=0.008 n=5+5) wuffs_png_decode_image_40k_24bpp/clang9 92.8MB/s ± 0% 98.8MB/s ± 0% +6.49% (p=0.008 n=5+5) wuffs_png_decode_image_77k_8bpp/clang9 255MB/s ± 0% 269MB/s ± 0% +5.44% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 123MB/s ± 0% 123MB/s ± 0% +0.13% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 110MB/s ± 0% 112MB/s ± 0% +1.46% (p=0.008 n=5+5) wuffs_png_decode_image_4002k_24bpp/clang9 88.4MB/s ± 0% 92.8MB/s ± 0% +4.90% (p=0.008 n=5+5) wuffs_png_decode_image_19k_8bpp/gcc8 67.2MB/s ± 0% 70.8MB/s ± 0% +5.37% (p=0.008 n=5+5) wuffs_png_decode_image_40k_24bpp/gcc8 94.0MB/s ± 0% 98.2MB/s ± 0% +4.45% (p=0.008 n=5+5) wuffs_png_decode_image_77k_8bpp/gcc8 236MB/s ± 0% 248MB/s ± 0% +4.84% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc8 114MB/s ± 0% 115MB/s ± 0% +1.11% (p=0.008 n=5+5) wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc8 102MB/s ± 0% 104MB/s ± 0% +2.17% (p=0.008 n=5+5) wuffs_png_decode_image_4002k_24bpp/gcc8 88.8MB/s ± 0% 94.3MB/s ± 0% +6.21% (p=0.008 n=5+5) mimic_png_decode_image_19k_8bpp 43.7MB/s ± 1% 43.4MB/s ± 1% ~ (p=0.095 n=5+5) mimic_png_decode_image_40k_24bpp 54.6MB/s ± 0% 53.9MB/s ± 0% -1.25% (p=0.008 n=5+5) mimic_png_decode_image_77k_8bpp 125MB/s ± 0% 124MB/s ± 0% -0.73% (p=0.008 n=5+5) mimic_png_decode_image_552k_32bpp_ignore_checksum skipped mimic_png_decode_image_552k_32bpp_verify_checksum 102MB/s ± 0% 101MB/s ± 0% -0.95% (p=0.008 n=5+5) mimic_png_decode_image_4002k_24bpp 81.3MB/s ± 0% 81.0MB/s ± 0% -0.46% (p=0.016 n=4+5)

commit: 84f3592dbe6d392c2b758791ae48ff3d0e82afff [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Sat Feb 13 22:34:54 2021 +1100
committer: Nigel Tao <nigeltao@golang.org> Sat Feb 13 22:54:47 2021 +1100
tree: 4325c64c3608107881cc4e01f4ba7e6d65d5b22d
parent: 81ba41e2511fc2a5bba87a4de24ca16592417179 [diff]
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 6b602db..45dbefe 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go

@@ -95,6 +95,8 @@
 
 	if qid[1].IsNumType() {
 		return g.writeBuiltinNumType(b, recv, method.Ident(), n.Args(), depth)
+	} else if qid[1].IsBuiltInCPUArch() {
+		return g.writeBuiltinCPUArch(b, recv, method.Ident(), n.Args(), sideEffectsOnly, depth)
 	} else {
 		switch qid[1] {
 		case t.IDIOReader:
@@ -144,8 +146,6 @@
 				b.writes("&empty_io_buffer")
 				return nil
 			}
-		case t.IDX86M128I:
-			return g.writeBuiltinCPUArch(b, recv, method.Ident(), n.Args(), sideEffectsOnly, depth)
 		}
 	}
 	return errNoSuchBuiltin
@@ -524,8 +524,11 @@
 	}
 
 	const create = "create"
-	methodStr := method.Str(g.tm)
-	if strings.HasPrefix(methodStr, create) {
+	if methodStr := method.Str(g.tm); methodStr == "value" {
+		return g.writeExpr(b, recv, false, depth)
+	} else if methodStr == create {
+		return g.writeExpr(b, args[0].AsArg().Value(), false, depth)
+	} else if strings.HasPrefix(methodStr, create) {
 		b.printf("%s(", methodStr[len(create):])
 		for i, o := range args {
 			if i > 0 {
@@ -552,6 +555,10 @@
 			b.writes(after)
 		}
 	} else {
+		armCRC32U32 := recv.MType().Eq(typeExprARMCRC32U32)
+		if armCRC32U32 {
+			b.writeb('_')
+		}
 		b.printf("%s(", methodStr)
 		if err := g.writeExpr(b, recv, false, depth); err != nil {
 			return err
@@ -560,7 +567,9 @@
 			b.writes(", ")
 			after := ""
 			v := o.AsArg().Value()
-			if !v.MType().IsCPUArchType() {
+			if armCRC32U32 {
+				// No-op.
+			} else if !v.MType().IsCPUArchType() {
 				b.writes("(int32_t)(")
 				after = ")"
 			}

diff --git a/internal/cgen/cgen.go b/internal/cgen/cgen.go
index ee3ee7a..492c517 100644
--- a/internal/cgen/cgen.go
+++ b/internal/cgen/cgen.go

@@ -43,6 +43,7 @@
 
 	maxInt64 = big.NewInt((1 << 63) - 1)
 
+	typeExprARMCRC32U32   = a.NewTypeExpr(0, t.IDBase, t.IDARMCRC32U32, nil, nil, nil)
 	typeExprPixelSwizzler = a.NewTypeExpr(0, t.IDBase, t.IDPixelSwizzler, nil, nil, nil)
 	typeExprUtility       = a.NewTypeExpr(0, t.IDBase, t.IDUtility, nil, nil, nil)
 )

diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index 3747fa4..c280ced 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go

@@ -577,7 +577,8 @@
 	t.IDTokenReader: "wuffs_base__token_buffer*",
 	t.IDTokenWriter: "wuffs_base__token_buffer*",
 
-	t.IDX86M128I: "__m128i",
+	t.IDARMCRC32U32: "uint32_t",
+	t.IDX86M128I:    "__m128i",
 }
 
 const noSuchCOperator = " no_such_C_operator "

diff --git a/internal/cgen/var.go b/internal/cgen/var.go
index f59e651..95ddfcf 100644
--- a/internal/cgen/var.go
+++ b/internal/cgen/var.go

@@ -351,6 +351,8 @@
 			b.writes(" = wuffs_base__make_status(NULL);\n")
 		} else if typ.IsIOType() {
 			b.printf(" = &%s%s;\n", uPrefix, name)
+		} else if typ.Eq(typeExprARMCRC32U32) {
+			b.writes(" = 0;\n")
 		} else {
 			b.writes(" = {0};\n")
 		}

diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 688201d..25d7cf7 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go

@@ -245,6 +245,7 @@
 
 	// ----
 
+	"arm_crc32_u32",
 	"x86_m128i",
 }
 
@@ -531,6 +532,16 @@
 	"pixel_swizzler.swizzle_interleaved_transparent_black!(" +
 		"dst: slice u8, dst_palette: slice u8, num_pixels: u64) u64",
 
+	// ---- arm_crc32_u32
+
+	"arm_crc32_u32.create(a: u32) arm_crc32_u32",
+	"arm_crc32_u32.value() u32",
+
+	"arm_crc32_u32._crc32b(b: u8) arm_crc32_u32",
+	"arm_crc32_u32._crc32h(b: u16) arm_crc32_u32",
+	"arm_crc32_u32._crc32w(b: u32) arm_crc32_u32",
+	"arm_crc32_u32._crc32d(b: u64) arm_crc32_u32",
+
 	// ---- x86_m128i
 
 	"x86_m128i.load_u32!(a: u32)",

diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index c1e2c31..611321b 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go

@@ -75,7 +75,8 @@
 
 	typeExprDecodeFrameOptions = a.NewTypeExpr(0, t.IDBase, t.IDDecodeFrameOptions, nil, nil, nil)
 
-	typeExprX86M128I = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
+	typeExprARMCRC32U32 = a.NewTypeExpr(0, t.IDBase, t.IDARMCRC32U32, nil, nil, nil)
+	typeExprX86M128I    = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
 
 	typeExprSliceU8 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, typeExprU8)
 	typeExprTableU8 = a.NewTypeExpr(t.IDTable, 0, 0, nil, nil, typeExprU8)
@@ -125,7 +126,8 @@
 
 	t.IDDecodeFrameOptions: typeExprDecodeFrameOptions,
 
-	t.IDX86M128I: typeExprX86M128I,
+	t.IDARMCRC32U32: typeExprARMCRC32U32,
+	t.IDX86M128I:    typeExprX86M128I,
 }
 
 func (c *Checker) parseBuiltInFuncs(m map[t.QQID]*a.Func, ss []string) error {

diff --git a/lang/check/type.go b/lang/check/type.go
index 90a2875..f444972 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go

@@ -25,7 +25,9 @@
 type cpuArchBits uint32
 
 const (
-	cpuArchBitsX86SSE42 = cpuArchBits(0x00000001)
+	cpuArchBitsARMCRC32 = cpuArchBits(0x00000001)
+	cpuArchBitsARMNeon  = cpuArchBits(0x00000002)
+	cpuArchBitsX86SSE42 = cpuArchBits(0x00000004)
 )
 
 func calcCPUArchBits(n *a.Func) (ret cpuArchBits) {
@@ -35,6 +37,10 @@
 			continue
 		}
 		switch o.Condition().RHS().AsExpr().Ident() {
+		case t.IDARMCRC32:
+			ret |= cpuArchBitsARMCRC32
+		case t.IDARMNeon:
+			ret |= cpuArchBitsARMNeon
 		case t.IDX86SSE42:
 			ret |= cpuArchBitsX86SSE42
 		}
@@ -46,6 +52,8 @@
 	if qid := typ.Innermost().QID(); qid[0] == t.IDBase {
 		need := cpuArchBits(0)
 		switch qid[1] {
+		case t.IDARMCRC32U32:
+			need = cpuArchBitsARMCRC32
 		case t.IDX86M128I:
 			need = cpuArchBitsX86SSE42
 		}

diff --git a/lang/token/list.go b/lang/token/list.go
index 6231aba..22664d0 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go

@@ -668,6 +668,8 @@
 	IDARMCRC32 = ID(0x300)
 	IDARMNeon  = ID(0x301)
 
+	IDARMCRC32U32 = ID(0x308)
+
 	IDX86SSE42 = ID(0x320)
 	IDX86AVX2  = ID(0x321)
 
@@ -1087,6 +1089,8 @@
 	IDARMCRC32: "arm_crc32",
 	IDARMNeon:  "arm_neon",
 
+	IDARMCRC32U32: "arm_crc32_u32",
+
 	IDX86SSE42: "x86_sse42",
 	IDX86AVX2:  "x86_avx2",
 

diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index e403bc3..09a2379 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -21945,87 +21945,58 @@
 
   v_s = (4294967295 ^ self->private_impl.f_state);
   while ((((uint64_t)(a_x.len)) > 0) && ((15 & ((uint32_t)(0xFFF & (uintptr_t)(a_x.ptr)))) != 0)) {
-    v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ a_x.ptr[0])] ^ (v_s >> 8));
+    v_s = __crc32b(v_s, a_x.ptr[0]);
     a_x = wuffs_base__slice_u8__subslice_i(a_x, 1);
   }
   {
     wuffs_base__slice_u8 i_slice_p = a_x;
     v_p.ptr = i_slice_p.ptr;
-    v_p.len = 16;
-    uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
+    v_p.len = 8;
+    uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 128) * 128);
     while (v_p.ptr < i_end0_p) {
-      v_s ^= ((((uint32_t)(v_p.ptr[0])) << 0) |
-          (((uint32_t)(v_p.ptr[1])) << 8) |
-          (((uint32_t)(v_p.ptr[2])) << 16) |
-          (((uint32_t)(v_p.ptr[3])) << 24));
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][v_p.ptr[15]] ^
-          WUFFS_CRC32__IEEE_TABLE[1][v_p.ptr[14]] ^
-          WUFFS_CRC32__IEEE_TABLE[2][v_p.ptr[13]] ^
-          WUFFS_CRC32__IEEE_TABLE[3][v_p.ptr[12]] ^
-          WUFFS_CRC32__IEEE_TABLE[4][v_p.ptr[11]] ^
-          WUFFS_CRC32__IEEE_TABLE[5][v_p.ptr[10]] ^
-          WUFFS_CRC32__IEEE_TABLE[6][v_p.ptr[9]] ^
-          WUFFS_CRC32__IEEE_TABLE[7][v_p.ptr[8]] ^
-          WUFFS_CRC32__IEEE_TABLE[8][v_p.ptr[7]] ^
-          WUFFS_CRC32__IEEE_TABLE[9][v_p.ptr[6]] ^
-          WUFFS_CRC32__IEEE_TABLE[10][v_p.ptr[5]] ^
-          WUFFS_CRC32__IEEE_TABLE[11][v_p.ptr[4]] ^
-          WUFFS_CRC32__IEEE_TABLE[12][(255 & (v_s >> 24))] ^
-          WUFFS_CRC32__IEEE_TABLE[13][(255 & (v_s >> 16))] ^
-          WUFFS_CRC32__IEEE_TABLE[14][(255 & (v_s >> 8))] ^
-          WUFFS_CRC32__IEEE_TABLE[15][(255 & (v_s >> 0))]);
-      v_p.ptr += 16;
-      v_s ^= ((((uint32_t)(v_p.ptr[0])) << 0) |
-          (((uint32_t)(v_p.ptr[1])) << 8) |
-          (((uint32_t)(v_p.ptr[2])) << 16) |
-          (((uint32_t)(v_p.ptr[3])) << 24));
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][v_p.ptr[15]] ^
-          WUFFS_CRC32__IEEE_TABLE[1][v_p.ptr[14]] ^
-          WUFFS_CRC32__IEEE_TABLE[2][v_p.ptr[13]] ^
-          WUFFS_CRC32__IEEE_TABLE[3][v_p.ptr[12]] ^
-          WUFFS_CRC32__IEEE_TABLE[4][v_p.ptr[11]] ^
-          WUFFS_CRC32__IEEE_TABLE[5][v_p.ptr[10]] ^
-          WUFFS_CRC32__IEEE_TABLE[6][v_p.ptr[9]] ^
-          WUFFS_CRC32__IEEE_TABLE[7][v_p.ptr[8]] ^
-          WUFFS_CRC32__IEEE_TABLE[8][v_p.ptr[7]] ^
-          WUFFS_CRC32__IEEE_TABLE[9][v_p.ptr[6]] ^
-          WUFFS_CRC32__IEEE_TABLE[10][v_p.ptr[5]] ^
-          WUFFS_CRC32__IEEE_TABLE[11][v_p.ptr[4]] ^
-          WUFFS_CRC32__IEEE_TABLE[12][(255 & (v_s >> 24))] ^
-          WUFFS_CRC32__IEEE_TABLE[13][(255 & (v_s >> 16))] ^
-          WUFFS_CRC32__IEEE_TABLE[14][(255 & (v_s >> 8))] ^
-          WUFFS_CRC32__IEEE_TABLE[15][(255 & (v_s >> 0))]);
-      v_p.ptr += 16;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
     }
-    v_p.len = 16;
-    uint8_t* i_end1_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 16) * 16);
+    v_p.len = 8;
+    uint8_t* i_end1_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 8) * 8);
     while (v_p.ptr < i_end1_p) {
-      v_s ^= ((((uint32_t)(v_p.ptr[0])) << 0) |
-          (((uint32_t)(v_p.ptr[1])) << 8) |
-          (((uint32_t)(v_p.ptr[2])) << 16) |
-          (((uint32_t)(v_p.ptr[3])) << 24));
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][v_p.ptr[15]] ^
-          WUFFS_CRC32__IEEE_TABLE[1][v_p.ptr[14]] ^
-          WUFFS_CRC32__IEEE_TABLE[2][v_p.ptr[13]] ^
-          WUFFS_CRC32__IEEE_TABLE[3][v_p.ptr[12]] ^
-          WUFFS_CRC32__IEEE_TABLE[4][v_p.ptr[11]] ^
-          WUFFS_CRC32__IEEE_TABLE[5][v_p.ptr[10]] ^
-          WUFFS_CRC32__IEEE_TABLE[6][v_p.ptr[9]] ^
-          WUFFS_CRC32__IEEE_TABLE[7][v_p.ptr[8]] ^
-          WUFFS_CRC32__IEEE_TABLE[8][v_p.ptr[7]] ^
-          WUFFS_CRC32__IEEE_TABLE[9][v_p.ptr[6]] ^
-          WUFFS_CRC32__IEEE_TABLE[10][v_p.ptr[5]] ^
-          WUFFS_CRC32__IEEE_TABLE[11][v_p.ptr[4]] ^
-          WUFFS_CRC32__IEEE_TABLE[12][(255 & (v_s >> 24))] ^
-          WUFFS_CRC32__IEEE_TABLE[13][(255 & (v_s >> 16))] ^
-          WUFFS_CRC32__IEEE_TABLE[14][(255 & (v_s >> 8))] ^
-          WUFFS_CRC32__IEEE_TABLE[15][(255 & (v_s >> 0))]);
-      v_p.ptr += 16;
+      v_s = __crc32d(v_s, wuffs_base__peek_u64le__no_bounds_check(v_p.ptr));
+      v_p.ptr += 8;
     }
     v_p.len = 1;
     uint8_t* i_end2_p = i_slice_p.ptr + i_slice_p.len;
     while (v_p.ptr < i_end2_p) {
-      v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+      v_s = __crc32b(v_s, v_p.ptr[0]);
       v_p.ptr += 1;
     }
     v_p.len = 0;

diff --git a/std/crc32/common_crc32.wuffs b/std/crc32/common_crc32.wuffs
index 02d6293..ee3470e 100644
--- a/std/crc32/common_crc32.wuffs
+++ b/std/crc32/common_crc32.wuffs

@@ -72,45 +72,24 @@
 pri func ieee_hasher.up_arm_crc32!(x: slice base.u8),
 	choose cpu_arch >= arm_crc32,
 {
-	var s : base.u32
+	var s : base.arm_crc32_u32
 	var p : slice base.u8
 
-	s = 0xFFFF_FFFF ^ this.state
+	s = s.create(a: 0xFFFF_FFFF ^ this.state)
 
 	// Align to a 16-byte boundary.
 	while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
-		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
+		s = s._crc32b(b: args.x[0])
 		args.x = args.x[1 ..]
 	} endwhile
 
-	// See "Multi-Byte Lookup Tables" in std/crc32/README.md for more detail on
-	// the slicing-by-M algorithm. We use an M of 16.
-	iterate (p = args.x)(length: 16, advance: 16, unroll: 2) {
-		s ^= ((p[0x00] as base.u32) << 0) |
-			((p[0x01] as base.u32) << 8) |
-			((p[0x02] as base.u32) << 16) |
-			((p[0x03] as base.u32) << 24)
-		s = IEEE_TABLE[0x00][p[0x0F]] ^
-			IEEE_TABLE[0x01][p[0x0E]] ^
-			IEEE_TABLE[0x02][p[0x0D]] ^
-			IEEE_TABLE[0x03][p[0x0C]] ^
-			IEEE_TABLE[0x04][p[0x0B]] ^
-			IEEE_TABLE[0x05][p[0x0A]] ^
-			IEEE_TABLE[0x06][p[0x09]] ^
-			IEEE_TABLE[0x07][p[0x08]] ^
-			IEEE_TABLE[0x08][p[0x07]] ^
-			IEEE_TABLE[0x09][p[0x06]] ^
-			IEEE_TABLE[0x0A][p[0x05]] ^
-			IEEE_TABLE[0x0B][p[0x04]] ^
-			IEEE_TABLE[0x0C][0xFF & (s >> 24)] ^
-			IEEE_TABLE[0x0D][0xFF & (s >> 16)] ^
-			IEEE_TABLE[0x0E][0xFF & (s >> 8)] ^
-			IEEE_TABLE[0x0F][0xFF & (s >> 0)]
+	iterate (p = args.x)(length: 8, advance: 8, unroll: 16) {
+		s = s._crc32d(b: p.peek_u64le())
 	} else (length: 1, advance: 1, unroll: 1) {
-		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+		s = s._crc32b(b: p[0])
 	}
 
-	this.state = 0xFFFF_FFFF ^ s
+	this.state = 0xFFFF_FFFF ^ s.value()
 }
 
 // The table below was created by script/print-crc32-magic-numbers.go.
commit	84f3592dbe6d392c2b758791ae48ff3d0e82afff	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Sat Feb 13 22:34:54 2021 +1100
committer	Nigel Tao <nigeltao@golang.org>	Sat Feb 13 22:54:47 2021 +1100
tree	4325c64c3608107881cc4e01f4ba7e6d65d5b22d
parent	81ba41e2511fc2a5bba87a4de24ca16592417179 [diff]