Add crc32.ieee_hasher.up_x86_avx2 name old speed new speed delta wuffs_crc32_ieee_10k/clang9 8.42GB/s ± 1% 8.12GB/s ± 1% -3.55% (p=0.000 n=9+8) wuffs_crc32_ieee_100k/clang9 11.0GB/s ± 1% 10.9GB/s ± 1% ~ (p=0.931 n=9+9) wuffs_crc32_ieee_10k/gcc10 9.20GB/s ± 0% 9.71GB/s ± 0% +5.54% (p=0.000 n=8+10) wuffs_crc32_ieee_100k/gcc10 11.9GB/s ± 2% 13.1GB/s ± 1% +10.22% (p=0.000 n=10+8) wuffs_gzip_decode_10k/clang9 219MB/s ± 1% 218MB/s ± 2% ~ (p=0.497 n=9+10) wuffs_gzip_decode_100k/clang9 283MB/s ± 2% 281MB/s ± 1% ~ (p=0.063 n=10+10) wuffs_gzip_decode_10k/gcc10 224MB/s ± 1% 223MB/s ± 1% ~ (p=0.489 n=9+9) wuffs_gzip_decode_100k/gcc10 285MB/s ± 4% 290MB/s ± 0% +1.68% (p=0.004 n=10+8) wuffs_png_decode_image_19k_8bpp/clang9 138MB/s ± 2% 138MB/s ± 1% ~ (p=0.156 n=10+9) wuffs_png_decode_image_40k_24bpp/clang9 171MB/s ± 2% 172MB/s ± 1% ~ (p=0.971 n=10+10) wuffs_png_decode_image_77k_8bpp/clang9 495MB/s ± 4% 504MB/s ± 1% +1.79% (p=0.000 n=9+9) wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 457MB/s ± 4% 463MB/s ± 0% +1.35% (p=0.002 n=9+9) wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 435MB/s ± 5% 445MB/s ± 1% +2.31% (p=0.000 n=10+9) wuffs_png_decode_image_4002k_24bpp/clang9 172MB/s ± 2% 174MB/s ± 1% ~ (p=0.053 n=10+9) wuffs_png_decode_image_19k_8bpp/gcc10 157MB/s ± 1% 157MB/s ± 1% ~ (p=0.549 n=9+10) wuffs_png_decode_image_40k_24bpp/gcc10 187MB/s ± 1% 186MB/s ± 1% ~ (p=0.182 n=10+9) wuffs_png_decode_image_77k_8bpp/gcc10 554MB/s ± 1% 559MB/s ± 0% +0.84% (p=0.000 n=9+9) wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 502MB/s ± 1% 500MB/s ± 0% -0.38% (p=0.011 n=10+10) wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 481MB/s ± 1% 478MB/s ± 0% -0.46% (p=0.002 n=8+9) wuffs_png_decode_image_4002k_24bpp/gcc10 189MB/s ± 2% 190MB/s ± 0% +0.62% (p=0.008 n=10+9)

commit: 2dddd388bc6a1c3e8dd89c62df1071a445644b38 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Fri Apr 09 23:12:32 2021 +1000
committer: Nigel Tao <nigeltao@golang.org> Fri Apr 09 23:12:32 2021 +1000
tree: 8d87c2498ca00a70bd6ae87862a9263ee9183b60
parent: 1dd7ad44130e4332b539aac1d293d6ab2cee79ae [diff]
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index d260979..7b714ed 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h

@@ -149,6 +149,33 @@
 }
 
 static inline bool  //
+wuffs_base__cpu_arch__have_x86_avx2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  // GCC defines these macros but MSVC does not.
+  //  - bit_BMI2 = (1 <<  5)
+  const unsigned int avx2_ebx7 = 0x00000020;
+
+  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+  unsigned int eax7 = 0;
+  unsigned int ebx7 = 0;
+  unsigned int ecx7 = 0;
+  unsigned int edx7 = 0;
+  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+    return (ebx7 & avx2_ebx7) == avx2_ebx7;
+  }
+#elif defined(_MSC_VER)  // defined(__GNUC__)
+  int x[4];
+  __cpuidex(x, 7, 0);
+  return (((unsigned int)(x[1])) & avx2_ebx7) == avx2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif  // defined(__GNUC__); defined(_MSC_VER)
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  return false;
+}
+
+static inline bool  //
 wuffs_base__cpu_arch__have_x86_bmi2() {
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
   // GCC defines these macros but MSVC does not.

diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index ada5cde..1806430 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go

@@ -60,9 +60,10 @@
 	"" +
 	"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif  // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
 	"" +
-	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_BMI2 = (1 <<  8)\n  const unsigned int bmi2_ebx7 = 0x00000100;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    return (ebx7 & bmi2_ebx7) == bmi2_ebx7;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  i" +
-	"nt x[4];\n  __cpuidex(x, 7, 0);\n  return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    return (ecx1 & sse42_ecx1) == sse42_ecx1;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuid(x, 1);\n  return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#els" +
-	"e\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
+	"// ---------------- CPU Architecture\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n  return true;\n#else\n  return false;\n#endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_avx2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_BMI2 = (1 <<  5)\n  const unsigned int avx2_ebx7 = 0x00000020;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    return (ebx7 & avx2_ebx7) == avx2_ebx7;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  i" +
+	"nt x[4];\n  __cpuidex(x, 7, 0);\n  return (((unsigned int)(x[1])) & avx2_ebx7) == avx2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_BMI2 = (1 <<  8)\n  const unsigned int bmi2_ebx7 = 0x00000100;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax7 = 0;\n  unsigned int ebx7 = 0;\n  unsigned int ecx7 = 0;\n  unsigned int edx7 = 0;\n  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n    return (ebx7 & bmi2_ebx7) == bmi2_ebx7;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuidex(x, 7, 0);\n  return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an uns" +
+	"upported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  return false;\n}\n\nstatic inline bool  //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n  // GCC defines these macros but MSVC does not.\n  //  - bit_PCLMUL = (1 <<  1)\n  //  - bit_POPCNT = (1 << 23)\n  //  - bit_SSE4_2 = (1 << 20)\n  const unsigned int sse42_ecx1 = 0x00900002;\n\n  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n  unsigned int eax1 = 0;\n  unsigned int ebx1 = 0;\n  unsigned int ecx1 = 0;\n  unsigned int edx1 = 0;\n  if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n    return (ecx1 & sse42_ecx1) == sse42_ecx1;\n  }\n#elif defined(_MSC_VER)  // defined(__GNUC__)\n  int x[4];\n  __cpuid(x, 1);\n  return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif  // defined(__GNUC__); defined(_MSC_VER)\n#endif  // defined(WUFFS_BAS" +
+	"E__CPU_ARCH__X86_64)\n  return false;\n}\n\n" +
 	"" +
 	"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n//  - converting a uint32_t to a size_t will never overflow.\n//  - converting a size_t to a uint64_t will never overflow.\n#if defined(__WORDSIZE)\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
 	"" +

diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index 8343623..cf5c0f8 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go

@@ -279,6 +279,10 @@
 				caMacro, caName, caAttribute =
 					"X86_64", "x86_sse42",
 					"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2\")"
+			case t.IDX86AVX2:
+				caMacro, caName, caAttribute =
+					"X86_64", "x86_avx2",
+					"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2,avx2\")"
 			case t.IDX86BMI2:
 				caMacro, caName, caAttribute =
 					"X86_64", "x86_bmi2",

diff --git a/lang/check/type.go b/lang/check/type.go
index 1781bb6..d515bc1 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go

@@ -28,6 +28,7 @@
 	cpuArchBitsARMCRC32 = cpuArchBits(0x00000001)
 	cpuArchBitsARMNeon  = cpuArchBits(0x00000002)
 	cpuArchBitsX86SSE42 = cpuArchBits(0x00000004)
+	cpuArchBitsX86AVX2  = cpuArchBits(0x00000008)
 )
 
 func calcCPUArchBits(n *a.Func) (ret cpuArchBits) {
@@ -43,6 +44,8 @@
 			ret |= cpuArchBitsARMNeon
 		case t.IDX86SSE42:
 			ret |= cpuArchBitsX86SSE42
+		case t.IDX86AVX2:
+			ret |= cpuArchBitsX86SSE42 | cpuArchBitsX86AVX2
 		}
 	}
 	return ret

diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index d9756cc..682ae05 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -188,6 +188,33 @@
 }
 
 static inline bool  //
+wuffs_base__cpu_arch__have_x86_avx2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  // GCC defines these macros but MSVC does not.
+  //  - bit_BMI2 = (1 <<  5)
+  const unsigned int avx2_ebx7 = 0x00000020;
+
+  // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+  unsigned int eax7 = 0;
+  unsigned int ebx7 = 0;
+  unsigned int ecx7 = 0;
+  unsigned int edx7 = 0;
+  if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+    return (ebx7 & avx2_ebx7) == avx2_ebx7;
+  }
+#elif defined(_MSC_VER)  // defined(__GNUC__)
+  int x[4];
+  __cpuidex(x, 7, 0);
+  return (((unsigned int)(x[1])) & avx2_ebx7) == avx2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif  // defined(__GNUC__); defined(_MSC_VER)
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+  return false;
+}
+
+static inline bool  //
 wuffs_base__cpu_arch__have_x86_bmi2() {
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
   // GCC defines these macros but MSVC does not.
@@ -24687,6 +24714,13 @@
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
 static wuffs_base__empty_struct
+wuffs_crc32__ieee_hasher__up_x86_avx2(
+    wuffs_crc32__ieee_hasher* self,
+    wuffs_base__slice_u8 a_x);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
 wuffs_crc32__ieee_hasher__up_x86_sse42(
     wuffs_crc32__ieee_hasher* self,
     wuffs_base__slice_u8 a_x);
@@ -24805,6 +24839,9 @@
         wuffs_base__cpu_arch__have_arm_crc32() ? &wuffs_crc32__ieee_hasher__up_arm_crc32 :
 #endif
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+        wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_crc32__ieee_hasher__up_x86_avx2 :
+#endif
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
         wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_crc32__ieee_hasher__up_x86_sse42 :
 #endif
         self->private_impl.choosy_up);
@@ -24991,6 +25028,125 @@
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
 // ‼ WUFFS MULTI-FILE SECTION -arm_crc32
 
+// ‼ WUFFS MULTI-FILE SECTION +x86_avx2
+// -------- func crc32.ieee_hasher.up_x86_avx2
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2")
+static wuffs_base__empty_struct
+wuffs_crc32__ieee_hasher__up_x86_avx2(
+    wuffs_crc32__ieee_hasher* self,
+    wuffs_base__slice_u8 a_x) {
+  uint32_t v_s = 0;
+  wuffs_base__slice_u8 v_p = {0};
+  __m128i v_k = {0};
+  __m128i v_x0 = {0};
+  __m128i v_x1 = {0};
+  __m128i v_x2 = {0};
+  __m128i v_x3 = {0};
+  __m128i v_y0 = {0};
+  __m128i v_y1 = {0};
+  __m128i v_y2 = {0};
+  __m128i v_y3 = {0};
+  uint64_t v_tail_index = 0;
+
+  v_s = (4294967295 ^ self->private_impl.f_state);
+  while ((((uint64_t)(a_x.len)) > 0) && ((15 & ((uint32_t)(0xFFF & (uintptr_t)(a_x.ptr)))) != 0)) {
+    v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ a_x.ptr[0])] ^ (v_s >> 8));
+    a_x = wuffs_base__slice_u8__subslice_i(a_x, 1);
+  }
+  if (((uint64_t)(a_x.len)) < 64) {
+    {
+      wuffs_base__slice_u8 i_slice_p = a_x;
+      v_p.ptr = i_slice_p.ptr;
+      v_p.len = 1;
+      uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+      while (v_p.ptr < i_end0_p) {
+        v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+        v_p.ptr += 1;
+      }
+      v_p.len = 0;
+    }
+    self->private_impl.f_state = (4294967295 ^ v_s);
+    return wuffs_base__make_empty_struct();
+  }
+  v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0));
+  v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16));
+  v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32));
+  v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48));
+  v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
+  v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
+  {
+    wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64);
+    v_p.ptr = i_slice_p.ptr;
+    v_p.len = 64;
+    uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64);
+    while (v_p.ptr < i_end0_p) {
+      v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+      v_y1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(0));
+      v_y2 = _mm_clmulepi64_si128(v_x2, v_k, (int32_t)(0));
+      v_y3 = _mm_clmulepi64_si128(v_x3, v_k, (int32_t)(0));
+      v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+      v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(17));
+      v_x2 = _mm_clmulepi64_si128(v_x2, v_k, (int32_t)(17));
+      v_x3 = _mm_clmulepi64_si128(v_x3, v_k, (int32_t)(17));
+      v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0)));
+      v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16)));
+      v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32)));
+      v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48)));
+      v_p.ptr += 64;
+    }
+    v_p.len = 0;
+  }
+  v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
+  v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+  v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+  v_x0 = _mm_xor_si128(v_x0, v_x1);
+  v_x0 = _mm_xor_si128(v_x0, v_y0);
+  v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+  v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+  v_x0 = _mm_xor_si128(v_x0, v_x2);
+  v_x0 = _mm_xor_si128(v_x0, v_y0);
+  v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+  v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+  v_x0 = _mm_xor_si128(v_x0, v_x3);
+  v_x0 = _mm_xor_si128(v_x0, v_y0);
+  v_x1 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(16));
+  v_x2 = _mm_set_epi32((int32_t)(0), (int32_t)(4294967295), (int32_t)(0), (int32_t)(4294967295));
+  v_x0 = _mm_srli_si128(v_x0, (int32_t)(8));
+  v_x0 = _mm_xor_si128(v_x0, v_x1);
+  v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
+  v_x1 = _mm_srli_si128(v_x0, (int32_t)(4));
+  v_x0 = _mm_and_si128(v_x0, v_x2);
+  v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+  v_x0 = _mm_xor_si128(v_x0, v_x1);
+  v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
+  v_x1 = _mm_and_si128(v_x0, v_x2);
+  v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(16));
+  v_x1 = _mm_and_si128(v_x1, v_x2);
+  v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(0));
+  v_x0 = _mm_xor_si128(v_x0, v_x1);
+  v_s = ((uint32_t)(_mm_extract_epi32(v_x0, (int32_t)(1))));
+  v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
+  if (v_tail_index < ((uint64_t)(a_x.len))) {
+    {
+      wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
+      v_p.ptr = i_slice_p.ptr;
+      v_p.len = 1;
+      uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+      while (v_p.ptr < i_end0_p) {
+        v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+        v_p.ptr += 1;
+      }
+      v_p.len = 0;
+    }
+  }
+  self->private_impl.f_state = (4294967295 ^ v_s);
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+// ‼ WUFFS MULTI-FILE SECTION -x86_avx2
+
 // ‼ WUFFS MULTI-FILE SECTION +x86_sse42
 // -------- func crc32.ieee_hasher.up_x86_sse42
 

diff --git a/std/crc32/common_crc32.wuffs b/std/crc32/common_crc32.wuffs
index ae48e95..c7a94e9 100644
--- a/std/crc32/common_crc32.wuffs
+++ b/std/crc32/common_crc32.wuffs

@@ -27,6 +27,7 @@
 	if this.state == 0 {
 		choose up = [
 			up_arm_crc32,
+			up_x86_avx2,
 			up_x86_sse42]
 	}
 	this.up!(x: args.x)

diff --git a/std/crc32/common_up_x86_avx2.wuffs b/std/crc32/common_up_x86_avx2.wuffs
new file mode 100644
index 0000000..27c0722
--- /dev/null
+++ b/std/crc32/common_up_x86_avx2.wuffs

@@ -0,0 +1,138 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// --------
+
+// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
+// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
+
+// up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose
+// cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not
+// just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq
+pri func ieee_hasher.up_x86_avx2!(x: slice base.u8),
+	choose cpu_arch >= x86_avx2,
+{
+	var s : base.u32
+	var p : slice base.u8
+
+	var util : base.x86_sse42_utility
+	var k    : base.x86_m128i
+	var x0   : base.x86_m128i
+	var x1   : base.x86_m128i
+	var x2   : base.x86_m128i
+	var x3   : base.x86_m128i
+	var y0   : base.x86_m128i
+	var y1   : base.x86_m128i
+	var y2   : base.x86_m128i
+	var y3   : base.x86_m128i
+
+	var tail_index : base.u64
+
+	s = 0xFFFF_FFFF ^ this.state
+
+	// Align to a 16-byte boundary.
+	while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
+		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
+		args.x = args.x[1 ..]
+	} endwhile
+
+	// For short inputs, just do a simple loop.
+	if args.x.length() < 64 {
+		iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
+			s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+		}
+		this.state = 0xFFFF_FFFF ^ s
+		return nothing
+	}
+
+	// Load 128×4 = 512 bits from the first 64-byte chunk.
+	x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
+	x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+	x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
+	x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+
+	// Combine with the initial state.
+	x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
+
+	// Process the remaining 64-byte chunks.
+	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
+	iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
+		y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+		y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
+		y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00)
+		y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00)
+
+		x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+		x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11)
+		x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11)
+		x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11)
+
+		x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
+		x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
+		x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
+		x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
+	}
+
+	// Reduce 128×4 = 512 bits to 128 bits.
+	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
+	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+	x0 = x0._mm_xor_si128(b: x1)
+	x0 = x0._mm_xor_si128(b: y0)
+	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+	x0 = x0._mm_xor_si128(b: x2)
+	x0 = x0._mm_xor_si128(b: y0)
+	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+	x0 = x0._mm_xor_si128(b: x3)
+	x0 = x0._mm_xor_si128(b: y0)
+
+	// Reduce 128 bits to 64 bits.
+	x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10)
+	x2 = util.make_m128i_multiple_u32(
+		a00: 0xFFFF_FFFF,
+		a01: 0x0000_0000,
+		a02: 0xFFFF_FFFF,
+		a03: 0x0000_0000)
+	x0 = x0._mm_srli_si128(imm8: 8)
+	x0 = x0._mm_xor_si128(b: x1)
+	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
+	x1 = x0._mm_srli_si128(imm8: 4)
+	x0 = x0._mm_and_si128(b: x2)
+	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+	x0 = x0._mm_xor_si128(b: x1)
+
+	// Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
+	//
+	// Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
+	// adjusting for bit-reflection as per Figure 12 (page 21).
+	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
+	x1 = x0._mm_and_si128(b: x2)
+	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10)
+	x1 = x1._mm_and_si128(b: x2)
+	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
+	x0 = x0._mm_xor_si128(b: x1)
+	s = x0._mm_extract_epi32(imm8: 1)
+
+	// Handle the tail of args.x that wasn't a complete 64-byte chunk.
+	tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0  // And-not 64.
+	if tail_index < args.x.length() {
+		iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
+			s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+		}
+	}
+
+	this.state = 0xFFFF_FFFF ^ s
+}
commit	2dddd388bc6a1c3e8dd89c62df1071a445644b38	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Fri Apr 09 23:12:32 2021 +1000
committer	Nigel Tao <nigeltao@golang.org>	Fri Apr 09 23:12:32 2021 +1000
tree	8d87c2498ca00a70bd6ae87862a9263ee9183b60
parent	1dd7ad44130e4332b539aac1d293d6ab2cee79ae [diff]