Add crc32.ieee_hasher.up_x86_avx2
name old speed new speed delta
wuffs_crc32_ieee_10k/clang9 8.42GB/s ± 1% 8.12GB/s ± 1% -3.55% (p=0.000 n=9+8)
wuffs_crc32_ieee_100k/clang9 11.0GB/s ± 1% 10.9GB/s ± 1% ~ (p=0.931 n=9+9)
wuffs_crc32_ieee_10k/gcc10 9.20GB/s ± 0% 9.71GB/s ± 0% +5.54% (p=0.000 n=8+10)
wuffs_crc32_ieee_100k/gcc10 11.9GB/s ± 2% 13.1GB/s ± 1% +10.22% (p=0.000 n=10+8)
wuffs_gzip_decode_10k/clang9 219MB/s ± 1% 218MB/s ± 2% ~ (p=0.497 n=9+10)
wuffs_gzip_decode_100k/clang9 283MB/s ± 2% 281MB/s ± 1% ~ (p=0.063 n=10+10)
wuffs_gzip_decode_10k/gcc10 224MB/s ± 1% 223MB/s ± 1% ~ (p=0.489 n=9+9)
wuffs_gzip_decode_100k/gcc10 285MB/s ± 4% 290MB/s ± 0% +1.68% (p=0.004 n=10+8)
wuffs_png_decode_image_19k_8bpp/clang9 138MB/s ± 2% 138MB/s ± 1% ~ (p=0.156 n=10+9)
wuffs_png_decode_image_40k_24bpp/clang9 171MB/s ± 2% 172MB/s ± 1% ~ (p=0.971 n=10+10)
wuffs_png_decode_image_77k_8bpp/clang9 495MB/s ± 4% 504MB/s ± 1% +1.79% (p=0.000 n=9+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 457MB/s ± 4% 463MB/s ± 0% +1.35% (p=0.002 n=9+9)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 435MB/s ± 5% 445MB/s ± 1% +2.31% (p=0.000 n=10+9)
wuffs_png_decode_image_4002k_24bpp/clang9 172MB/s ± 2% 174MB/s ± 1% ~ (p=0.053 n=10+9)
wuffs_png_decode_image_19k_8bpp/gcc10 157MB/s ± 1% 157MB/s ± 1% ~ (p=0.549 n=9+10)
wuffs_png_decode_image_40k_24bpp/gcc10 187MB/s ± 1% 186MB/s ± 1% ~ (p=0.182 n=10+9)
wuffs_png_decode_image_77k_8bpp/gcc10 554MB/s ± 1% 559MB/s ± 0% +0.84% (p=0.000 n=9+9)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 502MB/s ± 1% 500MB/s ± 0% -0.38% (p=0.011 n=10+10)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 481MB/s ± 1% 478MB/s ± 0% -0.46% (p=0.002 n=8+9)
wuffs_png_decode_image_4002k_24bpp/gcc10 189MB/s ± 2% 190MB/s ± 0% +0.62% (p=0.008 n=10+9)
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index d260979..7b714ed 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -149,6 +149,33 @@
}
static inline bool //
+wuffs_base__cpu_arch__have_x86_avx2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ // GCC defines these macros but MSVC does not.
+ // - bit_BMI2 = (1 << 5)
+ const unsigned int avx2_ebx7 = 0x00000020;
+
+ // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+ unsigned int eax7 = 0;
+ unsigned int ebx7 = 0;
+ unsigned int ecx7 = 0;
+ unsigned int edx7 = 0;
+ if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+ return (ebx7 & avx2_ebx7) == avx2_ebx7;
+ }
+#elif defined(_MSC_VER) // defined(__GNUC__)
+ int x[4];
+ __cpuidex(x, 7, 0);
+ return (((unsigned int)(x[1])) & avx2_ebx7) == avx2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif // defined(__GNUC__); defined(_MSC_VER)
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ return false;
+}
+
+static inline bool //
wuffs_base__cpu_arch__have_x86_bmi2() {
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
// GCC defines these macros but MSVC does not.
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index ada5cde..1806430 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -60,9 +60,10 @@
"" +
"// --------\n\n// Define WUFFS_CONFIG__STATIC_FUNCTIONS to make all of Wuffs' functions have\n// static storage. The motivation is discussed in the \"ALLOW STATIC\n// IMPLEMENTATION\" section of\n// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt\n#if defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n#define WUFFS_BASE__MAYBE_STATIC static\n#else\n#define WUFFS_BASE__MAYBE_STATIC\n#endif // defined(WUFFS_CONFIG__STATIC_FUNCTIONS)\n\n" +
"" +
- "// ---------------- CPU Architecture\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_BMI2 = (1 << 8)\n const unsigned int bmi2_ebx7 = 0x00000100;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n return (ebx7 & bmi2_ebx7) == bmi2_ebx7;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n i" +
- "nt x[4];\n __cpuidex(x, 7, 0);\n return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int sse42_ecx1 = 0x00900002;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n return (ecx1 & sse42_ecx1) == sse42_ecx1;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x[4];\n __cpuid(x, 1);\n return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#els" +
- "e\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\n" +
+ "// ---------------- CPU Architecture\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_crc32() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_arm_neon() {\n#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n return true;\n#else\n return false;\n#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_avx2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_BMI2 = (1 << 5)\n const unsigned int avx2_ebx7 = 0x00000020;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n return (ebx7 & avx2_ebx7) == avx2_ebx7;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n i" +
+ "nt x[4];\n __cpuidex(x, 7, 0);\n return (((unsigned int)(x[1])) & avx2_ebx7) == avx2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_bmi2() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_BMI2 = (1 << 8)\n const unsigned int bmi2_ebx7 = 0x00000100;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax7 = 0;\n unsigned int ebx7 = 0;\n unsigned int ecx7 = 0;\n unsigned int edx7 = 0;\n if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {\n return (ebx7 & bmi2_ebx7) == bmi2_ebx7;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x[4];\n __cpuidex(x, 7, 0);\n return (((unsigned int)(x[1])) & bmi2_ebx7) == bmi2_ebx7;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an uns" +
+ "upported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)\n return false;\n}\n\nstatic inline bool //\nwuffs_base__cpu_arch__have_x86_sse42() {\n#if defined(WUFFS_BASE__CPU_ARCH__X86_64)\n // GCC defines these macros but MSVC does not.\n // - bit_PCLMUL = (1 << 1)\n // - bit_POPCNT = (1 << 23)\n // - bit_SSE4_2 = (1 << 20)\n const unsigned int sse42_ecx1 = 0x00900002;\n\n // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).\n#if defined(__GNUC__)\n unsigned int eax1 = 0;\n unsigned int ebx1 = 0;\n unsigned int ecx1 = 0;\n unsigned int edx1 = 0;\n if (__get_cpuid(1, &eax1, &ebx1, &ecx1, &edx1)) {\n return (ecx1 & sse42_ecx1) == sse42_ecx1;\n }\n#elif defined(_MSC_VER) // defined(__GNUC__)\n int x[4];\n __cpuid(x, 1);\n return (((unsigned int)(x[2])) & sse42_ecx1) == sse42_ecx1;\n#else\n#error \"WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler\"\n#endif // defined(__GNUC__); defined(_MSC_VER)\n#endif // defined(WUFFS_BAS" +
+ "E__CPU_ARCH__X86_64)\n return false;\n}\n\n" +
"" +
"// ---------------- Fundamentals\n\n// Wuffs assumes that:\n// - converting a uint32_t to a size_t will never overflow.\n// - converting a size_t to a uint64_t will never overflow.\n#if defined(__WORDSIZE)\n#if (__WORDSIZE != 32) && (__WORDSIZE != 64)\n#error \"Wuffs requires a word size of either 32 or 64 bits\"\n#endif\n#endif\n\n// Clang also defines \"__GNUC__\".\n#if defined(__GNUC__)\n#define WUFFS_BASE__POTENTIALLY_UNUSED __attribute__((unused))\n#define WUFFS_BASE__WARN_UNUSED_RESULT __attribute__((warn_unused_result))\n#else\n#define WUFFS_BASE__POTENTIALLY_UNUSED\n#define WUFFS_BASE__WARN_UNUSED_RESULT\n#endif\n\n" +
"" +
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index 8343623..cf5c0f8 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -279,6 +279,10 @@
caMacro, caName, caAttribute =
"X86_64", "x86_sse42",
"WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2\")"
+ case t.IDX86AVX2:
+ caMacro, caName, caAttribute =
+ "X86_64", "x86_avx2",
+ "WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET(\"pclmul,popcnt,sse4.2,avx2\")"
case t.IDX86BMI2:
caMacro, caName, caAttribute =
"X86_64", "x86_bmi2",
diff --git a/lang/check/type.go b/lang/check/type.go
index 1781bb6..d515bc1 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -28,6 +28,7 @@
cpuArchBitsARMCRC32 = cpuArchBits(0x00000001)
cpuArchBitsARMNeon = cpuArchBits(0x00000002)
cpuArchBitsX86SSE42 = cpuArchBits(0x00000004)
+ cpuArchBitsX86AVX2 = cpuArchBits(0x00000008)
)
func calcCPUArchBits(n *a.Func) (ret cpuArchBits) {
@@ -43,6 +44,8 @@
ret |= cpuArchBitsARMNeon
case t.IDX86SSE42:
ret |= cpuArchBitsX86SSE42
+ case t.IDX86AVX2:
+ ret |= cpuArchBitsX86SSE42 | cpuArchBitsX86AVX2
}
}
return ret
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index d9756cc..682ae05 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -188,6 +188,33 @@
}
static inline bool //
+wuffs_base__cpu_arch__have_x86_avx2() {
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ // GCC defines these macros but MSVC does not.
+ // - bit_BMI2 = (1 << 5)
+ const unsigned int avx2_ebx7 = 0x00000020;
+
+ // clang defines __GNUC__ and clang-cl defines _MSC_VER (but not __GNUC__).
+#if defined(__GNUC__)
+ unsigned int eax7 = 0;
+ unsigned int ebx7 = 0;
+ unsigned int ecx7 = 0;
+ unsigned int edx7 = 0;
+ if (__get_cpuid_count(7, 0, &eax7, &ebx7, &ecx7, &edx7)) {
+ return (ebx7 & avx2_ebx7) == avx2_ebx7;
+ }
+#elif defined(_MSC_VER) // defined(__GNUC__)
+ int x[4];
+ __cpuidex(x, 7, 0);
+ return (((unsigned int)(x[1])) & avx2_ebx7) == avx2_ebx7;
+#else
+#error "WUFFS_BASE__CPU_ARCH__ETC combined with an unsupported compiler"
+#endif // defined(__GNUC__); defined(_MSC_VER)
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ return false;
+}
+
+static inline bool //
wuffs_base__cpu_arch__have_x86_bmi2() {
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
// GCC defines these macros but MSVC does not.
@@ -24687,6 +24714,13 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
static wuffs_base__empty_struct
+wuffs_crc32__ieee_hasher__up_x86_avx2(
+ wuffs_crc32__ieee_hasher* self,
+ wuffs_base__slice_u8 a_x);
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
wuffs_crc32__ieee_hasher__up_x86_sse42(
wuffs_crc32__ieee_hasher* self,
wuffs_base__slice_u8 a_x);
@@ -24805,6 +24839,9 @@
wuffs_base__cpu_arch__have_arm_crc32() ? &wuffs_crc32__ieee_hasher__up_arm_crc32 :
#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+ wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_crc32__ieee_hasher__up_x86_avx2 :
+#endif
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_crc32__ieee_hasher__up_x86_sse42 :
#endif
self->private_impl.choosy_up);
@@ -24991,6 +25028,125 @@
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
// ‼ WUFFS MULTI-FILE SECTION -arm_crc32
+// ‼ WUFFS MULTI-FILE SECTION +x86_avx2
+// -------- func crc32.ieee_hasher.up_x86_avx2
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2")
+static wuffs_base__empty_struct
+wuffs_crc32__ieee_hasher__up_x86_avx2(
+ wuffs_crc32__ieee_hasher* self,
+ wuffs_base__slice_u8 a_x) {
+ uint32_t v_s = 0;
+ wuffs_base__slice_u8 v_p = {0};
+ __m128i v_k = {0};
+ __m128i v_x0 = {0};
+ __m128i v_x1 = {0};
+ __m128i v_x2 = {0};
+ __m128i v_x3 = {0};
+ __m128i v_y0 = {0};
+ __m128i v_y1 = {0};
+ __m128i v_y2 = {0};
+ __m128i v_y3 = {0};
+ uint64_t v_tail_index = 0;
+
+ v_s = (4294967295 ^ self->private_impl.f_state);
+ while ((((uint64_t)(a_x.len)) > 0) && ((15 & ((uint32_t)(0xFFF & (uintptr_t)(a_x.ptr)))) != 0)) {
+ v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ a_x.ptr[0])] ^ (v_s >> 8));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 1);
+ }
+ if (((uint64_t)(a_x.len)) < 64) {
+ {
+ wuffs_base__slice_u8 i_slice_p = a_x;
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 1;
+ uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+ while (v_p.ptr < i_end0_p) {
+ v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+ v_p.ptr += 1;
+ }
+ v_p.len = 0;
+ }
+ self->private_impl.f_state = (4294967295 ^ v_s);
+ return wuffs_base__make_empty_struct();
+ }
+ v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0));
+ v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16));
+ v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32));
+ v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48));
+ v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
+ {
+ wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64);
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 64;
+ uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64);
+ while (v_p.ptr < i_end0_p) {
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_y1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(0));
+ v_y2 = _mm_clmulepi64_si128(v_x2, v_k, (int32_t)(0));
+ v_y3 = _mm_clmulepi64_si128(v_x3, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(17));
+ v_x2 = _mm_clmulepi64_si128(v_x2, v_k, (int32_t)(17));
+ v_x3 = _mm_clmulepi64_si128(v_x3, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0)));
+ v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16)));
+ v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32)));
+ v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48)));
+ v_p.ptr += 64;
+ }
+ v_p.len = 0;
+ }
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(v_x0, v_x2);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(v_x0, v_x3);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_x1 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(16));
+ v_x2 = _mm_set_epi32((int32_t)(0), (int32_t)(4294967295), (int32_t)(0), (int32_t)(4294967295));
+ v_x0 = _mm_srli_si128(v_x0, (int32_t)(8));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
+ v_x1 = _mm_srli_si128(v_x0, (int32_t)(4));
+ v_x0 = _mm_and_si128(v_x0, v_x2);
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
+ v_x1 = _mm_and_si128(v_x0, v_x2);
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(16));
+ v_x1 = _mm_and_si128(v_x1, v_x2);
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(0));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_s = ((uint32_t)(_mm_extract_epi32(v_x0, (int32_t)(1))));
+ v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
+ if (v_tail_index < ((uint64_t)(a_x.len))) {
+ {
+ wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 1;
+ uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+ while (v_p.ptr < i_end0_p) {
+ v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+ v_p.ptr += 1;
+ }
+ v_p.len = 0;
+ }
+ }
+ self->private_impl.f_state = (4294967295 ^ v_s);
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+// ‼ WUFFS MULTI-FILE SECTION -x86_avx2
+
// ‼ WUFFS MULTI-FILE SECTION +x86_sse42
// -------- func crc32.ieee_hasher.up_x86_sse42
diff --git a/std/crc32/common_crc32.wuffs b/std/crc32/common_crc32.wuffs
index ae48e95..c7a94e9 100644
--- a/std/crc32/common_crc32.wuffs
+++ b/std/crc32/common_crc32.wuffs
@@ -27,6 +27,7 @@
if this.state == 0 {
choose up = [
up_arm_crc32,
+ up_x86_avx2,
up_x86_sse42]
}
this.up!(x: args.x)
diff --git a/std/crc32/common_up_x86_avx2.wuffs b/std/crc32/common_up_x86_avx2.wuffs
new file mode 100644
index 0000000..27c0722
--- /dev/null
+++ b/std/crc32/common_up_x86_avx2.wuffs
@@ -0,0 +1,138 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// --------
+
+// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
+// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
+
+// up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose
+// cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not
+// just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq
+pri func ieee_hasher.up_x86_avx2!(x: slice base.u8),
+ choose cpu_arch >= x86_avx2,
+{
+ var s : base.u32
+ var p : slice base.u8
+
+ var util : base.x86_sse42_utility
+ var k : base.x86_m128i
+ var x0 : base.x86_m128i
+ var x1 : base.x86_m128i
+ var x2 : base.x86_m128i
+ var x3 : base.x86_m128i
+ var y0 : base.x86_m128i
+ var y1 : base.x86_m128i
+ var y2 : base.x86_m128i
+ var y3 : base.x86_m128i
+
+ var tail_index : base.u64
+
+ s = 0xFFFF_FFFF ^ this.state
+
+ // Align to a 16-byte boundary.
+ while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
+ s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
+ args.x = args.x[1 ..]
+ } endwhile
+
+ // For short inputs, just do a simple loop.
+ if args.x.length() < 64 {
+ iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
+ s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+ }
+ this.state = 0xFFFF_FFFF ^ s
+ return nothing
+ }
+
+ // Load 128×4 = 512 bits from the first 64-byte chunk.
+ x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
+ x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+ x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
+ x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+
+ // Combine with the initial state.
+ x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
+
+ // Process the remaining 64-byte chunks.
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
+ iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00)
+
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11)
+
+ x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
+ x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
+ x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
+ x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
+ }
+
+ // Reduce 128×4 = 512 bits to 128 bits.
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x0 = x0._mm_xor_si128(b: x1)
+ x0 = x0._mm_xor_si128(b: y0)
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x0 = x0._mm_xor_si128(b: x2)
+ x0 = x0._mm_xor_si128(b: y0)
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x0 = x0._mm_xor_si128(b: x3)
+ x0 = x0._mm_xor_si128(b: y0)
+
+ // Reduce 128 bits to 64 bits.
+ x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10)
+ x2 = util.make_m128i_multiple_u32(
+ a00: 0xFFFF_FFFF,
+ a01: 0x0000_0000,
+ a02: 0xFFFF_FFFF,
+ a03: 0x0000_0000)
+ x0 = x0._mm_srli_si128(imm8: 8)
+ x0 = x0._mm_xor_si128(b: x1)
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
+ x1 = x0._mm_srli_si128(imm8: 4)
+ x0 = x0._mm_and_si128(b: x2)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_xor_si128(b: x1)
+
+ // Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
+ //
+ // Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
+ // adjusting for bit-reflection as per Figure 12 (page 21).
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
+ x1 = x0._mm_and_si128(b: x2)
+ x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10)
+ x1 = x1._mm_and_si128(b: x2)
+ x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_xor_si128(b: x1)
+ s = x0._mm_extract_epi32(imm8: 1)
+
+ // Handle the tail of args.x that wasn't a complete 64-byte chunk.
+ tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
+ if tail_index < args.x.length() {
+ iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
+ s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+ }
+ }
+
+ this.state = 0xFFFF_FFFF ^ s
+}