std/crc32: re-write x86_sse42 implementation
name old speed new speed delta
wuffs_crc32_ieee_10k/clang14 13.5GB/s ± 0% 21.9GB/s ± 0% +62.09% (p=0.016 n=5+4)
wuffs_crc32_ieee_100k/clang14 22.4GB/s ± 9% 29.7GB/s ± 0% +32.76% (p=0.016 n=5+4)
wuffs_crc32_ieee_10k/gcc12 14.2GB/s ± 2% 22.2GB/s ± 0% +56.92% (p=0.008 n=5+5)
wuffs_crc32_ieee_100k/gcc12 21.3GB/s ± 3% 29.6GB/s ± 1% +39.18% (p=0.008 n=5+5)
wuffs_gzip_decode_10k/clang14 366MB/s ± 0% 372MB/s ± 0% +1.41% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang14 482MB/s ± 0% 494MB/s ± 0% +2.57% (p=0.008 n=5+5)
wuffs_gzip_decode_10k/gcc12 398MB/s ± 0% 418MB/s ± 0% +5.19% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc12 510MB/s ± 0% 537MB/s ± 0% +5.21% (p=0.008 n=5+5)
wuffs_png_decode_image_19k_8bpp/clang14 263MB/s ± 0% 264MB/s ± 0% +0.39% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang14 297MB/s ± 0% 298MB/s ± 0% +0.16% (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang14 932MB/s ± 0% 945MB/s ± 0% +1.33% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang14 831MB/s ± 0% 833MB/s ± 0% +0.25% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang14 799MB/s ± 0% 802MB/s ± 0% +0.37% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/clang14 306MB/s ± 0% 307MB/s ± 0% +0.56% (p=0.008 n=5+5)
wuffs_png_decode_image_19k_8bpp/gcc12 283MB/s ± 0% 270MB/s ± 0% -4.84% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc12 330MB/s ± 0% 329MB/s ± 0% -0.29% (p=0.016 n=5+4)
wuffs_png_decode_image_77k_8bpp/gcc12 992MB/s ± 0% 957MB/s ± 0% -3.56% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc12 920MB/s ± 0% 908MB/s ± 0% -1.30% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc12 875MB/s ± 0% 867MB/s ± 0% -0.95% (p=0.008 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc12 341MB/s ± 0% 342MB/s ± 0% +0.25% (p=0.008 n=5+5)
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index e2adb67..863e3e3 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -35634,30 +35634,6 @@
},
};
-static const uint8_t
-WUFFS_CRC32__IEEE_X86_SSE42_K1K2[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 212u, 43u, 68u, 84u, 1u, 0u, 0u, 0u,
- 150u, 21u, 228u, 198u, 1u, 0u, 0u, 0u,
-};
-
-static const uint8_t
-WUFFS_CRC32__IEEE_X86_SSE42_K3K4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 208u, 151u, 25u, 117u, 1u, 0u, 0u, 0u,
- 158u, 0u, 170u, 204u, 0u, 0u, 0u, 0u,
-};
-
-static const uint8_t
-WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 36u, 97u, 205u, 99u, 1u, 0u, 0u, 0u,
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u,
-};
-
-static const uint8_t
-WUFFS_CRC32__IEEE_X86_SSE42_PXMU[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 65u, 6u, 113u, 219u, 1u, 0u, 0u, 0u,
- 65u, 22u, 1u, 247u, 1u, 0u, 0u, 0u,
-};
-
// ---------------- Private Initializer Prototypes
// ---------------- Private Function Prototypes
@@ -35685,14 +35661,6 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
WUFFS_BASE__GENERATED_C_CODE
static wuffs_base__empty_struct
-wuffs_crc32__ieee_hasher__up_x86_avx2(
- wuffs_crc32__ieee_hasher* self,
- wuffs_base__slice_u8 a_x);
-#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-WUFFS_BASE__GENERATED_C_CODE
-static wuffs_base__empty_struct
wuffs_crc32__ieee_hasher__up_x86_sse42(
wuffs_crc32__ieee_hasher* self,
wuffs_base__slice_u8 a_x);
@@ -35846,9 +35814,6 @@
wuffs_base__cpu_arch__have_arm_crc32() ? &wuffs_crc32__ieee_hasher__up_arm_crc32 :
#endif
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
- wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_crc32__ieee_hasher__up_x86_avx2 :
-#endif
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_crc32__ieee_hasher__up_x86_sse42 :
#endif
self->private_impl.choosy_up);
@@ -36073,129 +36038,6 @@
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_CRC32)
// ‼ WUFFS MULTI-FILE SECTION -arm_crc32
-// ‼ WUFFS MULTI-FILE SECTION +x86_avx2
-// -------- func crc32.ieee_hasher.up_x86_avx2
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2")
-WUFFS_BASE__GENERATED_C_CODE
-static wuffs_base__empty_struct
-wuffs_crc32__ieee_hasher__up_x86_avx2(
- wuffs_crc32__ieee_hasher* self,
- wuffs_base__slice_u8 a_x) {
- uint32_t v_s = 0;
- wuffs_base__slice_u8 v_p = {0};
- __m128i v_k1k2 = {0};
- __m128i v_k3k4 = {0};
- __m128i v_k5zz = {0};
- __m128i v_pxmu = {0};
- __m128i v_x0 = {0};
- __m128i v_x1 = {0};
- __m128i v_x2 = {0};
- __m128i v_x3 = {0};
- __m128i v_y0 = {0};
- __m128i v_y1 = {0};
- __m128i v_y2 = {0};
- __m128i v_y3 = {0};
- uint64_t v_tail_index = 0;
-
- v_s = (4294967295u ^ self->private_impl.f_state);
- while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
- v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
- a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
- }
- if (((uint64_t)(a_x.len)) < 64u) {
- {
- wuffs_base__slice_u8 i_slice_p = a_x;
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
- while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
- v_p.ptr += 1;
- }
- v_p.len = 0;
- }
- self->private_impl.f_state = (4294967295u ^ v_s);
- return wuffs_base__make_empty_struct();
- }
- v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
- v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
- v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
- v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
- v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
- v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
- {
- wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64u);
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 64;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(v_p.ptr, (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64));
- while (v_p.ptr < i_end0_p) {
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(0u));
- v_y1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(0u));
- v_y2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(0u));
- v_y3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(17u));
- v_x1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(17u));
- v_x2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(17u));
- v_x3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(17u));
- v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0u)));
- v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16u)));
- v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32u)));
- v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48u)));
- v_p.ptr += 64;
- }
- v_p.len = 0;
- }
- v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
- v_x0 = _mm_xor_si128(v_x0, v_x1);
- v_x0 = _mm_xor_si128(v_x0, v_y0);
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
- v_x0 = _mm_xor_si128(v_x0, v_x2);
- v_x0 = _mm_xor_si128(v_x0, v_y0);
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
- v_x0 = _mm_xor_si128(v_x0, v_x3);
- v_x0 = _mm_xor_si128(v_x0, v_y0);
- v_x1 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(16u));
- v_x2 = _mm_set_epi32((int32_t)(0u), (int32_t)(4294967295u), (int32_t)(0u), (int32_t)(4294967295u));
- v_x0 = _mm_srli_si128(v_x0, (int32_t)(8u));
- v_x0 = _mm_xor_si128(v_x0, v_x1);
- v_k5zz = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
- v_x1 = _mm_srli_si128(v_x0, (int32_t)(4u));
- v_x0 = _mm_and_si128(v_x0, v_x2);
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k5zz, (int32_t)(0u));
- v_x0 = _mm_xor_si128(v_x0, v_x1);
- v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
- v_x1 = _mm_and_si128(v_x0, v_x2);
- v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(16u));
- v_x1 = _mm_and_si128(v_x1, v_x2);
- v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(0u));
- v_x1 = _mm_xor_si128(v_x1, v_x0);
- v_s = ((uint32_t)(_mm_extract_epi32(v_x1, (int32_t)(1u))));
- v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
- if (v_tail_index < ((uint64_t)(a_x.len))) {
- {
- wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
- while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
- v_p.ptr += 1;
- }
- v_p.len = 0;
- }
- }
- self->private_impl.f_state = (4294967295u ^ v_s);
- return wuffs_base__make_empty_struct();
-}
-#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-// ‼ WUFFS MULTI-FILE SECTION -x86_avx2
-
// ‼ WUFFS MULTI-FILE SECTION +x86_sse42
// -------- func crc32.ieee_hasher.up_x86_sse42
@@ -36207,111 +36049,120 @@
wuffs_crc32__ieee_hasher* self,
wuffs_base__slice_u8 a_x) {
uint32_t v_s = 0;
- wuffs_base__slice_u8 v_p = {0};
- __m128i v_k1k2 = {0};
- __m128i v_k3k4 = {0};
- __m128i v_k5zz = {0};
- __m128i v_pxmu = {0};
+ __m128i v_kk = {0};
__m128i v_x0 = {0};
__m128i v_x1 = {0};
__m128i v_x2 = {0};
__m128i v_x3 = {0};
+ __m128i v_x4 = {0};
+ __m128i v_x5 = {0};
+ __m128i v_x6 = {0};
+ __m128i v_x7 = {0};
__m128i v_y0 = {0};
__m128i v_y1 = {0};
__m128i v_y2 = {0};
__m128i v_y3 = {0};
- uint64_t v_tail_index = 0;
+ __m128i v_y4 = {0};
+ __m128i v_y5 = {0};
+ __m128i v_y6 = {0};
+ __m128i v_y7 = {0};
v_s = (4294967295u ^ self->private_impl.f_state);
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
- if (((uint64_t)(a_x.len)) < 64u) {
- {
- wuffs_base__slice_u8 i_slice_p = a_x;
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
- while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
- v_p.ptr += 1;
- }
- v_p.len = 0;
+ if (((uint64_t)(a_x.len)) >= 128u) {
+ v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
+ v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
+ v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
+ v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
+ v_x4 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u));
+ v_x5 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u));
+ v_x6 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u));
+ v_x7 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u));
+ v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(2433674945u), (int32_t)(0u), (int32_t)(872412467u));
+ v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
+ while (((uint64_t)(a_x.len)) >= 128u) {
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
+ v_y1 = _mm_clmulepi64_si128(v_x1, v_kk, (int32_t)(0u));
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_kk, (int32_t)(17u));
+ v_y2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(0u));
+ v_x2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(17u));
+ v_y3 = _mm_clmulepi64_si128(v_x3, v_kk, (int32_t)(0u));
+ v_x3 = _mm_clmulepi64_si128(v_x3, v_kk, (int32_t)(17u));
+ v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
+ v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
+ v_y5 = _mm_clmulepi64_si128(v_x5, v_kk, (int32_t)(0u));
+ v_x5 = _mm_clmulepi64_si128(v_x5, v_kk, (int32_t)(17u));
+ v_y6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(0u));
+ v_x6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(17u));
+ v_y7 = _mm_clmulepi64_si128(v_x7, v_kk, (int32_t)(0u));
+ v_x7 = _mm_clmulepi64_si128(v_x7, v_kk, (int32_t)(17u));
+ v_y0 = _mm_xor_si128(v_y0, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y1 = _mm_xor_si128(v_y1, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
+ v_x1 = _mm_xor_si128(v_x1, v_y1);
+ v_y2 = _mm_xor_si128(v_y2, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
+ v_x2 = _mm_xor_si128(v_x2, v_y2);
+ v_y3 = _mm_xor_si128(v_y3, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
+ v_x3 = _mm_xor_si128(v_x3, v_y3);
+ v_y4 = _mm_xor_si128(v_y4, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u)));
+ v_x4 = _mm_xor_si128(v_x4, v_y4);
+ v_y5 = _mm_xor_si128(v_y5, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u)));
+ v_x5 = _mm_xor_si128(v_x5, v_y5);
+ v_y6 = _mm_xor_si128(v_y6, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u)));
+ v_x6 = _mm_xor_si128(v_x6, v_y6);
+ v_y7 = _mm_xor_si128(v_y7, _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u)));
+ v_x7 = _mm_xor_si128(v_x7, v_y7);
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
}
- self->private_impl.f_state = (4294967295u ^ v_s);
- return wuffs_base__make_empty_struct();
+ v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(3433693342u), (int32_t)(0u), (int32_t)(2926088593u));
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
+ v_y2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(0u));
+ v_x2 = _mm_clmulepi64_si128(v_x2, v_kk, (int32_t)(17u));
+ v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
+ v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
+ v_y6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(0u));
+ v_x6 = _mm_clmulepi64_si128(v_x6, v_kk, (int32_t)(17u));
+ v_y0 = _mm_xor_si128(v_y0, v_x1);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y2 = _mm_xor_si128(v_y2, v_x3);
+ v_x2 = _mm_xor_si128(v_x2, v_y2);
+ v_y4 = _mm_xor_si128(v_y4, v_x5);
+ v_x4 = _mm_xor_si128(v_x4, v_y4);
+ v_y6 = _mm_xor_si128(v_y6, v_x7);
+ v_x6 = _mm_xor_si128(v_x6, v_y6);
+ v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(2166711591u), (int32_t)(0u), (int32_t)(4057597354u));
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
+ v_y4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(0u));
+ v_x4 = _mm_clmulepi64_si128(v_x4, v_kk, (int32_t)(17u));
+ v_y0 = _mm_xor_si128(v_y0, v_x2);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y4 = _mm_xor_si128(v_y4, v_x6);
+ v_x4 = _mm_xor_si128(v_x4, v_y4);
+ v_kk = _mm_set_epi32((int32_t)(0u), (int32_t)(496309207u), (int32_t)(0u), (int32_t)(2402626965u));
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(0u));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_kk, (int32_t)(17u));
+ v_y0 = _mm_xor_si128(v_y0, v_x4);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
+ v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)(((uint64_t)(_mm_extract_epi64(v_x0, (int32_t)(0u)))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
+ v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
+ v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)((((uint64_t)(_mm_extract_epi64(v_x0, (int32_t)(1u)))) ^ ((uint64_t)(v_s))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
}
- v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
- v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
- v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
- v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
- v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
- v_k1k2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
- {
- wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64u);
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 64;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(v_p.ptr, (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64));
- while (v_p.ptr < i_end0_p) {
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(0u));
- v_y1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(0u));
- v_y2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(0u));
- v_y3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k1k2, (int32_t)(17u));
- v_x1 = _mm_clmulepi64_si128(v_x1, v_k1k2, (int32_t)(17u));
- v_x2 = _mm_clmulepi64_si128(v_x2, v_k1k2, (int32_t)(17u));
- v_x3 = _mm_clmulepi64_si128(v_x3, v_k1k2, (int32_t)(17u));
- v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0u)));
- v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16u)));
- v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32u)));
- v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48u)));
- v_p.ptr += 64;
- }
- v_p.len = 0;
+ while (((uint64_t)(a_x.len)) >= 8u) {
+ v_kk = _mm_set_epi32((int32_t)(1u), (int32_t)(3681617473u), (int32_t)(3034951717u), (int32_t)(4144043585u));
+ v_s = ((uint32_t)(_mm_extract_epi32(_mm_clmulepi64_si128(_mm_clmulepi64_si128(_mm_cvtsi64_si128((int64_t)((wuffs_base__peek_u64le__no_bounds_check(a_x.ptr) ^ ((uint64_t)(v_s))))), v_kk, (int32_t)(0u)), v_kk, (int32_t)(16u)), (int32_t)(2u))));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 8u);
}
- v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
- v_x0 = _mm_xor_si128(v_x0, v_x1);
- v_x0 = _mm_xor_si128(v_x0, v_y0);
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
- v_x0 = _mm_xor_si128(v_x0, v_x2);
- v_x0 = _mm_xor_si128(v_x0, v_y0);
- v_y0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(0u));
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(17u));
- v_x0 = _mm_xor_si128(v_x0, v_x3);
- v_x0 = _mm_xor_si128(v_x0, v_y0);
- v_x1 = _mm_clmulepi64_si128(v_x0, v_k3k4, (int32_t)(16u));
- v_x2 = _mm_set_epi32((int32_t)(0u), (int32_t)(4294967295u), (int32_t)(0u), (int32_t)(4294967295u));
- v_x0 = _mm_srli_si128(v_x0, (int32_t)(8u));
- v_x0 = _mm_xor_si128(v_x0, v_x1);
- v_k5zz = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
- v_x1 = _mm_srli_si128(v_x0, (int32_t)(4u));
- v_x0 = _mm_and_si128(v_x0, v_x2);
- v_x0 = _mm_clmulepi64_si128(v_x0, v_k5zz, (int32_t)(0u));
- v_x0 = _mm_xor_si128(v_x0, v_x1);
- v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_PXMU));
- v_x1 = _mm_and_si128(v_x0, v_x2);
- v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(16u));
- v_x1 = _mm_and_si128(v_x1, v_x2);
- v_x1 = _mm_clmulepi64_si128(v_x1, v_pxmu, (int32_t)(0u));
- v_x1 = _mm_xor_si128(v_x1, v_x0);
- v_s = ((uint32_t)(_mm_extract_epi32(v_x1, (int32_t)(1u))));
- v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
- if (v_tail_index < ((uint64_t)(a_x.len))) {
- {
- wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
- while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
- v_p.ptr += 1;
- }
- v_p.len = 0;
- }
+ while (((uint64_t)(a_x.len)) > 0u) {
+ v_s = (WUFFS_CRC32__IEEE_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
self->private_impl.f_state = (4294967295u ^ v_s);
return wuffs_base__make_empty_struct();
diff --git a/script/print-crc32-x86-sse42-code.go b/script/print-crc32-x86-sse42-code.go
new file mode 100644
index 0000000..f6c5db0
--- /dev/null
+++ b/script/print-crc32-x86-sse42-code.go
@@ -0,0 +1,167 @@
+// Copyright 2024 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+//
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//go:build ignore
+// +build ignore
+
+package main
+
+// print-crc32-x86-sse42-code.go prints the std/crc32 x86/SSE4.2 Wuffs code
+// based on some C code generated by https://github.com/corsix/fast-crc32/
+//
+// Usage: go run print-crc32-x86-sse42-code.go
+
+import (
+ "fmt"
+ "regexp"
+ "strconv"
+ "strings"
+)
+
+func main() {
+ var (
+ reXEqLoadu = regexp.MustCompile(`^__m128i x(\d+) = _mm_loadu_si128`)
+ reKEqSetr = regexp.MustCompile(`^k = _mm_setr_epi32\(([^,]+), ([^,]+), ([^,]+), ([^\)]+)\);$`)
+ reYEqClmul = regexp.MustCompile(`^y(\d+) = clmul_lo\(x(\d+), k\), x(\d+) = clmul_hi\(x(\d+), k\);$`)
+ reYEqXorLoadu = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), _mm_loadu_si128`)
+ reYEqXorYX = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), x(\d+)\), x(\d+) = _mm_xor_si128\(x(\d+), y(\d+)\);$`)
+ )
+
+ fmt.Println("// BEGIN script/print-crc32-x86-sse42-code.go generated code.")
+ for src := srcSSECRC32V8; src != ""; {
+ i := strings.IndexByte(src, '\n')
+ line := strings.TrimSpace(src[:i])
+ src = src[i+1:]
+
+ if (line == "") || strings.HasPrefix(line, "/*") {
+ continue
+
+ } else if s := reXEqLoadu.FindStringSubmatch(line); len(s) > 0 {
+ n, _ := strconv.Atoi(s[1])
+ fmt.Printf("x%d = util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X])\n", n, 16*(n), 16*(n+1))
+
+ } else if line == "__m128i k;" {
+ continue
+
+ } else if s := reKEqSetr.FindStringSubmatch(line); len(s) > 0 {
+ fmt.Printf("kk = util.make_m128i_multiple_u32(a00: %s, a01: %s, a02: %s, a03: %s)\n", s[1], s[2], s[3], s[4])
+
+ } else if line == "x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);" {
+ fmt.Printf("x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))\n")
+
+ } else if line == "buf += 128;" {
+ fmt.Printf("args.x = args.x[128 ..]\n")
+
+ } else if line == "len -= 128;" {
+ continue
+
+ } else if line == "while (len >= 128) {" {
+ fmt.Printf("while args.x.length() >= 128 {\n")
+
+ } else if line == "}" {
+ fmt.Printf("} endwhile\n")
+
+ } else if s := reYEqClmul.FindStringSubmatch(line); len(s) > 0 {
+ fmt.Printf("y%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x00)\n", s[1], s[2])
+ fmt.Printf("x%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x11)\n", s[3], s[4])
+
+ } else if s := reYEqXorLoadu.FindStringSubmatch(line); len(s) > 0 {
+ n, _ := strconv.Atoi(s[1])
+ fmt.Printf("y%d = y%d._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X]))\n", n, n, 16*(n), 16*(n+1))
+ fmt.Printf("x%d = x%d._mm_xor_si128(b: y%d)\n", n, n, n)
+
+ } else if s := reYEqXorYX.FindStringSubmatch(line); len(s) > 0 {
+ fmt.Printf("y%s = y%s._mm_xor_si128(b: x%s)\n", s[1], s[2], s[3])
+ fmt.Printf("x%s = x%s._mm_xor_si128(b: y%s)\n", s[4], s[5], s[6])
+
+ } else if line == "crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));" {
+ fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
+ fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).\n")
+ fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
+ fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
+ fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
+
+ // fmt.Printf("s = util.make_m128i_single_u64(a: (s as base.u64) ^ args.x.peek_u64le()).\n")
+ } else if line == "crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));" {
+ fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
+ fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).\n")
+ fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
+ fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
+ fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
+
+ } else {
+ fmt.Printf("// Could not process %q.\n", line)
+ break
+ }
+ }
+ fmt.Println("// END script/print-crc32-x86-sse42-code.go generated code.")
+}
+
+// This is the core (inside "if (len >= 128)") of the code produced by
+// generate.c in https://github.com/corsix/fast-crc32/ when parameterized by
+// "./generate -i sse -p crc32 -a v8".
+const srcSSECRC32V8 = `
+ /* First vector chunk. */
+ __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
+ __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
+ __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
+ __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
+ __m128i x4 = _mm_loadu_si128((const __m128i*)(buf + 64)), y4;
+ __m128i x5 = _mm_loadu_si128((const __m128i*)(buf + 80)), y5;
+ __m128i x6 = _mm_loadu_si128((const __m128i*)(buf + 96)), y6;
+ __m128i x7 = _mm_loadu_si128((const __m128i*)(buf + 112)), y7;
+ __m128i k;
+ k = _mm_setr_epi32(0x33fff533, 0, 0x910eeec1, 0);
+ x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+ buf += 128;
+ len -= 128;
+ /* Main loop. */
+ while (len >= 128) {
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+ y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+ y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
+ y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+ y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
+ y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
+ y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
+ y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
+ y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
+ y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf + 64))), x4 = _mm_xor_si128(x4, y4);
+ y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf + 80))), x5 = _mm_xor_si128(x5, y5);
+ y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf + 96))), x6 = _mm_xor_si128(x6, y6);
+ y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf + 112))), x7 = _mm_xor_si128(x7, y7);
+ buf += 128;
+ len -= 128;
+ }
+ /* Reduce x0 ... x7 to just x0. */
+ k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+ y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+ y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+ y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4);
+ y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6);
+ k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+ y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+ y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4);
+ k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0);
+ /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+ crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));
+ crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));
+`
diff --git a/script/print-crc32-x86-sse42-magic-numbers.go b/script/print-crc32-x86-sse42-magic-numbers.go
index 226c790..fd866c4 100644
--- a/script/print-crc32-x86-sse42-magic-numbers.go
+++ b/script/print-crc32-x86-sse42-magic-numbers.go
@@ -13,6 +13,8 @@
package main
+// This program is obsolete.
+//
// print-crc32-x86-sse42-magic-numbers.go prints the std/crc32
// IEEE_X86_SSE42_ETC magic number tables.
//
diff --git a/std/crc32/common_crc32.wuffs b/std/crc32/common_crc32.wuffs
index 13c1d8c..4ab951c 100644
--- a/std/crc32/common_crc32.wuffs
+++ b/std/crc32/common_crc32.wuffs
@@ -28,7 +28,6 @@
if this.state == 0 {
choose up = [
up_arm_crc32,
- up_x86_avx2,
up_x86_sse42]
}
this.up!(x: args.x)
diff --git a/std/crc32/common_up_x86_avx2.wuffs b/std/crc32/common_up_x86_avx2.wuffs
deleted file mode 100644
index 108beb7..0000000
--- a/std/crc32/common_up_x86_avx2.wuffs
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2021 The Wuffs Authors.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-//
-// SPDX-License-Identifier: Apache-2.0 OR MIT
-
-// --------
-
-// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
-// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
-
-// up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose
-// cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not
-// just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq
-pri func ieee_hasher.up_x86_avx2!(x: roslice base.u8),
- choose cpu_arch >= x86_avx2,
-{
- var s : base.u32
- var p : roslice base.u8
-
- var util : base.x86_sse42_utility
- var k1k2 : base.x86_m128i
- var k3k4 : base.x86_m128i
- var k5zz : base.x86_m128i
- var pxmu : base.x86_m128i
- var x0 : base.x86_m128i
- var x1 : base.x86_m128i
- var x2 : base.x86_m128i
- var x3 : base.x86_m128i
- var y0 : base.x86_m128i
- var y1 : base.x86_m128i
- var y2 : base.x86_m128i
- var y3 : base.x86_m128i
-
- var tail_index : base.u64
-
- s = 0xFFFF_FFFF ^ this.state
-
- // Align to a 16-byte boundary.
- while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
- s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
- args.x = args.x[1 ..]
- } endwhile
-
- // For short inputs, just do a simple loop.
- if args.x.length() < 64 {
- iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
- s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
- }
- this.state = 0xFFFF_FFFF ^ s
- return nothing
- }
-
- // Load 128×4 = 512 bits from the first 64-byte chunk.
- x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
- x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
- x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
- x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
-
- // Combine with the initial state.
- x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
-
- // Process the remaining 64-byte chunks.
- k1k2 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
- iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
- y0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
- y1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
- y2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
- y3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
-
- x0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
- x1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
- x2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
- x3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
-
- x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
- x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
- x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
- x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
- }
-
- // Reduce 128×4 = 512 bits to 128 bits.
- k3k4 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
- y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- x0 = x0._mm_xor_si128(b: x1)
- x0 = x0._mm_xor_si128(b: y0)
- y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- x0 = x0._mm_xor_si128(b: x2)
- x0 = x0._mm_xor_si128(b: y0)
- y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- x0 = x0._mm_xor_si128(b: x3)
- x0 = x0._mm_xor_si128(b: y0)
-
- // Reduce 128 bits to 64 bits.
- x1 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
- x2 = util.make_m128i_multiple_u32(
- a00: 0xFFFF_FFFF,
- a01: 0x0000_0000,
- a02: 0xFFFF_FFFF,
- a03: 0x0000_0000)
- x0 = x0._mm_srli_si128(imm8: 8)
- x0 = x0._mm_xor_si128(b: x1)
- k5zz = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
- x1 = x0._mm_srli_si128(imm8: 4)
- x0 = x0._mm_and_si128(b: x2)
- x0 = x0._mm_clmulepi64_si128(b: k5zz, imm8: 0x00)
- x0 = x0._mm_xor_si128(b: x1)
-
- // Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
- //
- // Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
- // adjusting for bit-reflection as per Figure 12 (page 21).
- pxmu = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
- x1 = x0._mm_and_si128(b: x2)
- x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
- x1 = x1._mm_and_si128(b: x2)
- x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
- x1 = x1._mm_xor_si128(b: x0)
- s = x1._mm_extract_epi32(imm8: 1)
-
- // Handle the tail of args.x that wasn't a complete 64-byte chunk.
- tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
- if tail_index < args.x.length() {
- iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
- s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
- }
- }
-
- this.state = 0xFFFF_FFFF ^ s
-}
diff --git a/std/crc32/common_up_x86_sse42.wuffs b/std/crc32/common_up_x86_sse42.wuffs
index b1fb4e0..53a1f6d 100644
--- a/std/crc32/common_up_x86_sse42.wuffs
+++ b/std/crc32/common_up_x86_sse42.wuffs
@@ -8,32 +8,29 @@
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
-// --------
-
-// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
-// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
-
pri func ieee_hasher.up_x86_sse42!(x: roslice base.u8),
choose cpu_arch >= x86_sse42,
{
var s : base.u32
- var p : roslice base.u8
var util : base.x86_sse42_utility
- var k1k2 : base.x86_m128i
- var k3k4 : base.x86_m128i
- var k5zz : base.x86_m128i
- var pxmu : base.x86_m128i
+ var kk : base.x86_m128i
var x0 : base.x86_m128i
var x1 : base.x86_m128i
var x2 : base.x86_m128i
var x3 : base.x86_m128i
+ var x4 : base.x86_m128i
+ var x5 : base.x86_m128i
+ var x6 : base.x86_m128i
+ var x7 : base.x86_m128i
var y0 : base.x86_m128i
var y1 : base.x86_m128i
var y2 : base.x86_m128i
var y3 : base.x86_m128i
-
- var tail_index : base.u64
+ var y4 : base.x86_m128i
+ var y5 : base.x86_m128i
+ var y6 : base.x86_m128i
+ var y7 : base.x86_m128i
s = 0xFFFF_FFFF ^ this.state
@@ -43,125 +40,111 @@
args.x = args.x[1 ..]
} endwhile
- // For short inputs, just do a simple loop.
- if args.x.length() < 64 {
- iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
- s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
- }
- this.state = 0xFFFF_FFFF ^ s
- return nothing
+ if args.x.length() >= 128 {
+ // BEGIN script/print-crc32-x86-sse42-code.go generated code.
+ x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
+ x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+ x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
+ x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+ x4 = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
+ x5 = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
+ x6 = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
+ x7 = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
+ kk = util.make_m128i_multiple_u32(a00: 0x33FF_F533, a01: 0, a02: 0x910E_EEC1, a03: 0)
+ x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
+ args.x = args.x[128 ..]
+ while args.x.length() >= 128 {
+ y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y0 = y0._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
+ x0 = x0._mm_xor_si128(b: y0)
+ y1 = y1._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
+ x1 = x1._mm_xor_si128(b: y1)
+ y2 = y2._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
+ x2 = x2._mm_xor_si128(b: y2)
+ y3 = y3._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
+ x3 = x3._mm_xor_si128(b: y3)
+ y4 = y4._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
+ x4 = x4._mm_xor_si128(b: y4)
+ y5 = y5._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
+ x5 = x5._mm_xor_si128(b: y5)
+ y6 = y6._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
+ x6 = x6._mm_xor_si128(b: y6)
+ y7 = y7._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
+ x7 = x7._mm_xor_si128(b: y7)
+ args.x = args.x[128 ..]
+ } endwhile
+ kk = util.make_m128i_multiple_u32(a00: 0xAE68_9191, a01: 0, a02: 0xCCAA_009E, a03: 0)
+ y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y0 = y0._mm_xor_si128(b: x1)
+ x0 = x0._mm_xor_si128(b: y0)
+ y2 = y2._mm_xor_si128(b: x3)
+ x2 = x2._mm_xor_si128(b: y2)
+ y4 = y4._mm_xor_si128(b: x5)
+ x4 = x4._mm_xor_si128(b: y4)
+ y6 = y6._mm_xor_si128(b: x7)
+ x6 = x6._mm_xor_si128(b: y6)
+ kk = util.make_m128i_multiple_u32(a00: 0xF1DA_05AA, a01: 0, a02: 0x8125_6527, a03: 0)
+ y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y0 = y0._mm_xor_si128(b: x2)
+ x0 = x0._mm_xor_si128(b: y0)
+ y4 = y4._mm_xor_si128(b: x6)
+ x4 = x4._mm_xor_si128(b: y4)
+ kk = util.make_m128i_multiple_u32(a00: 0x8F35_2D95, a01: 0, a02: 0x1D95_13D7, a03: 0)
+ y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
+ y0 = y0._mm_xor_si128(b: x4)
+ x0 = x0._mm_xor_si128(b: y0)
+ kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
+ s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).
+ _mm_clmulepi64_si128(b: kk, imm8: 0x00).
+ _mm_clmulepi64_si128(b: kk, imm8: 0x10).
+ _mm_extract_epi32(imm8: 2)
+ kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
+ s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).
+ _mm_clmulepi64_si128(b: kk, imm8: 0x00).
+ _mm_clmulepi64_si128(b: kk, imm8: 0x10).
+ _mm_extract_epi32(imm8: 2)
+ // END script/print-crc32-x86-sse42-code.go generated code.
}
- // Load 128×4 = 512 bits from the first 64-byte chunk.
- x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
- x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
- x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
- x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+ while args.x.length() >= 8 {
+ kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
+ s = util.make_m128i_single_u64(a: args.x.peek_u64le() ^ (s as base.u64)).
+ _mm_clmulepi64_si128(b: kk, imm8: 0x00).
+ _mm_clmulepi64_si128(b: kk, imm8: 0x10).
+ _mm_extract_epi32(imm8: 2)
+ args.x = args.x[8 ..]
+ } endwhile
- // Combine with the initial state.
- x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
-
- // Process the remaining 64-byte chunks.
- k1k2 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
- iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
- y0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
- y1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
- y2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
- y3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x00)
-
- x0 = x0._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
- x1 = x1._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
- x2 = x2._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
- x3 = x3._mm_clmulepi64_si128(b: k1k2, imm8: 0x11)
-
- x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
- x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
- x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
- x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
- }
-
- // Reduce 128×4 = 512 bits to 128 bits.
- k3k4 = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
- y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- x0 = x0._mm_xor_si128(b: x1)
- x0 = x0._mm_xor_si128(b: y0)
- y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- x0 = x0._mm_xor_si128(b: x2)
- x0 = x0._mm_xor_si128(b: y0)
- y0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- x0 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- x0 = x0._mm_xor_si128(b: x3)
- x0 = x0._mm_xor_si128(b: y0)
-
- // Reduce 128 bits to 64 bits.
- x1 = x0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
- x2 = util.make_m128i_multiple_u32(
- a00: 0xFFFF_FFFF,
- a01: 0x0000_0000,
- a02: 0xFFFF_FFFF,
- a03: 0x0000_0000)
- x0 = x0._mm_srli_si128(imm8: 8)
- x0 = x0._mm_xor_si128(b: x1)
- k5zz = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
- x1 = x0._mm_srli_si128(imm8: 4)
- x0 = x0._mm_and_si128(b: x2)
- x0 = x0._mm_clmulepi64_si128(b: k5zz, imm8: 0x00)
- x0 = x0._mm_xor_si128(b: x1)
-
- // Reduce 64 bits to 32 bits (Barrett Reduction) and extract.
- //
- // Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after
- // adjusting for bit-reflection as per Figure 12 (page 21).
- pxmu = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16])
- x1 = x0._mm_and_si128(b: x2)
- x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
- x1 = x1._mm_and_si128(b: x2)
- x1 = x1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
- x1 = x1._mm_xor_si128(b: x0)
- s = x1._mm_extract_epi32(imm8: 1)
-
- // Handle the tail of args.x that wasn't a complete 64-byte chunk.
- tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
- if tail_index < args.x.length() {
- iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
- s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
- }
- }
+ while args.x.length() > 0 {
+ s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
+ args.x = args.x[1 ..]
+ } endwhile
this.state = 0xFFFF_FFFF ^ s
}
-
-// These constants come from page 22 of Gopal et al. They are also reproduced
-// by script/print-crc32-x86-sse42-magic-numbers.go which is runnable online at
-// https://play.golang.org/p/wH1q6GfhKOE
-//
-// The k6' constant from the Gopal paper is unused.
-//
-// The rkN names match the numbers at
-// https://github.com/intel/isa-l/blob/7b30857e20b84e5afab1a28291189b9dc571110d/crc/crc32_gzip_refl_by16_10.asm#L475-L499
-//
-// The "+§" means a harmless off-by-one difference compared to Intel's numbers.
-// https://danlark.org/2021/03/08/how-a-bug-in-the-linux-crc-32-checksum-turned-out-not-to-be-a-bug/
-// https://github.com/google/wuffs/commit/b24e046670396d7ef22ccf499051340b9288419b
-
-pri const IEEE_X86_SSE42_K1K2 : roarray[16] base.u8 = [
- 0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00, // k1' = 0x1_5444_2BD4 = rk16
- 0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00, // k2' = 0x1_C6E4_1596 = rk15
-]
-
-pri const IEEE_X86_SSE42_K3K4 : roarray[16] base.u8 = [
- 0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00, // k3' = 0x1_7519_97D0 = rk2
- 0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00, // k4' = 0x0_CCAA_009E = rk1
-]
-
-pri const IEEE_X86_SSE42_K5ZZ : roarray[16] base.u8 = [
- 0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00, // k5' = 0x1_63CD_6124 = rk6
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Unused
-]
-
-pri const IEEE_X86_SSE42_PXMU : roarray[16] base.u8 = [
- 0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00, // Px' = 0x1_DB71_0641 = rk8+§
- 0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00, // μ' = 0x1_F701_1641 = rk7+§
-]