std/crc64: re-write x86_sse42 implementation
name old speed new speed delta
wuffs_crc64_ecma_10k/clang14 5.84GB/s ± 0% 23.43GB/s ± 0% +301.23% (p=0.008 n=5+5)
wuffs_crc64_ecma_100k/clang14 6.18GB/s ± 0% 30.23GB/s ± 1% +389.45% (p=0.008 n=5+5)
wuffs_crc64_ecma_10k/gcc12 5.87GB/s ± 1% 24.02GB/s ± 0% +309.21% (p=0.008 n=5+5)
wuffs_crc64_ecma_100k/gcc12 6.21GB/s ± 0% 30.68GB/s ± 0% +394.22% (p=0.008 n=5+5)
$ time example-mzcat < linux-6.8.2.tar.xz > /dev/null
Before: user 0m7.604s
After: user 0m7.432s
diff --git a/lang/check/optimize.go b/lang/check/optimize.go
index 42458e7..86974d5 100644
--- a/lang/check/optimize.go
+++ b/lang/check/optimize.go
@@ -49,8 +49,7 @@
return q.optimizeIOMethodAdvanceExpr(receiver, advanceExpr, update)
}
- // Check if receiver looks like "a[i .. j]" where i and j are constants and
- // ((j - i) >= advance).
+ // Check if the receiver looks like "a[i .. j]".
if _, i, j, ok := receiver.IsSlice(); ok {
icv := (*big.Int)(nil)
if i == nil {
@@ -64,11 +63,25 @@
jcv = j.ConstValue()
}
- if (icv != nil) && (jcv != nil) {
+ switch {
+ case (icv != nil) && (jcv != nil):
+ // OK if i and j are constants and ((j - i) >= advance).
n := big.NewInt(0).Sub(jcv, icv)
if n.Cmp(advance) >= 0 {
retOK = true
}
+
+ case (i != nil) && (j != nil) && (j.Operator() == t.IDXBinaryPlus):
+ // OK if j is (i + constant) and (constant >= advance).
+ n := (*big.Int)(nil)
+ if j.LHS().AsExpr().Eq(i) {
+ n = j.RHS().AsExpr().ConstValue()
+ } else if j.RHS().AsExpr().Eq(i) {
+ n = j.LHS().AsExpr().ConstValue()
+ }
+ if n != nil && (n.Cmp(advance) >= 0) {
+ retOK = true
+ }
}
}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 63f1bf8..e2adb67 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -36597,27 +36597,33 @@
};
static const uint8_t
-WUFFS_CRC64__SHUFFLE_707F[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 112u, 113u, 114u, 115u, 116u, 117u, 118u, 119u,
- 120u, 121u, 122u, 123u, 124u, 125u, 126u, 127u,
-};
-
-static const uint8_t
-WUFFS_CRC64__SHUFFLE_8F80[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 143u, 142u, 141u, 140u, 139u, 138u, 137u, 136u,
- 135u, 134u, 133u, 132u, 131u, 130u, 129u, 128u,
-};
-
-static const uint8_t
-WUFFS_CRC64__ECMA_X86_SSE42_K3K4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+WUFFS_CRC64__ECMA_X86_SSE42_FOLD1[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
228u, 58u, 57u, 202u, 151u, 212u, 93u, 224u,
64u, 95u, 135u, 199u, 175u, 149u, 190u, 218u,
};
static const uint8_t
-WUFFS_CRC64__ECMA_X86_SSE42_PXMU[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
- 133u, 30u, 14u, 175u, 43u, 175u, 216u, 146u,
+WUFFS_CRC64__ECMA_X86_SSE42_FOLD2[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 68u, 250u, 158u, 138u, 0u, 91u, 9u, 96u,
+ 81u, 175u, 225u, 15u, 163u, 83u, 230u, 59u,
+};
+
+static const uint8_t
+WUFFS_CRC64__ECMA_X86_SSE42_FOLD4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 243u, 65u, 212u, 157u, 187u, 239u, 227u, 106u,
+ 244u, 45u, 132u, 167u, 84u, 96u, 31u, 8u,
+};
+
+static const uint8_t
+WUFFS_CRC64__ECMA_X86_SSE42_FOLD8[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 0u, 16u, 204u, 79u, 29u, 215u, 87u, 135u,
+ 64u, 231u, 61u, 247u, 42u, 107u, 216u, 215u,
+};
+
+static const uint8_t
+WUFFS_CRC64__ECMA_X86_SSE42_MUPX[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
213u, 99u, 41u, 23u, 108u, 70u, 62u, 156u,
+ 133u, 30u, 14u, 175u, 43u, 175u, 216u, 146u,
};
// ---------------- Private Initializer Prototypes
@@ -36899,89 +36905,122 @@
wuffs_base__slice_u8 a_x) {
uint64_t v_s = 0;
wuffs_base__slice_u8 v_p = {0};
- __m128i v_s0 = {0};
- __m128i v_s0_707F = {0};
- __m128i v_s0_8F80 = {0};
- __m128i v_x0 = {0};
- __m128i v_aa = {0};
- __m128i v_k3k4 = {0};
- __m128i v_t0 = {0};
- __m128i v_t1 = {0};
- __m128i v_t2 = {0};
- __m128i v_u0 = {0};
- __m128i v_u1 = {0};
- __m128i v_u2 = {0};
- __m128i v_v0 = {0};
- __m128i v_v1 = {0};
- __m128i v_pxmu = {0};
- __m128i v_w1 = {0};
- __m128i v_w2 = {0};
- uint64_t v_tail_index = 0;
+ uint8_t v_buf[48] = {0};
+ __m128i v_xa = {0};
+ __m128i v_xb = {0};
+ __m128i v_xc = {0};
+ __m128i v_xd = {0};
+ __m128i v_xe = {0};
+ __m128i v_xf = {0};
+ __m128i v_xg = {0};
+ __m128i v_xh = {0};
+ __m128i v_mu1 = {0};
+ __m128i v_mu2 = {0};
+ __m128i v_mu4 = {0};
+ __m128i v_mu8 = {0};
+ __m128i v_mupx = {0};
v_s = (18446744073709551615u ^ self->private_impl.f_state);
while ((((uint64_t)(a_x.len)) > 0u) && ((15u & ((uint32_t)(0xFFFu & (uintptr_t)(a_x.ptr)))) != 0u)) {
v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ a_x.ptr[0u]))] ^ (v_s >> 8u));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 1u);
}
- if (((uint64_t)(a_x.len)) < 32u) {
- {
- wuffs_base__slice_u8 i_slice_p = a_x;
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
- while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
- v_p.ptr += 1;
+ do {
+ do {
+ if (((uint64_t)(a_x.len)) >= 128u) {
+ } else if (((uint64_t)(a_x.len)) >= 64u) {
+ v_xa = _mm_xor_si128(_mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)), _mm_cvtsi64_si128((int64_t)(v_s)));
+ v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
+ v_xc = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
+ v_xd = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 64u);
+ break;
+ } else if (((uint64_t)(a_x.len)) >= 32u) {
+ v_xa = _mm_xor_si128(_mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)), _mm_cvtsi64_si128((int64_t)(v_s)));
+ v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 32u);
+ goto label__chain2__break;
+ } else {
+ {
+ wuffs_base__slice_u8 i_slice_p = a_x;
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 1;
+ const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
+ while (v_p.ptr < i_end0_p) {
+ v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
+ v_p.ptr += 1;
+ }
+ v_p.len = 0;
+ }
+ self->private_impl.f_state = (18446744073709551615u ^ v_s);
+ return wuffs_base__make_empty_struct();
}
- v_p.len = 0;
+ v_xa = _mm_xor_si128(_mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)), _mm_cvtsi64_si128((int64_t)(v_s)));
+ v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u));
+ v_xc = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u));
+ v_xd = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u));
+ v_xe = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u));
+ v_xf = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u));
+ v_xg = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u));
+ v_xh = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
+ v_mu8 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD8));
+ while (((uint64_t)(a_x.len)) >= 128u) {
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
+ v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
+ v_xc = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xc, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xc, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
+ v_xd = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xd, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xd, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
+ v_xe = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xe, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xe, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 64u)));
+ v_xf = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xf, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xf, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 80u)));
+ v_xg = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xg, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xg, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 96u)));
+ v_xh = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xh, v_mu8, (int32_t)(0u)), _mm_clmulepi64_si128(v_xh, v_mu8, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 112u)));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 128u);
+ }
+ v_mu4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD4));
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(17u))), v_xe);
+ v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(17u))), v_xf);
+ v_xc = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(17u))), v_xg);
+ v_xd = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(17u))), v_xh);
+ if (((uint64_t)(a_x.len)) > 64u) {
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
+ v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
+ v_xc = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xc, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32u)));
+ v_xd = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(0u)), _mm_clmulepi64_si128(v_xd, v_mu4, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48u)));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 64u);
+ }
+ } while (0);
+ v_mu2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD2));
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(17u))), v_xc);
+ v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(17u))), v_xd);
+ if (((uint64_t)(a_x.len)) > 32u) {
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
+ v_xb = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu2, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16u)));
+ a_x = wuffs_base__slice_u8__subslice_i(a_x, 32u);
}
- self->private_impl.f_state = (18446744073709551615u ^ v_s);
- return wuffs_base__make_empty_struct();
- }
- v_s0 = _mm_cvtsi64_si128((int64_t)(v_s));
- v_s0_707F = _mm_shuffle_epi8(v_s0, _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__SHUFFLE_707F)));
- v_s0_8F80 = _mm_shuffle_epi8(v_s0, _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__SHUFFLE_8F80)));
- v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
- a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u);
- v_k3k4 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_K3K4));
- v_t0 = _mm_xor_si128(v_s0_707F, v_x0);
- v_t1 = _mm_clmulepi64_si128(v_t0, v_k3k4, (int32_t)(0u));
- v_t2 = _mm_clmulepi64_si128(v_t0, v_k3k4, (int32_t)(17u));
- v_aa = _mm_xor_si128(_mm_xor_si128(v_t1, v_t2), v_s0_8F80);
- while (((uint64_t)(a_x.len)) >= 32u) {
- v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
+ } while (0);
+ label__chain2__break:;
+ v_mu1 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD1));
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(17u))), v_xb);
+ if (((uint64_t)(a_x.len)) > 24u) {
+ v_xa = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu1, (int32_t)(17u))), _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u)));
a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u);
- v_u0 = _mm_xor_si128(v_aa, v_x0);
- v_u1 = _mm_clmulepi64_si128(v_u0, v_k3k4, (int32_t)(0u));
- v_u2 = _mm_clmulepi64_si128(v_u0, v_k3k4, (int32_t)(17u));
- v_aa = _mm_xor_si128(v_u1, v_u2);
- }
- if (((uint64_t)(a_x.len)) < 16u) {
- return wuffs_base__make_empty_struct();
- }
- v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0u));
- a_x = wuffs_base__slice_u8__subslice_i(a_x, 16u);
- v_v0 = _mm_xor_si128(v_aa, v_x0);
- v_v1 = _mm_clmulepi64_si128(v_v0, v_k3k4, (int32_t)(16u));
- v_aa = _mm_xor_si128(v_v1, _mm_srli_si128(v_v0, (int32_t)(8u)));
- v_pxmu = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_PXMU));
- v_w1 = _mm_clmulepi64_si128(v_aa, v_pxmu, (int32_t)(16u));
- v_w2 = _mm_clmulepi64_si128(v_w1, v_pxmu, (int32_t)(0u));
- v_s = ((uint64_t)(_mm_extract_epi64(_mm_xor_si128(v_aa, _mm_xor_si128(v_w2, _mm_slli_si128(v_w1, (int32_t)(8u)))), (int32_t)(1u))));
- v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551600u);
- if (v_tail_index < ((uint64_t)(a_x.len))) {
- {
- wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- const uint8_t* i_end0_p = wuffs_private_impl__ptr_u8_plus_len(i_slice_p.ptr, i_slice_p.len);
- while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC64__ECMA_TABLE[0u][((uint8_t)(((uint8_t)(v_s)) ^ v_p.ptr[0u]))] ^ (v_s >> 8u));
- v_p.ptr += 1;
- }
- v_p.len = 0;
+ if (((uint64_t)(a_x.len)) > 24u) {
+ return wuffs_base__make_empty_struct();
}
}
+ _mm_storeu_si128((__m128i*)(void*)(v_buf + (24u - ((uint64_t)(a_x.len)))), v_xa);
+ wuffs_private_impl__slice_u8__copy_from_slice(wuffs_base__make_slice_u8_ij(v_buf, ((24u - ((uint64_t)(a_x.len))) + 16u), 48), a_x);
+ v_mu2 = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_FOLD2));
+ v_xa = _mm_lddqu_si128((const __m128i*)(const void*)(v_buf + 0u));
+ v_xb = _mm_lddqu_si128((const __m128i*)(const void*)(v_buf + 16u));
+ v_xc = _mm_lddqu_si128((const __m128i*)(const void*)(v_buf + 32u));
+ v_xd = _mm_xor_si128(_mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(0u)), _mm_clmulepi64_si128(v_xa, v_mu2, (int32_t)(17u)));
+ v_xe = _mm_xor_si128(_mm_clmulepi64_si128(v_xb, v_mu1, (int32_t)(0u)), _mm_clmulepi64_si128(v_xb, v_mu1, (int32_t)(17u)));
+ v_xa = _mm_xor_si128(v_xd, _mm_xor_si128(v_xe, v_xc));
+ v_mupx = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC64__ECMA_X86_SSE42_MUPX));
+ v_xb = _mm_clmulepi64_si128(v_xa, v_mupx, (int32_t)(0u));
+ v_xc = _mm_clmulepi64_si128(v_xb, v_mupx, (int32_t)(16u));
+ v_s = ((uint64_t)(_mm_extract_epi64(_mm_xor_si128(_mm_xor_si128(v_xc, _mm_slli_si128(v_xb, (int32_t)(8u))), v_xa), (int32_t)(1u))));
self->private_impl.f_state = (18446744073709551615u ^ v_s);
return wuffs_base__make_empty_struct();
}
diff --git a/script/print-crc64-x86-sse42-magic-numbers.go b/script/print-crc64-x86-sse42-magic-numbers.go
index ff296d8..2fc76c2 100644
--- a/script/print-crc64-x86-sse42-magic-numbers.go
+++ b/script/print-crc64-x86-sse42-magic-numbers.go
@@ -105,10 +105,14 @@
}
func main() {
- show("Px'", px)
- calcKn("k1'", 512+64)
- calcKn("k2'", 512)
- calcKn("k3'", 128+64)
- calcKn("k4'", 128)
+ calcKn("fold1a'", 128+64)
+ calcKn("fold1b'", 128)
+ calcKn("fold2a'", 256+64)
+ calcKn("fold2b'", 256)
+ calcKn("fold4a'", 512+64)
+ calcKn("fold4b'", 512)
+ calcKn("fold8a'", 1024+64)
+ calcKn("fold8b'", 1024)
calcMu("μ' ")
+ show("Px'", px)
}
diff --git a/std/crc64/common_up_x86_sse42.wuffs b/std/crc64/common_up_x86_sse42.wuffs
index 0fac73b..9639769 100644
--- a/std/crc64/common_up_x86_sse42.wuffs
+++ b/std/crc64/common_up_x86_sse42.wuffs
@@ -12,6 +12,8 @@
// Like std/crc32's x86 SIMD implementation, this is based on Gopal et al.
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction".
+// and further optimized a la
+// https://github.com/JoernEngel/joernblog/blob/778f0007b9a580e477608691f1aa86369f0efdd2/crc64.c
pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8),
choose cpu_arch >= x86_sse42,
@@ -19,26 +21,22 @@
var s : base.u64
var p : roslice base.u8
- var util : base.x86_sse42_utility
- var s0 : base.x86_m128i
- var s0_707F : base.x86_m128i
- var s0_8F80 : base.x86_m128i
- var x0 : base.x86_m128i
- var aa : base.x86_m128i
- var k3k4 : base.x86_m128i
- var t0 : base.x86_m128i
- var t1 : base.x86_m128i
- var t2 : base.x86_m128i
- var u0 : base.x86_m128i
- var u1 : base.x86_m128i
- var u2 : base.x86_m128i
- var v0 : base.x86_m128i
- var v1 : base.x86_m128i
- var pxmu : base.x86_m128i
- var w1 : base.x86_m128i
- var w2 : base.x86_m128i
+ var buf : array[48] base.u8
- var tail_index : base.u64
+ var util : base.x86_sse42_utility
+ var xa : base.x86_m128i
+ var xb : base.x86_m128i
+ var xc : base.x86_m128i
+ var xd : base.x86_m128i
+ var xe : base.x86_m128i
+ var xf : base.x86_m128i
+ var xg : base.x86_m128i
+ var xh : base.x86_m128i
+ var mu1 : base.x86_m128i
+ var mu2 : base.x86_m128i
+ var mu4 : base.x86_m128i
+ var mu8 : base.x86_m128i
+ var mupx : base.x86_m128i
s = 0xFFFF_FFFF_FFFF_FFFF ^ this.state
@@ -48,8 +46,31 @@
args.x = args.x[1 ..]
} endwhile
- // For short inputs, just do a simple loop.
- if args.x.length() < 0x20 {
+ while.chain2 true {{
+ while.chain4 true {{
+ if args.x.length() >= 0x80 {
+ // No-op.
+
+ } else if args.x.length() >= 0x40 {
+ // Set up 4 accumulators.
+ xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
+ util.make_m128i_single_u64(a: s))
+ xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+ xc = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
+ xd = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+ args.x = args.x[0x40 ..]
+ break.chain4
+
+ } else if args.x.length() >= 0x20 {
+ // Set up 2 accumulators.
+ xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
+ util.make_m128i_single_u64(a: s))
+ xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+ args.x = args.x[0x20 ..]
+ break.chain2
+
+ } else {
+ // For short inputs, just do a simple loop.
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
@@ -57,97 +78,174 @@
return nothing
}
- // Process N 16-byte chunks (1 on ramp, N-2 main loop, 1 off ramp). The
- // algorithm is loosely based on https://github.com/rawrunprotected/crc but
- // modified to always start and end on 16-byte alignment boundaries. It
- // also doesn't crash on zero-length input.
-
- // On ramp.
-
- s0 = util.make_m128i_single_u64(a: s)
- s0_707F = s0._mm_shuffle_epi8(b: util.make_m128i_slice128(a: SHUFFLE_707F[.. 16]))
- s0_8F80 = s0._mm_shuffle_epi8(b: util.make_m128i_slice128(a: SHUFFLE_8F80[.. 16]))
-
- x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
- args.x = args.x[0x10 ..]
-
- k3k4 = util.make_m128i_slice128(a: ECMA_X86_SSE42_K3K4[.. 16])
- t0 = s0_707F._mm_xor_si128(b: x0)
- t1 = t0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- t2 = t0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- aa = t1._mm_xor_si128(b: t2)._mm_xor_si128(b: s0_8F80)
+ // Set up 8 accumulators.
+ xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
+ util.make_m128i_single_u64(a: s))
+ xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+ xc = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
+ xd = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+ xe = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
+ xf = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
+ xg = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
+ xh = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
+ args.x = args.x[0x80 ..]
// Main loop.
-
- while args.x.length() >= 0x20 {
- x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
- args.x = args.x[0x10 ..]
-
- u0 = aa._mm_xor_si128(b: x0)
- u1 = u0._mm_clmulepi64_si128(b: k3k4, imm8: 0x00)
- u2 = u0._mm_clmulepi64_si128(b: k3k4, imm8: 0x11)
- aa = u1._mm_xor_si128(b: u2)
+ mu8 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD8[.. 16])
+ while args.x.length() >= 0x80 {
+ xa = xa._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
+ xb = xb._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xb._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
+ xc = xc._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xc._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
+ xd = xd._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xd._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
+ xe = xe._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xe._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
+ xf = xf._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xf._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
+ xg = xg._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xg._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
+ xh = xh._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
+ xh._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
+ args.x = args.x[0x80 ..]
} endwhile
- // Off ramp.
-
- if args.x.length() < 0x10 {
- // Unreachable, but the "if" is needed by the bounds checker.
- return nothing
+ // Reduce 8 chains down to 4.
+ mu4 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD4[.. 16])
+ xa = xa._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ xe)
+ xb = xb._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xb._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ xf)
+ xc = xc._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xc._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ xg)
+ xd = xd._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xd._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ xh)
+ if args.x.length() > 0x40 {
+ xa = xa._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
+ xb = xb._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xb._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
+ xc = xc._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xc._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
+ xd = xd._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
+ xd._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
+ args.x = args.x[0x40 ..]
}
- x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
- args.x = args.x[0x10 ..]
- v0 = aa._mm_xor_si128(b: x0)
- v1 = v0._mm_clmulepi64_si128(b: k3k4, imm8: 0x10)
- aa = v1._mm_xor_si128(b: v0._mm_srli_si128(imm8: 8))
+ break.chain4
+ }} endwhile.chain4
+ // Reduce 4 chains down to 2.
+ mu2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD2[.. 16])
+ xa = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
+ xc)
+ xb = xb._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
+ xb._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
+ xd)
+ if args.x.length() > 0x20 {
+ xa = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
+ xb = xb._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
+ xb._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
+ args.x = args.x[0x20 ..]
+ }
- // Barrett reduction.
- pxmu = util.make_m128i_slice128(a: ECMA_X86_SSE42_PXMU[.. 16])
- w1 = aa._mm_clmulepi64_si128(b: pxmu, imm8: 0x10)
- w2 = w1._mm_clmulepi64_si128(b: pxmu, imm8: 0x00)
- s = aa._mm_xor_si128(b:
- w2._mm_xor_si128(b:
- w1._mm_slli_si128(imm8: 8)))._mm_extract_epi64(imm8: 1)
-
- // Handle the tail of args.x that wasn't a complete 16-byte chunk.
- tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFF0 // And-not 16.
- if tail_index < args.x.length() {
- iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
- s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+ break.chain2
+ }} endwhile.chain2
+ // Reduce 2 chains down to 1.
+ mu1 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD1[.. 16])
+ xa = xa._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu1, imm8: 0x11))._mm_xor_si128(b:
+ xb)
+ if args.x.length() > 24 {
+ xa = xa._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu1, imm8: 0x11))._mm_xor_si128(b:
+ util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
+ args.x = args.x[0x10 ..]
+ if args.x.length() > 24 {
+ return nothing // Unreachable.
}
}
+ assert args.x.length() <= 24
+
+ // Copy our accumulator xa and the remaining data to buf, a 48-byte buffer.
+ // The buffer includes the implicitly appended 8 bytes for the CRC.
+ //
+ // | 0-padding | xa | args.x | 0-padding |
+ // | 0-24 bytes | 16 bytes | 0-24 bytes| 8 bytes |
+ //
+ assert (24 - args.x.length()) <= ((24 - args.x.length()) + 16) via "a <= (a + b): 0 <= b"()
+ xa.store_slice128!(a: buf[24 - args.x.length() .. (24 - args.x.length()) + 16])
+ buf[(24 - args.x.length()) + 16 ..].copy_from_slice!(s: args.x)
+
+ // Reduce the 48-byte buffer down to a 16-byte (128-bit) accumulator.
+ mu2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD2[.. 16])
+ xa = util.make_m128i_slice128(a: buf[0x00 .. 0x10])
+ xb = util.make_m128i_slice128(a: buf[0x10 .. 0x20])
+ xc = util.make_m128i_slice128(a: buf[0x20 .. 0x30])
+ xd = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
+ xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))
+ xe = xb._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
+ xb._mm_clmulepi64_si128(b: mu1, imm8: 0x11))
+ xa = xd._mm_xor_si128(b: xe._mm_xor_si128(b: xc))
+
+ // Reduce 128-bit to 64-bit via Barrett reduction.
+ mupx = util.make_m128i_slice128(a: ECMA_X86_SSE42_MUPX[.. 16])
+ xb = xa._mm_clmulepi64_si128(b: mupx, imm8: 0x00)
+ xc = xb._mm_clmulepi64_si128(b: mupx, imm8: 0x10)
+ s = xc._mm_xor_si128(b: xb._mm_slli_si128(imm8: 8)).
+ _mm_xor_si128(b: xa)._mm_extract_epi64(imm8: 1)
this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s
}
-pri const SHUFFLE_707F : roarray[16] base.u8 = [
- 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
- 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
-]
-
-pri const SHUFFLE_8F80 : roarray[16] base.u8 = [
- 0x8F, 0x8E, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x88,
- 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
-]
-
// These constants are reproduced by
// script/print-crc64-x86-sse42-magic-numbers.go
//
// The rkN names match the numbers at
// https://github.com/intel/isa-l/blob/7b30857e20b84e5afab1a28291189b9dc571110d/crc/crc64_ecma_refl_by16_10.asm#L33-L56
-// pri const ECMA_X86_SSE42_K1K2 : roarray[16] base.u8 = [
-// 0xF3, 0x41, 0xD4, 0x9D, 0xBB, 0xEF, 0xE3, 0x6A, // k1' = 0x6AE3_EFBB_9DD4_41F3 = rk16
-// 0xF4, 0x2D, 0x84, 0xA7, 0x54, 0x60, 0x1F, 0x08, // k2' = 0x081F_6054_A784_2DF4 = rk15
-// ]
-
-pri const ECMA_X86_SSE42_K3K4 : roarray[16] base.u8 = [
- 0xE4, 0x3A, 0x39, 0xCA, 0x97, 0xD4, 0x5D, 0xE0, // k3' = 0xE05D_D497_CA39_3AE4 = rk2
- 0x40, 0x5F, 0x87, 0xC7, 0xAF, 0x95, 0xBE, 0xDA, // k4' = 0xDABE_95AF_C787_5F40 = rk1
+pri const ECMA_X86_SSE42_FOLD1 : roarray[16] base.u8 = [
+ 0xE4, 0x3A, 0x39, 0xCA, 0x97, 0xD4, 0x5D, 0xE0, // fold1a' = 0xE05D_D497_CA39_3AE4 = rk2
+ 0x40, 0x5F, 0x87, 0xC7, 0xAF, 0x95, 0xBE, 0xDA, // fold1b' = 0xDABE_95AF_C787_5F40 = rk1
]
-pri const ECMA_X86_SSE42_PXMU : roarray[16] base.u8 = [
- 0x85, 0x1E, 0x0E, 0xAF, 0x2B, 0xAF, 0xD8, 0x92, // Px' = 0x92D8_AF2B_AF0E_1E85 = rk8
+pri const ECMA_X86_SSE42_FOLD2 : roarray[16] base.u8 = [
+ 0x44, 0xFA, 0x9E, 0x8A, 0x00, 0x5B, 0x09, 0x60, // fold2a' = 0x6009_5B00_8A9E_FA44 = rk20
+ 0x51, 0xAF, 0xE1, 0x0F, 0xA3, 0x53, 0xE6, 0x3B, // fold2b' = 0x3BE6_53A3_0FE1_AF51 = rk19
+]
+
+pri const ECMA_X86_SSE42_FOLD4 : roarray[16] base.u8 = [
+ 0xF3, 0x41, 0xD4, 0x9D, 0xBB, 0xEF, 0xE3, 0x6A, // fold4a' = 0x6AE3_EFBB_9DD4_41F3 = rk16
+ 0xF4, 0x2D, 0x84, 0xA7, 0x54, 0x60, 0x1F, 0x08, // fold4b' = 0x081F_6054_A784_2DF4 = rk15
+]
+
+pri const ECMA_X86_SSE42_FOLD8 : roarray[16] base.u8 = [
+ 0x00, 0x10, 0xCC, 0x4F, 0x1D, 0xD7, 0x57, 0x87, // fold8a' = 0x8757_D71D_4FCC_1000 = rk4
+ 0x40, 0xE7, 0x3D, 0xF7, 0x2A, 0x6B, 0xD8, 0xD7, // fold8b' = 0xD7D8_6B2A_F73D_E740 = rk3
+]
+
+pri const ECMA_X86_SSE42_MUPX : roarray[16] base.u8 = [
0xD5, 0x63, 0x29, 0x17, 0x6C, 0x46, 0x3E, 0x9C, // μ' = 0x9C3E_466C_1729_63D5 = rk7
+ 0x85, 0x1E, 0x0E, 0xAF, 0x2B, 0xAF, 0xD8, 0x92, // Px' = 0x92D8_AF2B_AF0E_1E85 = rk8
]
diff --git a/test/c/std/crc64.c b/test/c/std/crc64.c
index 49869df..23e21b0 100644
--- a/test/c/std/crc64.c
+++ b/test/c/std/crc64.c
@@ -377,7 +377,7 @@
return do_bench_io_buffers(
wuffs_bench_crc64_ecma,
WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, tcounter_src,
- &g_crc64_midsummer_gt, UINT64_MAX, UINT64_MAX, 200);
+ &g_crc64_midsummer_gt, UINT64_MAX, UINT64_MAX, 2000);
}
const char* //
@@ -386,7 +386,7 @@
return do_bench_io_buffers(
wuffs_bench_crc64_ecma,
WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, tcounter_src,
- &g_crc64_pi_gt, UINT64_MAX, UINT64_MAX, 20);
+ &g_crc64_pi_gt, UINT64_MAX, UINT64_MAX, 200);
}
// ---------------- Mimic Benches