Implement std/crc32 ieee_hasher.up_x86_sse42
On a mid-range x86_64 laptop:
----
$ ls -lh linux-5.11.3.tar.gz | awk '{print $5 " " $9}'
178M linux-5.11.3.tar.gz
$ # Debian's /usr/bin/crc32
$ time /usr/bin/crc32 /dev/stdin < linux-5.11.3.tar.gz > /dev/null
real 0m0.411s
$ # Wuffs; before
$ time gen/bin/example-crc32 < linux-5.11.3.tar.gz > /dev/null
real 0m0.137s
$ # Wuffs; after
$ time gen/bin/example-crc32 < linux-5.11.3.tar.gz > /dev/null
real 0m0.060s
----
Wuffs benchmarks:
wuffs_crc32_ieee_10k/clang9 1.71GB/s ± 0% 8.40GB/s ± 3% +391.97% (p=0.008 n=5+5)
wuffs_crc32_ieee_100k/clang9 1.73GB/s ± 1% 10.67GB/s ± 0% +518.47% (p=0.008 n=5+5)
wuffs_crc32_ieee_10k/gcc10 2.08GB/s ± 0% 8.77GB/s ± 0% +322.05% (p=0.008 n=5+5)
wuffs_crc32_ieee_100k/gcc10 2.10GB/s ± 0% 11.23GB/s ± 0% +435.56% (p=0.008 n=5+5)
mimic_crc32_ieee_10k 774MB/s ± 0% 776MB/s ± 0% +0.23% (p=0.016 n=5+5)
mimic_crc32_ieee_100k 773MB/s ± 0% 775MB/s ± 0% +0.24% (p=0.032 n=5+5)
wuffs_gzip_decode_10k/clang9 151MB/s ± 0% 166MB/s ± 0% +9.81% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/clang9 155MB/s ± 0% 178MB/s ± 0% +14.53% (p=0.008 n=5+5)
wuffs_gzip_decode_10k/gcc10 150MB/s ± 0% 172MB/s ± 0% +14.92% (p=0.008 n=5+5)
wuffs_gzip_decode_100k/gcc10 188MB/s ± 0% 198MB/s ± 0% +5.37% (p=0.008 n=5+5)
mimic_gzip_decode_10k 121MB/s ± 0% 121MB/s ± 0% ~ (p=1.000 n=5+5)
mimic_gzip_decode_100k 143MB/s ± 0% 144MB/s ± 0% ~ (p=0.056 n=5+5)
wuffs_png_decode_image_19k_8bpp/clang9 111MB/s ± 0% 115MB/s ± 0% +3.77% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/clang9 147MB/s ± 0% 153MB/s ± 0% +4.66% (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/clang9 404MB/s ± 0% 415MB/s ± 0% +2.75% (p=0.016 n=5+4)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 344MB/s ± 0% 345MB/s ± 1% ~ (p=0.151 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 329MB/s ± 0% 335MB/s ± 0% +1.78% (p=0.016 n=5+4)
wuffs_png_decode_image_4002k_24bpp/clang9 148MB/s ± 0% 153MB/s ± 0% +3.38% (p=0.008 n=5+5)
wuffs_png_decode_image_19k_8bpp/gcc10 120MB/s ± 0% 123MB/s ± 0% +2.62% (p=0.008 n=5+5)
wuffs_png_decode_image_40k_24bpp/gcc10 153MB/s ± 0% 159MB/s ± 1% +3.54% (p=0.008 n=5+5)
wuffs_png_decode_image_77k_8bpp/gcc10 440MB/s ± 0% 452MB/s ± 0% +2.82% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 357MB/s ± 0% 353MB/s ± 0% -0.95% (p=0.008 n=5+5)
wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 341MB/s ± 0% 341MB/s ± 0% ~ (p=0.151 n=5+5)
wuffs_png_decode_image_4002k_24bpp/gcc10 157MB/s ± 0% 161MB/s ± 0% +2.85% (p=0.016 n=5+4)
mimic_png_decode_image_19k_8bpp 58.1MB/s ± 0% 58.1MB/s ± 0% ~ (p=0.389 n=5+5)
mimic_png_decode_image_40k_24bpp 73.2MB/s ± 0% 73.2MB/s ± 0% ~ (p=0.548 n=5+5)
mimic_png_decode_image_77k_8bpp 177MB/s ± 0% 177MB/s ± 0% +0.14% (p=0.008 n=5+5)
mimic_png_decode_image_552k_32bpp_ignore_checksum skipped
mimic_png_decode_image_552k_32bpp_verify_checksum 146MB/s ± 0% 147MB/s ± 0% +0.13% (p=0.016 n=5+5)
mimic_png_decode_image_4002k_24bpp 104MB/s ± 0% 104MB/s ± 0% ~ (p=0.556 n=5+4)
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index ffdf27c..be32c70 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -720,7 +720,9 @@
if err := g.writeExpr(b, n.LHS().AsExpr(), sideEffectsOnly, depth); err != nil {
return err
}
- b.writes(".ptr")
+ if n.LHS().AsExpr().MType().IsSliceType() {
+ b.writes(".ptr")
+ }
if n.MHS() != nil {
b.writes(" + ")
if err := g.writeExpr(b, n.MHS().AsExpr(), sideEffectsOnly, depth); err != nil {
@@ -733,7 +735,9 @@
if err := g.writeExpr(b, n, sideEffectsOnly, depth); err != nil {
return err
}
- b.writes(".ptr")
+ if n.MType().IsSliceType() {
+ b.writes(".ptr")
+ }
return nil
}
diff --git a/internal/cgen/statement.go b/internal/cgen/statement.go
index c91a957..a60a578 100644
--- a/internal/cgen/statement.go
+++ b/internal/cgen/statement.go
@@ -277,7 +277,7 @@
caMacro, caName, caAttribute = "ARM_NEON", "arm_neon", ""
case t.IDX86SSE42:
caMacro, caName, caAttribute =
- "X86_64", "x86_sse42", "__attribute__((target(\"sse4.2\")))"
+ "X86_64", "x86_sse42", "__attribute__((target(\"pclmul,popcnt,sse4.2\")))"
}
}
}
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index c5269d1..375014e 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -711,10 +711,15 @@
"x86_m128i._mm_avg_epu8(b: x86_m128i) x86_m128i",
"x86_m128i._mm_blend_epi16(b: x86_m128i, imm8: u32) x86_m128i",
"x86_m128i._mm_blendv_epi8(b: x86_m128i, mask: x86_m128i) x86_m128i",
+ "x86_m128i._mm_clmulepi64_si128(b: x86_m128i, imm8: u32) x86_m128i",
"x86_m128i._mm_cmpeq_epi16(b: x86_m128i) x86_m128i",
"x86_m128i._mm_cmpeq_epi32(b: x86_m128i) x86_m128i",
"x86_m128i._mm_cmpeq_epi64(b: x86_m128i) x86_m128i",
"x86_m128i._mm_cmpeq_epi8(b: x86_m128i) x86_m128i",
+ "x86_m128i._mm_extract_epi16(imm8: u32) u16",
+ "x86_m128i._mm_extract_epi32(imm8: u32) u32",
+ "x86_m128i._mm_extract_epi64(imm8: u32) u64",
+ "x86_m128i._mm_extract_epi8(imm8: u32) u8",
"x86_m128i._mm_madd_epi16(b: x86_m128i) x86_m128i",
"x86_m128i._mm_maddubs_epi16(b: x86_m128i) x86_m128i",
"x86_m128i._mm_max_epi16(b: x86_m128i) x86_m128i",
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 2b7eb1d..b0f7e8a 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -18325,7 +18325,7 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
+__attribute__((target("pclmul,popcnt,sse4.2")))
#endif
static wuffs_base__empty_struct
wuffs_adler32__hasher__up_x86_sse42(
@@ -22153,6 +22153,30 @@
},
};
+static const uint8_t
+WUFFS_CRC32__IEEE_X86_SSE42_K1K2[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 212, 43, 68, 84, 1, 0, 0, 0,
+ 150, 21, 228, 198, 1, 0, 0, 0,
+};
+
+static const uint8_t
+WUFFS_CRC32__IEEE_X86_SSE42_K3K4[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 208, 151, 25, 117, 1, 0, 0, 0,
+ 158, 0, 170, 204, 0, 0, 0, 0,
+};
+
+static const uint8_t
+WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 36, 97, 205, 99, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const uint8_t
+WUFFS_CRC32__IEEE_X86_SSE42_K6MU[16] WUFFS_BASE__POTENTIALLY_UNUSED = {
+ 65, 6, 113, 219, 1, 0, 0, 0,
+ 65, 22, 1, 247, 1, 0, 0, 0,
+};
+
// ---------------- Private Initializer Prototypes
// ---------------- Private Function Prototypes
@@ -22482,7 +22506,7 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
+__attribute__((target("pclmul,popcnt,sse4.2")))
#endif
static wuffs_base__empty_struct
wuffs_crc32__ieee_hasher__up_x86_sse42(
@@ -22490,6 +22514,16 @@
wuffs_base__slice_u8 a_x) {
uint32_t v_s = 0;
wuffs_base__slice_u8 v_p = {0};
+ __m128i v_k = {0};
+ __m128i v_x0 = {0};
+ __m128i v_x1 = {0};
+ __m128i v_x2 = {0};
+ __m128i v_x3 = {0};
+ __m128i v_y0 = {0};
+ __m128i v_y1 = {0};
+ __m128i v_y2 = {0};
+ __m128i v_y3 = {0};
+ uint64_t v_tail_index = 0;
v_s = (4294967295 ^ self->private_impl.f_state);
while ((((uint64_t)(a_x.len)) > 0) && ((15 & ((uint32_t)(0xFFF & (uintptr_t)(a_x.ptr)))) != 0)) {
@@ -22511,17 +22545,77 @@
self->private_impl.f_state = (4294967295 ^ v_s);
return wuffs_base__make_empty_struct();
}
+ v_x0 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 0));
+ v_x1 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 16));
+ v_x2 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 32));
+ v_x3 = _mm_lddqu_si128((const __m128i*)(const void*)(a_x.ptr + 48));
+ v_x0 = _mm_xor_si128(v_x0, _mm_cvtsi32_si128((int32_t)(v_s)));
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K1K2));
{
- wuffs_base__slice_u8 i_slice_p = a_x;
+ wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, 64);
v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+ v_p.len = 64;
+ uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 64) * 64);
while (v_p.ptr < i_end0_p) {
- v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
- v_p.ptr += 1;
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_y1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(0));
+ v_y2 = _mm_clmulepi64_si128(v_x2, v_k, (int32_t)(0));
+ v_y3 = _mm_clmulepi64_si128(v_x3, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(17));
+ v_x2 = _mm_clmulepi64_si128(v_x2, v_k, (int32_t)(17));
+ v_x3 = _mm_clmulepi64_si128(v_x3, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(_mm_xor_si128(v_x0, v_y0), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 0)));
+ v_x1 = _mm_xor_si128(_mm_xor_si128(v_x1, v_y1), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16)));
+ v_x2 = _mm_xor_si128(_mm_xor_si128(v_x2, v_y2), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 32)));
+ v_x3 = _mm_xor_si128(_mm_xor_si128(v_x3, v_y3), _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 48)));
+ v_p.ptr += 64;
}
v_p.len = 0;
}
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K3K4));
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(v_x0, v_x2);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_y0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(17));
+ v_x0 = _mm_xor_si128(v_x0, v_x3);
+ v_x0 = _mm_xor_si128(v_x0, v_y0);
+ v_x1 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(16));
+ v_x2 = _mm_set_epi32((int32_t)(0), (int32_t)(4294967295), (int32_t)(0), (int32_t)(4294967295));
+ v_x0 = _mm_srli_si128(v_x0, (int32_t)(8));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K5ZZ));
+ v_x1 = _mm_srli_si128(v_x0, (int32_t)(4));
+ v_x0 = _mm_and_si128(v_x0, v_x2);
+ v_x0 = _mm_clmulepi64_si128(v_x0, v_k, (int32_t)(0));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_k = _mm_lddqu_si128((const __m128i*)(const void*)(WUFFS_CRC32__IEEE_X86_SSE42_K6MU));
+ v_x1 = _mm_and_si128(v_x0, v_x2);
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(16));
+ v_x1 = _mm_and_si128(v_x1, v_x2);
+ v_x1 = _mm_clmulepi64_si128(v_x1, v_k, (int32_t)(0));
+ v_x0 = _mm_xor_si128(v_x0, v_x1);
+ v_s = _mm_extract_epi32(v_x0, (int32_t)(1));
+ v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551552u);
+ if (v_tail_index < ((uint64_t)(a_x.len))) {
+ {
+ wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 1;
+ uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+ while (v_p.ptr < i_end0_p) {
+ v_s = (WUFFS_CRC32__IEEE_TABLE[0][(((uint8_t)((v_s & 255))) ^ v_p.ptr[0])] ^ (v_s >> 8));
+ v_p.ptr += 1;
+ }
+ v_p.len = 0;
+ }
+ }
self->private_impl.f_state = (4294967295 ^ v_s);
return wuffs_base__make_empty_struct();
}
@@ -33056,7 +33150,7 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
+__attribute__((target("pclmul,popcnt,sse4.2")))
#endif
static wuffs_base__empty_struct
wuffs_png__decoder__filter_1_distance_4_x86_sse42(
@@ -33102,7 +33196,7 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
+__attribute__((target("pclmul,popcnt,sse4.2")))
#endif
static wuffs_base__empty_struct
wuffs_png__decoder__filter_3_distance_4_x86_sse42(
@@ -33207,7 +33301,7 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
+__attribute__((target("pclmul,popcnt,sse4.2")))
#endif
static wuffs_base__empty_struct
wuffs_png__decoder__filter_4_distance_3_x86_sse42(
@@ -33333,7 +33427,7 @@
#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
+__attribute__((target("pclmul,popcnt,sse4.2")))
#endif
static wuffs_base__empty_struct
wuffs_png__decoder__filter_4_distance_4_x86_sse42(
diff --git a/std/crc32/README.md b/std/crc32/README.md
index 182ed8a..13006e2 100644
--- a/std/crc32/README.md
+++ b/std/crc32/README.md
@@ -78,10 +78,11 @@
`0xC` is all you need to specify the polynomial.
For 32 bit CRCs, the two popular polynomials (presented in `LSB` order) are
-`0xEDB88320` and `0x82F63B78`, also called the IEEE and Castagnoli polynomials,
-also called CRC-32 and CRC-32C. For example, the bit string representation of
-the IEEE polynomial `0xEDB88320`, un-reversed and with the implicit high bit,
-is `0b1_00000100_11000001_00011101_10110111`.
+`0xEDB8_8320` and `0x82F6_3B78`, also called the IEEE and Castagnoli
+polynomials, also called CRC-32 and CRC-32C. For example, the bit string
+representation of the IEEE polynomial `0xEDB8_8320`, un-reversed and with the
+implicit high bit, is `0b1_00000100_11000001_00011101_10110111 =
+0x1_04C1_1DB7`.
# Worked Example
@@ -138,8 +139,8 @@
hex A 9 C 3 2 2 5 D
```
-The final line says that the CRC-32 checksum of "Hi\n" is 0xD5223C9A. This can
-be verified by running the `/usr/bin/crc32` program:
+The final line says that the CRC-32 checksum of "Hi\n" is `0xD522_3C9A`. This
+can be verified by running the `/usr/bin/crc32` program:
```
$ echo Hi | xxd /dev/stdin
@@ -226,8 +227,6 @@
by Gopal, Ozturk, Guilford, Wolrich, Feghali and Dixon of Intel Corporation and
Karakoyunlu of the Worcester Polytechnic Institute.
-Wuffs does not currently implement the SIMD algorithm.
-
# Further Reading
diff --git a/std/crc32/common_up_x86_sse42.wuffs b/std/crc32/common_up_x86_sse42.wuffs
index 8237382..3126056 100644
--- a/std/crc32/common_up_x86_sse42.wuffs
+++ b/std/crc32/common_up_x86_sse42.wuffs
@@ -12,12 +12,30 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// --------
+
+// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
+// Computation for Generic Polynomials Using PCLMULQDQ Instruction".
+
pri func ieee_hasher.up_x86_sse42!(x: slice base.u8),
choose cpu_arch >= x86_sse42,
{
var s : base.u32
var p : slice base.u8
+ var util : base.x86_sse42_utility
+ var k : base.x86_m128i
+ var x0 : base.x86_m128i
+ var x1 : base.x86_m128i
+ var x2 : base.x86_m128i
+ var x3 : base.x86_m128i
+ var y0 : base.x86_m128i
+ var y1 : base.x86_m128i
+ var y2 : base.x86_m128i
+ var y3 : base.x86_m128i
+
+ var tail_index : base.u64
+
s = 0xFFFF_FFFF ^ this.state
// Align to a 16-byte boundary.
@@ -35,10 +53,104 @@
return nothing
}
- // Placeholder.
- iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
- s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+ // Load 128×4 = 512 bits from the first 64-byte chunk.
+ x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
+ x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
+ x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
+ x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
+
+ // Combine with the initial state.
+ x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
+
+ // Process the remaining 64-byte chunks.
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
+ iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00)
+
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11)
+
+ x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
+ x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
+ x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
+ x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
+ }
+
+ // Reduce 128×4 = 512 bits to 128 bits.
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x0 = x0._mm_xor_si128(b: x1)
+ x0 = x0._mm_xor_si128(b: y0)
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x0 = x0._mm_xor_si128(b: x2)
+ x0 = x0._mm_xor_si128(b: y0)
+ y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
+ x0 = x0._mm_xor_si128(b: x3)
+ x0 = x0._mm_xor_si128(b: y0)
+
+ // Reduce 128 bits to 64 bits.
+ x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10)
+ x2 = util.make_m128i_multiple_u32(
+ a00: 0xFFFF_FFFF,
+ a01: 0x0000_0000,
+ a02: 0xFFFF_FFFF,
+ a03: 0x0000_0000)
+ x0 = x0._mm_srli_si128(imm8: 8)
+ x0 = x0._mm_xor_si128(b: x1)
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
+ x1 = x0._mm_srli_si128(imm8: 4)
+ x0 = x0._mm_and_si128(b: x2)
+ x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_xor_si128(b: x1)
+
+ // Reduce 64 bits to 32 bits and extract.
+ k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K6MU[.. 16])
+ x1 = x0._mm_and_si128(b: x2)
+ x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10)
+ x1 = x1._mm_and_si128(b: x2)
+ x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
+ x0 = x0._mm_xor_si128(b: x1)
+ s = x0._mm_extract_epi32(imm8: 1)
+
+ // Handle the tail of args.x that wasn't a complete 64-byte chunk.
+ tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
+ if tail_index < args.x.length() {
+ iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
+ s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
+ }
}
this.state = 0xFFFF_FFFF ^ s
}
+
+// These constants come from page 22 of Gopal et al.
+//
+// (†) TODO: explain the discrepancy.
+
+pri const IEEE_X86_SSE42_K1K2 : array[16] base.u8 = [
+ 0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00, // k1' = 0x1_5444_2BD4
+ 0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00, // k2' = 0x1_C6E4_1596
+]
+
+pri const IEEE_X86_SSE42_K3K4 : array[16] base.u8 = [
+ 0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00, // k3' = 0x1_7519_97D0
+ 0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00, // k4' = 0x0_CCAA_009E
+]
+
+pri const IEEE_X86_SSE42_K5ZZ : array[16] base.u8 = [
+ 0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00, // k5' = 0x1_63CD_6124
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Unused
+]
+
+pri const IEEE_X86_SSE42_K6MU : array[16] base.u8 = [
+ 0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00, // k6' = 0x1_DB71_0640 (†)
+ 0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00, // μ' = 0x1_F701_1641
+]