std/crc32/common_up_x86_sse42.wuffs - external/github.com/google/wuffs - Git at Google

 // Copyright 2021 The Wuffs Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // --------

 // See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
 // Computation for Generic Polynomials Using PCLMULQDQ Instruction".

 pri func ieee_hasher.up_x86_sse42!(x: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	var s : base.u32
 	var p : slice base.u8

 	var util : base.x86_sse42_utility
 	var k    : base.x86_m128i
 	var x0   : base.x86_m128i
 	var x1   : base.x86_m128i
 	var x2   : base.x86_m128i
 	var x3   : base.x86_m128i
 	var y0   : base.x86_m128i
 	var y1   : base.x86_m128i
 	var y2   : base.x86_m128i
 	var y3   : base.x86_m128i

 	var tail_index : base.u64

 	s = 0xFFFF_FFFF ^ this.state

 	// Align to a 16-byte boundary.
 	while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
 		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
 		args.x = args.x[1 ..]
 	} endwhile

 	// For short inputs, just do a simple loop.
 	if args.x.length() < 64 {
 		iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
 			s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
 		}
 		this.state = 0xFFFF_FFFF ^ s
 		return nothing
 	}

 	// Load 128×4 = 512 bits from the first 64-byte chunk.
 	x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
 	x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
 	x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
 	x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])

 	// Combine with the initial state.
 	x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))

 	// Process the remaining 64-byte chunks.
 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
 	iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
 		y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
 		y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
 		y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00)
 		y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00)

 		x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
 		x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11)
 		x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11)
 		x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11)

 		x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
 		x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
 		x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
 		x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
 	}

 	// Reduce 128×4 = 512 bits to 128 bits.
 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
 	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
 	x0 = x0._mm_xor_si128(b: x1)
 	x0 = x0._mm_xor_si128(b: y0)
 	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
 	x0 = x0._mm_xor_si128(b: x2)
 	x0 = x0._mm_xor_si128(b: y0)
 	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
 	x0 = x0._mm_xor_si128(b: x3)
 	x0 = x0._mm_xor_si128(b: y0)

 	// Reduce 128 bits to 64 bits.
 	x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10)
 	x2 = util.make_m128i_multiple_u32(
 		a00: 0xFFFF_FFFF,
 		a01: 0x0000_0000,
 		a02: 0xFFFF_FFFF,
 		a03: 0x0000_0000)
 	x0 = x0._mm_srli_si128(imm8: 8)
 	x0 = x0._mm_xor_si128(b: x1)
 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
 	x1 = x0._mm_srli_si128(imm8: 4)
 	x0 = x0._mm_and_si128(b: x2)
 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
 	x0 = x0._mm_xor_si128(b: x1)

 	// Reduce 64 bits to 32 bits and extract.
 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K6MU[.. 16])
 	x1 = x0._mm_and_si128(b: x2)
 	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10)
 	x1 = x1._mm_and_si128(b: x2)
 	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
 	x0 = x0._mm_xor_si128(b: x1)
 	s = x0._mm_extract_epi32(imm8: 1)

 	// Handle the tail of args.x that wasn't a complete 64-byte chunk.
 	tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0  // And-not 64.
 	if tail_index < args.x.length() {
 		iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
 			s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
 		}
 	}

 	this.state = 0xFFFF_FFFF ^ s
 }

 // These constants come from page 22 of Gopal et al. They are also reproduced
 // by script/print-crc32-x86-sse42-magic-numbers.go which is runnable online at
 // https://play.golang.org/p/wH1q6GfhKOE
 //
 // (†) There is a discrepancy in the k6' constant. The Gopal paper gives the
 // least significant byte as 0x40 and this is confirmed by the reproduction
 // script. Indeed, the formula on page 22 ends with a left-shift by 1, so the
 // final hex digit must be even.
 //
 // Nonetheless, we use 0x41 here, the same as the Linux kernel
 // (https://github.com/torvalds/linux/blob/v5.11/arch/x86/crypto/crc32-pclmul_asm.S#L72).
 // It's there in 2021 unchanged since that code was introduced in 2013
 // (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=78c37d191dd6899d8c219fee597a17d6e3c5d288).
 //
 // The Linux kernel's 0x41 (instead of 0x40) also turns up in e.g. Chromium
 // (https://source.chromium.org/chromium/chromium/src/+/master:third_party/zlib/crc32_simd.c;l=36;drc=9d4ec9349a1bf609eedb917c44c69eb0df9ff6bb)
 // and Go
 // (https://github.com/golang/go/blob/414fa8c35e7c2f65e2c767d6db2f25791e53b5c1/src/hash/crc32/crc32_amd64.s#L143).
 // The 0x41 has surely been copy/pasted to other software too.
 //
 // Code (0x41) and documentation (0x40) disagree. We went with code here, for
 // interoperability. Even though it's mathematical documentation, it's very
 // widely used code.
 //
 // For academic curiosity, there are at least two possible sources of the k6'
 // constant discrepancy.
 //
 // One is a transcription error. The Gopal paper has two consecutive lines:
 //  k6' = ... = 0x1DB710640
 //  μ'  = ... = 0x1F7011641
 // The final hex digits are 640 and 641.
 //
 // Two is that there's also this line a little earlier on the same page:
 //  P(x)'     = 0x1DB710641
 // This P(x)' number differs from k6' only in the final bit: 0x41 vs 0x40. This
 // isn't coincidental. Calculating k6' (in the Galois Field GF(2) space) means
 // dividing (i.e. bitwise XOR-ing) a 33-bit polynomial (x³², with only one 'on'
 // bit) by the 33-bit polynomial P(x)'.
 //
 // The discrepancy might not matter in practice because Barrett reduction is
 // already an integer approximation to a fraction. 0x40 vs 0x41 is essentially
 // rounding down vs up. The approximation diverges for large inputs but might
 // not matter for 'small' inputs that fit into e.g. a uint32 or uint64. The
 // tests seem to pass either way (and fail with 0x3F or 0x42).

 pri const IEEE_X86_SSE42_K1K2 : array[16] base.u8 = [
 	0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00,  // k1' = 0x1_5444_2BD4
 	0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00,  // k2' = 0x1_C6E4_1596
 ]

 pri const IEEE_X86_SSE42_K3K4 : array[16] base.u8 = [
 	0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00,  // k3' = 0x1_7519_97D0
 	0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00,  // k4' = 0x0_CCAA_009E
 ]

 pri const IEEE_X86_SSE42_K5ZZ : array[16] base.u8 = [
 	0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00,  // k5' = 0x1_63CD_6124
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  // Unused
 ]

 pri const IEEE_X86_SSE42_K6MU : array[16] base.u8 = [
 	0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00,  // k6' = 0x1_DB71_0640 (†)
 	0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00,  // μ'  = 0x1_F701_1641
 ]
	// Copyright 2021 The Wuffs Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// --------

	// See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC
	// Computation for Generic Polynomials Using PCLMULQDQ Instruction".

	pri func ieee_hasher.up_x86_sse42!(x: slice base.u8),
	choose cpu_arch >= x86_sse42,
	{
	var s : base.u32
	var p : slice base.u8

	var util : base.x86_sse42_utility
	var k : base.x86_m128i
	var x0 : base.x86_m128i
	var x1 : base.x86_m128i
	var x2 : base.x86_m128i
	var x3 : base.x86_m128i
	var y0 : base.x86_m128i
	var y1 : base.x86_m128i
	var y2 : base.x86_m128i
	var y3 : base.x86_m128i

	var tail_index : base.u64

	s = 0xFFFF_FFFF ^ this.state

	// Align to a 16-byte boundary.
	while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
	s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
	args.x = args.x[1 ..]
	} endwhile

	// For short inputs, just do a simple loop.
	if args.x.length() < 64 {
	iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
	s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
	}
	this.state = 0xFFFF_FFFF ^ s
	return nothing
	}

	// Load 128×4 = 512 bits from the first 64-byte chunk.
	x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
	x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
	x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
	x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])

	// Combine with the initial state.
	x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))

	// Process the remaining 64-byte chunks.
	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16])
	iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) {
	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
	y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
	y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00)
	y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00)

	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11)
	x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11)
	x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11)

	x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10]))
	x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20]))
	x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30]))
	x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40]))
	}

	// Reduce 128×4 = 512 bits to 128 bits.
	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16])
	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
	x0 = x0._mm_xor_si128(b: x1)
	x0 = x0._mm_xor_si128(b: y0)
	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
	x0 = x0._mm_xor_si128(b: x2)
	x0 = x0._mm_xor_si128(b: y0)
	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11)
	x0 = x0._mm_xor_si128(b: x3)
	x0 = x0._mm_xor_si128(b: y0)

	// Reduce 128 bits to 64 bits.
	x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10)
	x2 = util.make_m128i_multiple_u32(
	a00: 0xFFFF_FFFF,
	a01: 0x0000_0000,
	a02: 0xFFFF_FFFF,
	a03: 0x0000_0000)
	x0 = x0._mm_srli_si128(imm8: 8)
	x0 = x0._mm_xor_si128(b: x1)
	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16])
	x1 = x0._mm_srli_si128(imm8: 4)
	x0 = x0._mm_and_si128(b: x2)
	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00)
	x0 = x0._mm_xor_si128(b: x1)

	// Reduce 64 bits to 32 bits and extract.
	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K6MU[.. 16])
	x1 = x0._mm_and_si128(b: x2)
	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10)
	x1 = x1._mm_and_si128(b: x2)
	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00)
	x0 = x0._mm_xor_si128(b: x1)
	s = x0._mm_extract_epi32(imm8: 1)

	// Handle the tail of args.x that wasn't a complete 64-byte chunk.
	tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64.
	if tail_index < args.x.length() {
	iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
	s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
	}
	}

	this.state = 0xFFFF_FFFF ^ s
	}

	// These constants come from page 22 of Gopal et al. They are also reproduced
	// by script/print-crc32-x86-sse42-magic-numbers.go which is runnable online at
	// https://play.golang.org/p/wH1q6GfhKOE
	//
	// (†) There is a discrepancy in the k6' constant. The Gopal paper gives the
	// least significant byte as 0x40 and this is confirmed by the reproduction
	// script. Indeed, the formula on page 22 ends with a left-shift by 1, so the
	// final hex digit must be even.
	//
	// Nonetheless, we use 0x41 here, the same as the Linux kernel
	// (https://github.com/torvalds/linux/blob/v5.11/arch/x86/crypto/crc32-pclmul_asm.S#L72).
	// It's there in 2021 unchanged since that code was introduced in 2013
	// (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=78c37d191dd6899d8c219fee597a17d6e3c5d288).
	//
	// The Linux kernel's 0x41 (instead of 0x40) also turns up in e.g. Chromium
	// (https://source.chromium.org/chromium/chromium/src/+/master:third_party/zlib/crc32_simd.c;l=36;drc=9d4ec9349a1bf609eedb917c44c69eb0df9ff6bb)
	// and Go
	// (https://github.com/golang/go/blob/414fa8c35e7c2f65e2c767d6db2f25791e53b5c1/src/hash/crc32/crc32_amd64.s#L143).
	// The 0x41 has surely been copy/pasted to other software too.
	//
	// Code (0x41) and documentation (0x40) disagree. We went with code here, for
	// interoperability. Even though it's mathematical documentation, it's very
	// widely used code.
	//
	// For academic curiosity, there are at least two possible sources of the k6'
	// constant discrepancy.
	//
	// One is a transcription error. The Gopal paper has two consecutive lines:
	// k6' = ... = 0x1DB710640
	// μ' = ... = 0x1F7011641
	// The final hex digits are 640 and 641.
	//
	// Two is that there's also this line a little earlier on the same page:
	// P(x)' = 0x1DB710641
	// This P(x)' number differs from k6' only in the final bit: 0x41 vs 0x40. This
	// isn't coincidental. Calculating k6' (in the Galois Field GF(2) space) means
	// dividing (i.e. bitwise XOR-ing) a 33-bit polynomial (x³², with only one 'on'
	// bit) by the 33-bit polynomial P(x)'.
	//
	// The discrepancy might not matter in practice because Barrett reduction is
	// already an integer approximation to a fraction. 0x40 vs 0x41 is essentially
	// rounding down vs up. The approximation diverges for large inputs but might
	// not matter for 'small' inputs that fit into e.g. a uint32 or uint64. The
	// tests seem to pass either way (and fail with 0x3F or 0x42).

	pri const IEEE_X86_SSE42_K1K2 : array[16] base.u8 = [
	0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00, // k1' = 0x1_5444_2BD4
	0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00, // k2' = 0x1_C6E4_1596
	]

	pri const IEEE_X86_SSE42_K3K4 : array[16] base.u8 = [
	0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00, // k3' = 0x1_7519_97D0
	0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00, // k4' = 0x0_CCAA_009E
	]

	pri const IEEE_X86_SSE42_K5ZZ : array[16] base.u8 = [
	0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00, // k5' = 0x1_63CD_6124
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Unused
	]

	pri const IEEE_X86_SSE42_K6MU : array[16] base.u8 = [
	0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00, // k6' = 0x1_DB71_0640 (†)
	0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00, // μ' = 0x1_F701_1641
	]