| // Copyright 2021 The Wuffs Authors. | 
 | // | 
 | // Licensed under the Apache License, Version 2.0 (the "License"); | 
 | // you may not use this file except in compliance with the License. | 
 | // You may obtain a copy of the License at | 
 | // | 
 | //    https://www.apache.org/licenses/LICENSE-2.0 | 
 | // | 
 | // Unless required by applicable law or agreed to in writing, software | 
 | // distributed under the License is distributed on an "AS IS" BASIS, | 
 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 | // See the License for the specific language governing permissions and | 
 | // limitations under the License. | 
 |  | 
 | // -------- | 
 |  | 
 | // See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC | 
 | // Computation for Generic Polynomials Using PCLMULQDQ Instruction". | 
 |  | 
 | // up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose | 
 | // cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not | 
 | // just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq | 
 | pri func ieee_hasher.up_x86_avx2!(x: slice base.u8), | 
 | 	choose cpu_arch >= x86_avx2, | 
 | { | 
 | 	var s : base.u32 | 
 | 	var p : slice base.u8 | 
 |  | 
 | 	var util : base.x86_sse42_utility | 
 | 	var k    : base.x86_m128i | 
 | 	var x0   : base.x86_m128i | 
 | 	var x1   : base.x86_m128i | 
 | 	var x2   : base.x86_m128i | 
 | 	var x3   : base.x86_m128i | 
 | 	var y0   : base.x86_m128i | 
 | 	var y1   : base.x86_m128i | 
 | 	var y2   : base.x86_m128i | 
 | 	var y3   : base.x86_m128i | 
 |  | 
 | 	var tail_index : base.u64 | 
 |  | 
 | 	s = 0xFFFF_FFFF ^ this.state | 
 |  | 
 | 	// Align to a 16-byte boundary. | 
 | 	while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) { | 
 | 		s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8) | 
 | 		args.x = args.x[1 ..] | 
 | 	} endwhile | 
 |  | 
 | 	// For short inputs, just do a simple loop. | 
 | 	if args.x.length() < 64 { | 
 | 		iterate (p = args.x)(length: 1, advance: 1, unroll: 1) { | 
 | 			s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) | 
 | 		} | 
 | 		this.state = 0xFFFF_FFFF ^ s | 
 | 		return nothing | 
 | 	} | 
 |  | 
 | 	// Load 128×4 = 512 bits from the first 64-byte chunk. | 
 | 	x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10]) | 
 | 	x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20]) | 
 | 	x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30]) | 
 | 	x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40]) | 
 |  | 
 | 	// Combine with the initial state. | 
 | 	x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s)) | 
 |  | 
 | 	// Process the remaining 64-byte chunks. | 
 | 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16]) | 
 | 	iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) { | 
 | 		y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 		y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 		y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 		y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 |  | 
 | 		x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 | 		x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 | 		x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 | 		x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 |  | 
 | 		x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10])) | 
 | 		x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20])) | 
 | 		x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30])) | 
 | 		x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40])) | 
 | 	} | 
 |  | 
 | 	// Reduce 128×4 = 512 bits to 128 bits. | 
 | 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16]) | 
 | 	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 | 	x0 = x0._mm_xor_si128(b: x1) | 
 | 	x0 = x0._mm_xor_si128(b: y0) | 
 | 	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 | 	x0 = x0._mm_xor_si128(b: x2) | 
 | 	x0 = x0._mm_xor_si128(b: y0) | 
 | 	y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | 
 | 	x0 = x0._mm_xor_si128(b: x3) | 
 | 	x0 = x0._mm_xor_si128(b: y0) | 
 |  | 
 | 	// Reduce 128 bits to 64 bits. | 
 | 	x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10) | 
 | 	x2 = util.make_m128i_multiple_u32( | 
 | 		a00: 0xFFFF_FFFF, | 
 | 		a01: 0x0000_0000, | 
 | 		a02: 0xFFFF_FFFF, | 
 | 		a03: 0x0000_0000) | 
 | 	x0 = x0._mm_srli_si128(imm8: 8) | 
 | 	x0 = x0._mm_xor_si128(b: x1) | 
 | 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16]) | 
 | 	x1 = x0._mm_srli_si128(imm8: 4) | 
 | 	x0 = x0._mm_and_si128(b: x2) | 
 | 	x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 	x0 = x0._mm_xor_si128(b: x1) | 
 |  | 
 | 	// Reduce 64 bits to 32 bits (Barrett Reduction) and extract. | 
 | 	// | 
 | 	// Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after | 
 | 	// adjusting for bit-reflection as per Figure 12 (page 21). | 
 | 	k = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16]) | 
 | 	x1 = x0._mm_and_si128(b: x2) | 
 | 	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10) | 
 | 	x1 = x1._mm_and_si128(b: x2) | 
 | 	x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00) | 
 | 	x0 = x0._mm_xor_si128(b: x1) | 
 | 	s = x0._mm_extract_epi32(imm8: 1) | 
 |  | 
 | 	// Handle the tail of args.x that wasn't a complete 64-byte chunk. | 
 | 	tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0  // And-not 64. | 
 | 	if tail_index < args.x.length() { | 
 | 		iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) { | 
 | 			s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) | 
 | 		} | 
 | 	} | 
 |  | 
 | 	this.state = 0xFFFF_FFFF ^ s | 
 | } |