blob: 963976931f9dd162c40dd4c7921dca94b00581c7 [file] [log] [blame]
// Copyright 2024 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
// --------
// Like std/crc32's x86 SIMD implementation, this is based on Gopal et al.
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction".
// and further optimized a la
// https://github.com/JoernEngel/joernblog/blob/778f0007b9a580e477608691f1aa86369f0efdd2/crc64.c
pri func ecma_hasher.up_x86_sse42!(x: roslice base.u8),
choose cpu_arch >= x86_sse42,
{
var s : base.u64
var p : roslice base.u8
var buf : array[48] base.u8
var util : base.x86_sse42_utility
var xa : base.x86_m128i
var xb : base.x86_m128i
var xc : base.x86_m128i
var xd : base.x86_m128i
var xe : base.x86_m128i
var xf : base.x86_m128i
var xg : base.x86_m128i
var xh : base.x86_m128i
var mu1 : base.x86_m128i
var mu2 : base.x86_m128i
var mu4 : base.x86_m128i
var mu8 : base.x86_m128i
var mupx : base.x86_m128i
s = 0xFFFF_FFFF_FFFF_FFFF ^ this.state
// Align to a 16-byte boundary.
while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
args.x = args.x[1 ..]
} endwhile
while.chain2 true {{
while.chain4 true {{
if args.x.length() >= 0x80 {
// No-op.
} else if args.x.length() >= 0x40 {
// Set up 4 accumulators.
xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
util.make_m128i_single_u64(a: s))
xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
xc = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
xd = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
args.x = args.x[0x40 ..]
break.chain4
} else if args.x.length() >= 0x20 {
// Set up 2 accumulators.
xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
util.make_m128i_single_u64(a: s))
xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
args.x = args.x[0x20 ..]
break.chain2
} else {
// For short inputs, just do a simple loop.
iterate (p = args.x)(length: 1, advance: 1, unroll: 1) {
s = ECMA_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8)
}
this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s
return nothing
}
// Set up 8 accumulators.
xa = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])._mm_xor_si128(b:
util.make_m128i_single_u64(a: s))
xb = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
xc = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
xd = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
xe = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
xf = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
xg = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
xh = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
args.x = args.x[0x80 ..]
// Main loop.
mu8 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD8[.. 16])
while args.x.length() >= 0x80 {
xa = xa._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
xb = xb._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
xc = xc._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xc._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
xd = xd._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xd._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
xe = xe._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xe._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
xf = xf._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xf._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
xg = xg._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xg._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
xh = xh._mm_clmulepi64_si128(b: mu8, imm8: 0x00)._mm_xor_si128(b:
xh._mm_clmulepi64_si128(b: mu8, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
args.x = args.x[0x80 ..]
} endwhile
// Reduce 8 chains down to 4.
mu4 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD4[.. 16])
xa = xa._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xe)
xb = xb._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xf)
xc = xc._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xc._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xg)
xd = xd._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xd._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
xh)
if args.x.length() > 0x40 {
xa = xa._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
xb = xb._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
xc = xc._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xc._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
xd = xd._mm_clmulepi64_si128(b: mu4, imm8: 0x00)._mm_xor_si128(b:
xd._mm_clmulepi64_si128(b: mu4, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
args.x = args.x[0x40 ..]
}
break.chain4
}} endwhile.chain4
// Reduce 4 chains down to 2.
mu2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD2[.. 16])
xa = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
xc)
xb = xb._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
xd)
if args.x.length() > 0x20 {
xa = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
xb = xb._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu2, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
args.x = args.x[0x20 ..]
}
break.chain2
}} endwhile.chain2
// Reduce 2 chains down to 1.
mu1 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD1[.. 16])
xa = xa._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu1, imm8: 0x11))._mm_xor_si128(b:
xb)
if args.x.length() > 24 {
xa = xa._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu1, imm8: 0x11))._mm_xor_si128(b:
util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
args.x = args.x[0x10 ..]
if args.x.length() > 24 {
return nothing // Unreachable.
}
}
assert args.x.length() <= 24
// Copy our accumulator xa and the remaining data to buf, a 48-byte buffer.
// The buffer includes the implicitly appended 8 bytes for the CRC.
//
// | 0-padding | xa | args.x | 0-padding |
// | 0-24 bytes | 16 bytes | 0-24 bytes| 8 bytes |
//
assert (24 - args.x.length()) <= ((24 - args.x.length()) + 16) via "a <= (a + b): 0 <= b"()
xa.store_slice128!(a: buf[24 - args.x.length() .. (24 - args.x.length()) + 16])
buf[(24 - args.x.length()) + 16 ..].copy_from_slice!(s: args.x)
// Reduce the 48-byte buffer down to a 16-byte (128-bit) accumulator.
mu2 = util.make_m128i_slice128(a: ECMA_X86_SSE42_FOLD2[.. 16])
xa = util.make_m128i_slice128(a: buf[0x00 .. 0x10])
xb = util.make_m128i_slice128(a: buf[0x10 .. 0x20])
xc = util.make_m128i_slice128(a: buf[0x20 .. 0x30])
xd = xa._mm_clmulepi64_si128(b: mu2, imm8: 0x00)._mm_xor_si128(b:
xa._mm_clmulepi64_si128(b: mu2, imm8: 0x11))
xe = xb._mm_clmulepi64_si128(b: mu1, imm8: 0x00)._mm_xor_si128(b:
xb._mm_clmulepi64_si128(b: mu1, imm8: 0x11))
xa = xd._mm_xor_si128(b: xe._mm_xor_si128(b: xc))
// Reduce 128-bit to 64-bit via Barrett reduction.
mupx = util.make_m128i_slice128(a: ECMA_X86_SSE42_MUPX[.. 16])
xb = xa._mm_clmulepi64_si128(b: mupx, imm8: 0x00)
xc = xb._mm_clmulepi64_si128(b: mupx, imm8: 0x10)
s = xc._mm_xor_si128(b: xb._mm_slli_si128(imm8: 8)).
_mm_xor_si128(b: xa)._mm_extract_epi64(imm8: 1)
this.state = 0xFFFF_FFFF_FFFF_FFFF ^ s
}
// These constants are reproduced by
// script/print-crc64-x86-sse42-magic-numbers.go
//
// The rkN names match the numbers at
// https://github.com/intel/isa-l/blob/7b30857e20b84e5afab1a28291189b9dc571110d/crc/crc64_ecma_refl_by16_10.asm#L33-L56
pri const ECMA_X86_SSE42_FOLD1 : roarray[16] base.u8 = [
0xE4, 0x3A, 0x39, 0xCA, 0x97, 0xD4, 0x5D, 0xE0, // fold1a' = 0xE05D_D497_CA39_3AE4 = rk2
0x40, 0x5F, 0x87, 0xC7, 0xAF, 0x95, 0xBE, 0xDA, // fold1b' = 0xDABE_95AF_C787_5F40 = rk1
]
pri const ECMA_X86_SSE42_FOLD2 : roarray[16] base.u8 = [
0x44, 0xFA, 0x9E, 0x8A, 0x00, 0x5B, 0x09, 0x60, // fold2a' = 0x6009_5B00_8A9E_FA44 = rk20
0x51, 0xAF, 0xE1, 0x0F, 0xA3, 0x53, 0xE6, 0x3B, // fold2b' = 0x3BE6_53A3_0FE1_AF51 = rk19
]
pri const ECMA_X86_SSE42_FOLD4 : roarray[16] base.u8 = [
0xF3, 0x41, 0xD4, 0x9D, 0xBB, 0xEF, 0xE3, 0x6A, // fold4a' = 0x6AE3_EFBB_9DD4_41F3 = rk16
0xF4, 0x2D, 0x84, 0xA7, 0x54, 0x60, 0x1F, 0x08, // fold4b' = 0x081F_6054_A784_2DF4 = rk15
]
pri const ECMA_X86_SSE42_FOLD8 : roarray[16] base.u8 = [
0x00, 0x10, 0xCC, 0x4F, 0x1D, 0xD7, 0x57, 0x87, // fold8a' = 0x8757_D71D_4FCC_1000 = rk4
0x40, 0xE7, 0x3D, 0xF7, 0x2A, 0x6B, 0xD8, 0xD7, // fold8b' = 0xD7D8_6B2A_F73D_E740 = rk3
]
pri const ECMA_X86_SSE42_MUPX : roarray[16] base.u8 = [
0xD5, 0x63, 0x29, 0x17, 0x6C, 0x46, 0x3E, 0x9C, // μ' = 0x9C3E_466C_1729_63D5 = rk7
0x85, 0x1E, 0x0E, 0xAF, 0x2B, 0xAF, 0xD8, 0x92, // Px' = 0x92D8_AF2B_AF0E_1E85 = rk8
]