blob: f6c5db0258fe1b9ea6f1a1b49bafc72b95d3ec1a [file] [log] [blame]
// Copyright 2024 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
//go:build ignore
// +build ignore
package main
// print-crc32-x86-sse42-code.go prints the std/crc32 x86/SSE4.2 Wuffs code
// based on some C code generated by https://github.com/corsix/fast-crc32/
//
// Usage: go run print-crc32-x86-sse42-code.go
import (
"fmt"
"regexp"
"strconv"
"strings"
)
func main() {
var (
reXEqLoadu = regexp.MustCompile(`^__m128i x(\d+) = _mm_loadu_si128`)
reKEqSetr = regexp.MustCompile(`^k = _mm_setr_epi32\(([^,]+), ([^,]+), ([^,]+), ([^\)]+)\);$`)
reYEqClmul = regexp.MustCompile(`^y(\d+) = clmul_lo\(x(\d+), k\), x(\d+) = clmul_hi\(x(\d+), k\);$`)
reYEqXorLoadu = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), _mm_loadu_si128`)
reYEqXorYX = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), x(\d+)\), x(\d+) = _mm_xor_si128\(x(\d+), y(\d+)\);$`)
)
fmt.Println("// BEGIN script/print-crc32-x86-sse42-code.go generated code.")
for src := srcSSECRC32V8; src != ""; {
i := strings.IndexByte(src, '\n')
line := strings.TrimSpace(src[:i])
src = src[i+1:]
if (line == "") || strings.HasPrefix(line, "/*") {
continue
} else if s := reXEqLoadu.FindStringSubmatch(line); len(s) > 0 {
n, _ := strconv.Atoi(s[1])
fmt.Printf("x%d = util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X])\n", n, 16*(n), 16*(n+1))
} else if line == "__m128i k;" {
continue
} else if s := reKEqSetr.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: %s, a01: %s, a02: %s, a03: %s)\n", s[1], s[2], s[3], s[4])
} else if line == "x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);" {
fmt.Printf("x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))\n")
} else if line == "buf += 128;" {
fmt.Printf("args.x = args.x[128 ..]\n")
} else if line == "len -= 128;" {
continue
} else if line == "while (len >= 128) {" {
fmt.Printf("while args.x.length() >= 128 {\n")
} else if line == "}" {
fmt.Printf("} endwhile\n")
} else if s := reYEqClmul.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("y%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x00)\n", s[1], s[2])
fmt.Printf("x%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x11)\n", s[3], s[4])
} else if s := reYEqXorLoadu.FindStringSubmatch(line); len(s) > 0 {
n, _ := strconv.Atoi(s[1])
fmt.Printf("y%d = y%d._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X]))\n", n, n, 16*(n), 16*(n+1))
fmt.Printf("x%d = x%d._mm_xor_si128(b: y%d)\n", n, n, n)
} else if s := reYEqXorYX.FindStringSubmatch(line); len(s) > 0 {
fmt.Printf("y%s = y%s._mm_xor_si128(b: x%s)\n", s[1], s[2], s[3])
fmt.Printf("x%s = x%s._mm_xor_si128(b: y%s)\n", s[4], s[5], s[6])
} else if line == "crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));" {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
// fmt.Printf("s = util.make_m128i_single_u64(a: (s as base.u64) ^ args.x.peek_u64le()).\n")
} else if line == "crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));" {
fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n")
fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n")
fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n")
fmt.Printf(" _mm_extract_epi32(imm8: 2)\n")
} else {
fmt.Printf("// Could not process %q.\n", line)
break
}
}
fmt.Println("// END script/print-crc32-x86-sse42-code.go generated code.")
}
// This is the core (inside "if (len >= 128)") of the code produced by
// generate.c in https://github.com/corsix/fast-crc32/ when parameterized by
// "./generate -i sse -p crc32 -a v8".
const srcSSECRC32V8 = `
/* First vector chunk. */
__m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
__m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
__m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
__m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
__m128i x4 = _mm_loadu_si128((const __m128i*)(buf + 64)), y4;
__m128i x5 = _mm_loadu_si128((const __m128i*)(buf + 80)), y5;
__m128i x6 = _mm_loadu_si128((const __m128i*)(buf + 96)), y6;
__m128i x7 = _mm_loadu_si128((const __m128i*)(buf + 112)), y7;
__m128i k;
k = _mm_setr_epi32(0x33fff533, 0, 0x910eeec1, 0);
x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
buf += 128;
len -= 128;
/* Main loop. */
while (len >= 128) {
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf + 64))), x4 = _mm_xor_si128(x4, y4);
y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf + 80))), x5 = _mm_xor_si128(x5, y5);
y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf + 96))), x6 = _mm_xor_si128(x6, y6);
y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf + 112))), x7 = _mm_xor_si128(x7, y7);
buf += 128;
len -= 128;
}
/* Reduce x0 ... x7 to just x0. */
k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4);
y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6);
k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4);
k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0);
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0);
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));
crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));
`