| // Copyright 2024 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| // |
| // SPDX-License-Identifier: Apache-2.0 OR MIT |
| |
| //go:build ignore |
| // +build ignore |
| |
| package main |
| |
| // print-crc32-x86-sse42-code.go prints the std/crc32 x86/SSE4.2 Wuffs code |
| // based on some C code generated by https://github.com/corsix/fast-crc32/ |
| // |
| // Usage: go run print-crc32-x86-sse42-code.go |
| |
| import ( |
| "fmt" |
| "regexp" |
| "strconv" |
| "strings" |
| ) |
| |
| func main() { |
| var ( |
| reXEqLoadu = regexp.MustCompile(`^__m128i x(\d+) = _mm_loadu_si128`) |
| reKEqSetr = regexp.MustCompile(`^k = _mm_setr_epi32\(([^,]+), ([^,]+), ([^,]+), ([^\)]+)\);$`) |
| reYEqClmul = regexp.MustCompile(`^y(\d+) = clmul_lo\(x(\d+), k\), x(\d+) = clmul_hi\(x(\d+), k\);$`) |
| reYEqXorLoadu = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), _mm_loadu_si128`) |
| reYEqXorYX = regexp.MustCompile(`^y(\d+) = _mm_xor_si128\(y(\d+), x(\d+)\), x(\d+) = _mm_xor_si128\(x(\d+), y(\d+)\);$`) |
| ) |
| |
| fmt.Println("// BEGIN script/print-crc32-x86-sse42-code.go generated code.") |
| for src := srcSSECRC32V8; src != ""; { |
| i := strings.IndexByte(src, '\n') |
| line := strings.TrimSpace(src[:i]) |
| src = src[i+1:] |
| |
| if (line == "") || strings.HasPrefix(line, "/*") { |
| continue |
| |
| } else if s := reXEqLoadu.FindStringSubmatch(line); len(s) > 0 { |
| n, _ := strconv.Atoi(s[1]) |
| fmt.Printf("x%d = util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X])\n", n, 16*(n), 16*(n+1)) |
| |
| } else if line == "__m128i k;" { |
| continue |
| |
| } else if s := reKEqSetr.FindStringSubmatch(line); len(s) > 0 { |
| fmt.Printf("kk = util.make_m128i_multiple_u32(a00: %s, a01: %s, a02: %s, a03: %s)\n", s[1], s[2], s[3], s[4]) |
| |
| } else if line == "x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);" { |
| fmt.Printf("x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))\n") |
| |
| } else if line == "buf += 128;" { |
| fmt.Printf("args.x = args.x[128 ..]\n") |
| |
| } else if line == "len -= 128;" { |
| continue |
| |
| } else if line == "while (len >= 128) {" { |
| fmt.Printf("while args.x.length() >= 128 {\n") |
| |
| } else if line == "}" { |
| fmt.Printf("} endwhile\n") |
| |
| } else if s := reYEqClmul.FindStringSubmatch(line); len(s) > 0 { |
| fmt.Printf("y%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x00)\n", s[1], s[2]) |
| fmt.Printf("x%s = x%s._mm_clmulepi64_si128(b: kk, imm8: 0x11)\n", s[3], s[4]) |
| |
| } else if s := reYEqXorLoadu.FindStringSubmatch(line); len(s) > 0 { |
| n, _ := strconv.Atoi(s[1]) |
| fmt.Printf("y%d = y%d._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x%02X .. 0x%02X]))\n", n, n, 16*(n), 16*(n+1)) |
| fmt.Printf("x%d = x%d._mm_xor_si128(b: y%d)\n", n, n, n) |
| |
| } else if s := reYEqXorYX.FindStringSubmatch(line); len(s) > 0 { |
| fmt.Printf("y%s = y%s._mm_xor_si128(b: x%s)\n", s[1], s[2], s[3]) |
| fmt.Printf("x%s = x%s._mm_xor_si128(b: y%s)\n", s[4], s[5], s[6]) |
| |
| } else if line == "crc0 = crc_u64(0, _mm_extract_epi64(x0, 0));" { |
| fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n") |
| fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).\n") |
| fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n") |
| fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n") |
| fmt.Printf(" _mm_extract_epi32(imm8: 2)\n") |
| |
| // fmt.Printf("s = util.make_m128i_single_u64(a: (s as base.u64) ^ args.x.peek_u64le()).\n") |
| } else if line == "crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1));" { |
| fmt.Printf("kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)\n") |
| fmt.Printf("s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).\n") |
| fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x00).\n") |
| fmt.Printf(" _mm_clmulepi64_si128(b: kk, imm8: 0x10).\n") |
| fmt.Printf(" _mm_extract_epi32(imm8: 2)\n") |
| |
| } else { |
| fmt.Printf("// Could not process %q.\n", line) |
| break |
| } |
| } |
| fmt.Println("// END script/print-crc32-x86-sse42-code.go generated code.") |
| } |
| |
| // This is the core (inside "if (len >= 128)") of the code produced by |
| // generate.c in https://github.com/corsix/fast-crc32/ when parameterized by |
| // "./generate -i sse -p crc32 -a v8". |
| const srcSSECRC32V8 = ` |
| /* First vector chunk. */ |
| __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0; |
| __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1; |
| __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2; |
| __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3; |
| __m128i x4 = _mm_loadu_si128((const __m128i*)(buf + 64)), y4; |
| __m128i x5 = _mm_loadu_si128((const __m128i*)(buf + 80)), y5; |
| __m128i x6 = _mm_loadu_si128((const __m128i*)(buf + 96)), y6; |
| __m128i x7 = _mm_loadu_si128((const __m128i*)(buf + 112)), y7; |
| __m128i k; |
| k = _mm_setr_epi32(0x33fff533, 0, 0x910eeec1, 0); |
| x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); |
| buf += 128; |
| len -= 128; |
| /* Main loop. */ |
| while (len >= 128) { |
| y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); |
| y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); |
| y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); |
| y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); |
| y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); |
| y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k); |
| y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); |
| y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k); |
| y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0); |
| y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1); |
| y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2); |
| y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3); |
| y4 = _mm_xor_si128(y4, _mm_loadu_si128((const __m128i*)(buf + 64))), x4 = _mm_xor_si128(x4, y4); |
| y5 = _mm_xor_si128(y5, _mm_loadu_si128((const __m128i*)(buf + 80))), x5 = _mm_xor_si128(x5, y5); |
| y6 = _mm_xor_si128(y6, _mm_loadu_si128((const __m128i*)(buf + 96))), x6 = _mm_xor_si128(x6, y6); |
| y7 = _mm_xor_si128(y7, _mm_loadu_si128((const __m128i*)(buf + 112))), x7 = _mm_xor_si128(x7, y7); |
| buf += 128; |
| len -= 128; |
| } |
| /* Reduce x0 ... x7 to just x0. */ |
| k = _mm_setr_epi32(0xae689191, 0, 0xccaa009e, 0); |
| y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); |
| y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); |
| y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); |
| y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k); |
| y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); |
| y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); |
| y4 = _mm_xor_si128(y4, x5), x4 = _mm_xor_si128(x4, y4); |
| y6 = _mm_xor_si128(y6, x7), x6 = _mm_xor_si128(x6, y6); |
| k = _mm_setr_epi32(0xf1da05aa, 0, 0x81256527, 0); |
| y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); |
| y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k); |
| y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); |
| y4 = _mm_xor_si128(y4, x6), x4 = _mm_xor_si128(x4, y4); |
| k = _mm_setr_epi32(0x8f352d95, 0, 0x1d9513d7, 0); |
| y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); |
| y0 = _mm_xor_si128(y0, x4), x0 = _mm_xor_si128(x0, y0); |
| /* Reduce 128 bits to 32 bits, and multiply by x^32. */ |
| crc0 = crc_u64(0, _mm_extract_epi64(x0, 0)); |
| crc0 = crc_u64(crc0, _mm_extract_epi64(x0, 1)); |
| ` |