blob: 9c14eb490457db2124de673c5cb444310b48c82b [file] [log] [blame]
// Copyright 2023 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
pri func decoder.decode_idct_x86_avx2!(dst_buffer: slice base.u8, dst_stride: base.u64, q: base.u32[..= 3]),
choose cpu_arch >= x86_avx2,
{
var sse42_util : base.x86_sse42_utility
var util : base.x86_avx2_utility
// SIMD constants.
var k_0000 : base.x86_m256i
var k_8080 : base.x86_m256i
var k_0000_0002 : base.x86_m256i
var k_0001_FFFF : base.x86_m256i
var k_0400_0000 : base.x86_m256i
var k_29CF_1151_D630_1151 : base.x86_m256i
var k_E333_133E_ADFD_1051 : base.x86_m256i
var k_E6DC_25A1_1925_25A1 : base.x86_m256i
var k_ECC1_E333_EFB0_ADFD : base.x86_m256i
// "Are the AC terms zero" variable names start with "az_".
var az_coeffs : base.x86_m128i
var az_ah00 : base.x86_m256i
var az_ad00 : base.x86_m256i
var az_eh00 : base.x86_m256i
var az_adeh : base.x86_m256i
// Set up variables.
var rows01 : base.x86_m256i
var rows23 : base.x86_m256i
var rows45 : base.x86_m256i
var rows67 : base.x86_m256i
var quants01 : base.x86_m256i
var quants23 : base.x86_m256i
var quants45 : base.x86_m256i
var quants67 : base.x86_m256i
var rows04 : base.x86_m256i
var rows31 : base.x86_m256i
var rows26 : base.x86_m256i
var rows75 : base.x86_m256i
// First pass variable names start with "fp_".
var fp_rows62 : base.x86_m256i
var fp_bq2662ad : base.x86_m256i
var fp_bq2662eh : base.x86_m256i
var fp_cb26ad : base.x86_m256i
var fp_cb26eh : base.x86_m256i
var fp_rows40pos : base.x86_m256i
var fp_rows04neg : base.x86_m256i
var fp_rows0pm4 : base.x86_m256i
var fp_ccpmad : base.x86_m256i
var fp_ccpmeh : base.x86_m256i
var fp_cd01ad : base.x86_m256i
var fp_cd01eh : base.x86_m256i
var fp_cd32ad : base.x86_m256i
var fp_cd32eh : base.x86_m256i
var fp_sums7351 : base.x86_m256i
var fp_sums5173 : base.x86_m256i
var fp_ci73515173ad : base.x86_m256i
var fp_ci73515173eh : base.x86_m256i
var fp_cl7351ad : base.x86_m256i
var fp_cl7351eh : base.x86_m256i
var fp_rows13 : base.x86_m256i
var fp_bq7153ad : base.x86_m256i
var fp_bq7153eh : base.x86_m256i
var fp_ck75ad : base.x86_m256i
var fp_ck75eh : base.x86_m256i
var fp_cl5173ad : base.x86_m256i
var fp_cl5173eh : base.x86_m256i
var fp_ck13ad : base.x86_m256i
var fp_ck13eh : base.x86_m256i
// Intermediate variables between first and second pass.
var intermediate01ad : base.x86_m256i
var intermediate01eh : base.x86_m256i
var intermediate01 : base.x86_m256i
var intermediate32ad : base.x86_m256i
var intermediate32eh : base.x86_m256i
var intermediate32 : base.x86_m256i
var intermediate45ad : base.x86_m256i
var intermediate45eh : base.x86_m256i
var intermediate45 : base.x86_m256i
var intermediate76ad : base.x86_m256i
var intermediate76eh : base.x86_m256i
var intermediate76 : base.x86_m256i
var ita0a1e0e1 : base.x86_m256i
var ita2a3e2e3 : base.x86_m256i
var ita4a5e4e5 : base.x86_m256i
var ita6a7e6e7 : base.x86_m256i
var ita0c0e0g0 : base.x86_m256i
var ita1c1e1g1 : base.x86_m256i
var ita4c4e4g4 : base.x86_m256i
var ita5c5e5g5 : base.x86_m256i
var ita0b0e0f0 : base.x86_m256i
var ita4b4e4f4 : base.x86_m256i
var itc0d0g0h0 : base.x86_m256i
var itc4d4g4h4 : base.x86_m256i
var intermediateae : base.x86_m256i
var intermediatebf : base.x86_m256i
var intermediatecg : base.x86_m256i
var intermediatedh : base.x86_m256i
var intermediatedb : base.x86_m256i
var intermediatehf : base.x86_m256i
// Second pass variable names start with "sp_".
var sp_cols62 : base.x86_m256i
var sp_bq2662ad : base.x86_m256i
var sp_bq2662eh : base.x86_m256i
var sp_rb26ad : base.x86_m256i
var sp_rb26eh : base.x86_m256i
var sp_cols40pos : base.x86_m256i
var sp_cols04neg : base.x86_m256i
var sp_cols0pm4 : base.x86_m256i
var sp_rcpmad : base.x86_m256i
var sp_rcpmeh : base.x86_m256i
var sp_rd01ad : base.x86_m256i
var sp_rd01eh : base.x86_m256i
var sp_rd32ad : base.x86_m256i
var sp_rd32eh : base.x86_m256i
var sp_sums7351 : base.x86_m256i
var sp_sums5173 : base.x86_m256i
var sp_ri73515173ad : base.x86_m256i
var sp_ri73515173eh : base.x86_m256i
var sp_rl7351ad : base.x86_m256i
var sp_rl7351eh : base.x86_m256i
var sp_cols13 : base.x86_m256i
var sp_bq7153ad : base.x86_m256i
var sp_bq7153eh : base.x86_m256i
var sp_rk75ad : base.x86_m256i
var sp_rk75eh : base.x86_m256i
var sp_rl5173ad : base.x86_m256i
var sp_rl5173eh : base.x86_m256i
var sp_rk13ad : base.x86_m256i
var sp_rk13eh : base.x86_m256i
// Final variables.
var final01ad : base.x86_m256i
var final01eh : base.x86_m256i
var final01 : base.x86_m256i
var final32ad : base.x86_m256i
var final32eh : base.x86_m256i
var final32 : base.x86_m256i
var final45ad : base.x86_m256i
var final45eh : base.x86_m256i
var final45 : base.x86_m256i
var final76ad : base.x86_m256i
var final76eh : base.x86_m256i
var final76 : base.x86_m256i
var fta0a1e0e1 : base.x86_m256i
var fta2a3e2e3 : base.x86_m256i
var fta4a5e4e5 : base.x86_m256i
var fta6a7e6e7 : base.x86_m256i
var fta0c0e0g0 : base.x86_m256i
var fta1c1e1g1 : base.x86_m256i
var fta4c4e4g4 : base.x86_m256i
var fta5c5e5g5 : base.x86_m256i
var fta0b0e0f0 : base.x86_m256i
var ftc0d0g0h0 : base.x86_m256i
var fta4b4e4f4 : base.x86_m256i
var ftc4d4g4h4 : base.x86_m256i
var finalae : base.x86_m256i
var finalbf : base.x86_m256i
var finalcg : base.x86_m256i
var finaldh : base.x86_m256i
var final0145 : base.x86_m256i
var final2367 : base.x86_m256i
var final0 : base.u64
var final1 : base.u64
var final2 : base.u64
var final3 : base.u64
var final4 : base.u64
var final5 : base.u64
var final6 : base.u64
var final7 : base.u64
var remaining : slice base.u8
// ----
if 8 > args.dst_stride {
return nothing
}
k_0000 = util.make_m256i_multiple_u16(
a00: 0x0000, a01: 0x0000, a02: 0x0000, a03: 0x0000,
a04: 0x0000, a05: 0x0000, a06: 0x0000, a07: 0x0000,
a08: 0x0000, a09: 0x0000, a10: 0x0000, a11: 0x0000,
a12: 0x0000, a13: 0x0000, a14: 0x0000, a15: 0x0000)
k_8080 = util.make_m256i_multiple_u16(
a00: 0x8080, a01: 0x8080, a02: 0x8080, a03: 0x8080,
a04: 0x8080, a05: 0x8080, a06: 0x8080, a07: 0x8080,
a08: 0x8080, a09: 0x8080, a10: 0x8080, a11: 0x8080,
a12: 0x8080, a13: 0x8080, a14: 0x8080, a15: 0x8080)
k_0000_0002 = util.make_m256i_multiple_u16(
a00: 0x0000, a01: 0x0002, a02: 0x0000, a03: 0x0002,
a04: 0x0000, a05: 0x0002, a06: 0x0000, a07: 0x0002,
a08: 0x0000, a09: 0x0002, a10: 0x0000, a11: 0x0002,
a12: 0x0000, a13: 0x0002, a14: 0x0000, a15: 0x0002)
k_0001_FFFF = util.make_m256i_multiple_u16(
a00: 0x0001, a01: 0x0001, a02: 0x0001, a03: 0x0001,
a04: 0x0001, a05: 0x0001, a06: 0x0001, a07: 0x0001,
a08: 0xFFFF, a09: 0xFFFF, a10: 0xFFFF, a11: 0xFFFF,
a12: 0xFFFF, a13: 0xFFFF, a14: 0xFFFF, a15: 0xFFFF)
k_0400_0000 = util.make_m256i_multiple_u16(
a00: 0x0400, a01: 0x0000, a02: 0x0400, a03: 0x0000,
a04: 0x0400, a05: 0x0000, a06: 0x0400, a07: 0x0000,
a08: 0x0400, a09: 0x0000, a10: 0x0400, a11: 0x0000,
a12: 0x0400, a13: 0x0000, a14: 0x0400, a15: 0x0000)
k_29CF_1151_D630_1151 = util.make_m256i_multiple_u16(
a00: 0x29CF, a01: 0x1151, a02: 0x29CF, a03: 0x1151,
a04: 0x29CF, a05: 0x1151, a06: 0x29CF, a07: 0x1151,
a08: 0xD630, a09: 0x1151, a10: 0xD630, a11: 0x1151,
a12: 0xD630, a13: 0x1151, a14: 0xD630, a15: 0x1151)
k_E333_133E_ADFD_1051 = util.make_m256i_multiple_u16(
a00: 0xE333, a01: 0x133E, a02: 0xE333, a03: 0x133E,
a04: 0xE333, a05: 0x133E, a06: 0xE333, a07: 0x133E,
a08: 0xADFD, a09: 0x1051, a10: 0xADFD, a11: 0x1051,
a12: 0xADFD, a13: 0x1051, a14: 0xADFD, a15: 0x1051)
k_E6DC_25A1_1925_25A1 = util.make_m256i_multiple_u16(
a00: 0xE6DC, a01: 0x25A1, a02: 0xE6DC, a03: 0x25A1,
a04: 0xE6DC, a05: 0x25A1, a06: 0xE6DC, a07: 0x25A1,
a08: 0x1925, a09: 0x25A1, a10: 0x1925, a11: 0x25A1,
a12: 0x1925, a13: 0x25A1, a14: 0x1925, a15: 0x25A1)
k_ECC1_E333_EFB0_ADFD = util.make_m256i_multiple_u16(
a00: 0xECC1, a01: 0xE333, a02: 0xECC1, a03: 0xE333,
a04: 0xECC1, a05: 0xE333, a06: 0xECC1, a07: 0xE333,
a08: 0xEFB0, a09: 0xADFD, a10: 0xEFB0, a11: 0xADFD,
a12: 0xEFB0, a13: 0xADFD, a14: 0xEFB0, a15: 0xADFD)
// ----
while.goto_do_second_pass true,
inv 8 <= args.dst_stride,
{{
// Name the columns a ..= h and the rows 0 ..= 7.
//
// Check if the AC terms are zero. First, a relative cheap check of the
// (a1, b1, c1, d1, a2, b2, c2, d2) coefficients - two half-rows.
if 0 == (util.make_u64_slice_u16lex4(a: this.mcu_blocks[0][0x08 .. 0x0C]) |
util.make_u64_slice_u16lex4(a: this.mcu_blocks[0][0x10 .. 0x14])) {
// A more thorough check of every coefficient in rows 1 ..= 7.
az_coeffs = sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x08 .. 0x10])._mm_or_si128(b:
sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x10 .. 0x18]))._mm_or_si128(b:
sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x18 .. 0x20]))._mm_or_si128(b:
sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x20 .. 0x28]))._mm_or_si128(b:
sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x28 .. 0x30]))._mm_or_si128(b:
sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x30 .. 0x38]))._mm_or_si128(b:
sse42_util.make_m128i_slice_u16lex8(a: this.mcu_blocks[0][0x38 .. 0x40]))
// Pack u16x8 into u8x8 in the low 64 bits (using signed saturation)
// and bulk-compare to zero. Only "zero versus non-zero" matters.
if 0 == az_coeffs._mm_packs_epi16(b: az_coeffs).truncate_u64() {
// Dequantize, similar to the "Set up" code path below.
rows01 = util.make_m256i_slice_u16lex16(a: this.mcu_blocks[0][0x00 .. 0x10])
quants01 = util.make_m256i_slice_u16lex16(a: this.quant_tables[args.q][0x00 .. 0x10])
rows01 = rows01._mm256_mullo_epi16(b: quants01)
// az_ah00 = i16x16 [a b c d e f g h 0 0 0 0 0 0 0 0]
// az_ad00 = i16x16 [a a b b c c d d 0 0 0 0 0 0 0 0]
// az_eh00 = i16x16 [e e f f g g h h 0 0 0 0 0 0 0 0]
// az_adeh = i16x16 [a a b b c c d d e e f f g g h h]
az_ah00 = rows01._mm256_slli_epi16(imm8: 2) // PASS1_BITS is 2.
az_ad00 = az_ah00._mm256_unpacklo_epi16(b: az_ah00)
az_eh00 = az_ah00._mm256_unpackhi_epi16(b: az_ah00)
az_adeh = az_ad00._mm256_inserti128_si256(b: az_eh00._mm256_castsi256_si128(), imm8: 1)
// intermediateae = i16x16 [a a a a a a a a e e e e e e e e]
// intermediatebf = i16x16 [b b b b b b b b f f f f f f f f]
// intermediatecg = i16x16 [c c c c c c c c g g g g g g g g]
// intermediatedh = i16x16 [d d d d d d d d h h h h h h h h]
intermediateae = az_adeh._mm256_shuffle_epi32(imm8: 0x00)
intermediatebf = az_adeh._mm256_shuffle_epi32(imm8: 0x55)
intermediatecg = az_adeh._mm256_shuffle_epi32(imm8: 0xAA)
intermediatedh = az_adeh._mm256_shuffle_epi32(imm8: 0xFF)
break.goto_do_second_pass
}
}
// ----
// Set up.
// Dequantize:
//
// rows01 = i16x16 [a0 b0 .. g0 h0
// a1 b1 .. g1 h1]
// rows23 = i16x16 [a2 b2 .. g2 e2
// a3 b3 .. g3 h3]
// rows45 = i16x16 [a4 b4 .. g4 h4
// a5 b5 .. g5 h5]
// rows67 = i16x16 [a6 b6 .. g6 h6
// a7 b7 .. g7 h7]
rows01 = util.make_m256i_slice_u16lex16(a: this.mcu_blocks[0][0x00 .. 0x10])
rows23 = util.make_m256i_slice_u16lex16(a: this.mcu_blocks[0][0x10 .. 0x20])
rows45 = util.make_m256i_slice_u16lex16(a: this.mcu_blocks[0][0x20 .. 0x30])
rows67 = util.make_m256i_slice_u16lex16(a: this.mcu_blocks[0][0x30 .. 0x40])
quants01 = util.make_m256i_slice_u16lex16(a: this.quant_tables[args.q][0x00 .. 0x10])
quants23 = util.make_m256i_slice_u16lex16(a: this.quant_tables[args.q][0x10 .. 0x20])
quants45 = util.make_m256i_slice_u16lex16(a: this.quant_tables[args.q][0x20 .. 0x30])
quants67 = util.make_m256i_slice_u16lex16(a: this.quant_tables[args.q][0x30 .. 0x40])
rows01 = rows01._mm256_mullo_epi16(b: quants01)
rows23 = rows23._mm256_mullo_epi16(b: quants23)
rows45 = rows45._mm256_mullo_epi16(b: quants45)
rows67 = rows67._mm256_mullo_epi16(b: quants67)
// Permute:
//
// rows04 = i16x16 [a0 b0 .. g0 h0
// a4 b4 .. g4 h4]
// rows31 = i16x16 [a3 b3 .. g3 e3
// a1 b1 .. g1 h1]
// rows26 = i16x16 [a2 b2 .. g2 h2
// a6 b6 .. g6 h6]
// rows75 = i16x16 [a7 b7 .. g7 h7
// a5 b5 .. g5 h5]
rows04 = rows01._mm256_permute2x128_si256(b: rows45, imm8: 0x20)
rows31 = rows23._mm256_permute2x128_si256(b: rows01, imm8: 0x31)
rows26 = rows23._mm256_permute2x128_si256(b: rows67, imm8: 0x20)
rows75 = rows67._mm256_permute2x128_si256(b: rows45, imm8: 0x31)
// ----
// First pass (even rows).
// This non-SIMD code:
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
//
// becomes:
//
// fp_rows62 = i16x16 [a6 b6 .. g6 h6
// a2 b2 .. g2 h2]
// fp_bq2662ad = i16x16 [a2 a6 .. d2 d6
// a6 a2 .. d6 d2]
// fp_bq2662eh = i16x16 [e2 e6 .. h2 h6
// e6 e2 .. h6 h2]
// fp_cb26ad = i32x8 [cb2.a .. cb2.d
// cb6.a .. cb6.d]
// = i32x8 [a2*0x29CF+a6*0x1151 .. d2*0x29CF+d6*0x1151
// a6*0xD630+a2*0x1151 .. d6*0xD630+d2*0x1151]
// fp_cb26eh = i32x8 [cb2.e .. cb2.h
// cb6.e .. cb6.h]
// = i32x8 [e2*0x29CF+e6*0x1151 .. h2*0x29CF+h6*0x1151
// e6*0xD630+e2*0x1151 .. h6*0xD630+h2*0x1151]
fp_rows62 = rows26._mm256_permute2x128_si256(b: rows26, imm8: 0x01)
fp_bq2662ad = rows26._mm256_unpacklo_epi16(b: fp_rows62)
fp_bq2662eh = rows26._mm256_unpackhi_epi16(b: fp_rows62)
fp_cb26ad = fp_bq2662ad._mm256_madd_epi16(b: k_29CF_1151_D630_1151)
fp_cb26eh = fp_bq2662eh._mm256_madd_epi16(b: k_29CF_1151_D630_1151)
// This non-SIMD code:
//
// ccp = (bq0 ~mod+ bq4) ~mod<< 13
// ccm = (bq0 ~mod- bq4) ~mod<< 13
//
// becomes:
//
// fp_rows40pos = i16x16 [+a4 +b4 .. +g4 +h4
// +a0 +b0 .. +g0 +h0]
// fp_rows04neg = i16x16 [+a0 +b0 .. +g0 +h0
// -a4 -b4 .. -g4 -h4]
// fp_rows0pm4 = i16x16 [a0+a4 b0+b4 .. g0+g4 h0+h4
// a0-a4 b0-b4 .. g0-g4 h0-h4]
// fp_ccpmad = i32x8 [ccp.a .. ccp.d
// ccm.a .. ccm.d]
// = i32x8 [(a0+a4)<<13 .. (d0+d4)<<13
// (a0-a4)<<13 .. (d0-d4)<<13]
// fp_ccpmeh = i32x8 [ccp.e .. ccp.h
// ccm.e .. ccm.h]
// = i32x8 [(e0+e4)<<13 .. (h0+h4)<<13
// (e0-e4)<<13 .. (h0-h4)<<13]
fp_rows40pos = rows04._mm256_permute2x128_si256(b: rows04, imm8: 0x01)
fp_rows04neg = rows04._mm256_sign_epi16(b: k_0001_FFFF)
fp_rows0pm4 = fp_rows40pos._mm256_add_epi16(b: fp_rows04neg)
fp_ccpmad = k_0000._mm256_unpacklo_epi16(b: fp_rows0pm4)._mm256_srai_epi32(imm8: 16 - 13)
fp_ccpmeh = k_0000._mm256_unpackhi_epi16(b: fp_rows0pm4)._mm256_srai_epi32(imm8: 16 - 13)
// This non-SIMD code:
//
// cd0 = ccp ~mod+ cb2
// cd1 = ccm ~mod+ cb6
// cd2 = ccm ~mod- cb6
// cd3 = ccp ~mod- cb2
//
// becomes:
//
// fp_cd01ad = i32x8 [cd0.a .. cd0.d
// cd1.a .. cd1.d]
// = i32x8 [ccp.a+cb2.a .. ccp.d+cb2.d
// ccm.a+cb6.a .. ccm.d+cb6.d]
// fp_cd01eh = i32x8 [cd0.e .. cd0.h
// cd1.e .. cd1.h]
// = i32x8 [ccp.e+cb2.e .. ccp.h+cb2.h
// ccm.e+cb6.e .. ccm.h+cb6.h]
// fp_cd32ad = i32x8 [cd3.a .. cd3.d
// cd2.a .. cd2.d]
// = i32x8 [ccp.a-cb2.a .. ccp.d-cb2.d
// ccm.a-cb6.a .. ccm.d-cb6.d]
// fp_cd32eh = i32x8 [cd3.e .. cd3.h
// cd2.e .. cd2.h]
// = i32x8 [ccp.e-cb2.e .. ccp.h-cb2.h
// ccm.e-cb6.e .. ccm.h-cb6.h]
fp_cd01ad = fp_ccpmad._mm256_add_epi32(b: fp_cb26ad)
fp_cd01eh = fp_ccpmeh._mm256_add_epi32(b: fp_cb26eh)
fp_cd32ad = fp_ccpmad._mm256_sub_epi32(b: fp_cb26ad)
fp_cd32eh = fp_ccpmeh._mm256_sub_epi32(b: fp_cb26eh)
// ----
// First pass (odd rows).
// This non-SIMD code:
//
// ci73 = bq7 ~mod+ bq3
// ci51 = bq5 ~mod+ bq1
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
//
// becomes:
//
// fp_sums7351 = i16x16 [a7+a3 b7+b3 .. g7+g3 h7+h3
// a5+a1 b5+b1 .. g5+g1 h5+h1]
// fp_sums5173 = i16x16 [a5+a1 b5+b1 .. g5+g1 h5+h1
// a7+a3 b7+b3 .. g7+g3 h7+h3]
// fp_ci73515173ad = i16x16 [a7+a3 a5+a1 .. d7+d3 d5+d1
// a5+a1 a7+a3 .. d5+d1 d7+d3]
// fp_ci73515173eh = i16x16 [e7+e3 e5+e1 .. h7+h3 h5+d1
// e5+e1 e7+e3 .. h5+h1 h7+d3]
// fp_cl7351ad = i32x8 [cl73.a .. cl73.d
// cl51.a .. cl51.d]
// = i32x8 [(a7+a3)*0xE6DC+(a5+a1)*0x25A1 .. (d7+d3)*0xE6DC+(d5+d1)*0x25A1
// (a5+a1)*0x1925+(a7+a3)*0x25A1 .. (d5+d1)*0x1925+(d7+d3)*0x25A1]
// fp_cl7351eh = i32x8 [cl73.e .. cl73.h
// cl51.e .. cl51.h]
// = i32x8 [(e7+e3)*0xE6DC+(e5+e1)*0x25A1 .. (h7+h3)*0xE6DC+(h5+h1)*0x25A1
// (e5+e1)*0x1925+(e7+e3)*0x25A1 .. (h5+h1)*0x1925+(h7+h3)*0x25A1]
fp_sums7351 = rows75._mm256_add_epi16(b: rows31)
fp_sums5173 = fp_sums7351._mm256_permute2x128_si256(b: fp_sums7351, imm8: 0x01)
fp_ci73515173ad = fp_sums7351._mm256_unpacklo_epi16(b: fp_sums5173)
fp_ci73515173eh = fp_sums7351._mm256_unpackhi_epi16(b: fp_sums5173)
fp_cl7351ad = fp_ci73515173ad._mm256_madd_epi16(b: k_E6DC_25A1_1925_25A1)
fp_cl7351eh = fp_ci73515173eh._mm256_madd_epi16(b: k_E6DC_25A1_1925_25A1)
// This non-SIMD code:
//
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
//
// becomes:
//
// fp_rows13 = i16x16 [a1 b1 .. g1 e1 a3 b3 .. g3 h3]
// fp_bq7153ad = i16x16 [a7 a1 .. d7 d1 a5 a3 .. d5 d3]
// fp_bq7153eh = i16x16 [e7 e1 .. h7 h1 e5 e3 .. h5 h3]
// fp_ck75ad = i32x8 [ck7.a .. ck7.d
// ck5.a .. ck5.d]
// = i32x8 [cl73.a+a7*0xECC1+a1*0xE333 .. cl73.d+d7*0xECC1+d1*0xE333
// cl51.a+a5*0xEFB0+a3*0xADFD .. cl51.d+d5*0xEFB0+d3*0xADFD]
// fp_ck75eh = i32x8 [ck7.e .. ck7.h
// ck5.e .. ck5.h]
// = i32x8 [cl73.e+e7*0xECC1+e1*0xE333 .. cl73.h+h7*0xECC1+h1*0xE333
// cl51.e+e5*0xEFB0+e3*0xADFD .. cl51.h+h5*0xEFB0+h3*0xADFD]
fp_rows13 = rows31._mm256_permute2x128_si256(b: rows31, imm8: 0x01)
fp_bq7153ad = rows75._mm256_unpacklo_epi16(b: fp_rows13)
fp_bq7153eh = rows75._mm256_unpackhi_epi16(b: fp_rows13)
fp_ck75ad = fp_bq7153ad._mm256_madd_epi16(b: k_ECC1_E333_EFB0_ADFD)._mm256_add_epi32(b: fp_cl7351ad)
fp_ck75eh = fp_bq7153eh._mm256_madd_epi16(b: k_ECC1_E333_EFB0_ADFD)._mm256_add_epi32(b: fp_cl7351eh)
// This non-SIMD code:
//
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
//
// becomes:
//
// fp_cl5173ad = i32x8 [cl51.a .. cl51.d
// cl73.a .. cl73.d]
// fp_cl5173eh = i32x8 [cl51.e .. cl51.h
// cl73.e .. cl73.h]
// fp_ck13ad = i32x8 [ck1.a .. ck1.d
// ck3.a .. ck3.d]
// = i32x8 [cl51.a+a7*0xE333+a1*0x133E .. cl51.d+d7*0xE333+d1*0x133E
// cl73.a+a5*0xADFD+a3*0x1051 .. cl73.d+d5*0xADFD+d3*0x1051]
// fp_ck13eh = i32x8 [ck1.e .. ck3.h
// ck3.e .. ck1.h]
// = i32x8 [cl51.e+e7*0xE333+e1*0x133E .. cl51.h+h7*0xE333+h1*0x133E
// cl73.e+e5*0xADFD+e3*0x1051 .. cl73.h+h5*0xADFD+h3*0x1051]
fp_cl5173ad = fp_cl7351ad._mm256_permute2x128_si256(b: fp_cl7351ad, imm8: 0x01)
fp_cl5173eh = fp_cl7351eh._mm256_permute2x128_si256(b: fp_cl7351eh, imm8: 0x01)
fp_ck13ad = fp_cl5173ad._mm256_add_epi32(b: fp_bq7153ad._mm256_madd_epi16(b: k_E333_133E_ADFD_1051))
fp_ck13eh = fp_cl5173eh._mm256_add_epi32(b: fp_bq7153eh._mm256_madd_epi16(b: k_E333_133E_ADFD_1051))
// ----
// First pass (combine rows).
// We have now calculated:
//
// fp_ck13ad = i32x8 [ck1.a .. ck1.d ck3.a .. ck3.d]
// fp_ck13eh = i32x8 [ck1.e .. ck1.h ck3.e .. ck3.h]
// fp_ck75ad = i32x8 [ck7.a .. ck7.d ck5.a .. ck5.d]
// fp_ck75eh = i32x8 [ck7.e .. ck7.h ck5.e .. ck5.h]
// fp_cd01ad = i32x8 [cd0.a .. cd0.d cd1.a .. cd1.d]
// fp_cd01eh = i32x8 [cd0.e .. cd0.h cd1.e .. cd1.h]
// fp_cd32ad = i32x8 [cd3.a .. cd3.d cd2.a .. cd2.d]
// fp_cd32eh = i32x8 [cd3.e .. cd3.h cd2.e .. cd2.h]
//
// This non-SIMD code:
//
// intermediate0 = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
// intermediate1 = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
// intermediate2 = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
// intermediate3 = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
// intermediate4 = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
// intermediate5 = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
// intermediate6 = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
// intermediate7 = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
//
// becomes a mix of adds, subtracts, shifts and packs.
intermediate01ad = fp_cd01ad._mm256_add_epi32(b: fp_ck13ad)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate01eh = fp_cd01eh._mm256_add_epi32(b: fp_ck13eh)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate01 = intermediate01ad._mm256_packs_epi32(b: intermediate01eh)
intermediate32ad = fp_cd32ad._mm256_add_epi32(b: fp_ck75ad)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate32eh = fp_cd32eh._mm256_add_epi32(b: fp_ck75eh)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate32 = intermediate32ad._mm256_packs_epi32(b: intermediate32eh)
intermediate45ad = fp_cd32ad._mm256_sub_epi32(b: fp_ck75ad)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate45eh = fp_cd32eh._mm256_sub_epi32(b: fp_ck75eh)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate45 = intermediate45ad._mm256_packs_epi32(b: intermediate45eh)
intermediate76ad = fp_cd01ad._mm256_sub_epi32(b: fp_ck13ad)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate76eh = fp_cd01eh._mm256_sub_epi32(b: fp_ck13eh)._mm256_add_epi32(b: k_0400_0000)._mm256_srai_epi32(imm8: 11)
intermediate76 = intermediate76ad._mm256_packs_epi32(b: intermediate76eh)
// ----
// Transpose/re-order the intermediate rows and columns. We start with:
//
// intermediate01 = i16x16 [in.a0 .. in.d0 in.e0 .. in.h0 in.a1 .. in.d1 in.e1 .. in.h1]
// intermediate32 = i16x16 [in.a3 .. in.d3 in.e3 .. in.h3 in.a2 .. in.d2 in.e2 .. in.h2]
// intermediate45 = i16x16 [in.a4 .. in.d4 in.e4 .. in.h4 in.a5 .. in.d5 in.e5 .. in.h5]
// intermediate76 = i16x16 [in.a7 .. in.d7 in.e7 .. in.h7 in.a6 .. in.d6 in.e6 .. in.h6]
ita0a1e0e1 = intermediate01._mm256_permute4x64_epi64(imm8: 0xD8)
ita2a3e2e3 = intermediate32._mm256_permute4x64_epi64(imm8: 0x72)
ita4a5e4e5 = intermediate45._mm256_permute4x64_epi64(imm8: 0xD8)
ita6a7e6e7 = intermediate76._mm256_permute4x64_epi64(imm8: 0x72)
// We now have:
//
// ita0a1e0e1 = i16x16 [in.a0 .. in.d0 in.a1 .. in.d1 in.e0 .. in.h0 in.e1 .. in.h1]
// ita2a3e2e3 = i16x16 [in.a2 .. in.d2 in.a3 .. in.d3 in.e2 .. in.h2 in.e3 .. in.h3]
// ita4a5e4e5 = i16x16 [in.a4 .. in.d4 in.a5 .. in.d5 in.e4 .. in.h4 in.e5 .. in.h5]
// ita6a7e6e7 = i16x16 [in.a6 .. in.d6 in.a7 .. in.d7 in.e6 .. in.h6 in.e7 .. in.h7]
ita0c0e0g0 = ita0a1e0e1._mm256_unpacklo_epi16(b: ita2a3e2e3)
ita1c1e1g1 = ita0a1e0e1._mm256_unpackhi_epi16(b: ita2a3e2e3)
ita4c4e4g4 = ita4a5e4e5._mm256_unpacklo_epi16(b: ita6a7e6e7)
ita5c5e5g5 = ita4a5e4e5._mm256_unpackhi_epi16(b: ita6a7e6e7)
// We now have:
//
// ita0c0e0g0 = i16x16 [in.a0 in.a2 .. in.d0 in.d2 in.e0 in.e2 .. in.h0 in.h2]
// ita1c1e1g1 = i16x16 [in.a1 in.a3 .. in.d1 in.d3 in.e1 in.e3 .. in.h1 in.h3]
// ita4c4e4g4 = i16x16 [in.a4 in.a6 .. in.d4 in.d6 in.e4 in.e6 .. in.h4 in.h6]
// ita5c5e5g5 = i16x16 [in.a5 in.a7 .. in.d5 in.d7 in.e5 in.e7 .. in.h5 in.h7]
ita0b0e0f0 = ita0c0e0g0._mm256_unpacklo_epi16(b: ita1c1e1g1)
itc0d0g0h0 = ita0c0e0g0._mm256_unpackhi_epi16(b: ita1c1e1g1)
ita4b4e4f4 = ita4c4e4g4._mm256_unpacklo_epi16(b: ita5c5e5g5)
itc4d4g4h4 = ita4c4e4g4._mm256_unpackhi_epi16(b: ita5c5e5g5)
// We now have:
//
// ita0b0e0f0 = i16x16 [in.a0 .. in.a3 in.b0 .. in.b3 in.e0 .. in.e3 in.f0 .. in.f3]
// itc0d0g0h0 = i16x16 [in.c0 .. in.c3 in.d0 .. in.d3 in.g0 .. in.g3 in.h0 .. in.h3]
// ita4b4e4f4 = i16x16 [in.a4 .. in.a7 in.b4 .. in.b7 in.e4 .. in.e7 in.f4 .. in.f7]
// itc4d4g4h4 = i16x16 [in.c4 .. in.c7 in.d4 .. in.d7 in.g4 .. in.g7 in.h4 .. in.h7]
intermediateae = ita0b0e0f0._mm256_unpacklo_epi64(b: ita4b4e4f4)
intermediatebf = ita0b0e0f0._mm256_unpackhi_epi64(b: ita4b4e4f4)
intermediatecg = itc0d0g0h0._mm256_unpacklo_epi64(b: itc4d4g4h4)
intermediatedh = itc0d0g0h0._mm256_unpackhi_epi64(b: itc4d4g4h4)
// We now have:
//
// intermediateae = i16x16 [in.a0 .. in.a3 in.a4 .. in.a7 in.e0 .. in.e3 in.e4 .. in.e7]
// intermediatebf = i16x16 [in.b0 .. in.b3 in.b4 .. in.b7 in.f0 .. in.f4 in.f5 .. in.f7]
// intermediatecg = i16x16 [in.c0 .. in.c3 in.c4 .. in.c7 in.g0 .. in.g3 in.g4 .. in.g7]
// intermediatedh = i16x16 [in.d0 .. in.d3 in.d4 .. in.d7 in.h0 .. in.h4 in.h5 .. in.h7]
break.goto_do_second_pass
}} endwhile.goto_do_second_pass
// ----
// To recap, we have:
//
// intermediateae = i16x16 [in.a0 in.a1 .. in.a6 in.a7
// in.e0 in.e1 .. in.e6 in.e7]
// intermediatebf = i16x16 [in.b0 in.b1 .. in.b6 in.b7
// in.f0 in.f1 .. in.f6 in.f7]
// intermediatecg = i16x16 [in.c0 in.c1 .. in.c6 in.c7
// in.g0 in.g1 .. in.g6 in.g7]
// intermediatedh = i16x16 [in.d0 in.d1 .. in.d6 in.d7
// in.h0 in.h1 .. in.h6 in.h7]
//
// Which is similar to this, but note the transposition and re-ordering:
//
// rows01 = i16x16 [a0 b0 .. g0 h0
// a1 b1 .. g1 h1]
// rows23 = i16x16 [a2 b2 .. g2 e2
// a3 b3 .. g3 h3]
// rows45 = i16x16 [a4 b4 .. g4 h4
// a5 b5 .. g5 h5]
// rows67 = i16x16 [a6 b6 .. g6 h6
// a7 b7 .. g7 h7]
// Permute:
//
// intermediatedb = i16x16 [in.d0 in.d1 .. in.d6 in.d7
// in.b0 in.b1 .. in.b6 in.b7]
// intermediatehf = i16x16 [in.h0 in.h1 .. in.h6 in.h7
// in.f0 in.f1 .. in.f6 in.f7]
//
// The row/column analogies are:
//
// [a0 .. h0 a4 .. h4] = rows04 <=> intermediateae = [in.a0 .. in.a7 in.e0 .. in.e7]
// [a3 .. h3 a1 .. h1] = rows31 <=> intermediatedb = [in.d0 .. in.d7 in.b0 .. in.b7]
// [a2 .. h2 a6 .. h6] = rows26 <=> intermediatecg = [in.c0 .. in.c7 in.g0 .. in.g7]
// [a7 .. h7 a5 .. h5] = rows75 <=> intermediatehf = [in.h0 .. in.h7 in.f0 .. in.f7]
intermediatedb = intermediatedh._mm256_permute2x128_si256(b: intermediatebf, imm8: 0x20)
intermediatehf = intermediatedh._mm256_permute2x128_si256(b: intermediatebf, imm8: 0x31)
// ----
// Second pass (even columns).
// This non-SIMD code:
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
//
// becomes:
//
// sp_rb26ad = i32x8 [rb2.a .. rb2.d rb6.a .. rb6.d] transposed
// sp_rb26eh = i32x8 [rb2.e .. rb2.h rb6.e .. rb6.h] transposed
//
// 'Tranposed' means that the rows and column names should be swapped.
// rb2.a, rb6.h, etc should really be rbc.0, rbg.7, etc but the former
// notation lines up closer with the first pass' comments.
sp_cols62 = intermediatecg._mm256_permute2x128_si256(b: intermediatecg, imm8: 0x01)
sp_bq2662ad = intermediatecg._mm256_unpacklo_epi16(b: sp_cols62)
sp_bq2662eh = intermediatecg._mm256_unpackhi_epi16(b: sp_cols62)
sp_rb26ad = sp_bq2662ad._mm256_madd_epi16(b: k_29CF_1151_D630_1151)
sp_rb26eh = sp_bq2662eh._mm256_madd_epi16(b: k_29CF_1151_D630_1151)
// This non-SIMD code:
//
// rcp = (in0 ~mod+ in4) ~mod<< 13
// rcm = (in0 ~mod- in4) ~mod<< 13
//
// becomes:
//
// sp_rcpmad = i32x8 [rcp.a .. rcp.d rcm.a .. rcm.d] transposed
// sp_rcpmeh = i32x8 [rcp.e .. rcp.h rcm.e .. rcm.h] transposed
sp_cols40pos = intermediateae._mm256_permute2x128_si256(b: intermediateae, imm8: 0x01)
sp_cols04neg = intermediateae._mm256_sign_epi16(b: k_0001_FFFF)
sp_cols0pm4 = sp_cols40pos._mm256_add_epi16(b: sp_cols04neg)
sp_rcpmad = k_0000._mm256_unpacklo_epi16(b: sp_cols0pm4)._mm256_srai_epi32(imm8: 16 - 13)
sp_rcpmeh = k_0000._mm256_unpackhi_epi16(b: sp_cols0pm4)._mm256_srai_epi32(imm8: 16 - 13)
// This non-SIMD code:
//
// rd0 = rcp ~mod+ rb2
// rd1 = rcm ~mod+ rb6
// rd2 = rcm ~mod- rb6
// rd3 = rcp ~mod- rb2
//
// becomes:
//
// sp_rd01ad = i32x8 [rd0.a .. rd0.d rd1.a .. rd1.d] transposed
// sp_rd01eh = i32x8 [rd0.e .. rd0.h rd1.e .. rd1.h] transposed
// sp_rd32ad = i32x8 [rd3.a .. rd3.d rd2.a .. rd2.d] transposed
// sp_rd32eh = i32x8 [rd3.e .. rd3.h rd2.e .. rd2.h] transposed
sp_rd01ad = sp_rcpmad._mm256_add_epi32(b: sp_rb26ad)
sp_rd01eh = sp_rcpmeh._mm256_add_epi32(b: sp_rb26eh)
sp_rd32ad = sp_rcpmad._mm256_sub_epi32(b: sp_rb26ad)
sp_rd32eh = sp_rcpmeh._mm256_sub_epi32(b: sp_rb26eh)
// ----
// Second pass (odd columns).
// This non-SIMD code:
//
// ri73 = in7 ~mod+ in3
// ri51 = in5 ~mod+ in1
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
//
// becomes:
//
// sp_rl7351ad = i32x8 [rl73.a .. rl73.d rl51.a .. rl51.d] transposed
// sp_rl7351eh = i32x8 [rl73.e .. rl73.h rl51.e .. rl51.h] transposed
sp_sums7351 = intermediatehf._mm256_add_epi16(b: intermediatedb)
sp_sums5173 = sp_sums7351._mm256_permute2x128_si256(b: sp_sums7351, imm8: 0x01)
sp_ri73515173ad = sp_sums7351._mm256_unpacklo_epi16(b: sp_sums5173)
sp_ri73515173eh = sp_sums7351._mm256_unpackhi_epi16(b: sp_sums5173)
sp_rl7351ad = sp_ri73515173ad._mm256_madd_epi16(b: k_E6DC_25A1_1925_25A1)
sp_rl7351eh = sp_ri73515173eh._mm256_madd_epi16(b: k_E6DC_25A1_1925_25A1)
// This non-SIMD code:
//
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
//
// becomes:
//
// sp_rk75ad = i32x8 [rk7.a .. rk7.d rk5.a .. rk5.d] transposed
// sp_rk75eh = i32x8 [rk7.e .. rk7.h rk5.e .. rk5.h] transposed
sp_cols13 = intermediatedb._mm256_permute2x128_si256(b: intermediatedb, imm8: 0x01)
sp_bq7153ad = intermediatehf._mm256_unpacklo_epi16(b: sp_cols13)
sp_bq7153eh = intermediatehf._mm256_unpackhi_epi16(b: sp_cols13)
sp_rk75ad = sp_bq7153ad._mm256_madd_epi16(b: k_ECC1_E333_EFB0_ADFD)._mm256_add_epi32(b: sp_rl7351ad)
sp_rk75eh = sp_bq7153eh._mm256_madd_epi16(b: k_ECC1_E333_EFB0_ADFD)._mm256_add_epi32(b: sp_rl7351eh)
// This non-SIMD code:
//
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
//
// becomes:
//
// sp_rk13ad = i32x8 [rk1.a .. rk1.d rk3.a .. rk3.d] transposed
// sp_rk13eh = i32x8 [rk1.e .. rk3.h rk3.e .. rk1.h] transposed
sp_rl5173ad = sp_rl7351ad._mm256_permute2x128_si256(b: sp_rl7351ad, imm8: 0x01)
sp_rl5173eh = sp_rl7351eh._mm256_permute2x128_si256(b: sp_rl7351eh, imm8: 0x01)
sp_rk13ad = sp_rl5173ad._mm256_add_epi32(b: sp_bq7153ad._mm256_madd_epi16(b: k_E333_133E_ADFD_1051))
sp_rk13eh = sp_rl5173eh._mm256_add_epi32(b: sp_bq7153eh._mm256_madd_epi16(b: k_E333_133E_ADFD_1051))
// ----
// Second pass (combine columns).
// We have now calculated:
//
// sp_rk13ad = i32x8 [rk1.a .. rk1.d rk3.a .. rk3.d] transposed
// sp_rk13eh = i32x8 [rk1.e .. rk1.h rk3.e .. rk3.h] transposed
// sp_rk75ad = i32x8 [rk7.a .. rk7.d rk5.a .. rk5.d] transposed
// sp_rk75eh = i32x8 [rk7.e .. rk7.h rk5.e .. rk5.h] transposed
// sp_rd01ad = i32x8 [rd0.a .. rd0.d rd1.a .. rd1.d] transposed
// sp_rd01eh = i32x8 [rd0.e .. rd0.h rd1.e .. rd1.h] transposed
// sp_rd32ad = i32x8 [rd3.a .. rd3.d rd2.a .. rd2.d] transposed
// sp_rd32eh = i32x8 [rd3.e .. rd3.h rd2.e .. rd2.h] transposed
//
// This non-SIMD code:
//
// final0 = this.util.sign_extend_rshift_u32(a: (rd0 ~mod+ rk1) ~mod+ (1 << 17), n: 18)
// final1 = this.util.sign_extend_rshift_u32(a: (rd1 ~mod+ rk3) ~mod+ (1 << 17), n: 18)
// final2 = this.util.sign_extend_rshift_u32(a: (rd2 ~mod+ rk5) ~mod+ (1 << 17), n: 18)
// final3 = this.util.sign_extend_rshift_u32(a: (rd3 ~mod+ rk7) ~mod+ (1 << 17), n: 18)
// final4 = this.util.sign_extend_rshift_u32(a: (rd3 ~mod- rk7) ~mod+ (1 << 17), n: 18)
// final5 = this.util.sign_extend_rshift_u32(a: (rd2 ~mod- rk5) ~mod+ (1 << 17), n: 18)
// final6 = this.util.sign_extend_rshift_u32(a: (rd1 ~mod- rk3) ~mod+ (1 << 17), n: 18)
// final7 = this.util.sign_extend_rshift_u32(a: (rd0 ~mod- rk1) ~mod+ (1 << 17), n: 18)
//
// becomes a mix of adds, subtracts, shifts and packs.
final01ad = sp_rd01ad._mm256_add_epi32(b: sp_rk13ad)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final01eh = sp_rd01eh._mm256_add_epi32(b: sp_rk13eh)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final01 = final01ad._mm256_packs_epi32(b: final01eh)
final32ad = sp_rd32ad._mm256_add_epi32(b: sp_rk75ad)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final32eh = sp_rd32eh._mm256_add_epi32(b: sp_rk75eh)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final32 = final32ad._mm256_packs_epi32(b: final32eh)
final45ad = sp_rd32ad._mm256_sub_epi32(b: sp_rk75ad)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final45eh = sp_rd32eh._mm256_sub_epi32(b: sp_rk75eh)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final45 = final45ad._mm256_packs_epi32(b: final45eh)
final76ad = sp_rd01ad._mm256_sub_epi32(b: sp_rk13ad)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final76eh = sp_rd01eh._mm256_sub_epi32(b: sp_rk13eh)._mm256_add_epi32(b: k_0000_0002)._mm256_srai_epi32(imm8: 18)
final76 = final76ad._mm256_packs_epi32(b: final76eh)
// ----
// Transpose/re-order the final rows and columns.
fta0a1e0e1 = final01._mm256_permute4x64_epi64(imm8: 0xD8)
fta2a3e2e3 = final32._mm256_permute4x64_epi64(imm8: 0x72)
fta4a5e4e5 = final45._mm256_permute4x64_epi64(imm8: 0xD8)
fta6a7e6e7 = final76._mm256_permute4x64_epi64(imm8: 0x72)
fta0c0e0g0 = fta0a1e0e1._mm256_unpacklo_epi16(b: fta2a3e2e3)
fta1c1e1g1 = fta0a1e0e1._mm256_unpackhi_epi16(b: fta2a3e2e3)
fta4c4e4g4 = fta4a5e4e5._mm256_unpacklo_epi16(b: fta6a7e6e7)
fta5c5e5g5 = fta4a5e4e5._mm256_unpackhi_epi16(b: fta6a7e6e7)
fta0b0e0f0 = fta0c0e0g0._mm256_unpacklo_epi16(b: fta1c1e1g1)
ftc0d0g0h0 = fta0c0e0g0._mm256_unpackhi_epi16(b: fta1c1e1g1)
fta4b4e4f4 = fta4c4e4g4._mm256_unpacklo_epi16(b: fta5c5e5g5)
ftc4d4g4h4 = fta4c4e4g4._mm256_unpackhi_epi16(b: fta5c5e5g5)
finalae = fta0b0e0f0._mm256_unpacklo_epi64(b: fta4b4e4f4)
finalbf = fta0b0e0f0._mm256_unpackhi_epi64(b: fta4b4e4f4)
finalcg = ftc0d0g0h0._mm256_unpacklo_epi64(b: ftc4d4g4h4)
finaldh = ftc0d0g0h0._mm256_unpackhi_epi64(b: ftc4d4g4h4)
// ----
// Pack i16x16 into i8x32 (using signed saturation; i8 ranges from -128 ..=
// +127), bias by +0x80 (u8 ranges from 0 ..= 255) and extract the rows.
//
// We also drop the "'Tranposed' means that the rows and column names
// should be swapped". Rows are back to 0 ..= 7 and columns are a ..= h.
final0145 = finalae._mm256_packs_epi16(b: finalbf)._mm256_add_epi8(b: k_8080)
final2367 = finalcg._mm256_packs_epi16(b: finaldh)._mm256_add_epi8(b: k_8080)
final0 = final0145._mm256_extract_epi64(index: 0)
final1 = final0145._mm256_extract_epi64(index: 1)
final2 = final2367._mm256_extract_epi64(index: 0)
final3 = final2367._mm256_extract_epi64(index: 1)
final4 = final0145._mm256_extract_epi64(index: 2)
final5 = final0145._mm256_extract_epi64(index: 3)
final6 = final2367._mm256_extract_epi64(index: 2)
final7 = final2367._mm256_extract_epi64(index: 3)
// ----
// Write to the args.dst_buffer.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final0)
args.dst_buffer = remaining
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final1)
args.dst_buffer = remaining
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final2)
args.dst_buffer = remaining
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final3)
args.dst_buffer = remaining
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final4)
args.dst_buffer = remaining
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final5)
args.dst_buffer = remaining
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
remaining = args.dst_buffer[args.dst_stride ..]
args.dst_buffer.poke_u64le!(a: final6)
args.dst_buffer = remaining
if 8 > args.dst_buffer.length() {
return nothing
}
assert args.dst_buffer.length() >= 8 via "a >= b: b <= a"()
args.dst_buffer.poke_u64le!(a: final7)
}