blob: 59283e2c6c65185213b3b470f1c2bdd3af91b0af [file] [log] [blame]
// Copyright 2023 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
pri func decoder.decode_idct!(dst_buffer: slice base.u8, dst_stride: base.u64, q: base.u32[..= 3]),
choosy,
{
// This method implements the same algorithm as libjpeg-turbo's jidctint.c.
// It defines CONST_BITS = 13 and PASS1_BITS = 2, so that right-shifting by
// 11 is to shift by (CONST_BITS - PASS1_BITS).
var bq0 : base.u32
var bq2 : base.u32
var bq4 : base.u32
var bq6 : base.u32
var ca : base.u32
var cb2 : base.u32
var cb6 : base.u32
var ccp : base.u32
var ccm : base.u32
var cd0 : base.u32
var cd1 : base.u32
var cd2 : base.u32
var cd3 : base.u32
var bq1 : base.u32
var bq3 : base.u32
var bq5 : base.u32
var bq7 : base.u32
var ci51 : base.u32
var ci53 : base.u32
var ci71 : base.u32
var ci73 : base.u32
var cj : base.u32
var ck1 : base.u32
var ck3 : base.u32
var ck5 : base.u32
var ck7 : base.u32
var cl51 : base.u32
var cl73 : base.u32
var in0 : base.u32
var in2 : base.u32
var in4 : base.u32
var in6 : base.u32
var ra : base.u32
var rb2 : base.u32
var rb6 : base.u32
var rcp : base.u32
var rcm : base.u32
var rd0 : base.u32
var rd1 : base.u32
var rd2 : base.u32
var rd3 : base.u32
var in1 : base.u32
var in3 : base.u32
var in5 : base.u32
var in7 : base.u32
var ri51 : base.u32
var ri53 : base.u32
var ri71 : base.u32
var ri73 : base.u32
var rj : base.u32
var rk1 : base.u32
var rk3 : base.u32
var rk5 : base.u32
var rk7 : base.u32
var rl51 : base.u32
var rl73 : base.u32
var intermediate : array[64] base.u32
if 8 > args.dst_stride {
return nothing
}
// -------- BEGIN generated by script/print-jpeg-idct-code.go
// p0_298631336 = 0x0000_098E = 2446
// p0_390180644 = 0x0000_0C7C = 3196
// p0_509795579 = 0x0000_1051 = 4177
// p0_541196100 = 0x0000_1151 = 4433
// p0_601344887 = 0x0000_133E = 4926
// p0_765366865 = 0x0000_187E = 6270
// p0_785694958 = 0x0000_1925 = 6437
// p0_899976223 = 0x0000_1CCD = 7373
// p1_175875602 = 0x0000_25A1 = 9633
// p1_306562965 = 0x0000_29CF = 10703
// p1_501321110 = 0x0000_300B = 12299
// p1_847759065 = 0x0000_3B21 = 15137
// p1_961570560 = 0x0000_3EC5 = 16069
// p2_053119869 = 0x0000_41B3 = 16819
// p2_562915447 = 0x0000_5203 = 20995
// p3_072711026 = 0x0000_6254 = 25172
//
// m0_390180644 = 0xFFFF_F384 = 4294964100
// m0_509795579 = 0xFFFF_EFB0 = 4294963120
// m0_601344887 = 0xFFFF_ECC1 = 4294962369
// m0_785694958 = 0xFFFF_E6DC = 4294960860
// m0_899976223 = 0xFFFF_E333 = 4294959923
// m1_306562965 = 0xFFFF_D630 = 4294956592
// m1_961570560 = 0xFFFF_C13B = 4294951227
// m2_562915447 = 0xFFFF_ADFD = 4294946301
// ==== First pass, column 0.
if (0 == (
this.mcu_blocks[0][0x08] |
this.mcu_blocks[0][0x10] |
this.mcu_blocks[0][0x18] |
this.mcu_blocks[0][0x20] |
this.mcu_blocks[0][0x28] |
this.mcu_blocks[0][0x30] |
this.mcu_blocks[0][0x38])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x00] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x00]) ~mod*
(this.quant_tables[args.q][0x00] as base.u32)) ~mod<< 2
intermediate[0x08] = intermediate[0x00]
intermediate[0x10] = intermediate[0x00]
intermediate[0x18] = intermediate[0x00]
intermediate[0x20] = intermediate[0x00]
intermediate[0x28] = intermediate[0x00]
intermediate[0x30] = intermediate[0x00]
intermediate[0x38] = intermediate[0x00]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x10]) ~mod* (this.quant_tables[args.q][0x10] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x30]) ~mod* (this.quant_tables[args.q][0x30] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x00]) ~mod* (this.quant_tables[args.q][0x00] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x20]) ~mod* (this.quant_tables[args.q][0x20] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x08]) ~mod* (this.quant_tables[args.q][0x08] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x18]) ~mod* (this.quant_tables[args.q][0x18] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x28]) ~mod* (this.quant_tables[args.q][0x28] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x38]) ~mod* (this.quant_tables[args.q][0x38] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x00] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x38] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x08] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x30] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x10] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x28] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x18] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x20] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 1.
if (0 == (
this.mcu_blocks[0][0x09] |
this.mcu_blocks[0][0x11] |
this.mcu_blocks[0][0x19] |
this.mcu_blocks[0][0x21] |
this.mcu_blocks[0][0x29] |
this.mcu_blocks[0][0x31] |
this.mcu_blocks[0][0x39])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x01] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x01]) ~mod*
(this.quant_tables[args.q][0x01] as base.u32)) ~mod<< 2
intermediate[0x09] = intermediate[0x01]
intermediate[0x11] = intermediate[0x01]
intermediate[0x19] = intermediate[0x01]
intermediate[0x21] = intermediate[0x01]
intermediate[0x29] = intermediate[0x01]
intermediate[0x31] = intermediate[0x01]
intermediate[0x39] = intermediate[0x01]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x11]) ~mod* (this.quant_tables[args.q][0x11] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x31]) ~mod* (this.quant_tables[args.q][0x31] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x01]) ~mod* (this.quant_tables[args.q][0x01] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x21]) ~mod* (this.quant_tables[args.q][0x21] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x09]) ~mod* (this.quant_tables[args.q][0x09] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x19]) ~mod* (this.quant_tables[args.q][0x19] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x29]) ~mod* (this.quant_tables[args.q][0x29] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x39]) ~mod* (this.quant_tables[args.q][0x39] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x01] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x39] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x09] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x31] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x11] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x29] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x19] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x21] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 2.
if (0 == (
this.mcu_blocks[0][0x0A] |
this.mcu_blocks[0][0x12] |
this.mcu_blocks[0][0x1A] |
this.mcu_blocks[0][0x22] |
this.mcu_blocks[0][0x2A] |
this.mcu_blocks[0][0x32] |
this.mcu_blocks[0][0x3A])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x02] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x02]) ~mod*
(this.quant_tables[args.q][0x02] as base.u32)) ~mod<< 2
intermediate[0x0A] = intermediate[0x02]
intermediate[0x12] = intermediate[0x02]
intermediate[0x1A] = intermediate[0x02]
intermediate[0x22] = intermediate[0x02]
intermediate[0x2A] = intermediate[0x02]
intermediate[0x32] = intermediate[0x02]
intermediate[0x3A] = intermediate[0x02]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x12]) ~mod* (this.quant_tables[args.q][0x12] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x32]) ~mod* (this.quant_tables[args.q][0x32] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x02]) ~mod* (this.quant_tables[args.q][0x02] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x22]) ~mod* (this.quant_tables[args.q][0x22] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0A]) ~mod* (this.quant_tables[args.q][0x0A] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1A]) ~mod* (this.quant_tables[args.q][0x1A] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2A]) ~mod* (this.quant_tables[args.q][0x2A] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3A]) ~mod* (this.quant_tables[args.q][0x3A] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x02] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x3A] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x0A] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x32] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x12] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x2A] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x1A] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x22] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 3.
if (0 == (
this.mcu_blocks[0][0x0B] |
this.mcu_blocks[0][0x13] |
this.mcu_blocks[0][0x1B] |
this.mcu_blocks[0][0x23] |
this.mcu_blocks[0][0x2B] |
this.mcu_blocks[0][0x33] |
this.mcu_blocks[0][0x3B])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x03] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x03]) ~mod*
(this.quant_tables[args.q][0x03] as base.u32)) ~mod<< 2
intermediate[0x0B] = intermediate[0x03]
intermediate[0x13] = intermediate[0x03]
intermediate[0x1B] = intermediate[0x03]
intermediate[0x23] = intermediate[0x03]
intermediate[0x2B] = intermediate[0x03]
intermediate[0x33] = intermediate[0x03]
intermediate[0x3B] = intermediate[0x03]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x13]) ~mod* (this.quant_tables[args.q][0x13] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x33]) ~mod* (this.quant_tables[args.q][0x33] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x03]) ~mod* (this.quant_tables[args.q][0x03] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x23]) ~mod* (this.quant_tables[args.q][0x23] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0B]) ~mod* (this.quant_tables[args.q][0x0B] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1B]) ~mod* (this.quant_tables[args.q][0x1B] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2B]) ~mod* (this.quant_tables[args.q][0x2B] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3B]) ~mod* (this.quant_tables[args.q][0x3B] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x03] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x3B] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x0B] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x33] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x13] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x2B] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x1B] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x23] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 4.
if (0 == (
this.mcu_blocks[0][0x0C] |
this.mcu_blocks[0][0x14] |
this.mcu_blocks[0][0x1C] |
this.mcu_blocks[0][0x24] |
this.mcu_blocks[0][0x2C] |
this.mcu_blocks[0][0x34] |
this.mcu_blocks[0][0x3C])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x04] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x04]) ~mod*
(this.quant_tables[args.q][0x04] as base.u32)) ~mod<< 2
intermediate[0x0C] = intermediate[0x04]
intermediate[0x14] = intermediate[0x04]
intermediate[0x1C] = intermediate[0x04]
intermediate[0x24] = intermediate[0x04]
intermediate[0x2C] = intermediate[0x04]
intermediate[0x34] = intermediate[0x04]
intermediate[0x3C] = intermediate[0x04]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x14]) ~mod* (this.quant_tables[args.q][0x14] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x34]) ~mod* (this.quant_tables[args.q][0x34] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x04]) ~mod* (this.quant_tables[args.q][0x04] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x24]) ~mod* (this.quant_tables[args.q][0x24] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0C]) ~mod* (this.quant_tables[args.q][0x0C] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1C]) ~mod* (this.quant_tables[args.q][0x1C] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2C]) ~mod* (this.quant_tables[args.q][0x2C] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3C]) ~mod* (this.quant_tables[args.q][0x3C] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x04] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x3C] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x0C] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x34] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x14] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x2C] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x1C] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x24] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 5.
if (0 == (
this.mcu_blocks[0][0x0D] |
this.mcu_blocks[0][0x15] |
this.mcu_blocks[0][0x1D] |
this.mcu_blocks[0][0x25] |
this.mcu_blocks[0][0x2D] |
this.mcu_blocks[0][0x35] |
this.mcu_blocks[0][0x3D])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x05] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x05]) ~mod*
(this.quant_tables[args.q][0x05] as base.u32)) ~mod<< 2
intermediate[0x0D] = intermediate[0x05]
intermediate[0x15] = intermediate[0x05]
intermediate[0x1D] = intermediate[0x05]
intermediate[0x25] = intermediate[0x05]
intermediate[0x2D] = intermediate[0x05]
intermediate[0x35] = intermediate[0x05]
intermediate[0x3D] = intermediate[0x05]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x15]) ~mod* (this.quant_tables[args.q][0x15] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x35]) ~mod* (this.quant_tables[args.q][0x35] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x05]) ~mod* (this.quant_tables[args.q][0x05] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x25]) ~mod* (this.quant_tables[args.q][0x25] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0D]) ~mod* (this.quant_tables[args.q][0x0D] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1D]) ~mod* (this.quant_tables[args.q][0x1D] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2D]) ~mod* (this.quant_tables[args.q][0x2D] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3D]) ~mod* (this.quant_tables[args.q][0x3D] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x05] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x3D] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x0D] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x35] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x15] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x2D] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x1D] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x25] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 6.
if (0 == (
this.mcu_blocks[0][0x0E] |
this.mcu_blocks[0][0x16] |
this.mcu_blocks[0][0x1E] |
this.mcu_blocks[0][0x26] |
this.mcu_blocks[0][0x2E] |
this.mcu_blocks[0][0x36] |
this.mcu_blocks[0][0x3E])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x06] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x06]) ~mod*
(this.quant_tables[args.q][0x06] as base.u32)) ~mod<< 2
intermediate[0x0E] = intermediate[0x06]
intermediate[0x16] = intermediate[0x06]
intermediate[0x1E] = intermediate[0x06]
intermediate[0x26] = intermediate[0x06]
intermediate[0x2E] = intermediate[0x06]
intermediate[0x36] = intermediate[0x06]
intermediate[0x3E] = intermediate[0x06]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x16]) ~mod* (this.quant_tables[args.q][0x16] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x36]) ~mod* (this.quant_tables[args.q][0x36] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x06]) ~mod* (this.quant_tables[args.q][0x06] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x26]) ~mod* (this.quant_tables[args.q][0x26] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0E]) ~mod* (this.quant_tables[args.q][0x0E] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1E]) ~mod* (this.quant_tables[args.q][0x1E] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2E]) ~mod* (this.quant_tables[args.q][0x2E] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3E]) ~mod* (this.quant_tables[args.q][0x3E] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x06] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x3E] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x0E] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x36] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x16] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x2E] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x1E] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x26] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== First pass, column 7.
if (0 == (
this.mcu_blocks[0][0x0F] |
this.mcu_blocks[0][0x17] |
this.mcu_blocks[0][0x1F] |
this.mcu_blocks[0][0x27] |
this.mcu_blocks[0][0x2F] |
this.mcu_blocks[0][0x37] |
this.mcu_blocks[0][0x3F])) {
// Fast path when the 1-dimensional AC terms are all zero.
intermediate[0x07] =
(this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x07]) ~mod*
(this.quant_tables[args.q][0x07] as base.u32)) ~mod<< 2
intermediate[0x0F] = intermediate[0x07]
intermediate[0x17] = intermediate[0x07]
intermediate[0x1F] = intermediate[0x07]
intermediate[0x27] = intermediate[0x07]
intermediate[0x2F] = intermediate[0x07]
intermediate[0x37] = intermediate[0x07]
intermediate[0x3F] = intermediate[0x07]
} else {
// Even rows.
bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x17]) ~mod* (this.quant_tables[args.q][0x17] as base.u32)
bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x37]) ~mod* (this.quant_tables[args.q][0x37] as base.u32)
// This code...
ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151
cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E)
cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151)
// cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630)
bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x07]) ~mod* (this.quant_tables[args.q][0x07] as base.u32)
bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x27]) ~mod* (this.quant_tables[args.q][0x27] as base.u32)
ccp = (bq0 ~mod+ bq4) ~mod<< 13
ccm = (bq0 ~mod- bq4) ~mod<< 13
cd0 = ccp ~mod+ cb2
cd1 = ccm ~mod+ cb6
cd2 = ccm ~mod- cb6
cd3 = ccp ~mod- cb2
// Odd rows.
bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0F]) ~mod* (this.quant_tables[args.q][0x0F] as base.u32)
bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1F]) ~mod* (this.quant_tables[args.q][0x1F] as base.u32)
bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2F]) ~mod* (this.quant_tables[args.q][0x2F] as base.u32)
bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3F]) ~mod* (this.quant_tables[args.q][0x3F] as base.u32)
ci51 = bq5 ~mod+ bq1
ci53 = bq5 ~mod+ bq3
ci71 = bq7 ~mod+ bq1
ci73 = bq7 ~mod+ bq3
// This code...
cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1
ck1 = bq1 ~mod* 0x0000_300B
ck3 = bq3 ~mod* 0x0000_6254
ck5 = bq5 ~mod* 0x0000_41B3
ck7 = bq7 ~mod* 0x0000_098E
ci51 ~mod*= 0xFFFF_F384
ci53 ~mod*= 0xFFFF_ADFD
ci71 ~mod*= 0xFFFF_E333
ci73 ~mod*= 0xFFFF_C13B
cl51 = ci51 ~mod+ cj
cl73 = ci73 ~mod+ cj
ck1 ~mod+= ci71 ~mod+ cl51
ck3 ~mod+= ci53 ~mod+ cl73
ck5 ~mod+= ci53 ~mod+ cl51
ck7 ~mod+= ci71 ~mod+ cl73
// ...is equivalent to this more-SIMD-like code.
//
// cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1)
// cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925)
// ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333))
// ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD))
// ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0))
// ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1))
// Combine rows.
intermediate[0x07] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x3F] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11)
intermediate[0x0F] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x37] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11)
intermediate[0x17] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x2F] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11)
intermediate[0x1F] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11)
intermediate[0x27] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11)
}
// ==== Second pass, row 0.
if (0 == (
intermediate[0x01] |
intermediate[0x02] |
intermediate[0x03] |
intermediate[0x04] |
intermediate[0x05] |
intermediate[0x06] |
intermediate[0x07])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x00] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x02]
in6 = intermediate[0x06]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x00]
in4 = intermediate[0x04]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x01]
in3 = intermediate[0x03]
in5 = intermediate[0x05]
in7 = intermediate[0x07]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 1.
if (0 == (
intermediate[0x09] |
intermediate[0x0A] |
intermediate[0x0B] |
intermediate[0x0C] |
intermediate[0x0D] |
intermediate[0x0E] |
intermediate[0x0F])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x08] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x0A]
in6 = intermediate[0x0E]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x08]
in4 = intermediate[0x0C]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x09]
in3 = intermediate[0x0B]
in5 = intermediate[0x0D]
in7 = intermediate[0x0F]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 2.
if (0 == (
intermediate[0x11] |
intermediate[0x12] |
intermediate[0x13] |
intermediate[0x14] |
intermediate[0x15] |
intermediate[0x16] |
intermediate[0x17])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x10] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x12]
in6 = intermediate[0x16]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x10]
in4 = intermediate[0x14]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x11]
in3 = intermediate[0x13]
in5 = intermediate[0x15]
in7 = intermediate[0x17]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 3.
if (0 == (
intermediate[0x19] |
intermediate[0x1A] |
intermediate[0x1B] |
intermediate[0x1C] |
intermediate[0x1D] |
intermediate[0x1E] |
intermediate[0x1F])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x18] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x1A]
in6 = intermediate[0x1E]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x18]
in4 = intermediate[0x1C]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x19]
in3 = intermediate[0x1B]
in5 = intermediate[0x1D]
in7 = intermediate[0x1F]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 4.
if (0 == (
intermediate[0x21] |
intermediate[0x22] |
intermediate[0x23] |
intermediate[0x24] |
intermediate[0x25] |
intermediate[0x26] |
intermediate[0x27])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x20] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x22]
in6 = intermediate[0x26]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x20]
in4 = intermediate[0x24]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x21]
in3 = intermediate[0x23]
in5 = intermediate[0x25]
in7 = intermediate[0x27]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 5.
if (0 == (
intermediate[0x29] |
intermediate[0x2A] |
intermediate[0x2B] |
intermediate[0x2C] |
intermediate[0x2D] |
intermediate[0x2E] |
intermediate[0x2F])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x28] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x2A]
in6 = intermediate[0x2E]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x28]
in4 = intermediate[0x2C]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x29]
in3 = intermediate[0x2B]
in5 = intermediate[0x2D]
in7 = intermediate[0x2F]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 6.
if (0 == (
intermediate[0x31] |
intermediate[0x32] |
intermediate[0x33] |
intermediate[0x34] |
intermediate[0x35] |
intermediate[0x36] |
intermediate[0x37])) {
// Fast path when the 1-dimensional AC terms are all zero.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x30] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
} else {
// Even columns.
in2 = intermediate[0x32]
in6 = intermediate[0x36]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x30]
in4 = intermediate[0x34]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x31]
in3 = intermediate[0x33]
in5 = intermediate[0x35]
in7 = intermediate[0x37]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if args.dst_stride > args.dst_buffer.length() {
return nothing
}
assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride)
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer = args.dst_buffer[args.dst_stride ..]
}
// ==== Second pass, row 7.
if (0 == (
intermediate[0x39] |
intermediate[0x3A] |
intermediate[0x3B] |
intermediate[0x3C] |
intermediate[0x3D] |
intermediate[0x3E] |
intermediate[0x3F])) {
// Fast path when the 1-dimensional AC terms are all zero.
if 8 > args.dst_buffer.length() {
return nothing
}
args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x38] ~mod+ (1 << 4)) >> 5) & 1023]
args.dst_buffer[1] = args.dst_buffer[0]
args.dst_buffer[2] = args.dst_buffer[0]
args.dst_buffer[3] = args.dst_buffer[0]
args.dst_buffer[4] = args.dst_buffer[0]
args.dst_buffer[5] = args.dst_buffer[0]
args.dst_buffer[6] = args.dst_buffer[0]
args.dst_buffer[7] = args.dst_buffer[0]
} else {
// Even columns.
in2 = intermediate[0x3A]
in6 = intermediate[0x3E]
// This code...
ra = (in2 ~mod+ in6) ~mod* 0x0000_1151
rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E)
rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21)
// ...is equivalent to this more-SIMD-like code.
//
// rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151)
// rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630)
in0 = intermediate[0x38]
in4 = intermediate[0x3C]
rcp = (in0 ~mod+ in4) ~mod<< 13
rcm = (in0 ~mod- in4) ~mod<< 13
rd0 = rcp ~mod+ rb2
rd1 = rcm ~mod+ rb6
rd2 = rcm ~mod- rb6
rd3 = rcp ~mod- rb2
// Odd columns.
in1 = intermediate[0x39]
in3 = intermediate[0x3B]
in5 = intermediate[0x3D]
in7 = intermediate[0x3F]
ri51 = in5 ~mod+ in1
ri53 = in5 ~mod+ in3
ri71 = in7 ~mod+ in1
ri73 = in7 ~mod+ in3
// This code...
rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1
rk1 = in1 ~mod* 0x0000_300B
rk3 = in3 ~mod* 0x0000_6254
rk5 = in5 ~mod* 0x0000_41B3
rk7 = in7 ~mod* 0x0000_098E
ri51 ~mod*= 0xFFFF_F384
ri53 ~mod*= 0xFFFF_ADFD
ri71 ~mod*= 0xFFFF_E333
ri73 ~mod*= 0xFFFF_C13B
rl51 = ri51 ~mod+ rj
rl73 = ri73 ~mod+ rj
rk1 ~mod+= ri71 ~mod+ rl51
rk3 ~mod+= ri53 ~mod+ rl73
rk5 ~mod+= ri53 ~mod+ rl51
rk7 ~mod+= ri71 ~mod+ rl73
// ...is equivalent to this more-SIMD-like code.
//
// rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1)
// rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925)
// rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333))
// rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD))
// rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0))
// rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1))
// Combine columns.
if 8 > args.dst_buffer.length() {
return nothing
}
args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023]
args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023]
}
// -------- END generated by script/print-jpeg-idct-code.go
}