| // Copyright 2023 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| // |
| // SPDX-License-Identifier: Apache-2.0 OR MIT |
| |
| pri func decoder.decode_idct!(dst_buffer: slice base.u8, dst_stride: base.u64, q: base.u32[..= 3]), |
| choosy, |
| { |
| // This method implements the same algorithm as libjpeg-turbo's jidctint.c. |
| // It defines CONST_BITS = 13 and PASS1_BITS = 2, so that right-shifting by |
| // 11 is to shift by (CONST_BITS - PASS1_BITS). |
| |
| var bq0 : base.u32 |
| var bq2 : base.u32 |
| var bq4 : base.u32 |
| var bq6 : base.u32 |
| |
| var ca : base.u32 |
| |
| var cb2 : base.u32 |
| var cb6 : base.u32 |
| |
| var ccp : base.u32 |
| var ccm : base.u32 |
| |
| var cd0 : base.u32 |
| var cd1 : base.u32 |
| var cd2 : base.u32 |
| var cd3 : base.u32 |
| |
| var bq1 : base.u32 |
| var bq3 : base.u32 |
| var bq5 : base.u32 |
| var bq7 : base.u32 |
| |
| var ci51 : base.u32 |
| var ci53 : base.u32 |
| var ci71 : base.u32 |
| var ci73 : base.u32 |
| |
| var cj : base.u32 |
| |
| var ck1 : base.u32 |
| var ck3 : base.u32 |
| var ck5 : base.u32 |
| var ck7 : base.u32 |
| |
| var cl51 : base.u32 |
| var cl73 : base.u32 |
| |
| var in0 : base.u32 |
| var in2 : base.u32 |
| var in4 : base.u32 |
| var in6 : base.u32 |
| |
| var ra : base.u32 |
| |
| var rb2 : base.u32 |
| var rb6 : base.u32 |
| |
| var rcp : base.u32 |
| var rcm : base.u32 |
| |
| var rd0 : base.u32 |
| var rd1 : base.u32 |
| var rd2 : base.u32 |
| var rd3 : base.u32 |
| |
| var in1 : base.u32 |
| var in3 : base.u32 |
| var in5 : base.u32 |
| var in7 : base.u32 |
| |
| var ri51 : base.u32 |
| var ri53 : base.u32 |
| var ri71 : base.u32 |
| var ri73 : base.u32 |
| |
| var rj : base.u32 |
| |
| var rk1 : base.u32 |
| var rk3 : base.u32 |
| var rk5 : base.u32 |
| var rk7 : base.u32 |
| |
| var rl51 : base.u32 |
| var rl73 : base.u32 |
| |
| var intermediate : array[64] base.u32 |
| |
| if 8 > args.dst_stride { |
| return nothing |
| } |
| |
| // -------- BEGIN generated by script/print-jpeg-idct-code.go |
| |
| // p0_298631336 = 0x0000_098E = 2446 |
| // p0_390180644 = 0x0000_0C7C = 3196 |
| // p0_509795579 = 0x0000_1051 = 4177 |
| // p0_541196100 = 0x0000_1151 = 4433 |
| // p0_601344887 = 0x0000_133E = 4926 |
| // p0_765366865 = 0x0000_187E = 6270 |
| // p0_785694958 = 0x0000_1925 = 6437 |
| // p0_899976223 = 0x0000_1CCD = 7373 |
| // p1_175875602 = 0x0000_25A1 = 9633 |
| // p1_306562965 = 0x0000_29CF = 10703 |
| // p1_501321110 = 0x0000_300B = 12299 |
| // p1_847759065 = 0x0000_3B21 = 15137 |
| // p1_961570560 = 0x0000_3EC5 = 16069 |
| // p2_053119869 = 0x0000_41B3 = 16819 |
| // p2_562915447 = 0x0000_5203 = 20995 |
| // p3_072711026 = 0x0000_6254 = 25172 |
| // |
| // m0_390180644 = 0xFFFF_F384 = 4294964100 |
| // m0_509795579 = 0xFFFF_EFB0 = 4294963120 |
| // m0_601344887 = 0xFFFF_ECC1 = 4294962369 |
| // m0_785694958 = 0xFFFF_E6DC = 4294960860 |
| // m0_899976223 = 0xFFFF_E333 = 4294959923 |
| // m1_306562965 = 0xFFFF_D630 = 4294956592 |
| // m1_961570560 = 0xFFFF_C13B = 4294951227 |
| // m2_562915447 = 0xFFFF_ADFD = 4294946301 |
| |
| // ==== First pass, column 0. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x08] | |
| this.mcu_blocks[0][0x10] | |
| this.mcu_blocks[0][0x18] | |
| this.mcu_blocks[0][0x20] | |
| this.mcu_blocks[0][0x28] | |
| this.mcu_blocks[0][0x30] | |
| this.mcu_blocks[0][0x38])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x00] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x00]) ~mod* |
| (this.quant_tables[args.q][0x00] as base.u32)) ~mod<< 2 |
| intermediate[0x08] = intermediate[0x00] |
| intermediate[0x10] = intermediate[0x00] |
| intermediate[0x18] = intermediate[0x00] |
| intermediate[0x20] = intermediate[0x00] |
| intermediate[0x28] = intermediate[0x00] |
| intermediate[0x30] = intermediate[0x00] |
| intermediate[0x38] = intermediate[0x00] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x10]) ~mod* (this.quant_tables[args.q][0x10] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x30]) ~mod* (this.quant_tables[args.q][0x30] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x00]) ~mod* (this.quant_tables[args.q][0x00] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x20]) ~mod* (this.quant_tables[args.q][0x20] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x08]) ~mod* (this.quant_tables[args.q][0x08] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x18]) ~mod* (this.quant_tables[args.q][0x18] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x28]) ~mod* (this.quant_tables[args.q][0x28] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x38]) ~mod* (this.quant_tables[args.q][0x38] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x00] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x38] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x08] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x30] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x10] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x28] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x18] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x20] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 1. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x09] | |
| this.mcu_blocks[0][0x11] | |
| this.mcu_blocks[0][0x19] | |
| this.mcu_blocks[0][0x21] | |
| this.mcu_blocks[0][0x29] | |
| this.mcu_blocks[0][0x31] | |
| this.mcu_blocks[0][0x39])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x01] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x01]) ~mod* |
| (this.quant_tables[args.q][0x01] as base.u32)) ~mod<< 2 |
| intermediate[0x09] = intermediate[0x01] |
| intermediate[0x11] = intermediate[0x01] |
| intermediate[0x19] = intermediate[0x01] |
| intermediate[0x21] = intermediate[0x01] |
| intermediate[0x29] = intermediate[0x01] |
| intermediate[0x31] = intermediate[0x01] |
| intermediate[0x39] = intermediate[0x01] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x11]) ~mod* (this.quant_tables[args.q][0x11] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x31]) ~mod* (this.quant_tables[args.q][0x31] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x01]) ~mod* (this.quant_tables[args.q][0x01] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x21]) ~mod* (this.quant_tables[args.q][0x21] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x09]) ~mod* (this.quant_tables[args.q][0x09] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x19]) ~mod* (this.quant_tables[args.q][0x19] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x29]) ~mod* (this.quant_tables[args.q][0x29] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x39]) ~mod* (this.quant_tables[args.q][0x39] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x01] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x39] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x09] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x31] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x11] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x29] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x19] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x21] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 2. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x0A] | |
| this.mcu_blocks[0][0x12] | |
| this.mcu_blocks[0][0x1A] | |
| this.mcu_blocks[0][0x22] | |
| this.mcu_blocks[0][0x2A] | |
| this.mcu_blocks[0][0x32] | |
| this.mcu_blocks[0][0x3A])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x02] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x02]) ~mod* |
| (this.quant_tables[args.q][0x02] as base.u32)) ~mod<< 2 |
| intermediate[0x0A] = intermediate[0x02] |
| intermediate[0x12] = intermediate[0x02] |
| intermediate[0x1A] = intermediate[0x02] |
| intermediate[0x22] = intermediate[0x02] |
| intermediate[0x2A] = intermediate[0x02] |
| intermediate[0x32] = intermediate[0x02] |
| intermediate[0x3A] = intermediate[0x02] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x12]) ~mod* (this.quant_tables[args.q][0x12] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x32]) ~mod* (this.quant_tables[args.q][0x32] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x02]) ~mod* (this.quant_tables[args.q][0x02] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x22]) ~mod* (this.quant_tables[args.q][0x22] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0A]) ~mod* (this.quant_tables[args.q][0x0A] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1A]) ~mod* (this.quant_tables[args.q][0x1A] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2A]) ~mod* (this.quant_tables[args.q][0x2A] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3A]) ~mod* (this.quant_tables[args.q][0x3A] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x02] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x3A] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x0A] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x32] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x12] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x2A] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x1A] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x22] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 3. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x0B] | |
| this.mcu_blocks[0][0x13] | |
| this.mcu_blocks[0][0x1B] | |
| this.mcu_blocks[0][0x23] | |
| this.mcu_blocks[0][0x2B] | |
| this.mcu_blocks[0][0x33] | |
| this.mcu_blocks[0][0x3B])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x03] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x03]) ~mod* |
| (this.quant_tables[args.q][0x03] as base.u32)) ~mod<< 2 |
| intermediate[0x0B] = intermediate[0x03] |
| intermediate[0x13] = intermediate[0x03] |
| intermediate[0x1B] = intermediate[0x03] |
| intermediate[0x23] = intermediate[0x03] |
| intermediate[0x2B] = intermediate[0x03] |
| intermediate[0x33] = intermediate[0x03] |
| intermediate[0x3B] = intermediate[0x03] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x13]) ~mod* (this.quant_tables[args.q][0x13] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x33]) ~mod* (this.quant_tables[args.q][0x33] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x03]) ~mod* (this.quant_tables[args.q][0x03] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x23]) ~mod* (this.quant_tables[args.q][0x23] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0B]) ~mod* (this.quant_tables[args.q][0x0B] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1B]) ~mod* (this.quant_tables[args.q][0x1B] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2B]) ~mod* (this.quant_tables[args.q][0x2B] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3B]) ~mod* (this.quant_tables[args.q][0x3B] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x03] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x3B] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x0B] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x33] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x13] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x2B] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x1B] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x23] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 4. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x0C] | |
| this.mcu_blocks[0][0x14] | |
| this.mcu_blocks[0][0x1C] | |
| this.mcu_blocks[0][0x24] | |
| this.mcu_blocks[0][0x2C] | |
| this.mcu_blocks[0][0x34] | |
| this.mcu_blocks[0][0x3C])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x04] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x04]) ~mod* |
| (this.quant_tables[args.q][0x04] as base.u32)) ~mod<< 2 |
| intermediate[0x0C] = intermediate[0x04] |
| intermediate[0x14] = intermediate[0x04] |
| intermediate[0x1C] = intermediate[0x04] |
| intermediate[0x24] = intermediate[0x04] |
| intermediate[0x2C] = intermediate[0x04] |
| intermediate[0x34] = intermediate[0x04] |
| intermediate[0x3C] = intermediate[0x04] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x14]) ~mod* (this.quant_tables[args.q][0x14] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x34]) ~mod* (this.quant_tables[args.q][0x34] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x04]) ~mod* (this.quant_tables[args.q][0x04] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x24]) ~mod* (this.quant_tables[args.q][0x24] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0C]) ~mod* (this.quant_tables[args.q][0x0C] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1C]) ~mod* (this.quant_tables[args.q][0x1C] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2C]) ~mod* (this.quant_tables[args.q][0x2C] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3C]) ~mod* (this.quant_tables[args.q][0x3C] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x04] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x3C] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x0C] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x34] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x14] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x2C] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x1C] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x24] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 5. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x0D] | |
| this.mcu_blocks[0][0x15] | |
| this.mcu_blocks[0][0x1D] | |
| this.mcu_blocks[0][0x25] | |
| this.mcu_blocks[0][0x2D] | |
| this.mcu_blocks[0][0x35] | |
| this.mcu_blocks[0][0x3D])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x05] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x05]) ~mod* |
| (this.quant_tables[args.q][0x05] as base.u32)) ~mod<< 2 |
| intermediate[0x0D] = intermediate[0x05] |
| intermediate[0x15] = intermediate[0x05] |
| intermediate[0x1D] = intermediate[0x05] |
| intermediate[0x25] = intermediate[0x05] |
| intermediate[0x2D] = intermediate[0x05] |
| intermediate[0x35] = intermediate[0x05] |
| intermediate[0x3D] = intermediate[0x05] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x15]) ~mod* (this.quant_tables[args.q][0x15] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x35]) ~mod* (this.quant_tables[args.q][0x35] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x05]) ~mod* (this.quant_tables[args.q][0x05] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x25]) ~mod* (this.quant_tables[args.q][0x25] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0D]) ~mod* (this.quant_tables[args.q][0x0D] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1D]) ~mod* (this.quant_tables[args.q][0x1D] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2D]) ~mod* (this.quant_tables[args.q][0x2D] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3D]) ~mod* (this.quant_tables[args.q][0x3D] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x05] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x3D] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x0D] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x35] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x15] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x2D] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x1D] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x25] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 6. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x0E] | |
| this.mcu_blocks[0][0x16] | |
| this.mcu_blocks[0][0x1E] | |
| this.mcu_blocks[0][0x26] | |
| this.mcu_blocks[0][0x2E] | |
| this.mcu_blocks[0][0x36] | |
| this.mcu_blocks[0][0x3E])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x06] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x06]) ~mod* |
| (this.quant_tables[args.q][0x06] as base.u32)) ~mod<< 2 |
| intermediate[0x0E] = intermediate[0x06] |
| intermediate[0x16] = intermediate[0x06] |
| intermediate[0x1E] = intermediate[0x06] |
| intermediate[0x26] = intermediate[0x06] |
| intermediate[0x2E] = intermediate[0x06] |
| intermediate[0x36] = intermediate[0x06] |
| intermediate[0x3E] = intermediate[0x06] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x16]) ~mod* (this.quant_tables[args.q][0x16] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x36]) ~mod* (this.quant_tables[args.q][0x36] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x06]) ~mod* (this.quant_tables[args.q][0x06] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x26]) ~mod* (this.quant_tables[args.q][0x26] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0E]) ~mod* (this.quant_tables[args.q][0x0E] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1E]) ~mod* (this.quant_tables[args.q][0x1E] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2E]) ~mod* (this.quant_tables[args.q][0x2E] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3E]) ~mod* (this.quant_tables[args.q][0x3E] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x06] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x3E] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x0E] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x36] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x16] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x2E] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x1E] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x26] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== First pass, column 7. |
| |
| if (0 == ( |
| this.mcu_blocks[0][0x0F] | |
| this.mcu_blocks[0][0x17] | |
| this.mcu_blocks[0][0x1F] | |
| this.mcu_blocks[0][0x27] | |
| this.mcu_blocks[0][0x2F] | |
| this.mcu_blocks[0][0x37] | |
| this.mcu_blocks[0][0x3F])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| intermediate[0x07] = |
| (this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x07]) ~mod* |
| (this.quant_tables[args.q][0x07] as base.u32)) ~mod<< 2 |
| intermediate[0x0F] = intermediate[0x07] |
| intermediate[0x17] = intermediate[0x07] |
| intermediate[0x1F] = intermediate[0x07] |
| intermediate[0x27] = intermediate[0x07] |
| intermediate[0x2F] = intermediate[0x07] |
| intermediate[0x37] = intermediate[0x07] |
| intermediate[0x3F] = intermediate[0x07] |
| |
| } else { |
| // Even rows. |
| |
| bq2 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x17]) ~mod* (this.quant_tables[args.q][0x17] as base.u32) |
| bq6 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x37]) ~mod* (this.quant_tables[args.q][0x37] as base.u32) |
| |
| // This code... |
| ca = (bq2 ~mod+ bq6) ~mod* 0x0000_1151 |
| cb2 = ca ~mod+ (bq2 ~mod* 0x0000_187E) |
| cb6 = ca ~mod- (bq6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cb2 = (bq2 ~mod* 0x0000_29CF) ~mod+ (bq6 ~mod* 0x0000_1151) |
| // cb6 = (bq2 ~mod* 0x0000_1151) ~mod+ (bq6 ~mod* 0xFFFF_D630) |
| |
| bq0 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x07]) ~mod* (this.quant_tables[args.q][0x07] as base.u32) |
| bq4 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x27]) ~mod* (this.quant_tables[args.q][0x27] as base.u32) |
| |
| ccp = (bq0 ~mod+ bq4) ~mod<< 13 |
| ccm = (bq0 ~mod- bq4) ~mod<< 13 |
| |
| cd0 = ccp ~mod+ cb2 |
| cd1 = ccm ~mod+ cb6 |
| cd2 = ccm ~mod- cb6 |
| cd3 = ccp ~mod- cb2 |
| |
| // Odd rows. |
| |
| bq1 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x0F]) ~mod* (this.quant_tables[args.q][0x0F] as base.u32) |
| bq3 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x1F]) ~mod* (this.quant_tables[args.q][0x1F] as base.u32) |
| bq5 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x2F]) ~mod* (this.quant_tables[args.q][0x2F] as base.u32) |
| bq7 = this.util.sign_extend_convert_u16_u32(a: this.mcu_blocks[0][0x3F]) ~mod* (this.quant_tables[args.q][0x3F] as base.u32) |
| |
| ci51 = bq5 ~mod+ bq1 |
| ci53 = bq5 ~mod+ bq3 |
| ci71 = bq7 ~mod+ bq1 |
| ci73 = bq7 ~mod+ bq3 |
| |
| // This code... |
| cj = (ci73 ~mod+ ci51) ~mod* 0x0000_25A1 |
| ck1 = bq1 ~mod* 0x0000_300B |
| ck3 = bq3 ~mod* 0x0000_6254 |
| ck5 = bq5 ~mod* 0x0000_41B3 |
| ck7 = bq7 ~mod* 0x0000_098E |
| ci51 ~mod*= 0xFFFF_F384 |
| ci53 ~mod*= 0xFFFF_ADFD |
| ci71 ~mod*= 0xFFFF_E333 |
| ci73 ~mod*= 0xFFFF_C13B |
| cl51 = ci51 ~mod+ cj |
| cl73 = ci73 ~mod+ cj |
| ck1 ~mod+= ci71 ~mod+ cl51 |
| ck3 ~mod+= ci53 ~mod+ cl73 |
| ck5 ~mod+= ci53 ~mod+ cl51 |
| ck7 ~mod+= ci71 ~mod+ cl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // cl73 = (ci73 ~mod* 0xFFFF_E6DC) ~mod+ (ci51 ~mod* 0x0000_25A1) |
| // cl51 = (ci73 ~mod* 0x0000_25A1) ~mod+ (ci51 ~mod* 0x0000_1925) |
| // ck1 = cl51 ~mod+ ((bq1 ~mod* 0x0000_133E) ~mod+ (bq7 ~mod* 0xFFFF_E333)) |
| // ck3 = cl73 ~mod+ ((bq3 ~mod* 0x0000_1051) ~mod+ (bq5 ~mod* 0xFFFF_ADFD)) |
| // ck5 = cl51 ~mod+ ((bq3 ~mod* 0xFFFF_ADFD) ~mod+ (bq5 ~mod* 0xFFFF_EFB0)) |
| // ck7 = cl73 ~mod+ ((bq1 ~mod* 0xFFFF_E333) ~mod+ (bq7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine rows. |
| |
| intermediate[0x07] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod+ ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x3F] = this.util.sign_extend_rshift_u32(a: (cd0 ~mod- ck1) ~mod+ (1 << 10), n: 11) |
| intermediate[0x0F] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod+ ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x37] = this.util.sign_extend_rshift_u32(a: (cd1 ~mod- ck3) ~mod+ (1 << 10), n: 11) |
| intermediate[0x17] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod+ ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x2F] = this.util.sign_extend_rshift_u32(a: (cd2 ~mod- ck5) ~mod+ (1 << 10), n: 11) |
| intermediate[0x1F] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod+ ck7) ~mod+ (1 << 10), n: 11) |
| intermediate[0x27] = this.util.sign_extend_rshift_u32(a: (cd3 ~mod- ck7) ~mod+ (1 << 10), n: 11) |
| } |
| |
| // ==== Second pass, row 0. |
| |
| if (0 == ( |
| intermediate[0x01] | |
| intermediate[0x02] | |
| intermediate[0x03] | |
| intermediate[0x04] | |
| intermediate[0x05] | |
| intermediate[0x06] | |
| intermediate[0x07])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x00] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x02] |
| in6 = intermediate[0x06] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x00] |
| in4 = intermediate[0x04] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x01] |
| in3 = intermediate[0x03] |
| in5 = intermediate[0x05] |
| in7 = intermediate[0x07] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 1. |
| |
| if (0 == ( |
| intermediate[0x09] | |
| intermediate[0x0A] | |
| intermediate[0x0B] | |
| intermediate[0x0C] | |
| intermediate[0x0D] | |
| intermediate[0x0E] | |
| intermediate[0x0F])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x08] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x0A] |
| in6 = intermediate[0x0E] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x08] |
| in4 = intermediate[0x0C] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x09] |
| in3 = intermediate[0x0B] |
| in5 = intermediate[0x0D] |
| in7 = intermediate[0x0F] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 2. |
| |
| if (0 == ( |
| intermediate[0x11] | |
| intermediate[0x12] | |
| intermediate[0x13] | |
| intermediate[0x14] | |
| intermediate[0x15] | |
| intermediate[0x16] | |
| intermediate[0x17])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x10] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x12] |
| in6 = intermediate[0x16] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x10] |
| in4 = intermediate[0x14] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x11] |
| in3 = intermediate[0x13] |
| in5 = intermediate[0x15] |
| in7 = intermediate[0x17] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 3. |
| |
| if (0 == ( |
| intermediate[0x19] | |
| intermediate[0x1A] | |
| intermediate[0x1B] | |
| intermediate[0x1C] | |
| intermediate[0x1D] | |
| intermediate[0x1E] | |
| intermediate[0x1F])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x18] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x1A] |
| in6 = intermediate[0x1E] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x18] |
| in4 = intermediate[0x1C] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x19] |
| in3 = intermediate[0x1B] |
| in5 = intermediate[0x1D] |
| in7 = intermediate[0x1F] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 4. |
| |
| if (0 == ( |
| intermediate[0x21] | |
| intermediate[0x22] | |
| intermediate[0x23] | |
| intermediate[0x24] | |
| intermediate[0x25] | |
| intermediate[0x26] | |
| intermediate[0x27])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x20] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x22] |
| in6 = intermediate[0x26] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x20] |
| in4 = intermediate[0x24] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x21] |
| in3 = intermediate[0x23] |
| in5 = intermediate[0x25] |
| in7 = intermediate[0x27] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 5. |
| |
| if (0 == ( |
| intermediate[0x29] | |
| intermediate[0x2A] | |
| intermediate[0x2B] | |
| intermediate[0x2C] | |
| intermediate[0x2D] | |
| intermediate[0x2E] | |
| intermediate[0x2F])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x28] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x2A] |
| in6 = intermediate[0x2E] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x28] |
| in4 = intermediate[0x2C] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x29] |
| in3 = intermediate[0x2B] |
| in5 = intermediate[0x2D] |
| in7 = intermediate[0x2F] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 6. |
| |
| if (0 == ( |
| intermediate[0x31] | |
| intermediate[0x32] | |
| intermediate[0x33] | |
| intermediate[0x34] | |
| intermediate[0x35] | |
| intermediate[0x36] | |
| intermediate[0x37])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x30] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x32] |
| in6 = intermediate[0x36] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x30] |
| in4 = intermediate[0x34] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x31] |
| in3 = intermediate[0x33] |
| in5 = intermediate[0x35] |
| in7 = intermediate[0x37] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if args.dst_stride > args.dst_buffer.length() { |
| return nothing |
| } |
| assert 8 <= args.dst_buffer.length() via "a <= b: a <= c; c <= b"(c: args.dst_stride) |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| args.dst_buffer = args.dst_buffer[args.dst_stride ..] |
| } |
| |
| // ==== Second pass, row 7. |
| |
| if (0 == ( |
| intermediate[0x39] | |
| intermediate[0x3A] | |
| intermediate[0x3B] | |
| intermediate[0x3C] | |
| intermediate[0x3D] | |
| intermediate[0x3E] | |
| intermediate[0x3F])) { |
| // Fast path when the 1-dimensional AC terms are all zero. |
| |
| if 8 > args.dst_buffer.length() { |
| return nothing |
| } |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[((intermediate[0x38] ~mod+ (1 << 4)) >> 5) & 1023] |
| args.dst_buffer[1] = args.dst_buffer[0] |
| args.dst_buffer[2] = args.dst_buffer[0] |
| args.dst_buffer[3] = args.dst_buffer[0] |
| args.dst_buffer[4] = args.dst_buffer[0] |
| args.dst_buffer[5] = args.dst_buffer[0] |
| args.dst_buffer[6] = args.dst_buffer[0] |
| args.dst_buffer[7] = args.dst_buffer[0] |
| |
| } else { |
| // Even columns. |
| |
| in2 = intermediate[0x3A] |
| in6 = intermediate[0x3E] |
| |
| // This code... |
| ra = (in2 ~mod+ in6) ~mod* 0x0000_1151 |
| rb2 = ra ~mod+ (in2 ~mod* 0x0000_187E) |
| rb6 = ra ~mod- (in6 ~mod* 0x0000_3B21) |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rb2 = (in2 ~mod* 0x0000_29CF) ~mod+ (in6 ~mod* 0x0000_1151) |
| // rb6 = (in2 ~mod* 0x0000_1151) ~mod+ (in6 ~mod* 0xFFFF_D630) |
| |
| in0 = intermediate[0x38] |
| in4 = intermediate[0x3C] |
| |
| rcp = (in0 ~mod+ in4) ~mod<< 13 |
| rcm = (in0 ~mod- in4) ~mod<< 13 |
| |
| rd0 = rcp ~mod+ rb2 |
| rd1 = rcm ~mod+ rb6 |
| rd2 = rcm ~mod- rb6 |
| rd3 = rcp ~mod- rb2 |
| |
| // Odd columns. |
| |
| in1 = intermediate[0x39] |
| in3 = intermediate[0x3B] |
| in5 = intermediate[0x3D] |
| in7 = intermediate[0x3F] |
| |
| ri51 = in5 ~mod+ in1 |
| ri53 = in5 ~mod+ in3 |
| ri71 = in7 ~mod+ in1 |
| ri73 = in7 ~mod+ in3 |
| |
| // This code... |
| rj = (ri73 ~mod+ ri51) ~mod* 0x0000_25A1 |
| rk1 = in1 ~mod* 0x0000_300B |
| rk3 = in3 ~mod* 0x0000_6254 |
| rk5 = in5 ~mod* 0x0000_41B3 |
| rk7 = in7 ~mod* 0x0000_098E |
| ri51 ~mod*= 0xFFFF_F384 |
| ri53 ~mod*= 0xFFFF_ADFD |
| ri71 ~mod*= 0xFFFF_E333 |
| ri73 ~mod*= 0xFFFF_C13B |
| rl51 = ri51 ~mod+ rj |
| rl73 = ri73 ~mod+ rj |
| rk1 ~mod+= ri71 ~mod+ rl51 |
| rk3 ~mod+= ri53 ~mod+ rl73 |
| rk5 ~mod+= ri53 ~mod+ rl51 |
| rk7 ~mod+= ri71 ~mod+ rl73 |
| // ...is equivalent to this more-SIMD-like code. |
| // |
| // rl73 = (ri73 ~mod* 0xFFFF_E6DC) ~mod+ (ri51 ~mod* 0x0000_25A1) |
| // rl51 = (ri73 ~mod* 0x0000_25A1) ~mod+ (ri51 ~mod* 0x0000_1925) |
| // rk1 = rl51 ~mod+ ((in1 ~mod* 0x0000_133E) ~mod+ (in7 ~mod* 0xFFFF_E333)) |
| // rk3 = rl73 ~mod+ ((in3 ~mod* 0x0000_1051) ~mod+ (in5 ~mod* 0xFFFF_ADFD)) |
| // rk5 = rl51 ~mod+ ((in3 ~mod* 0xFFFF_ADFD) ~mod+ (in5 ~mod* 0xFFFF_EFB0)) |
| // rk7 = rl73 ~mod+ ((in1 ~mod* 0xFFFF_E333) ~mod+ (in7 ~mod* 0xFFFF_ECC1)) |
| |
| // Combine columns. |
| |
| if 8 > args.dst_buffer.length() { |
| return nothing |
| } |
| |
| args.dst_buffer[0] = BIAS_AND_CLAMP[(((rd0 ~mod+ rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[7] = BIAS_AND_CLAMP[(((rd0 ~mod- rk1) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[1] = BIAS_AND_CLAMP[(((rd1 ~mod+ rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[6] = BIAS_AND_CLAMP[(((rd1 ~mod- rk3) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[2] = BIAS_AND_CLAMP[(((rd2 ~mod+ rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[5] = BIAS_AND_CLAMP[(((rd2 ~mod- rk5) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[3] = BIAS_AND_CLAMP[(((rd3 ~mod+ rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| args.dst_buffer[4] = BIAS_AND_CLAMP[(((rd3 ~mod- rk7) ~mod+ (1 << 17)) >> 18) & 1023] |
| |
| } |
| |
| // -------- END generated by script/print-jpeg-idct-code.go |
| } |