Neon: Intrinsics impl. of int sample conv./quant.
The previous AArch32 and AArch64 GAS implementations have been removed,
since the intrinsics implementation provides the same or better
performance.
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 89653c8..23eb871 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -265,7 +265,7 @@
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
-set(SIMD_SOURCES arm/jcgray-neon.c arm/jcsample-neon.c)
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcsample-neon.c arm/jquanti-neon.c)
if(NEON_INTRINSICS)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c)
endif()
diff --git a/simd/arm/aarch32/jsimd_neon.S b/simd/arm/aarch32/jsimd_neon.S
index c85801a..ef0132a 100644
--- a/simd/arm/aarch32/jsimd_neon.S
+++ b/simd/arm/aarch32/jsimd_neon.S
@@ -1924,69 +1924,6 @@
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req r0
- START_COL .req r1
- WORKSPACE .req r2
- TMP1 .req r3
- TMP2 .req r4
- TMP3 .req r5
- TMP4 .req ip
-
- push {r4, r5}
- vmov.u8 d0, #128
-
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d16}, [TMP1]
- vsubl.u8 q8, d16, d0
- vld1.8 {d18}, [TMP2]
- vsubl.u8 q9, d18, d0
- vld1.8 {d20}, [TMP3]
- vsubl.u8 q10, d20, d0
- vld1.8 {d22}, [TMP4]
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- vsubl.u8 q11, d22, d0
- vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d24}, [TMP1]
- vsubl.u8 q12, d24, d0
- vld1.8 {d26}, [TMP2]
- vsubl.u8 q13, d26, d0
- vld1.8 {d28}, [TMP3]
- vsubl.u8 q14, d28, d0
- vld1.8 {d30}, [TMP4]
- vsubl.u8 q15, d30, d0
- vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
- vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
- pop {r4, r5}
- bx lr
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_ifast_neon
*
* This function contains a fast, not so accurate integer implementation of
@@ -2111,107 +2048,6 @@
/*
* GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- * DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- * scheduling and eliminate stalls (this provides ~15% better
- * performance for this function on both Arm Cortex-A8 and
- * Arm Cortex-A9 when compared to the non-pipelined variant).
- * The instructions which belong to the second stage use different
- * indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
- COEF_BLOCK .req r0
- DIVISORS .req r1
- WORKSPACE .req r2
-
- RECIPROCAL .req DIVISORS
- CORRECTION .req r3
- SHIFT .req ip
- LOOP_COUNT .req r4
-
- vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
- vabs.s16 q12, q0
- add CORRECTION, DIVISORS, #(64 * 2)
- add SHIFT, DIVISORS, #(64 * 6)
- vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
- vabs.s16 q13, q1
- vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
- vadd.u16 q12, q12, q10 /* add correction */
- vadd.u16 q13, q13, q11
- vmull.u16 q10, d24, d16 /* multiply by reciprocal */
- vmull.u16 q11, d25, d17
- vmull.u16 q8, d26, d18
- vmull.u16 q9, d27, d19
- vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
- vshrn.u32 d20, q10, #16
- vshrn.u32 d21, q11, #16
- vshrn.u32 d22, q8, #16
- vshrn.u32 d23, q9, #16
- vneg.s16 q12, q12
- vneg.s16 q13, q13
- vshr.s16 q2, q0, #15 /* extract sign */
- vshr.s16 q3, q1, #15
- vshl.u16 q14, q10, q12 /* shift */
- vshl.u16 q15, q11, q13
-
- push {r4, r5}
- mov LOOP_COUNT, #3
-1:
- vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
- veor.u16 q14, q14, q2 /* restore sign */
- vabs.s16 q12, q0
- vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
- vabs.s16 q13, q1
- veor.u16 q15, q15, q3
- vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
- vadd.u16 q12, q12, q10 /* add correction */
- vadd.u16 q13, q13, q11
- vmull.u16 q10, d24, d16 /* multiply by reciprocal */
- vmull.u16 q11, d25, d17
- vmull.u16 q8, d26, d18
- vmull.u16 q9, d27, d19
- vsub.u16 q14, q14, q2
- vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
- vsub.u16 q15, q15, q3
- vshrn.u32 d20, q10, #16
- vshrn.u32 d21, q11, #16
- vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
- vshrn.u32 d22, q8, #16
- vshrn.u32 d23, q9, #16
- vneg.s16 q12, q12
- vneg.s16 q13, q13
- vshr.s16 q2, q0, #15 /* extract sign */
- vshr.s16 q3, q1, #15
- vshl.u16 q14, q10, q12 /* shift */
- vshl.u16 q15, q11, q13
- subs LOOP_COUNT, LOOP_COUNT, #1
- bne 1b
- pop {r4, r5}
-
- veor.u16 q14, q14, q2 /* restore sign */
- veor.u16 q15, q15, q3
- vsub.u16 q14, q14, q2
- vsub.u16 q15, q15, q3
- vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
- bx lr /* return */
-
- .unreq COEF_BLOCK
- .unreq DIVISORS
- .unreq WORKSPACE
- .unreq RECIPROCAL
- .unreq CORRECTION
- .unreq SHIFT
- .unreq LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
* jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
* JDIMENSION downsampled_width,
* JSAMPARRAY input_data,
diff --git a/simd/arm/aarch64/jsimd_neon.S b/simd/arm/aarch64/jsimd_neon.S
index c03387b..1d732e7 100644
--- a/simd/arm/aarch64/jsimd_neon.S
+++ b/simd/arm/aarch64/jsimd_neon.S
@@ -2266,82 +2266,6 @@
/*****************************************************************************/
/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req x0
- START_COL .req x1
- WORKSPACE .req x2
- TMP1 .req x9
- TMP2 .req x10
- TMP3 .req x11
- TMP4 .req x12
- TMP5 .req x13
- TMP6 .req x14
- TMP7 .req x15
- TMP8 .req x4
- TMPDUP .req w3
-
- /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x1 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x1, w1
-
- mov TMPDUP, #128
- ldp TMP1, TMP2, [SAMPLE_DATA], 16
- ldp TMP3, TMP4, [SAMPLE_DATA], 16
- dup v0.8b, TMPDUP
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- ldp TMP5, TMP6, [SAMPLE_DATA], 16
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- ldp TMP7, TMP8, [SAMPLE_DATA], 16
- add TMP5, TMP5, START_COL
- add TMP6, TMP6, START_COL
- ld1 {v16.8b}, [TMP1]
- add TMP7, TMP7, START_COL
- add TMP8, TMP8, START_COL
- ld1 {v17.8b}, [TMP2]
- usubl v16.8h, v16.8b, v0.8b
- ld1 {v18.8b}, [TMP3]
- usubl v17.8h, v17.8b, v0.8b
- ld1 {v19.8b}, [TMP4]
- usubl v18.8h, v18.8b, v0.8b
- ld1 {v20.8b}, [TMP5]
- usubl v19.8h, v19.8b, v0.8b
- ld1 {v21.8b}, [TMP6]
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
- usubl v20.8h, v20.8b, v0.8b
- ld1 {v22.8b}, [TMP7]
- usubl v21.8h, v21.8b, v0.8b
- ld1 {v23.8b}, [TMP8]
- usubl v22.8h, v22.8b, v0.8b
- usubl v23.8h, v23.8b, v0.8b
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
- br x30
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
- .unreq TMPDUP
-
-/*****************************************************************************/
-
-/*
* jsimd_fdct_islow_neon
*
* This file contains a slower but more accurate integer implementation of the
@@ -2743,94 +2667,6 @@
/*****************************************************************************/
/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- * DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
- COEF_BLOCK .req x0
- DIVISORS .req x1
- WORKSPACE .req x2
-
- RECIPROCAL .req DIVISORS
- CORRECTION .req x9
- SHIFT .req x10
- LOOP_COUNT .req x11
-
- mov LOOP_COUNT, #2
- add CORRECTION, DIVISORS, #(64 * 2)
- add SHIFT, DIVISORS, #(64 * 6)
-1:
- subs LOOP_COUNT, LOOP_COUNT, #1
- ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
- ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
- abs v20.8h, v0.8h
- abs v21.8h, v1.8h
- abs v22.8h, v2.8h
- abs v23.8h, v3.8h
- ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
- add v20.8h, v20.8h, v4.8h /* add correction */
- add v21.8h, v21.8h, v5.8h
- add v22.8h, v22.8h, v6.8h
- add v23.8h, v23.8h, v7.8h
- umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
- umull2 v16.4s, v20.8h, v28.8h
- umull v5.4s, v21.4h, v29.4h
- umull2 v17.4s, v21.8h, v29.8h
- umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
- umull2 v18.4s, v22.8h, v30.8h
- umull v7.4s, v23.4h, v31.4h
- umull2 v19.4s, v23.8h, v31.8h
- ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
- shrn v4.4h, v4.4s, #16
- shrn v5.4h, v5.4s, #16
- shrn v6.4h, v6.4s, #16
- shrn v7.4h, v7.4s, #16
- shrn2 v4.8h, v16.4s, #16
- shrn2 v5.8h, v17.4s, #16
- shrn2 v6.8h, v18.4s, #16
- shrn2 v7.8h, v19.4s, #16
- neg v24.8h, v24.8h
- neg v25.8h, v25.8h
- neg v26.8h, v26.8h
- neg v27.8h, v27.8h
- sshr v0.8h, v0.8h, #15 /* extract sign */
- sshr v1.8h, v1.8h, #15
- sshr v2.8h, v2.8h, #15
- sshr v3.8h, v3.8h, #15
- ushl v4.8h, v4.8h, v24.8h /* shift */
- ushl v5.8h, v5.8h, v25.8h
- ushl v6.8h, v6.8h, v26.8h
- ushl v7.8h, v7.8h, v27.8h
-
- eor v4.16b, v4.16b, v0.16b /* restore sign */
- eor v5.16b, v5.16b, v1.16b
- eor v6.16b, v6.16b, v2.16b
- eor v7.16b, v7.16b, v3.16b
- sub v4.8h, v4.8h, v0.8h
- sub v5.8h, v5.8h, v1.8h
- sub v6.8h, v6.8h, v2.8h
- sub v7.8h, v7.8h, v3.8h
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
- b.ne 1b
-
- br x30 /* return */
-
- .unreq COEF_BLOCK
- .unreq DIVISORS
- .unreq WORKSPACE
- .unreq RECIPROCAL
- .unreq CORRECTION
- .unreq SHIFT
- .unreq LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
* GLOBAL(JOCTET *)
* jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
* JCOEFPTR block, int last_dc_val,
diff --git a/simd/arm/jquanti-neon.c b/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000..a7eb6f1
--- /dev/null
+++ b/simd/arm/jquanti-neon.c
@@ -0,0 +1,190 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR out_ptr = coef_block;
+ UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+ UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+ DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+ int i;
+
+ for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+ /* Load reciprocals of quantization values. */
+ uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+ uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+ int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+ int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+ int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+ int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+ /* Extract sign from coefficients. */
+ int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+ int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+ int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+ int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+ /* Get absolute value of DCT coefficients. */
+ uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+ uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+ uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+ uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+ /* Add correction. */
+ abs_row0 = vaddq_u16(abs_row0, corr0);
+ abs_row1 = vaddq_u16(abs_row1, corr1);
+ abs_row2 = vaddq_u16(abs_row2, corr2);
+ abs_row3 = vaddq_u16(abs_row3, corr3);
+
+ /* Multiply DCT coefficients by quantization reciprocals. */
+ int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+ vget_low_u16(recip0)));
+ int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+ vget_high_u16(recip0)));
+ int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+ vget_low_u16(recip1)));
+ int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+ vget_high_u16(recip1)));
+ int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+ vget_low_u16(recip2)));
+ int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+ vget_high_u16(recip2)));
+ int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+ vget_low_u16(recip3)));
+ int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+ vget_high_u16(recip3)));
+ /* Narrow back to 16-bit. */
+ row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+ row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+ row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+ row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+ /* Since VSHR only supports an immediate as its second argument, negate the
+ * shift value and shift left.
+ */
+ row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+ vnegq_s16(shift0)));
+ row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+ vnegq_s16(shift1)));
+ row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+ vnegq_s16(shift2)));
+ row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+ vnegq_s16(shift3)));
+
+ /* Restore sign to original product. */
+ row0 = veorq_s16(row0, sign_row0);
+ row0 = vsubq_s16(row0, sign_row0);
+ row1 = veorq_s16(row1, sign_row1);
+ row1 = vsubq_s16(row1, sign_row1);
+ row2 = veorq_s16(row2, sign_row2);
+ row2 = vsubq_s16(row2, sign_row2);
+ row3 = veorq_s16(row3, sign_row3);
+ row3 = vsubq_s16(row3, sign_row3);
+
+ /* Store quantized coefficients to memory. */
+ vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+ vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+ vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+ vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+ }
+}