Neon: Intrinsics impl. of prog. Huffman encoding
The previous AArch64 GAS implementation has been removed, since the
intrinsics implementation provides the same or better performance.
There was no previous AArch32 GAS implementation.
diff --git a/ChangeLog.md b/ChangeLog.md
index c4f9490..2b89d83 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -61,10 +61,9 @@
each iMCU row based on which scan generated the pixels in that row, rather than
always using the block smoothing parameters for the most recent scan.
-7. Added SIMD acceleration for progressive Huffman encoding on Arm 64-bit
-(Armv8) platforms. This speeds up the compression of full-color progressive
-JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
-modern Armv8 CPUs.
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
instructions, so that the Loongson MMI SIMD extensions can be included in any
diff --git a/README.md b/README.md
index 924ebd8..01e391e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
all else being equal. On other types of systems, libjpeg-turbo can still
outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index 64fc294..446ce46 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal. On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines. In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API. libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
diff --git a/release/deb-control.in b/release/deb-control.in
index b59f8f9..72bceec 100644
--- a/release/deb-control.in
+++ b/release/deb-control.in
@@ -11,7 +11,7 @@
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
MIPS systems, as well as progressive JPEG compression on x86, x86-64, and
- Armv8 systems. On such systems, libjpeg-turbo is generally 2-6x as fast as
+ Arm systems. On such systems, libjpeg-turbo is generally 2-6x as fast as
libjpeg, all else being equal. On other types of systems, libjpeg-turbo can
still outperform libjpeg by a significant amount, by virtue of its
highly-optimized Huffman coding routines. In many cases, the performance of
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
index 0cd1825..05410a6 100644
--- a/release/rpm.spec.in
+++ b/release/rpm.spec.in
@@ -53,7 +53,7 @@
%description
libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
systems. On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
all else being equal. On other types of systems, libjpeg-turbo can still
outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index b9dfdcb..f976417 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -265,8 +265,8 @@
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
-set(SIMD_SOURCES arm/jcgray-neon.c arm/jcsample-neon.c arm/jdsample-neon.c
- arm/jfdctfst-neon.c arm/jquanti-neon.c)
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
+ arm/jdsample-neon.c arm/jfdctfst-neon.c arm/jquanti-neon.c)
if(NEON_INTRINSICS)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
endif()
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index cd90c63..e054a45 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -825,6 +825,16 @@
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -833,11 +843,23 @@
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -846,5 +868,7 @@
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
- return 0;
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start, Sl,
+ Al, absvalues, bits);
}
diff --git a/simd/arm/aarch64/jsimd_neon.S b/simd/arm/aarch64/jsimd_neon.S
index a99703e..9ded729 100644
--- a/simd/arm/aarch64/jsimd_neon.S
+++ b/simd/arm/aarch64/jsimd_neon.S
@@ -235,22 +235,6 @@
#endif
-/* Constants for jsimd_encode_mcu_AC_first_prepare_neon() */
-
-.balign 16
-Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
- .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
-/* Constants for jsimd_encode_mcu_AC_refine_prepare_neon() */
-
-.balign 16
-Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
- .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
-.text
-
#define RESPECT_STRICT_ALIGNMENT 1
@@ -2754,625 +2738,3 @@
.purgem checkbuf47
#endif /* NEON_INTRINSICS */
-
-
-/*****************************************************************************/
-
-/*
- * Macros to load data for jsimd_encode_mcu_AC_first_prepare_neon() and
- * jsimd_encode_mcu_AC_refine_prepare_neon()
- */
-
-.macro LOAD16
- ldr T0d, [LUT, #(0 * 4)]
- ldr T1d, [LUT, #(8 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[0], [T0]
- ld1 {Y1.h}[0], [T1]
-
- ldr T0d, [LUT, #(1 * 4)]
- ldr T1d, [LUT, #(9 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[1], [T0]
- ld1 {Y1.h}[1], [T1]
-
- ldr T0d, [LUT, #(2 * 4)]
- ldr T1d, [LUT, #(10 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[2], [T0]
- ld1 {Y1.h}[2], [T1]
-
- ldr T0d, [LUT, #(3 * 4)]
- ldr T1d, [LUT, #(11 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[3], [T0]
- ld1 {Y1.h}[3], [T1]
-
- ldr T0d, [LUT, #(4 * 4)]
- ldr T1d, [LUT, #(12 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[4], [T0]
- ld1 {Y1.h}[4], [T1]
-
- ldr T0d, [LUT, #(5 * 4)]
- ldr T1d, [LUT, #(13 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[5], [T0]
- ld1 {Y1.h}[5], [T1]
-
- ldr T0d, [LUT, #(6 * 4)]
- ldr T1d, [LUT, #(14 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[6], [T0]
- ld1 {Y1.h}[6], [T1]
-
- ldr T0d, [LUT, #(7 * 4)]
- ldr T1d, [LUT, #(15 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[7], [T0]
- ld1 {Y1.h}[7], [T1]
-
- add LUT, LUT, #(16 * 4)
-.endm
-
-.macro LOAD15
- eor Y1.16b, Y1.16b, Y1.16b
-
- ldr T0d, [LUT, #(0 * 4)]
- ldr T1d, [LUT, #(8 * 4)]
- add T0, BLOCK, T0, lsl #1
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[0], [T0]
- ld1 {Y1.h}[0], [T1]
-
- ldr T0d, [LUT, #(1 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[1], [T0]
-
- ldr T0d, [LUT, #(2 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[2], [T0]
-
- ldr T0d, [LUT, #(3 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[3], [T0]
-
- ldr T0d, [LUT, #(4 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[4], [T0]
-
- ldr T0d, [LUT, #(5 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[5], [T0]
-
- ldr T0d, [LUT, #(6 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[6], [T0]
-
- ldr T0d, [LUT, #(7 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[7], [T0]
-
- cmp LENEND, #2
- b.lt 1515f
- ldr T1d, [LUT, #(9 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y1.h}[1], [T1]
-
- cmp LENEND, #3
- b.lt 1515f
- ldr T1d, [LUT, #(10 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y1.h}[2], [T1]
-
- cmp LENEND, #4
- b.lt 1515f
- ldr T1d, [LUT, #(11 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y1.h}[3], [T1]
-
- cmp LENEND, #5
- b.lt 1515f
- ldr T1d, [LUT, #(12 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y1.h}[4], [T1]
-
- cmp LENEND, #6
- b.lt 1515f
- ldr T1d, [LUT, #(13 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y1.h}[5], [T1]
-
- cmp LENEND, #7
- b.lt 1515f
- ldr T1d, [LUT, #(14 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y1.h}[6], [T1]
-
-1515:
-.endm
-
-.macro LOAD8
- ldr T0d, [LUT, #(0 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[0], [T0]
-
- ldr T0d, [LUT, #(1 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[1], [T0]
-
- ldr T0d, [LUT, #(2 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[2], [T0]
-
- ldr T0d, [LUT, #(3 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[3], [T0]
-
- ldr T0d, [LUT, #(4 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[4], [T0]
-
- ldr T0d, [LUT, #(5 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[5], [T0]
-
- ldr T0d, [LUT, #(6 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[6], [T0]
-
- ldr T0d, [LUT, #(7 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[7], [T0]
-.endm
-
-.macro LOAD7
- eor Y0.16b, Y0.16b, Y0.16b
-
- ldr T0d, [LUT, #(0 * 4)]
- add T0, BLOCK, T0, lsl #1
- ld1 {Y0.h}[0], [T0]
-
- cmp LENEND, #2
- b.lt 77f
- ldr T1d, [LUT, #(1 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[1], [T1]
-
- cmp LENEND, #3
- b.lt 77f
- ldr T1d, [LUT, #(2 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[2], [T1]
-
- cmp LENEND, #4
- b.lt 77f
- ldr T1d, [LUT, #(3 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[3], [T1]
-
- cmp LENEND, #5
- b.lt 77f
- ldr T1d, [LUT, #(4 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[4], [T1]
-
- cmp LENEND, #6
- b.lt 77f
- ldr T1d, [LUT, #(5 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[5], [T1]
-
- cmp LENEND, #7
- b.lt 77f
- ldr T1d, [LUT, #(6 * 4)]
- add T1, BLOCK, T1, lsl #1
- ld1 {Y0.h}[6], [T1]
-
-77:
-.endm
-
-.macro REDUCE0
- ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [VALUES], #64
- ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [VALUES], #64
-
- cmeq v0.8h, v0.8h, #0
- cmeq v1.8h, v1.8h, #0
- cmeq v2.8h, v2.8h, #0
- cmeq v3.8h, v3.8h, #0
- cmeq v4.8h, v4.8h, #0
- cmeq v5.8h, v5.8h, #0
- cmeq v6.8h, v6.8h, #0
- cmeq v7.8h, v7.8h, #0
-
- xtn v0.8b, v0.8h
- xtn v2.8b, v2.8h
- xtn v4.8b, v4.8h
- xtn v6.8b, v6.8h
- xtn2 v0.16b, v1.8h
- xtn2 v2.16b, v3.8h
- xtn2 v4.16b, v5.8h
- xtn2 v6.16b, v7.8h
-
- and v0.16b, v0.16b, ANDMASK.16b
- and v2.16b, v2.16b, ANDMASK.16b
- and v4.16b, v4.16b, ANDMASK.16b
- and v6.16b, v6.16b, ANDMASK.16b
- addp v0.16b, v0.16b, v2.16b
- addp v4.16b, v4.16b, v6.16b
- addp v0.16b, v0.16b, v4.16b
- addp v0.16b, v0.16b, v0.16b
- umov T0, v0.D[0]
- mvn T0, T0
- str T0, [BITS]
-.endm
-
-/*
- * Prepare data for jsimd_encode_mcu_AC_first().
- *
- * GLOBAL(int)
- * jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF *block,
- * const int *jpeg_natural_order_start,
- * int Sl, int Al, JCOEF *values,
- * size_t *zerobits)
- *
- * x0 = const JCOEF *block
- * x1 = const int *jpeg_natural_order_start
- * w2 = int Sl
- * w3 = int Al
- * x4 = JCOEF *values
- * x5 = size_t *zerobits
- *
- */
-
- ZERO .req v0
- Y0 .req v2
- Y1 .req v3
- N0 .req v4
- N1 .req v5
- AL .req v6
- ANDMASK .req v20
- K .req w12
- LUT .req x1
- T0 .req x10
- T0d .req w10
- T1 .req x11
- T1d .req w11
- BLOCK .req x0
- VALUES .req x4
- XORVALUES .req x14
- LEN .req w2
- LENEND .req w9
- BITS .req x5
-
-asm_function jsimd_encode_mcu_AC_first_prepare_neon
- get_symbol_loc T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
- neg w3, w3 /* Al = -Al */
- eor ZERO.16b, ZERO.16b, ZERO.16b
- ld1 {ANDMASK.16b}, [T0]
- dup AL.8h, w3
- add XORVALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
- and LENEND, LEN, 7
- lsr K, LEN, 4
- cbz K, 3f
-1:
- LOAD16
- cmlt N0.8h, Y0.8h, #0
- cmlt N1.8h, Y1.8h, #0
- abs Y0.8h, Y0.8h
- abs Y1.8h, Y1.8h
- ushl Y0.8h, Y0.8h, AL.8h
- ushl Y1.8h, Y1.8h, AL.8h
- eor N0.16b, N0.16b, Y0.16b
- eor N1.16b, N1.16b, Y1.16b
- st1 {Y0.8h, Y1.8h}, [VALUES], #32
- st1 {N0.8h, N1.8h}, [XORVALUES], #32
- subs K, K, #1
- b.ne 1b
-3:
- tst LEN, #8
- b.eq 3f
- tst LEN, #7
- b.eq 2f
-
- LOAD15
- cmlt N0.8h, Y0.8h, #0
- cmlt N1.8h, Y1.8h, #0
- abs Y0.8h, Y0.8h
- abs Y1.8h, Y1.8h
- ushl Y0.8h, Y0.8h, AL.8h
- ushl Y1.8h, Y1.8h, AL.8h
- eor N0.16b, N0.16b, Y0.16b
- eor N1.16b, N1.16b, Y1.16b
- st1 {Y0.8h, Y1.8h}, [VALUES], #32
- st1 {N0.8h, N1.8h}, [XORVALUES], #32
- b 4f
-2:
- LOAD8
- cmlt N0.8h, Y0.8h, #0
- abs Y0.8h, Y0.8h
- ushl Y0.8h, Y0.8h, AL.8h
- eor N0.16b, N0.16b, Y0.16b
- st1 {Y0.8h}, [VALUES], #16
- st1 {N0.8h}, [XORVALUES], #16
- b 4f
-3:
- cbz LENEND, 4f
- LOAD7
- cmlt N0.8h, Y0.8h, #0
- abs Y0.8h, Y0.8h
- ushl Y0.8h, Y0.8h, AL.8h
- eor N0.16b, N0.16b, Y0.16b
- st1 {Y0.8h}, [VALUES], #16
- st1 {N0.8h}, [XORVALUES], #16
- /* b 4f */
- /* fallthrough */
-4:
- add K, LEN, #7
- lsr K, K, #3
- subs K, K, #(/*DCTSIZE2*/ 64 / 8)
- b.eq 5f
-1:
- st1 {ZERO.8h}, [VALUES], #16
- st1 {ZERO.8h}, [XORVALUES], #16
- adds K, K, #1
- b.ne 1b
-5:
- sub VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
-
- REDUCE0
-
- br x30
-
- .unreq ZERO
- .unreq Y0
- .unreq Y1
- .unreq N0
- .unreq N1
- .unreq AL
- .unreq ANDMASK
- .unreq K
- .unreq LUT
- .unreq T0
- .unreq T0d
- .unreq T1
- .unreq T1d
- .unreq BLOCK
- .unreq VALUES
- .unreq XORVALUES
- .unreq LEN
- .unreq LENEND
- .unreq BITS
-
-/*
- * Prepare data for jsimd_encode_mcu_AC_refine.
- *
- * GLOBAL(int)
- * jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF *block,
- * const int *jpeg_natural_order_start,
- * int Sl, int Al, JCOEF *absvalues,
- * size_t *bits)
- *
- * x0 = const JCOEF *block
- * x1 = const int *jpeg_natural_order_start
- * w2 = int Sl
- * w3 = int Al
- * x4 = JCOEF *absvalues
- * x5 = size_t *bits
- *
- */
-
- ZERO .req v0
- ONE .req v1
- Y0 .req v2
- Y1 .req v3
- N0 .req v4
- N1 .req v5
- AL .req v6
- ANDMASK .req v20
- K .req w12
- KK .req w13
- EOB .req w14
- SIGN .req x15
- LUT .req x1
- T0 .req x10
- T0d .req w10
- T1 .req x11
- T1d .req w11
- BLOCK .req x0
- VALUES .req x4
- LEN .req w2
- LENEND .req w9
- BITS .req x5
-
-asm_function jsimd_encode_mcu_AC_refine_prepare_neon
- get_symbol_loc T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
- neg w3, w3 /* Al = -Al */
- movi ONE.8h, #1
- eor SIGN, SIGN, SIGN
- eor ZERO.16b, ZERO.16b, ZERO.16b
- eor EOB, EOB, EOB
- ld1 {ANDMASK.16b}, [T0]
- eor KK, KK, KK
- dup AL.8h, w3
- and LENEND, LEN, 7
- lsr K, LEN, 4
- cbz K, 3f
-1:
- LOAD16
- cmlt N0.8h, Y0.8h, #0
- cmlt N1.8h, Y1.8h, #0
- abs Y0.8h, Y0.8h
- abs Y1.8h, Y1.8h
- ushl Y0.8h, Y0.8h, AL.8h
- ushl Y1.8h, Y1.8h, AL.8h
- st1 {Y0.8h, Y1.8h}, [VALUES], #32
- xtn N0.8b, N0.8h
- xtn N1.8b, N1.8h
- cmeq Y0.8h, Y0.8h, ONE.8h
- cmeq Y1.8h, Y1.8h, ONE.8h
- xtn Y0.8b, Y0.8h
- xtn Y1.8b, Y1.8h
- and N0.8b, N0.8b, ANDMASK.8b
- and N1.8b, N1.8b, ANDMASK.8b
- and Y0.8b, Y0.8b, ANDMASK.8b
- and Y1.8b, Y1.8b, ANDMASK.8b
- addv B28, N0.8b
- addv B29, N1.8b
- addv B30, Y0.8b
- addv B31, Y1.8b
- ins v28.b[1], v29.b[0]
- ins v30.b[1], v31.b[0]
- umov T0d, v28.h[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
- umov T1d, v30.h[0] /* idx = _mm_movemask_epi8(x1); */
- lsr SIGN, SIGN, #16 /* make room for sizebits */
- orr SIGN, SIGN, T0, lsl #48
- cbz T1d, 2f
- rbit T1d, T1d
- clz T1d, T1d
- add EOB, KK, T1d /* EOB = k + idx; */
-2:
- add KK, KK, #16
- subs K, K, #1
- b.ne 1b
-3:
- tst LEN, #8
- b.eq 3f
- tst LEN, #7
- b.eq 2f
-
- LOAD15
- cmlt N0.8h, Y0.8h, #0
- cmlt N1.8h, Y1.8h, #0
- abs Y0.8h, Y0.8h
- abs Y1.8h, Y1.8h
- ushl Y0.8h, Y0.8h, AL.8h
- ushl Y1.8h, Y1.8h, AL.8h
- st1 {Y0.8h, Y1.8h}, [VALUES], #32
- xtn N0.8b, N0.8h
- xtn N1.8b, N1.8h
- cmeq Y0.8h, Y0.8h, ONE.8h
- cmeq Y1.8h, Y1.8h, ONE.8h
- xtn Y0.8b, Y0.8h
- xtn Y1.8b, Y1.8h
- and N0.8b, N0.8b, ANDMASK.8b
- and N1.8b, N1.8b, ANDMASK.8b
- and Y0.8b, Y0.8b, ANDMASK.8b
- and Y1.8b, Y1.8b, ANDMASK.8b
- addv B28, N0.8b
- addv B29, N1.8b
- addv B30, Y0.8b
- addv B31, Y1.8b
- ins v28.b[1], v29.b[0]
- ins v30.b[1], v31.b[0]
- umov T0d, v28.h[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
- umov T1d, v30.h[0] /* idx = _mm_movemask_epi8(x1); */
- lsr SIGN, SIGN, #16 /* make room for sizebits */
- orr SIGN, SIGN, T0, lsl #48
- cbz T1d, 4f
- rbit T1d, T1d
- clz T1d, T1d
- add EOB, KK, T1d /* EOB = k + idx; */
- b 4f
-2:
- LOAD8
- cmlt N0.8h, Y0.8h, #0
- abs Y0.8h, Y0.8h
- ushl Y0.8h, Y0.8h, AL.8h
- st1 {Y0.8h}, [VALUES], #16
- xtn N0.8b, N0.8h
- cmeq Y0.8h, Y0.8h, ONE.8h
- xtn Y0.8b, Y0.8h
- and N0.8b, N0.8b, ANDMASK.8b
- and Y0.8b, Y0.8b, ANDMASK.8b
- addv B28, N0.8b
- addv B30, Y0.8b
- umov T0d, v28.b[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
- umov T1d, v30.b[0] /* idx = _mm_movemask_epi8(x1); */
- lsr SIGN, SIGN, #8 /* make room for sizebits */
- orr SIGN, SIGN, T0, lsl #56
- cbz T1d, 4f
- rbit T1d, T1d
- clz T1d, T1d
- add EOB, KK, T1d /* EOB = k + idx; */
- b 4f
-3:
- cbz LENEND, 4f
- LOAD7
- cmlt N0.8h, Y0.8h, #0
- abs Y0.8h, Y0.8h
- ushl Y0.8h, Y0.8h, AL.8h
- st1 {Y0.8h}, [VALUES], #16
- xtn N0.8b, N0.8h
- cmeq Y0.8h, Y0.8h, ONE.8h
- xtn Y0.8b, Y0.8h
- and N0.8b, N0.8b, ANDMASK.8b
- and Y0.8b, Y0.8b, ANDMASK.8b
- addv B28, N0.8b
- addv B30, Y0.8b
- umov T0d, v28.b[0] /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
- umov T1d, v30.b[0] /* idx = _mm_movemask_epi8(x1); */
- lsr SIGN, SIGN, #8 /* make room for sizebits */
- orr SIGN, SIGN, T0, lsl #56
- cbz T1d, 4f
- rbit T1d, T1d
- clz T1d, T1d
- add EOB, KK, T1d /* EOB = k + idx; */
- /* b 4f */
- /* fallthrough */
-4:
- add K, LEN, #7
- lsr K, K, #3
- subs K, K, #(/*DCTSIZE2*/ 64 / 8)
- b.eq 5f
-1:
- st1 {ZERO.8h}, [VALUES], #16
- lsr SIGN, SIGN, #8
- adds K, K, #1
- b.ne 1b
-5:
- mvn SIGN, SIGN
- sub VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
- str SIGN, [BITS, #8]
-
- REDUCE0
-
- mov w0, EOB
- br x30
-
- .unreq ZERO
- .unreq ONE
- .unreq Y0
- .unreq Y1
- .unreq N0
- .unreq N1
- .unreq AL
- .unreq ANDMASK
- .unreq K
- .unreq KK
- .unreq EOB
- .unreq SIGN
- .unreq LUT
- .unreq T0
- .unreq T0d
- .unreq T1
- .unreq T1d
- .unreq BLOCK
- .unreq VALUES
- .unreq LEN
- .unreq LENEND
- .unreq BITS
-
-.purgem LOAD16
-.purgem LOAD15
-.purgem LOAD8
-.purgem LOAD7
-.purgem REDUCE0
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000..61f94c2
--- /dev/null
+++ b/simd/arm/jcphuff-neon.c
@@ -0,0 +1,588 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits)
+{
+ JCOEF *values_ptr = values;
+ JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs);
+ vst1q_s16(diff_values_ptr, diff);
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in the values and diff_values blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(values_ptr, vdupq_n_s16(0));
+ vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. A set bit means that the corresponding
+ * coefficient != 0.
+ */
+ int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+ uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+ uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+ uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+ uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+ uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+ uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+ uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+ uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+ const uint8x8_t bitmap_mask =
+ { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+ row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+ row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+ row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+ row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+ row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+ row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+ row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+ row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ *zerobits = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ zerobits[0] = ~bitmap0;
+ zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits)
+{
+ /* Temporary storage buffers for data used to compute the signbits bitmap and
+ * the end-of-block (EOB) position
+ */
+ uint8_t coef_sign_bits[64];
+ uint8_t coef_eq1_bits[64];
+
+ JCOEF *absvalues_ptr = absvalues;
+ uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+ uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+ vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+ vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. */
+ int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+ int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+ int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+ int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+ int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+ int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+ int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+ int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+ uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+ uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+ uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+ uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+ uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+ uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+ uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+ uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+ const uint8x8_t bitmap_mask =
+ { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+ abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+ abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+ abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+ abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+ abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+ abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+ abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+ abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap0;
+ bits[1] = ~bitmap1;
+#endif
+
+ /* Construct signbits bitmap. */
+ uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+ uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+ uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+ uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+ uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+ uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+ uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+ uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+ signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+ signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+ signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+ signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+ signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+ signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+ signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+ signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+ bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+ bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+ bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store signbits bitmap. */
+ bits[1] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store signbits bitmap. */
+ bits[2] = ~bitmap0;
+ bits[3] = ~bitmap1;
+#endif
+
+ /* Construct bitmap to find EOB position (the index of the last coefficient
+ * equal to 1.)
+ */
+ uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+ uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+ uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+ uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+ uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+ uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+ uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+ uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+ row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+ row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+ row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+ row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+ row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+ row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+ row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+ row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+ bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+ bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+ bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Return EOB position. */
+ if (bitmap == 0) {
+ /* EOB position is defined to be 0 if all coefficients != 1. */
+ return 0;
+ } else {
+ return 63 - __builtin_clzl(bitmap);
+ }
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+ /* Return EOB position. */
+ if (bitmap0 == 0 && bitmap1 == 0) {
+ return 0;
+ } else if (bitmap1 != 0) {
+ return 63 - __builtin_clz(bitmap1);
+ } else {
+ return 31 - __builtin_clz(bitmap0);
+ }
+#endif
+}