Neon: Intrinsics impl. of prog. Huffman encoding

The previous AArch64 GAS implementation has been removed, since the
intrinsics implementation provides the same or better performance.
There was no previous AArch32 GAS implementation.
diff --git a/ChangeLog.md b/ChangeLog.md
index c4f9490..2b89d83 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -61,10 +61,9 @@
 each iMCU row based on which scan generated the pixels in that row, rather than
 always using the block smoothing parameters for the most recent scan.
 
-7. Added SIMD acceleration for progressive Huffman encoding on Arm 64-bit
-(Armv8) platforms.  This speeds up the compression of full-color progressive
-JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
-modern Armv8 CPUs.
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
 
 8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
 instructions, so that the Loongson MMI SIMD extensions can be included in any
diff --git a/README.md b/README.md
index 924ebd8..01e391e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index 64fc294..446ce46 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/deb-control.in b/release/deb-control.in
index b59f8f9..72bceec 100644
--- a/release/deb-control.in
+++ b/release/deb-control.in
@@ -11,7 +11,7 @@
  libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
  baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
  MIPS systems, as well as progressive JPEG compression on x86, x86-64, and
- Armv8 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as
+ Arm systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as
  libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
  still outperform libjpeg by a significant amount, by virtue of its
  highly-optimized Huffman coding routines.  In many cases, the performance of
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
index 0cd1825..05410a6 100644
--- a/release/rpm.spec.in
+++ b/release/rpm.spec.in
@@ -53,7 +53,7 @@
 %description
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index b9dfdcb..f976417 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -265,8 +265,8 @@
 
 file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
 
-set(SIMD_SOURCES arm/jcgray-neon.c arm/jcsample-neon.c arm/jdsample-neon.c
-  arm/jfdctfst-neon.c arm/jquanti-neon.c)
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
+  arm/jdsample-neon.c arm/jfdctfst-neon.c arm/jquanti-neon.c)
 if(NEON_INTRINSICS)
   set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
 endif()
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index cd90c63..e054a45 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -825,6 +825,16 @@
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -833,11 +843,23 @@
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -846,5 +868,7 @@
                                    const int *jpeg_natural_order_start, int Sl,
                                    int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
 }
diff --git a/simd/arm/aarch64/jsimd_neon.S b/simd/arm/aarch64/jsimd_neon.S
index a99703e..9ded729 100644
--- a/simd/arm/aarch64/jsimd_neon.S
+++ b/simd/arm/aarch64/jsimd_neon.S
@@ -235,22 +235,6 @@
 
 #endif
 
-/* Constants for jsimd_encode_mcu_AC_first_prepare_neon() */
-
-.balign 16
-Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
-/* Constants for jsimd_encode_mcu_AC_refine_prepare_neon() */
-
-.balign 16
-Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
-.text
-
 
 #define RESPECT_STRICT_ALIGNMENT  1
 
@@ -2754,625 +2738,3 @@
 .purgem checkbuf47
 
 #endif  /* NEON_INTRINSICS */
-
-
-/*****************************************************************************/
-
-/*
- * Macros to load data for jsimd_encode_mcu_AC_first_prepare_neon() and
- * jsimd_encode_mcu_AC_refine_prepare_neon()
- */
-
-.macro LOAD16
-    ldr             T0d, [LUT, #(0 * 4)]
-    ldr             T1d, [LUT, #(8 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[0], [T0]
-    ld1             {Y1.h}[0], [T1]
-
-    ldr             T0d, [LUT, #(1 * 4)]
-    ldr             T1d, [LUT, #(9 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[1], [T0]
-    ld1             {Y1.h}[1], [T1]
-
-    ldr             T0d, [LUT, #(2 * 4)]
-    ldr             T1d, [LUT, #(10 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[2], [T0]
-    ld1             {Y1.h}[2], [T1]
-
-    ldr             T0d, [LUT, #(3 * 4)]
-    ldr             T1d, [LUT, #(11 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[3], [T0]
-    ld1             {Y1.h}[3], [T1]
-
-    ldr             T0d, [LUT, #(4 * 4)]
-    ldr             T1d, [LUT, #(12 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[4], [T0]
-    ld1             {Y1.h}[4], [T1]
-
-    ldr             T0d, [LUT, #(5 * 4)]
-    ldr             T1d, [LUT, #(13 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[5], [T0]
-    ld1             {Y1.h}[5], [T1]
-
-    ldr             T0d, [LUT, #(6 * 4)]
-    ldr             T1d, [LUT, #(14 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[6], [T0]
-    ld1             {Y1.h}[6], [T1]
-
-    ldr             T0d, [LUT, #(7 * 4)]
-    ldr             T1d, [LUT, #(15 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[7], [T0]
-    ld1             {Y1.h}[7], [T1]
-
-    add             LUT, LUT, #(16 * 4)
-.endm
-
-.macro LOAD15
-    eor             Y1.16b, Y1.16b, Y1.16b
-
-    ldr             T0d, [LUT, #(0 * 4)]
-    ldr             T1d, [LUT, #(8 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[0], [T0]
-    ld1             {Y1.h}[0], [T1]
-
-    ldr             T0d, [LUT, #(1 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[1], [T0]
-
-    ldr             T0d, [LUT, #(2 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[2], [T0]
-
-    ldr             T0d, [LUT, #(3 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[3], [T0]
-
-    ldr             T0d, [LUT, #(4 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[4], [T0]
-
-    ldr             T0d, [LUT, #(5 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[5], [T0]
-
-    ldr             T0d, [LUT, #(6 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[6], [T0]
-
-    ldr             T0d, [LUT, #(7 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[7], [T0]
-
-    cmp             LENEND, #2
-    b.lt            1515f
-    ldr             T1d, [LUT, #(9 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y1.h}[1], [T1]
-
-    cmp             LENEND, #3
-    b.lt            1515f
-    ldr             T1d, [LUT, #(10 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y1.h}[2], [T1]
-
-    cmp             LENEND, #4
-    b.lt            1515f
-    ldr             T1d, [LUT, #(11 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y1.h}[3], [T1]
-
-    cmp             LENEND, #5
-    b.lt            1515f
-    ldr             T1d, [LUT, #(12 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y1.h}[4], [T1]
-
-    cmp             LENEND, #6
-    b.lt            1515f
-    ldr             T1d, [LUT, #(13 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y1.h}[5], [T1]
-
-    cmp             LENEND, #7
-    b.lt            1515f
-    ldr             T1d, [LUT, #(14 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y1.h}[6], [T1]
-
-1515:
-.endm
-
-.macro LOAD8
-    ldr             T0d, [LUT, #(0 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[0], [T0]
-
-    ldr             T0d, [LUT, #(1 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[1], [T0]
-
-    ldr             T0d, [LUT, #(2 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[2], [T0]
-
-    ldr             T0d, [LUT, #(3 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[3], [T0]
-
-    ldr             T0d, [LUT, #(4 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[4], [T0]
-
-    ldr             T0d, [LUT, #(5 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[5], [T0]
-
-    ldr             T0d, [LUT, #(6 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[6], [T0]
-
-    ldr             T0d, [LUT, #(7 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[7], [T0]
-.endm
-
-.macro LOAD7
-    eor             Y0.16b, Y0.16b, Y0.16b
-
-    ldr             T0d, [LUT, #(0 * 4)]
-    add             T0, BLOCK, T0, lsl #1
-    ld1             {Y0.h}[0], [T0]
-
-    cmp             LENEND, #2
-    b.lt            77f
-    ldr             T1d, [LUT, #(1 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[1], [T1]
-
-    cmp             LENEND, #3
-    b.lt            77f
-    ldr             T1d, [LUT, #(2 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[2], [T1]
-
-    cmp             LENEND, #4
-    b.lt            77f
-    ldr             T1d, [LUT, #(3 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[3], [T1]
-
-    cmp             LENEND, #5
-    b.lt            77f
-    ldr             T1d, [LUT, #(4 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[4], [T1]
-
-    cmp             LENEND, #6
-    b.lt            77f
-    ldr             T1d, [LUT, #(5 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[5], [T1]
-
-    cmp             LENEND, #7
-    b.lt            77f
-    ldr             T1d, [LUT, #(6 * 4)]
-    add             T1, BLOCK, T1, lsl #1
-    ld1             {Y0.h}[6], [T1]
-
-77:
-.endm
-
-.macro REDUCE0
-    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [VALUES], #64
-    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [VALUES], #64
-
-    cmeq            v0.8h, v0.8h, #0
-    cmeq            v1.8h, v1.8h, #0
-    cmeq            v2.8h, v2.8h, #0
-    cmeq            v3.8h, v3.8h, #0
-    cmeq            v4.8h, v4.8h, #0
-    cmeq            v5.8h, v5.8h, #0
-    cmeq            v6.8h, v6.8h, #0
-    cmeq            v7.8h, v7.8h, #0
-
-    xtn             v0.8b, v0.8h
-    xtn             v2.8b, v2.8h
-    xtn             v4.8b, v4.8h
-    xtn             v6.8b, v6.8h
-    xtn2            v0.16b, v1.8h
-    xtn2            v2.16b, v3.8h
-    xtn2            v4.16b, v5.8h
-    xtn2            v6.16b, v7.8h
-
-    and             v0.16b, v0.16b, ANDMASK.16b
-    and             v2.16b, v2.16b, ANDMASK.16b
-    and             v4.16b, v4.16b, ANDMASK.16b
-    and             v6.16b, v6.16b, ANDMASK.16b
-    addp            v0.16b, v0.16b, v2.16b
-    addp            v4.16b, v4.16b, v6.16b
-    addp            v0.16b, v0.16b, v4.16b
-    addp            v0.16b, v0.16b, v0.16b
-    umov            T0, v0.D[0]
-    mvn             T0, T0
-    str             T0, [BITS]
-.endm
-
-/*
- * Prepare data for jsimd_encode_mcu_AC_first().
- *
- * GLOBAL(int)
- * jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF *block,
- *                                        const int *jpeg_natural_order_start,
- *                                        int Sl, int Al, JCOEF *values,
- *                                        size_t *zerobits)
- *
- * x0 = const JCOEF *block
- * x1 = const int *jpeg_natural_order_start
- * w2 = int Sl
- * w3 = int Al
- * x4 = JCOEF *values
- * x5 = size_t *zerobits
- *
- */
-
-    ZERO            .req v0
-    Y0              .req v2
-    Y1              .req v3
-    N0              .req v4
-    N1              .req v5
-    AL              .req v6
-    ANDMASK         .req v20
-    K               .req w12
-    LUT             .req x1
-    T0              .req x10
-    T0d             .req w10
-    T1              .req x11
-    T1d             .req w11
-    BLOCK           .req x0
-    VALUES          .req x4
-    XORVALUES       .req x14
-    LEN             .req w2
-    LENEND          .req w9
-    BITS            .req x5
-
-asm_function jsimd_encode_mcu_AC_first_prepare_neon
-    get_symbol_loc  T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
-    neg             w3, w3                        /* Al = -Al */
-    eor             ZERO.16b, ZERO.16b, ZERO.16b
-    ld1             {ANDMASK.16b}, [T0]
-    dup             AL.8h, w3
-    add             XORVALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
-    and             LENEND, LEN, 7
-    lsr             K, LEN, 4
-    cbz             K, 3f
-1:
-    LOAD16
-    cmlt            N0.8h, Y0.8h, #0
-    cmlt            N1.8h, Y1.8h, #0
-    abs             Y0.8h, Y0.8h
-    abs             Y1.8h, Y1.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    ushl            Y1.8h, Y1.8h, AL.8h
-    eor             N0.16b, N0.16b, Y0.16b
-    eor             N1.16b, N1.16b, Y1.16b
-    st1             {Y0.8h, Y1.8h}, [VALUES], #32
-    st1             {N0.8h, N1.8h}, [XORVALUES], #32
-    subs            K, K, #1
-    b.ne            1b
-3:
-    tst             LEN, #8
-    b.eq            3f
-    tst             LEN, #7
-    b.eq            2f
-
-    LOAD15
-    cmlt            N0.8h, Y0.8h, #0
-    cmlt            N1.8h, Y1.8h, #0
-    abs             Y0.8h, Y0.8h
-    abs             Y1.8h, Y1.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    ushl            Y1.8h, Y1.8h, AL.8h
-    eor             N0.16b, N0.16b, Y0.16b
-    eor             N1.16b, N1.16b, Y1.16b
-    st1             {Y0.8h, Y1.8h}, [VALUES], #32
-    st1             {N0.8h, N1.8h}, [XORVALUES], #32
-    b               4f
-2:
-    LOAD8
-    cmlt            N0.8h, Y0.8h, #0
-    abs             Y0.8h, Y0.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    eor             N0.16b, N0.16b, Y0.16b
-    st1             {Y0.8h}, [VALUES], #16
-    st1             {N0.8h}, [XORVALUES], #16
-    b               4f
-3:
-    cbz             LENEND, 4f
-    LOAD7
-    cmlt            N0.8h, Y0.8h, #0
-    abs             Y0.8h, Y0.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    eor             N0.16b, N0.16b, Y0.16b
-    st1             {Y0.8h}, [VALUES], #16
-    st1             {N0.8h}, [XORVALUES], #16
-    /* b               4f */
-    /* fallthrough */
-4:
-    add             K, LEN, #7
-    lsr             K, K, #3
-    subs            K, K, #(/*DCTSIZE2*/ 64 / 8)
-    b.eq            5f
-1:
-    st1             {ZERO.8h}, [VALUES], #16
-    st1             {ZERO.8h}, [XORVALUES], #16
-    adds            K, K, #1
-    b.ne            1b
-5:
-    sub             VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
-
-    REDUCE0
-
-    br              x30
-
-    .unreq          ZERO
-    .unreq          Y0
-    .unreq          Y1
-    .unreq          N0
-    .unreq          N1
-    .unreq          AL
-    .unreq          ANDMASK
-    .unreq          K
-    .unreq          LUT
-    .unreq          T0
-    .unreq          T0d
-    .unreq          T1
-    .unreq          T1d
-    .unreq          BLOCK
-    .unreq          VALUES
-    .unreq          XORVALUES
-    .unreq          LEN
-    .unreq          LENEND
-    .unreq          BITS
-
-/*
- * Prepare data for jsimd_encode_mcu_AC_refine.
- *
- * GLOBAL(int)
- * jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF *block,
- *                                         const int *jpeg_natural_order_start,
- *                                         int Sl, int Al, JCOEF *absvalues,
- *                                         size_t *bits)
- *
- * x0 = const JCOEF *block
- * x1 = const int *jpeg_natural_order_start
- * w2 = int Sl
- * w3 = int Al
- * x4 = JCOEF *absvalues
- * x5 = size_t *bits
- *
- */
-
-    ZERO            .req v0
-    ONE             .req v1
-    Y0              .req v2
-    Y1              .req v3
-    N0              .req v4
-    N1              .req v5
-    AL              .req v6
-    ANDMASK         .req v20
-    K               .req w12
-    KK              .req w13
-    EOB             .req w14
-    SIGN            .req x15
-    LUT             .req x1
-    T0              .req x10
-    T0d             .req w10
-    T1              .req x11
-    T1d             .req w11
-    BLOCK           .req x0
-    VALUES          .req x4
-    LEN             .req w2
-    LENEND          .req w9
-    BITS            .req x5
-
-asm_function jsimd_encode_mcu_AC_refine_prepare_neon
-    get_symbol_loc  T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
-    neg             w3, w3                        /* Al = -Al */
-    movi            ONE.8h, #1
-    eor             SIGN, SIGN, SIGN
-    eor             ZERO.16b, ZERO.16b, ZERO.16b
-    eor             EOB, EOB, EOB
-    ld1             {ANDMASK.16b}, [T0]
-    eor             KK, KK, KK
-    dup             AL.8h, w3
-    and             LENEND, LEN, 7
-    lsr             K, LEN, 4
-    cbz             K, 3f
-1:
-    LOAD16
-    cmlt            N0.8h, Y0.8h, #0
-    cmlt            N1.8h, Y1.8h, #0
-    abs             Y0.8h, Y0.8h
-    abs             Y1.8h, Y1.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    ushl            Y1.8h, Y1.8h, AL.8h
-    st1             {Y0.8h, Y1.8h}, [VALUES], #32
-    xtn             N0.8b, N0.8h
-    xtn             N1.8b, N1.8h
-    cmeq            Y0.8h, Y0.8h, ONE.8h
-    cmeq            Y1.8h, Y1.8h, ONE.8h
-    xtn             Y0.8b, Y0.8h
-    xtn             Y1.8b, Y1.8h
-    and             N0.8b, N0.8b, ANDMASK.8b
-    and             N1.8b, N1.8b, ANDMASK.8b
-    and             Y0.8b, Y0.8b, ANDMASK.8b
-    and             Y1.8b, Y1.8b, ANDMASK.8b
-    addv            B28, N0.8b
-    addv            B29, N1.8b
-    addv            B30, Y0.8b
-    addv            B31, Y1.8b
-    ins             v28.b[1], v29.b[0]
-    ins             v30.b[1], v31.b[0]
-    umov            T0d, v28.h[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
-    umov            T1d, v30.h[0]    /* idx = _mm_movemask_epi8(x1); */
-    lsr             SIGN, SIGN, #16  /* make room for sizebits */
-    orr             SIGN, SIGN, T0, lsl #48
-    cbz             T1d, 2f
-    rbit            T1d, T1d
-    clz             T1d, T1d
-    add             EOB, KK, T1d     /* EOB = k + idx; */
-2:
-    add             KK, KK, #16
-    subs            K, K, #1
-    b.ne            1b
-3:
-    tst             LEN, #8
-    b.eq            3f
-    tst             LEN, #7
-    b.eq            2f
-
-    LOAD15
-    cmlt            N0.8h, Y0.8h, #0
-    cmlt            N1.8h, Y1.8h, #0
-    abs             Y0.8h, Y0.8h
-    abs             Y1.8h, Y1.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    ushl            Y1.8h, Y1.8h, AL.8h
-    st1             {Y0.8h, Y1.8h}, [VALUES], #32
-    xtn             N0.8b, N0.8h
-    xtn             N1.8b, N1.8h
-    cmeq            Y0.8h, Y0.8h, ONE.8h
-    cmeq            Y1.8h, Y1.8h, ONE.8h
-    xtn             Y0.8b, Y0.8h
-    xtn             Y1.8b, Y1.8h
-    and             N0.8b, N0.8b, ANDMASK.8b
-    and             N1.8b, N1.8b, ANDMASK.8b
-    and             Y0.8b, Y0.8b, ANDMASK.8b
-    and             Y1.8b, Y1.8b, ANDMASK.8b
-    addv            B28, N0.8b
-    addv            B29, N1.8b
-    addv            B30, Y0.8b
-    addv            B31, Y1.8b
-    ins             v28.b[1], v29.b[0]
-    ins             v30.b[1], v31.b[0]
-    umov            T0d, v28.h[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
-    umov            T1d, v30.h[0]    /* idx = _mm_movemask_epi8(x1); */
-    lsr             SIGN, SIGN, #16  /* make room for sizebits */
-    orr             SIGN, SIGN, T0, lsl #48
-    cbz             T1d, 4f
-    rbit            T1d, T1d
-    clz             T1d, T1d
-    add             EOB, KK, T1d     /* EOB = k + idx; */
-    b               4f
-2:
-    LOAD8
-    cmlt            N0.8h, Y0.8h, #0
-    abs             Y0.8h, Y0.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    st1             {Y0.8h}, [VALUES], #16
-    xtn             N0.8b, N0.8h
-    cmeq            Y0.8h, Y0.8h, ONE.8h
-    xtn             Y0.8b, Y0.8h
-    and             N0.8b, N0.8b, ANDMASK.8b
-    and             Y0.8b, Y0.8b, ANDMASK.8b
-    addv            B28, N0.8b
-    addv            B30, Y0.8b
-    umov            T0d, v28.b[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
-    umov            T1d, v30.b[0]    /* idx = _mm_movemask_epi8(x1); */
-    lsr             SIGN, SIGN, #8   /* make room for sizebits */
-    orr             SIGN, SIGN, T0, lsl #56
-    cbz             T1d, 4f
-    rbit            T1d, T1d
-    clz             T1d, T1d
-    add             EOB, KK, T1d     /* EOB = k + idx; */
-    b               4f
-3:
-    cbz             LENEND, 4f
-    LOAD7
-    cmlt            N0.8h, Y0.8h, #0
-    abs             Y0.8h, Y0.8h
-    ushl            Y0.8h, Y0.8h, AL.8h
-    st1             {Y0.8h}, [VALUES], #16
-    xtn             N0.8b, N0.8h
-    cmeq            Y0.8h, Y0.8h, ONE.8h
-    xtn             Y0.8b, Y0.8h
-    and             N0.8b, N0.8b, ANDMASK.8b
-    and             Y0.8b, Y0.8b, ANDMASK.8b
-    addv            B28, N0.8b
-    addv            B30, Y0.8b
-    umov            T0d, v28.b[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
-    umov            T1d, v30.b[0]    /* idx = _mm_movemask_epi8(x1); */
-    lsr             SIGN, SIGN, #8   /* make room for sizebits */
-    orr             SIGN, SIGN, T0, lsl #56
-    cbz             T1d, 4f
-    rbit            T1d, T1d
-    clz             T1d, T1d
-    add             EOB, KK, T1d     /* EOB = k + idx; */
-    /* b               4f */
-    /* fallthrough */
-4:
-    add             K, LEN, #7
-    lsr             K, K, #3
-    subs            K, K, #(/*DCTSIZE2*/ 64 / 8)
-    b.eq            5f
-1:
-    st1             {ZERO.8h}, [VALUES], #16
-    lsr             SIGN, SIGN, #8
-    adds            K, K, #1
-    b.ne            1b
-5:
-    mvn             SIGN, SIGN
-    sub             VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
-    str             SIGN, [BITS, #8]
-
-    REDUCE0
-
-    mov             w0, EOB
-    br              x30
-
-    .unreq          ZERO
-    .unreq          ONE
-    .unreq          Y0
-    .unreq          Y1
-    .unreq          N0
-    .unreq          N1
-    .unreq          AL
-    .unreq          ANDMASK
-    .unreq          K
-    .unreq          KK
-    .unreq          EOB
-    .unreq          SIGN
-    .unreq          LUT
-    .unreq          T0
-    .unreq          T0d
-    .unreq          T1
-    .unreq          T1d
-    .unreq          BLOCK
-    .unreq          VALUES
-    .unreq          LEN
-    .unreq          LENEND
-    .unreq          BITS
-
-.purgem LOAD16
-.purgem LOAD15
-.purgem LOAD8
-.purgem LOAD7
-.purgem REDUCE0
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000..61f94c2
--- /dev/null
+++ b/simd/arm/jcphuff-neon.c
@@ -0,0 +1,588 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *values_ptr = values;
+  JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs);
+    vst1q_s16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(values_ptr, vdupq_n_s16(0));
+    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+  const uint8x8_t bitmap_mask =
+    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  JCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+  const uint8x8_t bitmap_mask =
+    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - __builtin_clzl(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - __builtin_clz(bitmap1);
+  } else {
+    return 31 - __builtin_clz(bitmap0);
+  }
+#endif
+}