Neon: Intrinsics implementation of YCbCr->RGB565

The previous AArch64 GAS implementation is retained by default when
using GCC, in order to avoid a performance regression.  The intrinsics
implementation can be forced on or off using the new NEON_INTRINSICS
CMake variable.  The previous AArch32 GAS implementation has been
removed, since the intrinsics implementation provides the same or better
performance.
diff --git a/simd/arm/aarch32/jsimd_neon.S b/simd/arm/aarch32/jsimd_neon.S
index cfebe1b..c45f63c 100644
--- a/simd/arm/aarch32/jsimd_neon.S
+++ b/simd/arm/aarch32/jsimd_neon.S
@@ -1267,282 +1267,6 @@
 .purgem idct_helper
 
 
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_rgb565_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB565
- */
-
-
-.macro do_load size
-  .if \size == 8
-    vld1.8          {d4}, [U, :64]!
-    vld1.8          {d5}, [V, :64]!
-    vld1.8          {d0}, [Y, :64]!
-    pld             [U, #64]
-    pld             [V, #64]
-    pld             [Y, #64]
-  .elseif \size == 4
-    vld1.8          {d4[0]}, [U]!
-    vld1.8          {d4[1]}, [U]!
-    vld1.8          {d4[2]}, [U]!
-    vld1.8          {d4[3]}, [U]!
-    vld1.8          {d5[0]}, [V]!
-    vld1.8          {d5[1]}, [V]!
-    vld1.8          {d5[2]}, [V]!
-    vld1.8          {d5[3]}, [V]!
-    vld1.8          {d0[0]}, [Y]!
-    vld1.8          {d0[1]}, [Y]!
-    vld1.8          {d0[2]}, [Y]!
-    vld1.8          {d0[3]}, [Y]!
-  .elseif \size == 2
-    vld1.8          {d4[4]}, [U]!
-    vld1.8          {d4[5]}, [U]!
-    vld1.8          {d5[4]}, [V]!
-    vld1.8          {d5[5]}, [V]!
-    vld1.8          {d0[4]}, [Y]!
-    vld1.8          {d0[5]}, [Y]!
-  .elseif \size == 1
-    vld1.8          {d4[6]}, [U]!
-    vld1.8          {d5[6]}, [V]!
-    vld1.8          {d0[6]}, [Y]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size
-  .if \bpp == 16
-    .if \size == 8
-      vst1.16       {q15}, [RGB]!
-    .elseif \size == 4
-      vst1.16       {d30}, [RGB]!
-    .elseif \size == 2
-      vst1.16       {d31[0]}, [RGB]!
-      vst1.16       {d31[1]}, [RGB]!
-    .elseif \size == 1
-      vst1.16       {d31[2]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    vrshrn.s32      d20, q10, #15
-    vrshrn.s32      d21, q11, #15
-    vrshrn.s32      d24, q12, #14
-    vrshrn.s32      d25, q13, #14
-    vrshrn.s32      d28, q14, #14
-    vrshrn.s32      d29, q15, #14
-    vaddw.u8        q11, q10, d0
-    vaddw.u8        q12, q12, d0
-    vaddw.u8        q14, q14, d0
-    vqshlu.s16      q13, q11, #8
-    vqshlu.s16      q15, q12, #8
-    vqshlu.s16      q14, q14, #8
-    vsri.u16        q15, q13, #5
-    vsri.u16        q15, q14, #11
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-                                       /* "do_yuv_to_rgb_stage2" and "store" */
-                                       vrshrn.s32      d20, q10, #15
-    /* "load" and "do_yuv_to_rgb_stage1" */
-    pld             [U, #64]
-                                       vrshrn.s32      d21, q11, #15
-    pld             [V, #64]
-                                       vrshrn.s32      d24, q12, #14
-                                       vrshrn.s32      d25, q13, #14
-    vld1.8          {d4}, [U, :64]!
-                                       vrshrn.s32      d28, q14, #14
-    vld1.8          {d5}, [V, :64]!
-                                       vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-                                       vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-                                       vaddw.u8        q12, q12, d0
-                                       vaddw.u8        q14, q14, d0
-                                       vqshlu.s16      q13, q11, #8
-    pld             [Y, #64]
-                                       vqshlu.s16      q15, q12, #8
-                                       vqshlu.s16      q14, q14, #8
-    vld1.8          {d0}, [Y, :64]!
-    vmull.s16       q11, d7, d1[1]
-    vmlal.s16       q11, d9, d1[2]
-                                       vsri.u16        q15, q13, #5
-    vmull.s16       q12, d8, d1[0]
-                                       vsri.u16        q15, q14, #11
-    vmull.s16       q13, d9, d1[0]
-    vmull.s16       q14, d6, d1[3]
-                                       do_store        \bpp, 8
-    vmull.s16       q15, d7, d1[3]
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    INPUT_ROW       .req r2
-    OUTPUT_BUF      .req r3
-    NUM_ROWS        .req r4
-
-    INPUT_BUF0      .req r5
-    INPUT_BUF1      .req r6
-    INPUT_BUF2      .req INPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
-    adr             ip, jsimd_ycc_\colorid\()_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save Arm registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #4]
-    ldr             INPUT_BUF2, [INPUT_BUF, #8]
-    .unreq          INPUT_BUF
-
-    /* Save Neon registers */
-    vpush           {d8 - d15}
-
-    /* Initially set d10, d11, d12, d13 to 0xFF */
-    vmov.u8         q5, #255
-    vmov.u8         q6, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
 #ifndef NEON_INTRINSICS
 
 /*****************************************************************************/
diff --git a/simd/arm/aarch64/jsimd_neon.S b/simd/arm/aarch64/jsimd_neon.S
index dd97aae..335b0f5 100644
--- a/simd/arm/aarch64/jsimd_neon.S
+++ b/simd/arm/aarch64/jsimd_neon.S
@@ -136,6 +136,8 @@
   .short -FIX_1_272758580  /* v14[2] */
   .short FIX_3_624509785   /* v14[3] */
 
+#ifndef NEON_INTRINSICS
+
 /* Constants for jsimd_ycc_*_neon() */
 
 .balign 16
@@ -145,8 +147,6 @@
   .short -128,  -128,   -128,   -128
   .short -128,  -128,   -128,   -128
 
-#ifndef NEON_INTRINSICS
-
 /* Constants for jsimd_*_ycc_neon() */
 
 .balign 16
@@ -1570,6 +1570,8 @@
 .purgem idct_helper
 
 
+#ifndef NEON_INTRINSICS
+
 /*****************************************************************************/
 
 /*
@@ -1929,27 +1931,21 @@
 .endm
 
 /*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
-#ifndef NEON_INTRINSICS
 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
-#endif
 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
 
-#ifndef NEON_INTRINSICS
 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
-#endif
 
 .purgem do_load
 .purgem do_store
 
 
-#ifndef NEON_INTRINSICS
-
 /*****************************************************************************/
 
 /*
diff --git a/simd/arm/jdcolext-neon.c b/simd/arm/jdcolext-neon.c
index 5e4baa9..ae440f4 100644
--- a/simd/arm/jdcolext-neon.c
+++ b/simd/arm/jdcolext-neon.c
@@ -135,7 +135,7 @@
         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
                                        vget_high_u8(y)));
 
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
       uint8x16x4_t rgba;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
@@ -145,7 +145,7 @@
       rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
       /* Store RGBA pixel data to memory. */
       vst4q_u8(outptr, rgba);
-#else
+#elif RGB_PIXELSIZE == 3
       uint8x16x3_t rgb;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
@@ -153,7 +153,19 @@
       rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
       /* Store RGB pixel data to memory. */
       vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
 #endif
+
       /* Increment pointers. */
       inptr0 += 16;
       inptr1 += 16;
@@ -192,7 +204,7 @@
       int16x8_t g =
         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
 
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
       uint8x8x4_t rgba;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgba.val[RGB_RED] = vqmovun_s16(r);
@@ -202,7 +214,7 @@
       rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
       /* Store RGBA pixel data to memory. */
       vst4_u8(outptr, rgba);
-#else
+#elif RGB_PIXELSIZE == 3
       uint8x8x3_t rgb;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgb.val[RGB_RED] = vqmovun_s16(r);
@@ -210,7 +222,15 @@
       rgb.val[RGB_BLUE] = vqmovun_s16(b);
       /* Store RGB pixel data to memory. */
       vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
 #endif
+
       /* Increment pointers. */
       inptr0 += 8;
       inptr1 += 8;
@@ -251,7 +271,7 @@
       int16x8_t g =
         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
 
-#ifdef RGB_ALPHA
+#if RGB_PIXELSIZE == 4
       uint8x8x4_t rgba;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgba.val[RGB_RED] = vqmovun_s16(r);
@@ -278,7 +298,7 @@
       default:
         break;
       }
-#else
+#elif RGB_PIXELSIZE == 3
       uint8x8x3_t rgb;
       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
       rgb.val[RGB_RED] = vqmovun_s16(r);
@@ -303,6 +323,30 @@
       default:
         break;
       }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+      default:
+        break;
+      }
 #endif
     }
   }
diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
index abbef8c..28dbc57 100644
--- a/simd/arm/jdcolor-neon.c
+++ b/simd/arm/jdcolor-neon.c
@@ -131,3 +131,11 @@
 #undef RGB_ALPHA
 #undef RGB_PIXELSIZE
 #undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon