Neon: Intrinsics impl. of h2v1 & h2v2 plain upsamp

There was no previous GAS implementation.

NOTE: This doesn't produce much of a speedup when using -O3, because -O3
already enables Neon autovectorization, which works well for the scalar
C implementation of plain upsampling.  However, the Neon SIMD
implementation will benefit other optimization levels.
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index df13c11..fac55df 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -399,12 +399,33 @@
 GLOBAL(int)
 jsimd_can_h2v2_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -412,12 +433,16 @@
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(int)
diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c
index 74cd6cc..8570b82 100644
--- a/simd/arm/aarch64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -467,12 +467,33 @@
 GLOBAL(int)
 jsimd_can_h2v2_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -480,12 +501,16 @@
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(int)
diff --git a/simd/arm/jdsample-neon.c b/simd/arm/jdsample-neon.c
index b31d6cf..90ec678 100644
--- a/simd/arm/jdsample-neon.c
+++ b/simd/arm/jdsample-neon.c
@@ -465,3 +465,105 @@
     }
   }
 }
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index ccc9a10..64747c6 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -615,6 +615,13 @@
   (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
 
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
 EXTERN(void) jsimd_h2v1_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);