Neon: Intrinsics impl. of h1v2 fancy upsamling

There was no previous GAS implementation.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c11f03..8dac532 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -720,6 +720,8 @@
   set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
   set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
   set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
+  set(MD5_JPEG_440_ISLOW e25c1912e38367be505a89c410c1c2d2)
+  set(MD5_PPM_440_ISLOW e7d2e26288870cfcb30f3114ad01e380)
   set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
   set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
   set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
@@ -768,6 +770,8 @@
   set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
   set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
   set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
+  set(MD5_JPEG_440_ISLOW 538bc02bd4b4658fd85de6ece6cbeda6)
+  set(MD5_PPM_440_ISLOW 11e7eab7ef7ef3276934bb7e7b6bb377)
   set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
   set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
   set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
@@ -1101,6 +1105,16 @@
     testout_422_ifast.ppm testout_422_ifast_opt.jpg
     ${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
 
+  # CC: RGB->YCC  SAMP: fullsize/h1v2  FDCT: islow  ENT: huff
+  add_bittest(cjpeg 440-islow "-sample;1x2;-dct;int"
+    testout_440_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_440_ISLOW})
+
+  # CC: YCC->RGB  SAMP: fullsize/h1v2 fancy  IDCT: islow  ENT: huff
+  add_bittest(djpeg 440-islow "-dct;int"
+    testout_440_islow.ppm testout_440_islow.jpg
+    ${MD5_PPM_440_ISLOW} cjpeg-${libtype}-440-islow)
+
   # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
   add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
     testout_422m_ifast.ppm testout_422_ifast_opt.jpg
diff --git a/jdsample.c b/jdsample.c
index da8f151..2d34710 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -8,7 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2019, Arm Limited.
+ * Copyright (C) 2019-2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -477,7 +477,12 @@
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-      upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__)
+      if (jsimd_can_h1v2_fancy_upsample())
+        upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+      else
+#endif
+        upsample->methods[ci] = h1v2_fancy_upsample;
       upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
diff --git a/jsimd.h b/jsimd.h
index 51e2b8c..6c20365 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -75,6 +76,7 @@
 
 EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
 EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
 
 EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
                                        jpeg_component_info *compptr,
@@ -84,6 +86,10 @@
                                        jpeg_component_info *compptr,
                                        JSAMPARRAY input_data,
                                        JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
 
 EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
 EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
diff --git a/jsimd_none.c b/jsimd_none.c
index 3cb6c80..5b38a9f 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -169,6 +170,12 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -181,6 +188,12 @@
 {
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index 3c9a3f6..cd90c63 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -454,6 +454,23 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -472,6 +489,15 @@
                                  output_data_ptr);
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c
index 5040d50..a507c2c 100644
--- a/simd/arm/aarch64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -522,6 +522,23 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -540,6 +557,15 @@
                                  output_data_ptr);
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
diff --git a/simd/arm/jdsample-neon.c b/simd/arm/jdsample-neon.c
index 742ca58..b31d6cf 100644
--- a/simd/arm/jdsample-neon.c
+++ b/simd/arm/jdsample-neon.c
@@ -372,3 +372,96 @@
     inrow++;
   }
 }
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 29920a0..053ea3c 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -664,6 +664,9 @@
 EXTERN(void) jsimd_h2v2_fancy_upsample_neon
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,