Neon: Intrinsics impl. of h1v2 fancy upsamling
There was no previous GAS implementation.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c11f03..8dac532 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -720,6 +720,8 @@
set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
+ set(MD5_JPEG_440_ISLOW e25c1912e38367be505a89c410c1c2d2)
+ set(MD5_PPM_440_ISLOW e7d2e26288870cfcb30f3114ad01e380)
set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
@@ -768,6 +770,8 @@
set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
+ set(MD5_JPEG_440_ISLOW 538bc02bd4b4658fd85de6ece6cbeda6)
+ set(MD5_PPM_440_ISLOW 11e7eab7ef7ef3276934bb7e7b6bb377)
set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
@@ -1101,6 +1105,16 @@
testout_422_ifast.ppm testout_422_ifast_opt.jpg
${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
+ # CC: RGB->YCC SAMP: fullsize/h1v2 FDCT: islow ENT: huff
+ add_bittest(cjpeg 440-islow "-sample;1x2;-dct;int"
+ testout_440_islow.jpg ${TESTIMAGES}/testorig.ppm
+ ${MD5_JPEG_440_ISLOW})
+
+ # CC: YCC->RGB SAMP: fullsize/h1v2 fancy IDCT: islow ENT: huff
+ add_bittest(djpeg 440-islow "-dct;int"
+ testout_440_islow.ppm testout_440_islow.jpg
+ ${MD5_PPM_440_ISLOW} cjpeg-${libtype}-440-islow)
+
# CC: YCC->RGB SAMP: h2v1 merged IDCT: ifast ENT: huff
add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
testout_422m_ifast.ppm testout_422_ifast_opt.jpg
diff --git a/jdsample.c b/jdsample.c
index da8f151..2d34710 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -8,7 +8,7 @@
* Copyright (C) 2010, 2015-2016, D. R. Commander.
* Copyright (C) 2014, MIPS Technologies, Inc., California.
* Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2019, Arm Limited.
+ * Copyright (C) 2019-2020, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
@@ -477,7 +477,12 @@
} else if (h_in_group == h_out_group &&
v_in_group * 2 == v_out_group && do_fancy) {
/* Non-fancy upsampling is handled by the generic method */
- upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__)
+ if (jsimd_can_h1v2_fancy_upsample())
+ upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+ else
+#endif
+ upsample->methods[ci] = h1v2_fancy_upsample;
upsample->pub.need_context_rows = TRUE;
} else if (h_in_group * 2 == h_out_group &&
v_in_group * 2 == v_out_group) {
diff --git a/jsimd.h b/jsimd.h
index 51e2b8c..6c20365 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -4,6 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, 2014, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -75,6 +76,7 @@
EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
jpeg_component_info *compptr,
@@ -84,6 +86,10 @@
jpeg_component_info *compptr,
JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+ jpeg_component_info *compptr,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
diff --git a/jsimd_none.c b/jsimd_none.c
index 3cb6c80..5b38a9f 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -4,6 +4,7 @@
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009-2011, 2014, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -169,6 +170,12 @@
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -181,6 +188,12 @@
{
}
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index 3c9a3f6..cd90c63 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -454,6 +454,23 @@
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -472,6 +489,15 @@
output_data_ptr);
}
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c
index 5040d50..a507c2c 100644
--- a/simd/arm/aarch64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -522,6 +522,23 @@
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -540,6 +557,15 @@
output_data_ptr);
}
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
diff --git a/simd/arm/jdsample-neon.c b/simd/arm/jdsample-neon.c
index 742ca58..b31d6cf 100644
--- a/simd/arm/jdsample-neon.c
+++ b/simd/arm/jdsample-neon.c
@@ -372,3 +372,96 @@
inrow++;
}
}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ * +---------+
+ * | p0 |
+ * sA | |
+ * | p1 |
+ * +---------+
+ * | p2 |
+ * sB | |
+ * | p3 |
+ * +---------+
+ * | p4 |
+ * sC | |
+ * | p5 |
+ * +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * sA + 1/4 * sB
+ * p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ * p0(upsampled) = sA
+ * p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+ inrow++;
+
+ /* The size of the input and output buffers is always a multiple of 32
+ * bytes => no need to worry about buffer overflow when reading/writing
+ * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
+ * details.
+ */
+ for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+ /* Load samples. */
+ uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+ uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+ uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+ /* Blend samples vertically. */
+ uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+ vget_high_u8(sB), three_u8);
+ uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+ vget_high_u8(sB), three_u8);
+ /* Add ordered dithering bias to pixel values in even output rows. */
+ colsum0_l = vaddq_u16(colsum0_l, one_u16);
+ colsum0_h = vaddq_u16(colsum0_h, one_u16);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+ vshrn_n_u16(colsum0_h, 2));
+ uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+ vrshrn_n_u16(colsum1_h, 2));
+ /* Store pixel component values to memory. */
+ vst1q_u8(outptr0 + colctr, output_pixels0);
+ vst1q_u8(outptr1 + colctr, output_pixels1);
+ }
+ }
+}
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 29920a0..053ea3c 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -664,6 +664,9 @@
EXTERN(void) jsimd_h2v2_fancy_upsample_neon
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,