simd/arm/jcsample-neon.c - external/github.com/libjpeg-turbo/libjpeg-turbo - Git at Google

 /*
  * jcsample-neon.c - downsampling (Arm Neon)
  *
  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  */

 #define JPEG_INTERNALS
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 #include "align.h"

 #include <arm_neon.h>


 ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
   0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
   0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
   0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
   0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
   0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
   0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
   0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
   0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
   0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
   0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
   0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
   0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
   0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 };


 /* Downsample pixel values of a single component.
  * This version handles the common case of 2:1 horizontal and 1:1 vertical,
  * without smoothing.
  */

 void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
                                 JDIMENSION v_samp_factor,
                                 JDIMENSION width_in_blocks,
                                 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   JSAMPROW inptr, outptr;
   /* Load expansion mask to pad remaining elements of last DCT block. */
   const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
   const uint8x16_t expand_mask =
     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
   /* Load bias pattern (alternating every pixel.) */
   /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
   const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
   unsigned i, outrow;

   for (outrow = 0; outrow < v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];

     /* Downsample all but the last DCT block of pixels. */
     for (i = 0; i < width_in_blocks - 1; i++) {
       uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
       /* Add adjacent pixel values, widen to 16-bit, and add bias. */
       uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
       /* Divide total by 2 and narrow to 8-bit. */
       uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
       /* Store samples to memory. */
       vst1_u8(outptr + i * DCTSIZE, samples_u8);
     }

     /* Load pixels in last DCT block into a table. */
     uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
 #if defined(__aarch64__) || defined(_M_ARM64)
     /* Pad the empty elements with the value of the last pixel. */
     pixels = vqtbl1q_u8(pixels, expand_mask);
 #else
     uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
     pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
                          vtbl2_u8(table, vget_high_u8(expand_mask)));
 #endif
     /* Add adjacent pixel values, widen to 16-bit, and add bias. */
     uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
     /* Divide total by 2, narrow to 8-bit, and store. */
     uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
     vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
   }
 }


 /* Downsample pixel values of a single component.
  * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  * without smoothing.
  */

 void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
                                 JDIMENSION v_samp_factor,
                                 JDIMENSION width_in_blocks,
                                 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   JSAMPROW inptr0, inptr1, outptr;
   /* Load expansion mask to pad remaining elements of last DCT block. */
   const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
   const uint8x16_t expand_mask =
     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
   /* Load bias pattern (alternating every pixel.) */
   /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
   const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
   unsigned i, outrow;

   for (outrow = 0; outrow < v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[outrow];
     inptr1 = input_data[outrow + 1];

     /* Downsample all but the last DCT block of pixels. */
     for (i = 0; i < width_in_blocks - 1; i++) {
       uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
       uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
       /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
       uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
       /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
        */
       samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
       /* Divide total by 4 and narrow to 8-bit. */
       uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
       /* Store samples to memory and increment pointers. */
       vst1_u8(outptr + i * DCTSIZE, samples_u8);
     }

     /* Load pixels in last DCT block into a table. */
     uint8x16_t pixels_r0 =
       vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
     uint8x16_t pixels_r1 =
       vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
 #if defined(__aarch64__) || defined(_M_ARM64)
     /* Pad the empty elements with the value of the last pixel. */
     pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
     pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
 #else
     uint8x8x2_t table_r0 =
       { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
     uint8x8x2_t table_r1 =
       { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
     pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
                             vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
     pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
                             vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
 #endif
     /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
     uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
     /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
     samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
     /* Divide total by 4, narrow to 8-bit, and store. */
     uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
     vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
   }
 }
	/*
	* jcsample-neon.c - downsampling (Arm Neon)
	*
	* Copyright (C) 2020, Arm Limited. All Rights Reserved.
	*
	* This software is provided 'as-is', without any express or implied
	* warranty. In no event will the authors be held liable for any damages
	* arising from the use of this software.
	*
	* Permission is granted to anyone to use this software for any purpose,
	* including commercial applications, and to alter it and redistribute it
	* freely, subject to the following restrictions:
	*
	* 1. The origin of this software must not be misrepresented; you must not
	* claim that you wrote the original software. If you use this software
	* in a product, an acknowledgment in the product documentation would be
	* appreciated but is not required.
	* 2. Altered source versions must be plainly marked as such, and must not be
	* misrepresented as being the original software.
	* 3. This notice may not be removed or altered from any source distribution.
	*/

	#define JPEG_INTERNALS
	#include "../../jinclude.h"
	#include "../../jpeglib.h"
	#include "../../jsimd.h"
	#include "../../jdct.h"
	#include "../../jsimddct.h"
	#include "../jsimd.h"
	#include "align.h"

	#include <arm_neon.h>


	ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
	0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
	0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
	0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
	0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
	0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
	0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
	0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
	0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
	0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
	0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
	0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
	};


	/* Downsample pixel values of a single component.
	* This version handles the common case of 2:1 horizontal and 1:1 vertical,
	* without smoothing.
	*/

	void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
	JDIMENSION v_samp_factor,
	JDIMENSION width_in_blocks,
	JSAMPARRAY input_data, JSAMPARRAY output_data)
	{
	JSAMPROW inptr, outptr;
	/* Load expansion mask to pad remaining elements of last DCT block. */
	const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
	const uint8x16_t expand_mask =
	vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
	/* Load bias pattern (alternating every pixel.) */
	/* { 0, 1, 0, 1, 0, 1, 0, 1 } */
	const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
	unsigned i, outrow;

	for (outrow = 0; outrow < v_samp_factor; outrow++) {
	outptr = output_data[outrow];
	inptr = input_data[outrow];

	/* Downsample all but the last DCT block of pixels. */
	for (i = 0; i < width_in_blocks - 1; i++) {
	uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
	/* Add adjacent pixel values, widen to 16-bit, and add bias. */
	uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
	/* Divide total by 2 and narrow to 8-bit. */
	uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
	/* Store samples to memory. */
	vst1_u8(outptr + i * DCTSIZE, samples_u8);
	}

	/* Load pixels in last DCT block into a table. */
	uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
	#if defined(__aarch64__) \|\| defined(_M_ARM64)
	/* Pad the empty elements with the value of the last pixel. */
	pixels = vqtbl1q_u8(pixels, expand_mask);
	#else
	uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
	pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
	vtbl2_u8(table, vget_high_u8(expand_mask)));
	#endif
	/* Add adjacent pixel values, widen to 16-bit, and add bias. */
	uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
	/* Divide total by 2, narrow to 8-bit, and store. */
	uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
	vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
	}
	}


	/* Downsample pixel values of a single component.
	* This version handles the standard case of 2:1 horizontal and 2:1 vertical,
	* without smoothing.
	*/

	void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
	JDIMENSION v_samp_factor,
	JDIMENSION width_in_blocks,
	JSAMPARRAY input_data, JSAMPARRAY output_data)
	{
	JSAMPROW inptr0, inptr1, outptr;
	/* Load expansion mask to pad remaining elements of last DCT block. */
	const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
	const uint8x16_t expand_mask =
	vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
	/* Load bias pattern (alternating every pixel.) */
	/* { 1, 2, 1, 2, 1, 2, 1, 2 } */
	const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
	unsigned i, outrow;

	for (outrow = 0; outrow < v_samp_factor; outrow++) {
	outptr = output_data[outrow];
	inptr0 = input_data[outrow];
	inptr1 = input_data[outrow + 1];

	/* Downsample all but the last DCT block of pixels. */
	for (i = 0; i < width_in_blocks - 1; i++) {
	uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
	uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
	/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
	uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
	/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
	*/
	samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
	/* Divide total by 4 and narrow to 8-bit. */
	uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
	/* Store samples to memory and increment pointers. */
	vst1_u8(outptr + i * DCTSIZE, samples_u8);
	}

	/* Load pixels in last DCT block into a table. */
	uint8x16_t pixels_r0 =
	vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
	uint8x16_t pixels_r1 =
	vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
	#if defined(__aarch64__) \|\| defined(_M_ARM64)
	/* Pad the empty elements with the value of the last pixel. */
	pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
	pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
	#else
	uint8x8x2_t table_r0 =
	{ { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
	uint8x8x2_t table_r1 =
	{ { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
	pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
	vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
	pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
	vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
	#endif
	/* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
	uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
	/* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
	samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
	/* Divide total by 4, narrow to 8-bit, and store. */
	uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
	vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
	}
	}