src/video/arm/pixman-arm-neon-asm.S - external/github.com/libsdl-org/SDL - Git at Google

 /*
  * Copyright © 2009 Nokia Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
  */

 /*
  * Copyright (c) 2018 RISC OS Open Ltd
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  */

 /* Prevent the stack from becoming executable for no reason... */
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif

     .text
     .fpu neon
     .arch armv7a
     .object_arch armv4
     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
     .arm
     .altmacro
     .p2align 2

 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"

 /* Global configuration options and preferences */

 /*
  * The code can optionally make use of unaligned memory accesses to improve
  * performance of handling leading/trailing pixels for each scanline.
  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
  * example in linux if unaligned memory accesses are not configured to
  * generate.exceptions.
  */
 .set RESPECT_STRICT_ALIGNMENT, 1

 /*
  * Set default prefetch type. There is a choice between the following options:
  *
  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
  * as NOP to workaround some HW bugs or for whatever other reason)
  *
  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
  * advanced prefetch intruduces heavy overhead)
  *
  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
  * which can run ARM and NEON instructions simultaneously so that extra ARM
  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
  *
  * Note: some types of function can't support advanced prefetch and fallback
  *       to simple one (those which handle 24bpp pixels)
  */
 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED

 /* Prefetch distance in pixels for simple prefetch */
 .set PREFETCH_DISTANCE_SIMPLE, 64

 /******************************************************************************/

 /* We can actually do significantly better than the Pixman macros, at least for
  * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
  * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
  */

 .macro generate_fillrect_function name, bpp, log2Bpp
 /*
  * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
  * On entry:
  * a1 = width, pixels
  * a2 = height, rows
  * a3 = pointer to top-left destination pixel
  * a4 = stride, pixels
  * [sp] = pixel value to fill with
  * Within the function:
  * v1 = width remaining
  * v2 = vst offset
  * v3 = alternate pointer
  * ip = data ARM register
  */
 pixman_asm_function name
     vld1.\bpp   {d0[],d1[]}, [sp]
     sub         a4, a1
     vld1.\bpp   {d2[],d3[]}, [sp]
     cmp         a1, #(15+64) >> \log2Bpp
     push        {v1-v3,lr}
     vmov        ip, s0
     blo         51f

     /* Long-row case */
     mov         v2, #64
 1:  mov         v1, a1
     ands        v3, a3, #15
     beq         2f
     /* Leading pixels */
     rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
     sub         v1, v1, v3, lsr #\log2Bpp
     rbit        v3, v3
 .if bpp <= 16
 .if bpp == 8
     tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
     strneb      ip, [a3], #1
     tst         v3, #1<<30
 .else
     tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
 .endif
     strneh      ip, [a3], #2
 .endif
     movs        v3, v3, lsl #3
     vstmcs      a3!, {s0}
     vstmmi      a3!, {d0}
 2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
     add         v3, a3, #32
     /* Inner loop */
 3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
     subs        v1, v1, #64 >> \log2Bpp
     vst1.\bpp   {q0-q1}, [v3 :128], v2
     bhs         3b
     /* Trailing pixels */
 4:  movs        v1, v1, lsl #27 + \log2Bpp
     bcc         5f
     vst1.\bpp   {q0-q1}, [a3 :128]!
 5:  bpl         6f
     vst1.\bpp   {q0}, [a3 :128]!
 6:  movs        v1, v1, lsl #2
     vstmcs      a3!, {d0}
     vstmmi      a3!, {s0}
 .if bpp <= 16
     movs        v1, v1, lsl #2
     strcsh      ip, [a3], #2
 .if bpp == 8
     strmib      ip, [a3], #1
 .endif
 .endif
     subs        a2, a2, #1
     add         a3, a3, a4, lsl #\log2Bpp
     bhi         1b
     pop         {v1-v3,pc}

     /* Short-row case */
 51: movs        v1, a1
 .if bpp == 8
     tst         a3, #3
     beq         53f
 52: subs        v1, v1, #1
     blo         57f
     strb        ip, [a3], #1
     tst         a3, #3
     bne         52b
 .elseif bpp == 16
     tstne       a3, #2
     subne       v1, v1, #1
     strneh      ip, [a3], #2
 .endif
 53: cmp         v1, #32 >> \log2Bpp
     bcc         54f
     vst1.\bpp   {q0-q1}, [a3]!
     sub         v1, v1, #32 >> \log2Bpp
     /* Trailing pixels */
 54: movs        v1, v1, lsl #27 + \log2Bpp
     bcc         55f
     vst1.\bpp   {q0-q1}, [a3]!
 55: bpl         56f
     vst1.\bpp   {q0}, [a3]!
 56: movs        v1, v1, lsl #2
     vstmcs      a3!, {d0}
     vstmmi      a3!, {s0}
 .if bpp <= 16
     movs        v1, v1, lsl #2
     strcsh      ip, [a3], #2
 .if bpp == 8
     strmib      ip, [a3], #1
 .endif
 .endif
     subs        a2, a2, #1
     add         a3, a3, a4, lsl #\log2Bpp
     bhi         51b
 57: pop         {v1-v3,pc}

 .endfunc
 .endm

 generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
 generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
 generate_fillrect_function FillRect8ARMNEONAsm,  8,  0

 /******************************************************************************/

 .macro RGBtoRGBPixelAlpha_process_pixblock_head
     vmvn        d30, d3  /* get inverted source alpha */
     vmov        d31, d7  /* dest alpha is always unchanged */
     vmull.u8    q14, d0, d3
     vmlal.u8    q14, d4, d30
     vmull.u8    q0, d1, d3
     vmlal.u8    q0, d5, d30
     vmull.u8    q1, d2, d3
     vmlal.u8    q1, d6, d30
     vrshr.u16   q2, q14, #8
     vrshr.u16   q3, q0, #8
     vraddhn.u16 d28, q14, q2
     vrshr.u16   q2, q1, #8
     vraddhn.u16 d29, q0, q3
     vraddhn.u16 d30, q1, q2
 .endm

 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
     /* nothing */
 .endm

 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
     vld4.8      {d0-d3}, [SRC]!
                                     PF add PF_X, PF_X, #8
         vst4.8      {d28-d31}, [DST_W :128]!
                                     PF tst PF_CTL, #0xF
     vld4.8      {d4-d7}, [DST_R :128]!
                                     PF addne PF_X, PF_X, #8
     vmvn        d30, d3  /* get inverted source alpha */
     vmov        d31, d7  /* dest alpha is always unchanged */
     vmull.u8    q14, d0, d3
                                     PF subne PF_CTL, PF_CTL, #1
     vmlal.u8    q14, d4, d30
                                     PF cmp PF_X, ORIG_W
     vmull.u8    q0, d1, d3
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmlal.u8    q0, d5, d30
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
     vmull.u8    q1, d2, d3
                                     PF subge PF_X, PF_X, ORIG_W
     vmlal.u8    q1, d6, d30
                                     PF subges PF_CTL, PF_CTL, #0x10
     vrshr.u16   q2, q14, #8
                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
     vrshr.u16   q3, q0, #8
                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vraddhn.u16 d28, q14, q2
     vrshr.u16   q2, q1, #8
     vraddhn.u16 d29, q0, q3
     vraddhn.u16 d30, q1, q2
 .endm

 generate_composite_function \
     BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
     8, /* number of pixels, processed in a single block */ \
     5, /* prefetch distance */ \
     default_init, \
     default_cleanup, \
     RGBtoRGBPixelAlpha_process_pixblock_head, \
     RGBtoRGBPixelAlpha_process_pixblock_tail, \
     RGBtoRGBPixelAlpha_process_pixblock_tail_head

  /******************************************************************************/

 .macro ARGBto565PixelAlpha_process_pixblock_head
     vmvn        d6, d3
     vshr.u8     d1, #2
     vshr.u8     d3, #3
     vshr.u8     d0, #3
     vshrn.u16   d7, q2, #3
     vshrn.u16   d25, q2, #8
     vbic.i16    q2, #0xe0
     vshr.u8     d6, #3
     vshr.u8     d7, #2
     vshr.u8     d2, #3
     vmovn.u16   d24, q2
     vshr.u8     d25, #3
     vmull.u8    q13, d1, d3
     vmlal.u8    q13, d7, d6
     vmull.u8    q14, d0, d3
     vmlal.u8    q14, d24, d6
     vmull.u8    q15, d2, d3
     vmlal.u8    q15, d25, d6
 .endm

 .macro ARGBto565PixelAlpha_process_pixblock_tail
     vsra.u16    q13, #5
     vsra.u16    q14, #5
     vsra.u16    q15, #5
     vrshr.u16   q13, #5
     vrshr.u16   q14, #5
     vrshr.u16   q15, #5
     vsli.u16    q14, q13, #5
     vsli.u16    q14, q15, #11
 .endm

 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
     vld4.8      {d0-d3}, [SRC]!
                                     PF add PF_X, PF_X, #8
         vsra.u16    q13, #5
                                     PF tst PF_CTL, #0xF
         vsra.u16    q14, #5
                                     PF addne PF_X, PF_X, #8
         vsra.u16    q15, #5
                                     PF subne PF_CTL, PF_CTL, #1
         vrshr.u16   q13, #5
                                     PF cmp PF_X, ORIG_W
         vrshr.u16   q14, #5
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
         vrshr.u16   q15, #5
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
     vld1.8      {d4-d5}, [DST_R]!
                                     PF subge PF_X, PF_X, ORIG_W
         vsli.u16    q14, q13, #5
                                     PF subges PF_CTL, PF_CTL, #0x10
         vsli.u16    q14, q15, #11
                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
         vst1.8      {q14}, [DST_W :128]!
     vmvn        d6, d3
     vshr.u8     d1, #2
     vshr.u8     d3, #3
     vshr.u8     d0, #3
     vshrn.u16   d7, q2, #3
     vshrn.u16   d25, q2, #8
     vbic.i16    q2, #0xe0
                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
     vshr.u8     d6, #3
     vshr.u8     d7, #2
     vshr.u8     d2, #3
     vmovn.u16   d24, q2
     vshr.u8     d25, #3
     vmull.u8    q13, d1, d3
     vmlal.u8    q13, d7, d6
     vmull.u8    q14, d0, d3
     vmlal.u8    q14, d24, d6
     vmull.u8    q15, d2, d3
     vmlal.u8    q15, d25, d6
 .endm

 generate_composite_function \
     BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
     8, /* number of pixels, processed in a single block */ \
     6, /* prefetch distance */ \
     default_init, \
     default_cleanup, \
     ARGBto565PixelAlpha_process_pixblock_head, \
     ARGBto565PixelAlpha_process_pixblock_tail, \
     ARGBto565PixelAlpha_process_pixblock_tail_head
	/*
	* Copyright © 2009 Nokia Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	* DEALINGS IN THE SOFTWARE.
	*
	* Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
	*/

	/*
	* Copyright (c) 2018 RISC OS Open Ltd
	*
	* This software is provided 'as-is', without any express or implied
	* warranty. In no event will the authors be held liable for any damages
	* arising from the use of this software.
	*
	* Permission is granted to anyone to use this software for any purpose,
	* including commercial applications, and to alter it and redistribute it
	* freely, subject to the following restrictions:
	*
	* 1. The origin of this software must not be misrepresented; you must not
	* claim that you wrote the original software. If you use this software
	* in a product, an acknowledgment in the product documentation would be
	* appreciated but is not required.
	* 2. Altered source versions must be plainly marked as such, and must not be
	* misrepresented as being the original software.
	* 3. This notice may not be removed or altered from any source distribution.
	*/

	/* Prevent the stack from becoming executable for no reason... */
	#if defined(__linux__) && defined(__ELF__)
	.section .note.GNU-stack,"",%progbits
	#endif

	.text
	.fpu neon
	.arch armv7a
	.object_arch armv4
	.eabi_attribute 10, 0 /* suppress Tag_FP_arch */
	.eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
	.arm
	.altmacro
	.p2align 2

	#include "pixman-arm-asm.h"
	#include "pixman-arm-neon-asm.h"

	/* Global configuration options and preferences */

	/*
	* The code can optionally make use of unaligned memory accesses to improve
	* performance of handling leading/trailing pixels for each scanline.
	* Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
	* example in linux if unaligned memory accesses are not configured to
	* generate.exceptions.
	*/
	.set RESPECT_STRICT_ALIGNMENT, 1

	/*
	* Set default prefetch type. There is a choice between the following options:
	*
	* PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
	* as NOP to workaround some HW bugs or for whatever other reason)
	*
	* PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
	* advanced prefetch intruduces heavy overhead)
	*
	* PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
	* which can run ARM and NEON instructions simultaneously so that extra ARM
	* instructions do not add (many) extra cycles, but improve prefetch efficiency)
	*
	* Note: some types of function can't support advanced prefetch and fallback
	* to simple one (those which handle 24bpp pixels)
	*/
	.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED

	/* Prefetch distance in pixels for simple prefetch */
	.set PREFETCH_DISTANCE_SIMPLE, 64

	/******************************************************************************/

	/* We can actually do significantly better than the Pixman macros, at least for
	* the case of fills, by using a carefully scheduled inner loop. Cortex-A53
	* shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
	*/

	.macro generate_fillrect_function name, bpp, log2Bpp
	/*
	* void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
	* On entry:
	* a1 = width, pixels
	* a2 = height, rows
	* a3 = pointer to top-left destination pixel
	* a4 = stride, pixels
	* [sp] = pixel value to fill with
	* Within the function:
	* v1 = width remaining
	* v2 = vst offset
	* v3 = alternate pointer
	* ip = data ARM register
	*/
	pixman_asm_function name
	vld1.\bpp {d0[],d1[]}, [sp]
	sub a4, a1
	vld1.\bpp {d2[],d3[]}, [sp]
	cmp a1, #(15+64) >> \log2Bpp
	push {v1-v3,lr}
	vmov ip, s0
	blo 51f

	/* Long-row case */
	mov v2, #64
	1: mov v1, a1
	ands v3, a3, #15
	beq 2f
	/* Leading pixels */
	rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */
	sub v1, v1, v3, lsr #\log2Bpp
	rbit v3, v3
	.if bpp <= 16
	.if bpp == 8
	tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
	strneb ip, [a3], #1
	tst v3, #1<<30
	.else
	tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
	.endif
	strneh ip, [a3], #2
	.endif
	movs v3, v3, lsl #3
	vstmcs a3!, {s0}
	vstmmi a3!, {d0}
	2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
	add v3, a3, #32
	/* Inner loop */
	3: vst1.\bpp {q0-q1}, [a3 :128], v2
	subs v1, v1, #64 >> \log2Bpp
	vst1.\bpp {q0-q1}, [v3 :128], v2
	bhs 3b
	/* Trailing pixels */
	4: movs v1, v1, lsl #27 + \log2Bpp
	bcc 5f
	vst1.\bpp {q0-q1}, [a3 :128]!
	5: bpl 6f
	vst1.\bpp {q0}, [a3 :128]!
	6: movs v1, v1, lsl #2
	vstmcs a3!, {d0}
	vstmmi a3!, {s0}
	.if bpp <= 16
	movs v1, v1, lsl #2
	strcsh ip, [a3], #2
	.if bpp == 8
	strmib ip, [a3], #1
	.endif
	.endif
	subs a2, a2, #1
	add a3, a3, a4, lsl #\log2Bpp
	bhi 1b
	pop {v1-v3,pc}

	/* Short-row case */
	51: movs v1, a1
	.if bpp == 8
	tst a3, #3
	beq 53f
	52: subs v1, v1, #1
	blo 57f
	strb ip, [a3], #1
	tst a3, #3
	bne 52b
	.elseif bpp == 16
	tstne a3, #2
	subne v1, v1, #1
	strneh ip, [a3], #2
	.endif
	53: cmp v1, #32 >> \log2Bpp
	bcc 54f
	vst1.\bpp {q0-q1}, [a3]!
	sub v1, v1, #32 >> \log2Bpp
	/* Trailing pixels */
	54: movs v1, v1, lsl #27 + \log2Bpp
	bcc 55f
	vst1.\bpp {q0-q1}, [a3]!
	55: bpl 56f
	vst1.\bpp {q0}, [a3]!
	56: movs v1, v1, lsl #2
	vstmcs a3!, {d0}
	vstmmi a3!, {s0}
	.if bpp <= 16
	movs v1, v1, lsl #2
	strcsh ip, [a3], #2
	.if bpp == 8
	strmib ip, [a3], #1
	.endif
	.endif
	subs a2, a2, #1
	add a3, a3, a4, lsl #\log2Bpp
	bhi 51b
	57: pop {v1-v3,pc}

	.endfunc
	.endm

	generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
	generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
	generate_fillrect_function FillRect8ARMNEONAsm, 8, 0

	/******************************************************************************/

	.macro RGBtoRGBPixelAlpha_process_pixblock_head
	vmvn d30, d3 /* get inverted source alpha */
	vmov d31, d7 /* dest alpha is always unchanged */
	vmull.u8 q14, d0, d3
	vmlal.u8 q14, d4, d30
	vmull.u8 q0, d1, d3
	vmlal.u8 q0, d5, d30
	vmull.u8 q1, d2, d3
	vmlal.u8 q1, d6, d30
	vrshr.u16 q2, q14, #8
	vrshr.u16 q3, q0, #8
	vraddhn.u16 d28, q14, q2
	vrshr.u16 q2, q1, #8
	vraddhn.u16 d29, q0, q3
	vraddhn.u16 d30, q1, q2
	.endm

	.macro RGBtoRGBPixelAlpha_process_pixblock_tail
	/* nothing */
	.endm

	.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
	vld4.8 {d0-d3}, [SRC]!
	PF add PF_X, PF_X, #8
	vst4.8 {d28-d31}, [DST_W :128]!
	PF tst PF_CTL, #0xF
	vld4.8 {d4-d7}, [DST_R :128]!
	PF addne PF_X, PF_X, #8
	vmvn d30, d3 /* get inverted source alpha */
	vmov d31, d7 /* dest alpha is always unchanged */
	vmull.u8 q14, d0, d3
	PF subne PF_CTL, PF_CTL, #1
	vmlal.u8 q14, d4, d30
	PF cmp PF_X, ORIG_W
	vmull.u8 q0, d1, d3
	PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
	vmlal.u8 q0, d5, d30
	PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
	vmull.u8 q1, d2, d3
	PF subge PF_X, PF_X, ORIG_W
	vmlal.u8 q1, d6, d30
	PF subges PF_CTL, PF_CTL, #0x10
	vrshr.u16 q2, q14, #8
	PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
	vrshr.u16 q3, q0, #8
	PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
	vraddhn.u16 d28, q14, q2
	vrshr.u16 q2, q1, #8
	vraddhn.u16 d29, q0, q3
	vraddhn.u16 d30, q1, q2
	.endm

	generate_composite_function \
	BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
	FLAG_DST_READWRITE \| FLAG_DEINTERLEAVE_32BPP, \
	8, /* number of pixels, processed in a single block */ \
	5, /* prefetch distance */ \
	default_init, \
	default_cleanup, \
	RGBtoRGBPixelAlpha_process_pixblock_head, \
	RGBtoRGBPixelAlpha_process_pixblock_tail, \
	RGBtoRGBPixelAlpha_process_pixblock_tail_head

	/******************************************************************************/

	.macro ARGBto565PixelAlpha_process_pixblock_head
	vmvn d6, d3
	vshr.u8 d1, #2
	vshr.u8 d3, #3
	vshr.u8 d0, #3
	vshrn.u16 d7, q2, #3
	vshrn.u16 d25, q2, #8
	vbic.i16 q2, #0xe0
	vshr.u8 d6, #3
	vshr.u8 d7, #2
	vshr.u8 d2, #3
	vmovn.u16 d24, q2
	vshr.u8 d25, #3
	vmull.u8 q13, d1, d3
	vmlal.u8 q13, d7, d6
	vmull.u8 q14, d0, d3
	vmlal.u8 q14, d24, d6
	vmull.u8 q15, d2, d3
	vmlal.u8 q15, d25, d6
	.endm

	.macro ARGBto565PixelAlpha_process_pixblock_tail
	vsra.u16 q13, #5
	vsra.u16 q14, #5
	vsra.u16 q15, #5
	vrshr.u16 q13, #5
	vrshr.u16 q14, #5
	vrshr.u16 q15, #5
	vsli.u16 q14, q13, #5
	vsli.u16 q14, q15, #11
	.endm

	.macro ARGBto565PixelAlpha_process_pixblock_tail_head
	vld4.8 {d0-d3}, [SRC]!
	PF add PF_X, PF_X, #8
	vsra.u16 q13, #5
	PF tst PF_CTL, #0xF
	vsra.u16 q14, #5
	PF addne PF_X, PF_X, #8
	vsra.u16 q15, #5
	PF subne PF_CTL, PF_CTL, #1
	vrshr.u16 q13, #5
	PF cmp PF_X, ORIG_W
	vrshr.u16 q14, #5
	PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
	vrshr.u16 q15, #5
	PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
	vld1.8 {d4-d5}, [DST_R]!
	PF subge PF_X, PF_X, ORIG_W
	vsli.u16 q14, q13, #5
	PF subges PF_CTL, PF_CTL, #0x10
	vsli.u16 q14, q15, #11
	PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
	vst1.8 {q14}, [DST_W :128]!
	vmvn d6, d3
	vshr.u8 d1, #2
	vshr.u8 d3, #3
	vshr.u8 d0, #3
	vshrn.u16 d7, q2, #3
	vshrn.u16 d25, q2, #8
	vbic.i16 q2, #0xe0
	PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
	vshr.u8 d6, #3
	vshr.u8 d7, #2
	vshr.u8 d2, #3
	vmovn.u16 d24, q2
	vshr.u8 d25, #3
	vmull.u8 q13, d1, d3
	vmlal.u8 q13, d7, d6
	vmull.u8 q14, d0, d3
	vmlal.u8 q14, d24, d6
	vmull.u8 q15, d2, d3
	vmlal.u8 q15, d25, d6
	.endm

	generate_composite_function \
	BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
	FLAG_DST_READWRITE \| FLAG_DEINTERLEAVE_32BPP, \
	8, /* number of pixels, processed in a single block */ \
	6, /* prefetch distance */ \
	default_init, \
	default_cleanup, \
	ARGBto565PixelAlpha_process_pixblock_head, \
	ARGBto565PixelAlpha_process_pixblock_tail, \
	ARGBto565PixelAlpha_process_pixblock_tail_head