| /* |
| * Copyright (c) 2016 RISC OS Open Ltd |
| * |
| * This software is provided 'as-is', without any express or implied |
| * warranty. In no event will the authors be held liable for any damages |
| * arising from the use of this software. |
| * |
| * Permission is granted to anyone to use this software for any purpose, |
| * including commercial applications, and to alter it and redistribute it |
| * freely, subject to the following restrictions: |
| * |
| * 1. The origin of this software must not be misrepresented; you must not |
| * claim that you wrote the original software. If you use this software |
| * in a product, an acknowledgment in the product documentation would be |
| * appreciated but is not required. |
| * 2. Altered source versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * 3. This notice may not be removed or altered from any source distribution. |
| */ |
| |
| /* Prevent the stack from becoming executable */ |
| #if defined(__linux__) && defined(__ELF__) |
| .section .note.GNU-stack,"",%progbits |
| #endif |
| |
| .text |
| .arch armv6 |
| .object_arch armv4 |
| .arm |
| .altmacro |
| .p2align 2 |
| |
| #include "pixman-arm-asm.h" |
| #include "pixman-arm-simd-asm.h" |
| |
| /* A head macro should do all processing which results in an output of up to |
| * 16 bytes, as far as the final load instruction. The corresponding tail macro |
| * should complete the processing of the up-to-16 bytes. The calling macro will |
| * sometimes choose to insert a preload or a decrement of X between them. |
| * cond ARM condition code for code block |
| * numbytes Number of output bytes that should be generated this time |
| * firstreg First WK register in which to place output |
| * unaligned_src Whether to use non-wordaligned loads of source image |
| * unaligned_mask Whether to use non-wordaligned loads of mask image |
| * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output |
| */ |
| |
| /******************************************************************************/ |
| |
| .macro FillRect32_init |
| ldr SRC, [sp, #ARGS_STACK_OFFSET] |
| mov STRIDE_S, SRC |
| mov MASK, SRC |
| mov STRIDE_M, SRC |
| .endm |
| |
| .macro FillRect16_init |
| ldrh SRC, [sp, #ARGS_STACK_OFFSET] |
| orr SRC, SRC, lsl #16 |
| mov STRIDE_S, SRC |
| mov MASK, SRC |
| mov STRIDE_M, SRC |
| .endm |
| |
| .macro FillRect8_init |
| ldrb SRC, [sp, #ARGS_STACK_OFFSET] |
| orr SRC, SRC, lsl #8 |
| orr SRC, SRC, lsl #16 |
| mov STRIDE_S, SRC |
| mov MASK, SRC |
| mov STRIDE_M, SRC |
| .endm |
| |
| .macro FillRect_process_tail cond, numbytes, firstreg |
| WK4 .req SRC |
| WK5 .req STRIDE_S |
| WK6 .req MASK |
| WK7 .req STRIDE_M |
| pixst cond, numbytes, 4, DST |
| .unreq WK4 |
| .unreq WK5 |
| .unreq WK6 |
| .unreq WK7 |
| .endm |
| |
| generate_composite_function \ |
| FillRect32ARMSIMDAsm, 0, 0, 32, \ |
| FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
| 0, /* prefetch distance doesn't apply */ \ |
| FillRect32_init \ |
| nop_macro, /* newline */ \ |
| nop_macro /* cleanup */ \ |
| nop_macro /* process head */ \ |
| FillRect_process_tail |
| |
| generate_composite_function \ |
| FillRect16ARMSIMDAsm, 0, 0, 16, \ |
| FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
| 0, /* prefetch distance doesn't apply */ \ |
| FillRect16_init \ |
| nop_macro, /* newline */ \ |
| nop_macro /* cleanup */ \ |
| nop_macro /* process head */ \ |
| FillRect_process_tail |
| |
| generate_composite_function \ |
| FillRect8ARMSIMDAsm, 0, 0, 8, \ |
| FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ |
| 0, /* prefetch distance doesn't apply */ \ |
| FillRect8_init \ |
| nop_macro, /* newline */ \ |
| nop_macro /* cleanup */ \ |
| nop_macro /* process head */ \ |
| FillRect_process_tail |
| |
| /******************************************************************************/ |
| |
| /* This differs from the over_8888_8888 routine in Pixman in that the destination |
| * alpha component is always left unchanged, and RGB components are not |
| * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that |
| * renormalisation is done by multiplying by 257/256 (with rounding) rather than |
| * simply shifting right by 8 bits - removing the need to special-case alpha=0xff. |
| */ |
| |
| .macro RGBtoRGBPixelAlpha_init |
| line_saved_regs STRIDE_S, ORIG_W |
| mov MASK, #0x80 |
| .endm |
| |
| .macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half |
| uxtb tmp3, s |
| uxtb tmp0, d |
| sub tmp0, tmp3, tmp0 |
| uxtb tmp3, s, ror #16 |
| uxtb tmp1, d, ror #16 |
| sub tmp1, tmp3, tmp1 |
| uxtb tmp3, s, ror #8 |
| mov s, s, lsr #24 |
| uxtb tmp2, d, ror #8 |
| sub tmp2, tmp3, tmp2 |
| smlabb tmp0, tmp0, s, half |
| smlabb tmp1, tmp1, s, half |
| smlabb tmp2, tmp2, s, half |
| add tmp0, tmp0, asr #8 |
| add tmp1, tmp1, asr #8 |
| add tmp2, tmp2, asr #8 |
| pkhbt tmp0, tmp0, tmp1, lsl #16 |
| and tmp2, tmp2, #0xff00 |
| uxtb16 tmp0, tmp0, ror #8 |
| orr tmp0, tmp0, tmp2 |
| uadd8 d, d, tmp0 |
| .endm |
| |
| .macro RGBtoRGBPixelAlpha_1pixel_opaque s, d |
| and d, d, #0xff000000 |
| bic s, s, #0xff000000 |
| orr d, d, s |
| .endm |
| |
| .macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
| .if numbytes == 16 |
| ldm SRC!, {WK0, WK1} |
| ldm SRC!, {STRIDE_S, STRIDE_M} |
| ldrd WK2, WK3, [DST], #16 |
| orr SCRATCH, WK0, WK1 |
| and ORIG_W, WK0, WK1 |
| orr SCRATCH, SCRATCH, STRIDE_S |
| and ORIG_W, ORIG_W, STRIDE_S |
| orr SCRATCH, SCRATCH, STRIDE_M |
| and ORIG_W, ORIG_W, STRIDE_M |
| tst SCRATCH, #0xff000000 |
| .elseif numbytes == 8 |
| ldm SRC!, {WK0, WK1} |
| ldm DST!, {WK2, WK3} |
| orr SCRATCH, WK0, WK1 |
| and ORIG_W, WK0, WK1 |
| tst SCRATCH, #0xff000000 |
| .else // numbytes == 4 |
| ldr WK0, [SRC], #4 |
| ldr WK2, [DST], #4 |
| tst WK0, #0xff000000 |
| .endif |
| .endm |
| |
| .macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg |
| beq 20f @ all transparent |
| .if numbytes == 16 |
| cmp ORIG_W, #0xff000000 |
| bhs 10f @ all opaque |
| RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| strd WK2, WK3, [DST, #-16] |
| ldrd WK0, WK1, [SRC, #-8] |
| ldrd WK2, WK3, [DST, #-8] |
| RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| b 19f |
| 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 |
| RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 |
| strd WK2, WK3, [DST, #-16] |
| ldrd WK0, WK1, [SRC, #-8] |
| ldrd WK2, WK3, [DST, #-8] |
| RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 |
| RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 |
| 19: strd WK2, WK3, [DST, #-8] |
| .elseif numbytes == 8 |
| cmp ORIG_W, #0xff000000 |
| bhs 10f @ all opaque |
| RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| b 19f |
| 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 |
| RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 |
| 19: strd WK2, WK3, [DST, #-8] |
| .else // numbytes == 4 |
| cmp WK0, #0xff000000 |
| bhs 10f @ opaque |
| RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK |
| b 19f |
| 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 |
| 19: str WK2, [DST, #-4] |
| .endif |
| 20: |
| .endm |
| |
| generate_composite_function \ |
| BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \ |
| FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ |
| 2, /* prefetch distance */ \ |
| RGBtoRGBPixelAlpha_init, \ |
| nop_macro, /* newline */ \ |
| nop_macro, /* cleanup */ \ |
| RGBtoRGBPixelAlpha_process_head, \ |
| RGBtoRGBPixelAlpha_process_tail |
| |
| /******************************************************************************/ |
| |
| .macro ARGBto565PixelAlpha_init |
| line_saved_regs STRIDE_D, STRIDE_S, ORIG_W |
| mov MASK, #0x001f |
| mov STRIDE_M, #0x0010 |
| orr MASK, MASK, MASK, lsl #16 |
| orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16 |
| .endm |
| |
| .macro ARGBto565PixelAlpha_newline |
| mov STRIDE_S, #0x0200 |
| .endm |
| |
| /* On entry: |
| * s1 holds 1 32bpp source pixel |
| * d holds 1 16bpp destination pixel |
| * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively |
| * other registers are temporaries |
| * On exit: |
| * Constant registers preserved |
| */ |
| |
| .macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc |
| mov alpha, s, lsr #27 |
| and misc, s, #0xfc00 |
| and g, d, #0x07e0 |
| pkhbt rb, d, d, lsl #5 |
| rsb misc, g, misc, lsr #5 |
| and s, rbmask, s, lsr #3 |
| and rb, rbmask, rb |
| sub s, s, rb |
| smlabb misc, misc, alpha, ghalf |
| mla s, s, alpha, rbhalf |
| add misc, misc, misc, lsl #5 |
| add g, g, misc, asr #10 |
| add s, s, s, lsl #5 |
| and g, g, #0x07e0 |
| add rb, rb, s, asr #10 |
| and rb, rb, rbmask |
| pkhbt rb, rb, rb, lsl #11 |
| orr d, rb, g |
| orr d, d, rb, lsr #16 |
| .endm |
| |
| /* On entry: |
| * s1 holds 1 32bpp source pixel |
| * d holds 1 16bpp destination pixel |
| * rbmask holds 0x001f001f |
| * On exit: |
| * Constant registers preserved |
| */ |
| |
| .macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask |
| and d, rbmask, s, lsr #3 |
| and s, s, #0xfc00 |
| orr d, d, d, lsr #5 |
| orr d, d, s, lsr #5 |
| .endm |
| |
| /* On entry: |
| * s1, s2 hold 2 32bpp source pixels |
| * d holds 2 16bpp destination pixels |
| * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively |
| * other registers are temporaries |
| * On exit: |
| * Constant registers preserved |
| * Blended results have been written through destination pointer |
| */ |
| |
| .macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc |
| mov alpha, s1, lsr #27 |
| and misc, s1, #0xfc00 |
| and g, d, #0x07e0 |
| pkhbt rb, d, d, lsl #5 |
| rsb misc, g, misc, lsr #5 |
| and s1, rbmask, s1, lsr #3 |
| and rb, rbmask, rb |
| sub s1, s1, rb |
| smlabb misc, misc, alpha, ghalf |
| mla s1, s1, alpha, rbhalf |
| uxth d, d, ror #16 |
| add misc, misc, misc, lsl #5 |
| mov alpha, s2, lsr #27 |
| add g, g, misc, asr #10 |
| add s1, s1, s1, lsl #5 |
| and g, g, #0x07e0 |
| add rb, rb, s1, asr #10 |
| and rb, rb, rbmask |
| and misc, s2, #0xfc00 |
| pkhbt rb, rb, rb, lsl #11 |
| and s1, d, #0x07e0 |
| pkhbt d, d, d, lsl #5 |
| rsb misc, s1, misc, lsr #5 |
| and s2, rbmask, s2, lsr #3 |
| and d, rbmask, d |
| sub s2, s2, d |
| smlabb misc, misc, alpha, ghalf |
| mla s2, s2, alpha, rbhalf |
| orr alpha, rb, g |
| add misc, misc, misc, lsl #5 |
| orr alpha, alpha, rb, lsr #16 |
| add s1, s1, misc, asr #10 |
| add s2, s2, s2, lsl #5 |
| and s1, s1, #0x07e0 |
| add d, d, s2, asr #10 |
| and d, d, rbmask |
| strh alpha, [DST, #-4] |
| pkhbt d, d, d, lsl #11 |
| orr alpha, d, s1 |
| orr alpha, alpha, d, lsr #16 |
| strh alpha, [DST, #-2] |
| .endm |
| |
| /* On entry: |
| * s1, s2 hold 2 32bpp source pixels |
| * rbmask holds 0x001f001f |
| * other registers are temporaries |
| * On exit: |
| * Constant registers preserved |
| * Blended results have been written through destination pointer |
| */ |
| |
| .macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g |
| and g, s1, #0xfc00 |
| and d, rbmask, s1, lsr #3 |
| and s1, rbmask, s2, lsr #3 |
| orr d, d, d, lsr #5 |
| orr d, d, g, lsr #5 |
| and g, s2, #0xfc00 |
| strh d, [DST, #-4] |
| orr s1, s1, s1, lsr #5 |
| orr s1, s1, g, lsr #5 |
| strh s1, [DST, #-2] |
| .endm |
| |
| .macro ARGBto565PixelAlpha_2pixels_head |
| ldrd WK0, WK1, [SRC], #8 |
| ldr WK2, [DST], #4 |
| orr SCRATCH, WK0, WK1 |
| and ORIG_W, WK0, WK1 |
| tst SCRATCH, #0xff000000 |
| .endm |
| |
| .macro ARGBto565PixelAlpha_2pixels_tail |
| beq 20f @ all transparent |
| cmp ORIG_W, #0xff000000 |
| bhs 10f @ all opaque |
| ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W |
| b 20f |
| 10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH |
| 20: |
| .endm |
| |
| .macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
| .if numbytes == 16 |
| ARGBto565PixelAlpha_2pixels_head |
| ARGBto565PixelAlpha_2pixels_tail |
| ARGBto565PixelAlpha_2pixels_head |
| ARGBto565PixelAlpha_2pixels_tail |
| .endif |
| .if numbytes >= 8 |
| ARGBto565PixelAlpha_2pixels_head |
| ARGBto565PixelAlpha_2pixels_tail |
| .endif |
| .if numbytes >= 4 |
| ARGBto565PixelAlpha_2pixels_head |
| .else // numbytes == 2 |
| ldr WK0, [SRC], #4 |
| ldrh WK2, [DST], #2 |
| tst WK0, #0xff000000 |
| .endif |
| .endm |
| |
| .macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg |
| .if numbytes >= 4 |
| ARGBto565PixelAlpha_2pixels_tail |
| .else // numbytes == 2 |
| beq 20f @ all transparent |
| cmp WK0, #0xff000000 |
| bhs 10f @ opaque |
| ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W |
| b 19f |
| 10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK |
| 19: strh WK2, [DST, #-2] |
| 20: |
| .endif |
| .endm |
| |
| generate_composite_function \ |
| BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \ |
| FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ |
| 2, /* prefetch distance */ \ |
| ARGBto565PixelAlpha_init, \ |
| ARGBto565PixelAlpha_newline, \ |
| nop_macro, /* cleanup */ \ |
| ARGBto565PixelAlpha_process_head, \ |
| ARGBto565PixelAlpha_process_tail |
| |
| /******************************************************************************/ |
| |
| .macro BGR888toRGB888_1pixel cond, reg, tmp |
| uxtb16&cond tmp, WK®, ror #8 |
| uxtb16&cond WK®, WK®, ror #16 |
| orr&cond WK®, WK®, tmp, lsl #8 |
| .endm |
| |
| .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2 |
| uxtb16&cond tmp1, WK®1, ror #8 |
| uxtb16&cond WK®1, WK®1, ror #16 |
| uxtb16&cond tmp2, WK®2, ror #8 |
| uxtb16&cond WK®2, WK®2, ror #16 |
| orr&cond WK®1, WK®1, tmp1, lsl #8 |
| orr&cond WK®2, WK®2, tmp2, lsl #8 |
| .endm |
| |
| .macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
| pixld cond, numbytes, firstreg, SRC, unaligned_src |
| .endm |
| |
| .macro BGR888toRGB888_process_tail cond, numbytes, firstreg |
| .if numbytes >= 8 |
| BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M |
| .if numbytes == 16 |
| BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M |
| .endif |
| .else @ numbytes == 4 |
| BGR888toRGB888_1pixel cond, %(firstreg+0), MASK |
| .endif |
| .endm |
| |
| generate_composite_function \ |
| Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \ |
| FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ |
| 2, /* prefetch distance */ \ |
| nop_macro, /* init */ \ |
| nop_macro, /* newline */ \ |
| nop_macro, /* cleanup */ \ |
| BGR888toRGB888_process_head, \ |
| BGR888toRGB888_process_tail |
| |
| /******************************************************************************/ |
| |
| .macro RGB444toRGB888_init |
| ldr MASK, =0x0f0f0f0f |
| /* Set GE[3:0] to 0101 so SEL instructions do what we want */ |
| msr CPSR_s, #0x50000 |
| .endm |
| |
| .macro RGB444toRGB888_1pixel reg, mask, tmp |
| pkhbt WK®, WK®, WK®, lsl #12 @ 0000aaaarrrrggggaaaarrrrggggbbbb |
| and WK®, mask, WK® @ 0000aaaa0000gggg0000rrrr0000bbbb |
| orr WK®, WK®, WK®, lsl #4 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb |
| pkhtb tmp, WK®, WK®, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr |
| pkhbt WK®, WK®, WK®, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb |
| sel WK®, WK®, tmp @ aaaaaaaarrrrrrrrggggggggbbbbbbbb |
| .endm |
| |
| .macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2 |
| and tmp1, mask, WK&in @ 0000RRRR0000BBBB0000rrrr0000bbbb |
| and tmp2, mask, WK&in, lsr #4 @ 0000AAAA0000GGGG0000aaaa0000gggg |
| orr tmp1, tmp1, tmp1, lsl #4 @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb |
| orr tmp2, tmp2, tmp2, lsl #4 @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg |
| pkhtb WK&out2, tmp2, tmp1, asr #16 @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB |
| pkhbt WK&out1, tmp1, tmp2, lsl #16 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb |
| pkhtb tmp2, WK&out2, WK&out2, asr #8 @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR |
| pkhtb tmp1, WK&out1, WK&out1, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr |
| pkhbt WK&out1, WK&out1, WK&out1, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb |
| pkhbt WK&out2, WK&out2, WK&out2, lsl #8 @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB |
| sel WK&out1, WK&out1, tmp1 @ aaaaaaaarrrrrrrrggggggggbbbbbbbb |
| sel WK&out2, WK&out2, tmp2 @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB |
| .endm |
| |
| .macro RGB444toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload |
| pixld cond, numbytes/2, firstreg, SRC, unaligned_src |
| .endm |
| |
| .macro RGB444toRGB888_process_tail cond, numbytes, firstreg |
| .if numbytes >= 8 |
| .if numbytes == 16 |
| RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH |
| .endif |
| RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH |
| .else @ numbytes == 4 |
| RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH |
| .endif |
| .endm |
| |
| generate_composite_function \ |
| Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \ |
| FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ |
| 2, /* prefetch distance */ \ |
| RGB444toRGB888_init, \ |
| nop_macro, /* newline */ \ |
| nop_macro, /* cleanup */ \ |
| RGB444toRGB888_process_head, \ |
| RGB444toRGB888_process_tail |