| /* |
| * Copyright (c) 2012 Raspberry Pi Foundation |
| * Copyright (c) 2012 RISC OS Open Ltd |
| * |
| * This software is provided 'as-is', without any express or implied |
| * warranty. In no event will the authors be held liable for any damages |
| * arising from the use of this software. |
| * |
| * Permission is granted to anyone to use this software for any purpose, |
| * including commercial applications, and to alter it and redistribute it |
| * freely, subject to the following restrictions: |
| * |
| * 1. The origin of this software must not be misrepresented; you must not |
| * claim that you wrote the original software. If you use this software |
| * in a product, an acknowledgment in the product documentation would be |
| * appreciated but is not required. |
| * 2. Altered source versions must be plainly marked as such, and must not be |
| * misrepresented as being the original software. |
| * 3. This notice may not be removed or altered from any source distribution. |
| */ |
| |
| /* |
| * Because the alignment of pixel data to cachelines, and even the number of |
| * cachelines per row can vary from row to row, and because of the need to |
| * preload each scanline once and only once, this prefetch strategy treats |
| * each row of pixels independently. When a pixel row is long enough, there |
| * are three distinct phases of prefetch: |
| * * an inner loop section, where each time a cacheline of data is |
| * processed, another cacheline is preloaded (the exact distance ahead is |
| * determined empirically using profiling results from lowlevel-blt-bench) |
| * * a leading section, where enough cachelines are preloaded to ensure no |
| * cachelines escape being preloaded when the inner loop starts |
| * * a trailing section, where a limited number (0 or more) of cachelines |
| * are preloaded to deal with data (if any) that hangs off the end of the |
| * last iteration of the inner loop, plus any trailing bytes that were not |
| * enough to make up one whole iteration of the inner loop |
| * |
| * There are (in general) three distinct code paths, selected between |
| * depending upon how long the pixel row is. If it is long enough that there |
| * is at least one iteration of the inner loop (as described above) then |
| * this is described as the "wide" case. If it is shorter than that, but |
| * there are still enough bytes output that there is at least one 16-byte- |
| * long, 16-byte-aligned write to the destination (the optimum type of |
| * write), then this is the "medium" case. If it is not even this long, then |
| * this is the "narrow" case, and there is no attempt to align writes to |
| * 16-byte boundaries. In the "medium" and "narrow" cases, all the |
| * cachelines containing data from the pixel row are prefetched up-front. |
| */ |
| |
| /* |
| * Determine whether we put the arguments on the stack for debugging. |
| */ |
| #undef DEBUG_PARAMS |
| |
| /* |
| * Bit flags for 'generate_composite_function' macro which are used |
| * to tune generated functions behavior. |
| */ |
| .set FLAG_DST_WRITEONLY, 0 |
| .set FLAG_DST_READWRITE, 1 |
| .set FLAG_COND_EXEC, 0 |
| .set FLAG_BRANCH_OVER, 2 |
| .set FLAG_PROCESS_PRESERVES_PSR, 0 |
| .set FLAG_PROCESS_CORRUPTS_PSR, 4 |
| .set FLAG_PROCESS_DOESNT_STORE, 0 |
| .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ |
| .set FLAG_NO_SPILL_LINE_VARS, 0 |
| .set FLAG_SPILL_LINE_VARS_WIDE, 16 |
| .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 |
| .set FLAG_SPILL_LINE_VARS, 48 |
| .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 |
| .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 |
| .set FLAG_PROCESS_PRESERVES_WK0, 0 |
| .set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ |
| .set FLAG_PRELOAD_DST, 0 |
| .set FLAG_NO_PRELOAD_DST, 256 |
| |
| /* |
| * Number of bytes by which to adjust preload offset of destination |
| * buffer (allows preload instruction to be moved before the load(s)) |
| */ |
| .set DST_PRELOAD_BIAS, 0 |
| |
| /* |
| * Offset into stack where mask and source pointer/stride can be accessed. |
| */ |
| #ifdef DEBUG_PARAMS |
| .set ARGS_STACK_OFFSET, (9*4+9*4) |
| #else |
| .set ARGS_STACK_OFFSET, (9*4) |
| #endif |
| |
| /* |
| * Offset into stack where space allocated during init macro can be accessed. |
| */ |
| .set LOCALS_STACK_OFFSET, 0 |
| |
| /* |
| * Constants for selecting preferable prefetch type. |
| */ |
| .set PREFETCH_TYPE_NONE, 0 |
| .set PREFETCH_TYPE_STANDARD, 1 |
| |
| /* |
| * Definitions of macros for load/store of pixel data. |
| */ |
| |
| .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 |
| .if numbytes == 16 |
| .if unaligned == 1 |
| op&r&cond WK®0, [base], #4 |
| op&r&cond WK®1, [base], #4 |
| op&r&cond WK®2, [base], #4 |
| op&r&cond WK®3, [base], #4 |
| .else |
| op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} |
| .endif |
| .elseif numbytes == 8 |
| .if unaligned == 1 |
| op&r&cond WK®0, [base], #4 |
| op&r&cond WK®1, [base], #4 |
| .else |
| op&m&cond&ia base!, {WK®0,WK®1} |
| .endif |
| .elseif numbytes == 4 |
| op&r&cond WK®0, [base], #4 |
| .elseif numbytes == 2 |
| op&r&cond&h WK®0, [base], #2 |
| .elseif numbytes == 1 |
| op&r&cond&b WK®0, [base], #1 |
| .else |
| .error "unsupported size: numbytes" |
| .endif |
| .endm |
| |
| .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base |
| .if numbytes == 16 |
| stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} |
| .elseif numbytes == 8 |
| stm&cond&db base, {WK®0,WK®1} |
| .elseif numbytes == 4 |
| str&cond WK®0, [base, #-4] |
| .elseif numbytes == 2 |
| str&cond&h WK®0, [base, #-2] |
| .elseif numbytes == 1 |
| str&cond&b WK®0, [base, #-1] |
| .else |
| .error "unsupported size: numbytes" |
| .endif |
| .endm |
| |
| .macro pixld cond, numbytes, firstreg, base, unaligned |
| pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned |
| .endm |
| |
| .macro pixst cond, numbytes, firstreg, base |
| .if (flags) & FLAG_DST_READWRITE |
| pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base |
| .else |
| pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base |
| .endif |
| .endm |
| |
| .macro PF a, x:vararg |
| .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) |
| a x |
| .endif |
| .endm |
| |
| |
| .macro preload_leading_step1 bpp, ptr, base |
| /* If the destination is already 16-byte aligned, then we need to preload |
| * between 0 and prefetch_distance (inclusive) cache lines ahead so there |
| * are no gaps when the inner loop starts. |
| */ |
| .if bpp > 0 |
| PF bic, ptr, base, #31 |
| .set OFFSET, 0 |
| .rept prefetch_distance+1 |
| PF pld, [ptr, #OFFSET] |
| .set OFFSET, OFFSET+32 |
| .endr |
| .endif |
| .endm |
| |
| .macro preload_leading_step2 bpp, bpp_shift, ptr, base |
| /* However, if the destination is not 16-byte aligned, we may need to |
| * preload more cache lines than that. The question we need to ask is: |
| * are the bytes corresponding to the leading pixels more than the amount |
| * by which the source pointer will be rounded down for preloading, and if |
| * so, by how many cache lines? Effectively, we want to calculate |
| * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp |
| * inner_loop_offset = (src+leading_bytes)&31 |
| * extra_needed = leading_bytes - inner_loop_offset |
| * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only |
| * possible when there are 4 src bytes for every 1 dst byte). |
| */ |
| .if bpp > 0 |
| .ifc base,DST |
| /* The test can be simplified further when preloading the destination */ |
| PF tst, base, #16 |
| PF beq, 61f |
| .else |
| .if bpp/dst_w_bpp == 4 |
| PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift |
| PF and, SCRATCH, SCRATCH, #31 |
| PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift |
| PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ |
| PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ |
| PF bcs, 61f |
| PF bpl, 60f |
| PF pld, [ptr, #32*(prefetch_distance+2)] |
| .else |
| PF mov, SCRATCH, base, lsl #32-5 |
| PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift |
| PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift |
| PF bls, 61f |
| .endif |
| .endif |
| 60: PF pld, [ptr, #32*(prefetch_distance+1)] |
| 61: |
| .endif |
| .endm |
| |
| #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) |
| .macro preload_middle bpp, base, scratch_holds_offset |
| .if bpp > 0 |
| /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ |
| .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) |
| .if scratch_holds_offset |
| PF pld, [base, SCRATCH] |
| .else |
| PF bic, SCRATCH, base, #31 |
| PF pld, [SCRATCH, #32*prefetch_distance] |
| .endif |
| .endif |
| .endif |
| .endm |
| |
| .macro preload_trailing bpp, bpp_shift, base |
| .if bpp > 0 |
| .if bpp*pix_per_block > 256 |
| /* Calculations are more complex if more than one fetch per block */ |
| PF and, WK1, base, #31 |
| PF add, WK1, WK1, WK0, lsl #bpp_shift |
| PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) |
| PF bic, SCRATCH, base, #31 |
| 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] |
| PF add, SCRATCH, SCRATCH, #32 |
| PF subs, WK1, WK1, #32 |
| PF bhi, 80b |
| .else |
| /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ |
| PF mov, SCRATCH, base, lsl #32-5 |
| PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift |
| PF adceqs, SCRATCH, SCRATCH, #0 |
| /* The instruction above has two effects: ensures Z is only |
| * set if C was clear (so Z indicates that both shifted quantities |
| * were 0), and clears C if Z was set (so C indicates that the sum |
| * of the shifted quantities was greater and not equal to 32) */ |
| PF beq, 82f |
| PF bic, SCRATCH, base, #31 |
| PF bcc, 81f |
| PF pld, [SCRATCH, #32*(prefetch_distance+2)] |
| 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] |
| 82: |
| .endif |
| .endif |
| .endm |
| |
| |
| .macro preload_line narrow_case, bpp, bpp_shift, base |
| /* "narrow_case" - just means that the macro was invoked from the "narrow" |
| * code path rather than the "medium" one - because in the narrow case, |
| * the row of pixels is known to output no more than 30 bytes, then |
| * (assuming the source pixels are no wider than the the destination |
| * pixels) they cannot possibly straddle more than 2 32-byte cachelines, |
| * meaning there's no need for a loop. |
| * "bpp" - number of bits per pixel in the channel (source, mask or |
| * destination) that's being preloaded, or 0 if this channel is not used |
| * for reading |
| * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) |
| * "base" - base address register of channel to preload (SRC, MASK or DST) |
| */ |
| .if bpp > 0 |
| .if narrow_case && (bpp <= dst_w_bpp) |
| /* In these cases, each line for each channel is in either 1 or 2 cache lines */ |
| PF bic, WK0, base, #31 |
| PF pld, [WK0] |
| PF add, WK1, base, X, LSL #bpp_shift |
| PF sub, WK1, WK1, #1 |
| PF bic, WK1, WK1, #31 |
| PF cmp, WK1, WK0 |
| PF beq, 90f |
| PF pld, [WK1] |
| 90: |
| .else |
| PF bic, WK0, base, #31 |
| PF pld, [WK0] |
| PF add, WK1, base, X, lsl #bpp_shift |
| PF sub, WK1, WK1, #1 |
| PF bic, WK1, WK1, #31 |
| PF cmp, WK1, WK0 |
| PF beq, 92f |
| 91: PF add, WK0, WK0, #32 |
| PF cmp, WK0, WK1 |
| PF pld, [WK0] |
| PF bne, 91b |
| 92: |
| .endif |
| .endif |
| .endm |
| |
| |
| .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
| process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 |
| .if decrementx |
| sub&cond X, X, #8*numbytes/dst_w_bpp |
| .endif |
| process_tail cond, numbytes, firstreg |
| .if !((flags) & FLAG_PROCESS_DOES_STORE) |
| pixst cond, numbytes, firstreg, DST |
| .endif |
| .endm |
| |
| .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
| .if (flags) & FLAG_BRANCH_OVER |
| .ifc cond,mi |
| bpl 100f |
| .endif |
| .ifc cond,cs |
| bcc 100f |
| .endif |
| .ifc cond,ne |
| beq 100f |
| .endif |
| conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
| 100: |
| .else |
| conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx |
| .endif |
| .endm |
| |
| .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx |
| .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) |
| /* Can't interleave reads and writes */ |
| test |
| conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx |
| .if (flags) & FLAG_PROCESS_CORRUPTS_PSR |
| test |
| .endif |
| conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx |
| .else |
| /* Can interleave reads and writes for better scheduling */ |
| test |
| process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 |
| process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 |
| .if decrementx |
| sub&cond1 X, X, #8*numbytes1/dst_w_bpp |
| sub&cond2 X, X, #8*numbytes2/dst_w_bpp |
| .endif |
| process_tail cond1, numbytes1, firstreg1 |
| process_tail cond2, numbytes2, firstreg2 |
| pixst cond1, numbytes1, firstreg1, DST |
| pixst cond2, numbytes2, firstreg2, DST |
| .endif |
| .endm |
| |
| |
| .macro test_bits_1_0_ptr |
| .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 |
| movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ |
| .else |
| movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ |
| .endif |
| .endm |
| |
| .macro test_bits_3_2_ptr |
| .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 |
| movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ |
| .else |
| movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ |
| .endif |
| .endm |
| |
| .macro leading_15bytes process_head, process_tail |
| /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ |
| .set DECREMENT_X, 1 |
| .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 |
| .set DECREMENT_X, 0 |
| sub X, X, WK0, lsr #dst_bpp_shift |
| str X, [sp, #LINE_SAVED_REG_COUNT*4] |
| mov X, WK0 |
| .endif |
| /* Use unaligned loads in all cases for simplicity */ |
| .if dst_w_bpp == 8 |
| conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X |
| .elseif dst_w_bpp == 16 |
| test_bits_1_0_ptr |
| conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X |
| .endif |
| conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X |
| .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 |
| ldr X, [sp, #LINE_SAVED_REG_COUNT*4] |
| .endif |
| .endm |
| |
| .macro test_bits_3_2_pix |
| movs SCRATCH, X, lsl #dst_bpp_shift+32-3 |
| .endm |
| |
| .macro test_bits_1_0_pix |
| .if dst_w_bpp == 8 |
| movs SCRATCH, X, lsl #dst_bpp_shift+32-1 |
| .else |
| movs SCRATCH, X, lsr #1 |
| .endif |
| .endm |
| |
| .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
| conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 |
| .if dst_w_bpp == 16 |
| test_bits_1_0_pix |
| conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 |
| .elseif dst_w_bpp == 8 |
| conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 |
| .endif |
| .endm |
| |
| |
| .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment |
| 110: |
| .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ |
| .rept pix_per_block*dst_w_bpp/128 |
| process_head , 16, 0, unaligned_src, unaligned_mask, 1 |
| .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
| preload_middle src_bpp, SRC, 1 |
| .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
| preload_middle mask_bpp, MASK, 1 |
| .else |
| preload_middle src_bpp, SRC, 0 |
| preload_middle mask_bpp, MASK, 0 |
| .endif |
| .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) |
| /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that |
| * destination prefetches are 32-byte aligned. It's also the easiest channel to offset |
| * preloads for, to achieve staggered prefetches for multiple channels, because there are |
| * always two STMs per prefetch, so there is always an opposite STM on which to put the |
| * preload. Note, no need to BIC the base register here */ |
| PF pld, [DST, #32*prefetch_distance - dst_alignment] |
| .endif |
| process_tail , 16, 0 |
| .if !((flags) & FLAG_PROCESS_DOES_STORE) |
| pixst , 16, 0, DST |
| .endif |
| .set SUBBLOCK, SUBBLOCK+1 |
| .endr |
| subs X, X, #pix_per_block |
| bhs 110b |
| .endm |
| |
| .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask |
| /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ |
| .if dst_r_bpp > 0 |
| tst DST, #16 |
| bne 111f |
| process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS |
| b 112f |
| 111: |
| .endif |
| process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS |
| 112: |
| /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ |
| .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) |
| PF and, WK0, X, #pix_per_block-1 |
| .endif |
| preload_trailing src_bpp, src_bpp_shift, SRC |
| preload_trailing mask_bpp, mask_bpp_shift, MASK |
| .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 |
| preload_trailing dst_r_bpp, dst_bpp_shift, DST |
| .endif |
| add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp |
| /* The remainder of the line is handled identically to the medium case */ |
| medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask |
| .endm |
| |
| .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask |
| 120: |
| process_head , 16, 0, unaligned_src, unaligned_mask, 0 |
| process_tail , 16, 0 |
| .if !((flags) & FLAG_PROCESS_DOES_STORE) |
| pixst , 16, 0, DST |
| .endif |
| subs X, X, #128/dst_w_bpp |
| bhs 120b |
| /* Trailing pixels */ |
| tst X, #128/dst_w_bpp - 1 |
| beq exit_label |
| trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
| .endm |
| |
| .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask |
| tst X, #16*8/dst_w_bpp |
| conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 |
| /* Trailing pixels */ |
| /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ |
| trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask |
| .endm |
| |
| .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label |
| /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ |
| .if mask_bpp == 8 || mask_bpp == 16 |
| tst MASK, #3 |
| bne 141f |
| .endif |
| .if src_bpp == 8 || src_bpp == 16 |
| tst SRC, #3 |
| bne 140f |
| .endif |
| action process_head, process_tail, process_inner_loop, exit_label, 0, 0 |
| .if src_bpp == 8 || src_bpp == 16 |
| b exit_label |
| 140: |
| action process_head, process_tail, process_inner_loop, exit_label, 1, 0 |
| .endif |
| .if mask_bpp == 8 || mask_bpp == 16 |
| b exit_label |
| 141: |
| .if src_bpp == 8 || src_bpp == 16 |
| tst SRC, #3 |
| bne 142f |
| .endif |
| action process_head, process_tail, process_inner_loop, exit_label, 0, 1 |
| .if src_bpp == 8 || src_bpp == 16 |
| b exit_label |
| 142: |
| action process_head, process_tail, process_inner_loop, exit_label, 1, 1 |
| .endif |
| .endif |
| .endm |
| |
| |
| .macro end_of_line restore_x, vars_spilled, loop_label, last_one |
| .if SINGLE_SCANLINE |
| .ifc "last_one","" |
| b 198f |
| .endif |
| .else |
| .if vars_spilled |
| /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ |
| /* This is ldmia sp,{} */ |
| .word 0xE89D0000 | LINE_SAVED_REGS |
| .endif |
| subs Y, Y, #1 |
| .if vars_spilled |
| .if (LINE_SAVED_REGS) & (1<<1) |
| str Y, [sp] |
| .endif |
| .endif |
| add DST, DST, STRIDE_D |
| .if src_bpp > 0 |
| add SRC, SRC, STRIDE_S |
| .endif |
| .if mask_bpp > 0 |
| add MASK, MASK, STRIDE_M |
| .endif |
| .if restore_x |
| mov X, ORIG_W |
| .endif |
| bhs loop_label |
| .ifc "last_one","" |
| .if vars_spilled |
| b 197f |
| .else |
| b 198f |
| .endif |
| .else |
| .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) |
| b 198f |
| .endif |
| .endif |
| .endif |
| .endm |
| |
| |
| .macro generate_composite_function_common fname, \ |
| src_bpp_, \ |
| mask_bpp_, \ |
| dst_w_bpp_, \ |
| flags_, \ |
| prefetch_distance_, \ |
| init, \ |
| newline, \ |
| cleanup, \ |
| process_head, \ |
| process_tail, \ |
| process_inner_loop |
| |
| pixman_asm_function fname |
| |
| /* |
| * Make some macro arguments globally visible and accessible |
| * from other macros |
| */ |
| .set src_bpp, src_bpp_ |
| .set mask_bpp, mask_bpp_ |
| .set dst_w_bpp, dst_w_bpp_ |
| .set flags, flags_ |
| .set prefetch_distance, prefetch_distance_ |
| |
| /* |
| * Select prefetch type for this function. |
| */ |
| .if prefetch_distance == 0 |
| .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
| .else |
| .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD |
| .endif |
| |
| .if src_bpp == 32 |
| .set src_bpp_shift, 2 |
| .elseif src_bpp == 24 |
| .set src_bpp_shift, 0 |
| .elseif src_bpp == 16 |
| .set src_bpp_shift, 1 |
| .elseif src_bpp == 8 |
| .set src_bpp_shift, 0 |
| .elseif src_bpp == 0 |
| .set src_bpp_shift, -1 |
| .else |
| .error "requested src bpp (src_bpp) is not supported" |
| .endif |
| |
| .if mask_bpp == 32 |
| .set mask_bpp_shift, 2 |
| .elseif mask_bpp == 24 |
| .set mask_bpp_shift, 0 |
| .elseif mask_bpp == 8 |
| .set mask_bpp_shift, 0 |
| .elseif mask_bpp == 0 |
| .set mask_bpp_shift, -1 |
| .else |
| .error "requested mask bpp (mask_bpp) is not supported" |
| .endif |
| |
| .if dst_w_bpp == 32 |
| .set dst_bpp_shift, 2 |
| .elseif dst_w_bpp == 24 |
| .set dst_bpp_shift, 0 |
| .elseif dst_w_bpp == 16 |
| .set dst_bpp_shift, 1 |
| .elseif dst_w_bpp == 8 |
| .set dst_bpp_shift, 0 |
| .else |
| .error "requested dst bpp (dst_w_bpp) is not supported" |
| .endif |
| |
| .if (((flags) & FLAG_DST_READWRITE) != 0) |
| .set dst_r_bpp, dst_w_bpp |
| .else |
| .set dst_r_bpp, 0 |
| .endif |
| |
| .set pix_per_block, 16*8/dst_w_bpp |
| .if src_bpp != 0 |
| .if 32*8/src_bpp > pix_per_block |
| .set pix_per_block, 32*8/src_bpp |
| .endif |
| .endif |
| .if mask_bpp != 0 |
| .if 32*8/mask_bpp > pix_per_block |
| .set pix_per_block, 32*8/mask_bpp |
| .endif |
| .endif |
| .if dst_r_bpp != 0 |
| .if 32*8/dst_r_bpp > pix_per_block |
| .set pix_per_block, 32*8/dst_r_bpp |
| .endif |
| .endif |
| |
| /* The standard entry conditions set up by pixman-arm-common.h are: |
| * r0 = width (pixels) |
| * r1 = height (rows) |
| * r2 = pointer to top-left pixel of destination |
| * r3 = destination stride (pixels) |
| * [sp] = source pixel value, or pointer to top-left pixel of source |
| * [sp,#4] = 0 or source stride (pixels) |
| * The following arguments are unused for non-mask operations |
| * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask |
| * [sp,#12] = 0 or mask stride (pixels) |
| * |
| * or in the single-scanline case: |
| * r0 = width (pixels) |
| * r1 = pointer to top-left pixel of destination |
| * r2 = pointer to top-left pixel of source |
| * The following argument is unused for non-mask operations |
| * r3 = pointer to top-left pixel of mask |
| */ |
| |
| /* |
| * Assign symbolic names to registers |
| */ |
| X .req r0 /* pixels to go on this line */ |
| .if SINGLE_SCANLINE |
| DST .req r1 /* destination pixel pointer */ |
| SRC .req r2 /* source pixel pointer */ |
| MASK .req r3 /* mask pixel pointer (if applicable) */ |
| Y .req r4 /* temporary */ |
| STRIDE_D .req r5 /* temporary */ |
| STRIDE_S .req r6 /* temporary */ |
| STRIDE_M .req r7 /* temporary */ |
| .else |
| Y .req r1 /* lines to go */ |
| DST .req r2 /* destination pixel pointer */ |
| STRIDE_D .req r3 /* destination stride (bytes, minus width) */ |
| SRC .req r4 /* source pixel pointer */ |
| STRIDE_S .req r5 /* source stride (bytes, minus width) */ |
| MASK .req r6 /* mask pixel pointer (if applicable) */ |
| STRIDE_M .req r7 /* mask stride (bytes, minus width) */ |
| .endif |
| WK0 .req r8 /* pixel data registers */ |
| WK1 .req r9 |
| WK2 .req r10 |
| WK3 .req r11 |
| SCRATCH .req r12 |
| ORIG_W .req r14 /* width (pixels) */ |
| |
| push {r4-r11, lr} /* save all registers */ |
| |
| .if !SINGLE_SCANLINE |
| subs Y, Y, #1 |
| blo 199f |
| .endif |
| |
| #ifdef DEBUG_PARAMS |
| sub sp, sp, #9*4 |
| #endif |
| |
| .if !SINGLE_SCANLINE |
| .if src_bpp > 0 |
| ldr SRC, [sp, #ARGS_STACK_OFFSET] |
| ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] |
| .endif |
| .if mask_bpp > 0 |
| ldr MASK, [sp, #ARGS_STACK_OFFSET+8] |
| ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] |
| .endif |
| .endif |
| |
| #ifdef DEBUG_PARAMS |
| add Y, Y, #1 |
| stmia sp, {r0-r7,pc} |
| sub Y, Y, #1 |
| #endif |
| |
| init |
| |
| .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 |
| /* Reserve a word in which to store X during leading pixels */ |
| sub sp, sp, #4 |
| .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 |
| .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 |
| .endif |
| |
| .if !SINGLE_SCANLINE |
| lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ |
| sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift |
| .if src_bpp > 0 |
| lsl STRIDE_S, #src_bpp_shift |
| sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift |
| .endif |
| .if mask_bpp > 0 |
| lsl STRIDE_M, #mask_bpp_shift |
| sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift |
| .endif |
| .endif |
| |
| /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ |
| cmp X, #2*16*8/dst_w_bpp - 1 |
| blo 170f |
| .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ |
| /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ |
| cmp X, #(prefetch_distance+3)*pix_per_block - 1 |
| blo 160f |
| |
| /* Wide case */ |
| /* Adjust X so that the decrement instruction can also test for |
| * inner loop termination. We want it to stop when there are |
| * (prefetch_distance+1) complete blocks to go. */ |
| sub X, X, #(prefetch_distance+2)*pix_per_block |
| .if !SINGLE_SCANLINE |
| mov ORIG_W, X |
| .if (flags) & FLAG_SPILL_LINE_VARS_WIDE |
| /* This is stmdb sp!,{} */ |
| .word 0xE92D0000 | LINE_SAVED_REGS |
| .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 |
| .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 |
| .endif |
| .endif |
| 151: /* New line */ |
| newline |
| preload_leading_step1 src_bpp, WK1, SRC |
| preload_leading_step1 mask_bpp, WK2, MASK |
| .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 |
| preload_leading_step1 dst_r_bpp, WK3, DST |
| .endif |
| |
| ands WK0, DST, #15 |
| beq 154f |
| rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ |
| |
| preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC |
| preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK |
| .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 |
| preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST |
| .endif |
| |
| leading_15bytes process_head, process_tail |
| |
| 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ |
| .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
| and SCRATCH, SRC, #31 |
| rsb SCRATCH, SCRATCH, #32*prefetch_distance |
| .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) |
| and SCRATCH, MASK, #31 |
| rsb SCRATCH, SCRATCH, #32*prefetch_distance |
| .endif |
| .ifc "process_inner_loop","" |
| switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f |
| .else |
| switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f |
| .endif |
| |
| 157: /* Check for another line */ |
| end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b |
| .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE) |
| .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 |
| .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 |
| .endif |
| .endif |
| |
| .ltorg |
| |
| 160: /* Medium case */ |
| .if !SINGLE_SCANLINE |
| mov ORIG_W, X |
| .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE |
| /* This is stmdb sp!,{} */ |
| .word 0xE92D0000 | LINE_SAVED_REGS |
| .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 |
| .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 |
| .endif |
| .endif |
| 161: /* New line */ |
| newline |
| preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ |
| preload_line 0, mask_bpp, mask_bpp_shift, MASK |
| .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 |
| preload_line 0, dst_r_bpp, dst_bpp_shift, DST |
| .endif |
| |
| sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ |
| ands WK0, DST, #15 |
| beq 164f |
| rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ |
| |
| leading_15bytes process_head, process_tail |
| |
| 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ |
| switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f |
| |
| 167: /* Check for another line */ |
| end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b |
| |
| .ltorg |
| |
| 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ |
| .if !SINGLE_SCANLINE |
| .if dst_w_bpp < 32 |
| mov ORIG_W, X |
| .endif |
| .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE |
| /* This is stmdb sp!,{} */ |
| .word 0xE92D0000 | LINE_SAVED_REGS |
| .endif |
| .endif |
| 171: /* New line */ |
| newline |
| preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ |
| preload_line 1, mask_bpp, mask_bpp_shift, MASK |
| .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 |
| preload_line 1, dst_r_bpp, dst_bpp_shift, DST |
| .endif |
| |
| .if dst_w_bpp == 8 |
| tst DST, #3 |
| beq 174f |
| 172: subs X, X, #1 |
| blo 177f |
| process_head , 1, 0, 1, 1, 0 |
| process_tail , 1, 0 |
| .if !((flags) & FLAG_PROCESS_DOES_STORE) |
| pixst , 1, 0, DST |
| .endif |
| tst DST, #3 |
| bne 172b |
| .elseif dst_w_bpp == 16 |
| tst DST, #2 |
| beq 174f |
| subs X, X, #1 |
| blo 177f |
| process_head , 2, 0, 1, 1, 0 |
| process_tail , 2, 0 |
| .if !((flags) & FLAG_PROCESS_DOES_STORE) |
| pixst , 2, 0, DST |
| .endif |
| .endif |
| |
| 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ |
| switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f |
| |
| 177: /* Check for another line */ |
| end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one |
| .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE) |
| .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 |
| .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 |
| .endif |
| |
| 197: |
| .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS) |
| add sp, sp, #LINE_SAVED_REG_COUNT*4 |
| .endif |
| 198: |
| .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 |
| .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 |
| .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 |
| add sp, sp, #4 |
| .endif |
| |
| cleanup |
| |
| #ifdef DEBUG_PARAMS |
| add sp, sp, #9*4 /* junk the debug copy of arguments */ |
| #endif |
| 199: |
| pop {r4-r11, pc} /* exit */ |
| |
| .ltorg |
| |
| .unreq X |
| .unreq Y |
| .unreq DST |
| .unreq STRIDE_D |
| .unreq SRC |
| .unreq STRIDE_S |
| .unreq MASK |
| .unreq STRIDE_M |
| .unreq WK0 |
| .unreq WK1 |
| .unreq WK2 |
| .unreq WK3 |
| .unreq SCRATCH |
| .unreq ORIG_W |
| .endfunc |
| .endm |
| |
| .macro generate_composite_function fname, \ |
| src_bpp_, \ |
| mask_bpp_, \ |
| dst_w_bpp_, \ |
| flags_, \ |
| prefetch_distance_, \ |
| init, \ |
| newline, \ |
| cleanup, \ |
| process_head, \ |
| process_tail, \ |
| process_inner_loop |
| .set SINGLE_SCANLINE, 0 |
| generate_composite_function_common \ |
| fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ |
| init, newline, cleanup, process_head, process_tail, process_inner_loop |
| .endm |
| |
| .macro generate_composite_function_single_scanline fname, \ |
| src_bpp_, \ |
| mask_bpp_, \ |
| dst_w_bpp_, \ |
| flags_, \ |
| prefetch_distance_, \ |
| init, \ |
| newline, \ |
| cleanup, \ |
| process_head, \ |
| process_tail, \ |
| process_inner_loop |
| .set SINGLE_SCANLINE, 1 |
| generate_composite_function_common \ |
| fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ |
| init, newline, cleanup, process_head, process_tail, process_inner_loop |
| .endm |
| |
| .macro line_saved_regs x:vararg |
| .set LINE_SAVED_REGS, 0 |
| .set LINE_SAVED_REG_COUNT, 0 |
| .irp SAVED_REG,x |
| .ifc "SAVED_REG","Y" |
| .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) |
| .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
| .endif |
| .ifc "SAVED_REG","STRIDE_D" |
| .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) |
| .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
| .endif |
| .ifc "SAVED_REG","STRIDE_S" |
| .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) |
| .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
| .endif |
| .ifc "SAVED_REG","STRIDE_M" |
| .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) |
| .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
| .endif |
| .ifc "SAVED_REG","ORIG_W" |
| .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) |
| .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 |
| .endif |
| .endr |
| .if SINGLE_SCANLINE |
| .set LINE_SAVED_REG_COUNT, 0 |
| .endif |
| .endm |
| |
| .macro nop_macro x:vararg |
| .endm |