src/opts/memset16_neon.S - skia - Git at Google

 /***************************************************************************
  * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  ***************************************************************************/

 /***************************************************************************
   Neon memset: Attempts to do a memset with Neon registers if possible,
      Inputs:
         s: The buffer to write to
         c: The integer data to write to the buffer
         n: The size_t count.
      Outputs:

 ***************************************************************************/

         .syntax unified

         .code 32
         .fpu neon
         .align 4
         .globl memset16_neon

 memset16_neon:
         cmp             r2, #0
         bxeq            lr

         /* Keep in mind that r2 -- the count argument -- is for the
          * number of 16-bit items to copy.
          */
         lsl             r2, r2, #1

         push            {r0}

         /* If we have < 8 bytes, just do a quick loop to handle that */
         cmp             r2, #8
         bgt             memset_gt4
 memset_smallcopy_loop:
         strh            r1, [r0], #2
         subs            r2, r2, #2
         bne             memset_smallcopy_loop
 memset_smallcopy_done:
         pop             {r0}
         bx              lr

 memset_gt4:
         /*
          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
          * a register with two 16-bit-values we can copy. We do this by
          * duplicating lowest 16-bits of r1 to upper 16-bits.
          */
         orr             r1, r1, r1, lsl #16
         /*
          * If we're copying > 64 bytes, then we may want to get
          * onto a 16-byte boundary to improve speed even more.
          */
         cmp             r2, #64
         blt             memset_route
         ands            r12, r0, #0xf
         beq             memset_route
         /*
          * Determine the number of bytes to move forward to get to the 16-byte
          * boundary.  Note that this will be a multiple of 4, since we
          * already are word-aligned.
          */
         rsb             r12, r12, #16
         sub             r2, r2, r12
         lsls            r12, r12, #29
         strmi           r1, [r0], #4
         strcs           r1, [r0], #4
         strcs           r1, [r0], #4
         lsls            r12, r12, #2
         strhcs          r1, [r0], #2
 memset_route:
         /*
          * Decide where to route for the maximum copy sizes.  Note that we
          * build q0 and q1 depending on if we'll need it, so that's
          * interwoven here as well.
          */
         vdup.u32        d0, r1
         cmp             r2, #16
         blt             memset_8
         vmov            d1, d0
         cmp             r2, #64
         blt             memset_16
         vmov            q1, q0
         cmp             r2, #128
         blt             memset_32
 memset_128:
         mov             r12, r2, lsr #7
 memset_128_loop:
         vst1.64         {q0, q1}, [r0]!
         vst1.64         {q0, q1}, [r0]!
         vst1.64         {q0, q1}, [r0]!
         vst1.64         {q0, q1}, [r0]!
         subs            r12, r12, #1
         bne             memset_128_loop
         ands            r2, r2, #0x7f
         beq             memset_end
 memset_32:
         movs            r12, r2, lsr #5
         beq             memset_16
 memset_32_loop:
         subs            r12, r12, #1
         vst1.64         {q0, q1}, [r0]!
         bne             memset_32_loop
         ands            r2, r2, #0x1f
         beq             memset_end
 memset_16:
         movs            r12, r2, lsr #4
         beq             memset_8
 memset_16_loop:
         subs            r12, r12, #1
         vst1.32         {q0}, [r0]!
         bne             memset_16_loop
         ands            r2, r2, #0xf
         beq             memset_end
         /*
          * memset_8 isn't a loop, since we try to do our loops at 16
          * bytes and above.  We should loop there, then drop down here
          * to finish the <16-byte versions.  Same for memset_4 and
          * memset_1.
          */
 memset_8:
         cmp             r2, #8
         blt             memset_4
         subs            r2, r2, #8
         vst1.32         {d0}, [r0]!
 memset_4:
         cmp             r2, #4
         blt             memset_2
         subs            r2, r2, #4
         str             r1, [r0], #4
 memset_2:
         cmp             r2, #0
         ble             memset_end
         strh            r1, [r0], #2
 memset_end:
         pop             {r0}
         bx              lr

         .end
	/***************************************************************************
	* Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	***************************************************************************/

	/***************************************************************************
	Neon memset: Attempts to do a memset with Neon registers if possible,
	Inputs:
	s: The buffer to write to
	c: The integer data to write to the buffer
	n: The size_t count.
	Outputs:

	***************************************************************************/

	.syntax unified

	.code 32
	.fpu neon
	.align 4
	.globl memset16_neon

	memset16_neon:
	cmp r2, #0
	bxeq lr

	/* Keep in mind that r2 -- the count argument -- is for the
	* number of 16-bit items to copy.
	*/
	lsl r2, r2, #1

	push {r0}

	/* If we have < 8 bytes, just do a quick loop to handle that */
	cmp r2, #8
	bgt memset_gt4
	memset_smallcopy_loop:
	strh r1, [r0], #2
	subs r2, r2, #2
	bne memset_smallcopy_loop
	memset_smallcopy_done:
	pop {r0}
	bx lr

	memset_gt4:
	/*
	* Duplicate the r1 lowest 16-bits across r1. The idea is to have
	* a register with two 16-bit-values we can copy. We do this by
	* duplicating lowest 16-bits of r1 to upper 16-bits.
	*/
	orr r1, r1, r1, lsl #16
	/*
	* If we're copying > 64 bytes, then we may want to get
	* onto a 16-byte boundary to improve speed even more.
	*/
	cmp r2, #64
	blt memset_route
	ands r12, r0, #0xf
	beq memset_route
	/*
	* Determine the number of bytes to move forward to get to the 16-byte
	* boundary. Note that this will be a multiple of 4, since we
	* already are word-aligned.
	*/
	rsb r12, r12, #16
	sub r2, r2, r12
	lsls r12, r12, #29
	strmi r1, [r0], #4
	strcs r1, [r0], #4
	strcs r1, [r0], #4
	lsls r12, r12, #2
	strhcs r1, [r0], #2
	memset_route:
	/*
	* Decide where to route for the maximum copy sizes. Note that we
	* build q0 and q1 depending on if we'll need it, so that's
	* interwoven here as well.
	*/
	vdup.u32 d0, r1
	cmp r2, #16
	blt memset_8
	vmov d1, d0
	cmp r2, #64
	blt memset_16
	vmov q1, q0
	cmp r2, #128
	blt memset_32
	memset_128:
	mov r12, r2, lsr #7
	memset_128_loop:
	vst1.64 {q0, q1}, [r0]!
	vst1.64 {q0, q1}, [r0]!
	vst1.64 {q0, q1}, [r0]!
	vst1.64 {q0, q1}, [r0]!
	subs r12, r12, #1
	bne memset_128_loop
	ands r2, r2, #0x7f
	beq memset_end
	memset_32:
	movs r12, r2, lsr #5
	beq memset_16
	memset_32_loop:
	subs r12, r12, #1
	vst1.64 {q0, q1}, [r0]!
	bne memset_32_loop
	ands r2, r2, #0x1f
	beq memset_end
	memset_16:
	movs r12, r2, lsr #4
	beq memset_8
	memset_16_loop:
	subs r12, r12, #1
	vst1.32 {q0}, [r0]!
	bne memset_16_loop
	ands r2, r2, #0xf
	beq memset_end
	/*
	* memset_8 isn't a loop, since we try to do our loops at 16
	* bytes and above. We should loop there, then drop down here
	* to finish the <16-byte versions. Same for memset_4 and
	* memset_1.
	*/
	memset_8:
	cmp r2, #8
	blt memset_4
	subs r2, r2, #8
	vst1.32 {d0}, [r0]!
	memset_4:
	cmp r2, #4
	blt memset_2
	subs r2, r2, #4
	str r1, [r0], #4
	memset_2:
	cmp r2, #0
	ble memset_end
	strh r1, [r0], #2
	memset_end:
	pop {r0}
	bx lr

	.end