| /* |
| * Copyright 2015 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "SkBlitMask.h" |
| #include "SkColor_opts_neon.h" |
| |
| void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], |
| SkColor color, int width, |
| SkPMColor opaqueDst) { |
| int colR = SkColorGetR(color); |
| int colG = SkColorGetG(color); |
| int colB = SkColorGetB(color); |
| |
| uint8x8_t vcolR = vdup_n_u8(colR); |
| uint8x8_t vcolG = vdup_n_u8(colG); |
| uint8x8_t vcolB = vdup_n_u8(colB); |
| uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); |
| uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); |
| uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); |
| uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); |
| |
| while (width >= 8) { |
| uint8x8x4_t vdst; |
| uint16x8_t vmask; |
| uint16x8_t vmaskR, vmaskG, vmaskB; |
| uint8x8_t vsel_trans, vsel_opq; |
| |
| vdst = vld4_u8((uint8_t*)dst); |
| vmask = vld1q_u16(src); |
| |
| // Prepare compare masks |
| vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); |
| vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); |
| |
| // Get all the color masks on 5 bits |
| vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); |
| vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), |
| SK_B16_BITS + SK_R16_BITS + 1); |
| vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); |
| |
| // Upscale to 0..32 |
| vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); |
| vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); |
| vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); |
| |
| vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); |
| vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); |
| |
| vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); |
| vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); |
| vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); |
| |
| vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); |
| vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); |
| vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); |
| |
| vst4_u8((uint8_t*)dst, vdst); |
| |
| dst += 8; |
| src += 8; |
| width -= 8; |
| } |
| |
| // Leftovers |
| for (int i = 0; i < width; i++) { |
| dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], |
| opaqueDst); |
| } |
| } |
| |
| void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], |
| SkColor color, int width, SkPMColor) { |
| int colA = SkColorGetA(color); |
| int colR = SkColorGetR(color); |
| int colG = SkColorGetG(color); |
| int colB = SkColorGetB(color); |
| |
| colA = SkAlpha255To256(colA); |
| |
| uint16x8_t vcolA = vdupq_n_u16(colA); |
| uint8x8_t vcolR = vdup_n_u8(colR); |
| uint8x8_t vcolG = vdup_n_u8(colG); |
| uint8x8_t vcolB = vdup_n_u8(colB); |
| |
| while (width >= 8) { |
| uint8x8x4_t vdst; |
| uint16x8_t vmask; |
| uint16x8_t vmaskR, vmaskG, vmaskB; |
| |
| vdst = vld4_u8((uint8_t*)dst); |
| vmask = vld1q_u16(src); |
| |
| // Get all the color masks on 5 bits |
| vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); |
| vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), |
| SK_B16_BITS + SK_R16_BITS + 1); |
| vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); |
| |
| // Upscale to 0..32 |
| vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); |
| vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); |
| vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); |
| |
| vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); |
| vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); |
| vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); |
| |
| vdst.val[NEON_A] = vdup_n_u8(0xFF); |
| vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); |
| vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); |
| vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); |
| |
| vst4_u8((uint8_t*)dst, vdst); |
| |
| dst += 8; |
| src += 8; |
| width -= 8; |
| } |
| |
| for (int i = 0; i < width; i++) { |
| dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); |
| } |
| } |
| |
| #define LOAD_LANE_16(reg, n) \ |
| reg = vld1q_lane_u16(device, reg, n); \ |
| device = (uint16_t*)((char*)device + deviceRB); |
| |
| #define STORE_LANE_16(reg, n) \ |
| vst1_lane_u16(dst, reg, n); \ |
| dst = (uint16_t*)((char*)dst + deviceRB); |
| |
| void SkRGB16BlitterBlitV_neon(uint16_t* device, |
| int height, |
| size_t deviceRB, |
| unsigned scale, |
| uint32_t src32) { |
| if (height >= 8) |
| { |
| uint16_t* dst = device; |
| |
| // prepare constants |
| uint16x8_t vdev = vdupq_n_u16(0); |
| uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); |
| uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); |
| uint32x4_t vsrc32 = vdupq_n_u32(src32); |
| uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); |
| |
| while (height >= 8){ |
| LOAD_LANE_16(vdev, 0) |
| LOAD_LANE_16(vdev, 1) |
| LOAD_LANE_16(vdev, 2) |
| LOAD_LANE_16(vdev, 3) |
| LOAD_LANE_16(vdev, 4) |
| LOAD_LANE_16(vdev, 5) |
| LOAD_LANE_16(vdev, 6) |
| LOAD_LANE_16(vdev, 7) |
| |
| // Expand_rgb_16 |
| uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); |
| uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); |
| uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); |
| |
| // Compact_rgb_16 |
| vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); |
| vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); |
| vdst32_lo = vshrq_n_u32(vdst32_lo, 5); |
| vdst32_hi = vshrq_n_u32(vdst32_hi, 5); |
| |
| uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); |
| uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); |
| uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); |
| vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); |
| vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); |
| uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); |
| |
| STORE_LANE_16(vdst16_lo, 0) |
| STORE_LANE_16(vdst16_lo, 1) |
| STORE_LANE_16(vdst16_lo, 2) |
| STORE_LANE_16(vdst16_lo, 3) |
| STORE_LANE_16(vdst16_hi, 0) |
| STORE_LANE_16(vdst16_hi, 1) |
| STORE_LANE_16(vdst16_hi, 2) |
| STORE_LANE_16(vdst16_hi, 3) |
| height -= 8; |
| } |
| } |
| while (height != 0){ |
| uint32_t dst32 = SkExpand_rgb_16(*device) * scale; |
| *device = SkCompact_rgb_16((src32 + dst32) >> 5); |
| device = (uint16_t*)((char*)device + deviceRB); |
| height--; |
| } |
| } |
| |
| #undef LOAD_LANE_16 |
| #undef STORE_LANE_16 |