blob: b37a8135945da9ecb1546c1e61db2f9df3da6839 [file] [log] [blame]
/*
* Copyright 2014 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include <arm_neon.h>
#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
#ifndef PREAMBLE
#define PREAMBLE(state)
#define PREAMBLE_PARAM_X
#define PREAMBLE_PARAM_Y
#define PREAMBLE_ARG_X
#define PREAMBLE_ARG_Y
#endif
static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
SkMatrix::kScale_Mask)) == 0);
PREAMBLE(s);
// we store y, x, x, x, x, x
const unsigned maxX = s.fPixmap.width() - 1;
SkFractionalInt fx;
{
const SkBitmapProcStateAutoMapper mapper(s, x, y);
const unsigned maxY = s.fPixmap.height() - 1;
*xy++ = TILEY_PROCF(mapper.fixedY(), maxY);
fx = mapper.fractionalIntX();
}
if (0 == maxX) {
// all of the following X values must be 0
memset(xy, 0, count * sizeof(uint16_t));
return;
}
const SkFractionalInt dx = s.fInvSxFractionalInt;
#ifdef CHECK_FOR_DECAL
// test if we don't need to apply the tile proc
const SkFixed fixedFx = SkFractionalIntToFixed(fx);
const SkFixed fixedDx = SkFractionalIntToFixed(dx);
if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) {
decal_nofilter_scale_neon(xy, fixedFx, fixedDx, count);
return;
}
#endif
if (count >= 8) {
SkFractionalInt dx2 = dx+dx;
SkFractionalInt dx4 = dx2+dx2;
SkFractionalInt dx8 = dx4+dx4;
// now build fx/fx+dx/fx+2dx/fx+3dx
SkFractionalInt fx1, fx2, fx3;
int32x4_t lbase, hbase;
int16_t *dst16 = (int16_t *)xy;
fx1 = fx+dx;
fx2 = fx1+dx;
fx3 = fx2+dx;
lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
// store & bump
while (count >= 8) {
int16x8_t fx8;
fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
vst1q_s16(dst16, fx8);
// but preserving base & on to the next
lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
dst16 += 8;
count -= 8;
fx += dx8;
};
xy = (uint32_t *) dst16;
}
uint16_t* xx = (uint16_t*)xy;
for (int i = count; i > 0; --i) {
*xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
fx += dx;
}
}
static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
SkFixed one PREAMBLE_PARAM_Y) {
unsigned i = TILEY_PROCF(f, max);
i = (i << 4) | EXTRACT_LOW_BITS(f, max);
return (i << 14) | (TILEY_PROCF((f + one), max));
}
static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
SkFixed one PREAMBLE_PARAM_X) {
unsigned i = TILEX_PROCF(f, max);
i = (i << 4) | EXTRACT_LOW_BITS(f, max);
return (i << 14) | (TILEX_PROCF((f + one), max));
}
static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
SkFixed one PREAMBLE_PARAM_X) {
int32x4_t ret, res, wide_one;
// Prepare constants
wide_one = vdupq_n_s32(one);
// Step 1
res = TILEX_PROCF_NEON4(f, max);
// Step 2
ret = EXTRACT_LOW_BITS_NEON4(f, max);
ret = vsliq_n_s32(ret, res, 4);
// Step 3
res = TILEX_PROCF_NEON4(f + wide_one, max);
ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
return ret;
}
static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
SkFixed one PREAMBLE_PARAM_X) {
int32x4_t ret, res, wide_one;
// Prepare constants
wide_one = vdupq_n_s32(one);
// Step 1
res = TILEY_PROCF_NEON4(f, max);
// Step 2
ret = EXTRACT_LOW_BITS_NEON4(f, max);
ret = vsliq_n_s32(ret, res, 4);
// Step 3
res = TILEY_PROCF_NEON4(f + wide_one, max);
ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
return ret;
}
static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
SkMatrix::kScale_Mask)) == 0);
SkASSERT(s.fInvKy == 0);
PREAMBLE(s);
const unsigned maxX = s.fPixmap.width() - 1;
const SkFixed one = s.fFilterOneX;
const SkFractionalInt dx = s.fInvSxFractionalInt;
SkFractionalInt fx;
{
const SkBitmapProcStateAutoMapper mapper(s, x, y);
const SkFixed fy = mapper.fixedY();
const unsigned maxY = s.fPixmap.height() - 1;
// compute our two Y values up front
*xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
// now initialize fx
fx = mapper.fractionalIntX();
}
#ifdef CHECK_FOR_DECAL
// test if we don't need to apply the tile proc
const SkFixed fixedFx = SkFractionalIntToFixed(fx);
const SkFixed fixedDx = SkFractionalIntToFixed(dx);
if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) {
decal_filter_scale_neon(xy, fixedFx, fixedDx, count);
return;
}
#endif
{
if (count >= 4) {
int32x4_t wide_fx;
wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
while (count >= 4) {
int32x4_t res;
res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
vst1q_u32(xy, vreinterpretq_u32_s32(res));
wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
fx += dx+dx+dx+dx;
xy += 4;
count -= 4;
}
}
while (--count >= 0) {
*xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
fx += dx;
}
}
}
const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
SCALE_NOFILTER_NAME,
SCALE_FILTER_NAME,
};
#undef TILEX_PROCF_NEON8
#undef TILEY_PROCF_NEON8
#undef TILEX_PROCF_NEON4
#undef TILEY_PROCF_NEON4
#undef EXTRACT_LOW_BITS_NEON4
#undef MAKENAME
#undef TILEX_PROCF
#undef TILEY_PROCF
#ifdef CHECK_FOR_DECAL
#undef CHECK_FOR_DECAL
#endif
#undef SCALE_NOFILTER_NAME
#undef SCALE_FILTER_NAME
#undef PREAMBLE
#undef PREAMBLE_PARAM_X
#undef PREAMBLE_PARAM_Y
#undef PREAMBLE_ARG_X
#undef PREAMBLE_ARG_Y
#undef EXTRACT_LOW_BITS