blob: 0655960a2c4016a11028fef9e2b230cea090a0a6 [file] [log] [blame]
/*
* Copyright 2019 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#ifndef SkVM_opts_DEFINED
#define SkVM_opts_DEFINED
#include "src/core/SkVM.h"
#include "include/private/SkVx.h"
namespace SK_OPTS_NS {
inline void eval(const skvm::Program::Instruction insts[], const int ninsts,
const int nregs, const int loop,
int n, void* args[], size_t strides[], const int nargs) {
using namespace skvm;
// We'll operate in SIMT style, knocking off K-size chunks from n while possible.
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
constexpr int K = 32;
#else
constexpr int K = 16;
#endif
using I32 = skvx::Vec<K, int>;
using F32 = skvx::Vec<K, float>;
using U32 = skvx::Vec<K, uint32_t>;
using U8 = skvx::Vec<K, uint8_t>;
using I16x2 = skvx::Vec<2*K, int16_t>;
using U16x2 = skvx::Vec<2*K, uint16_t>;
union Slot {
I32 i32;
U32 u32;
F32 f32;
};
Slot few_regs[16];
std::unique_ptr<char[]> many_regs;
Slot* regs = few_regs;
if (nregs > (int)SK_ARRAY_COUNT(few_regs)) {
// Annoyingly we can't trust that malloc() or new will work with Slot because
// the skvx::Vec types may have alignment greater than what they provide.
// We'll overallocate one extra register so we can align manually.
many_regs.reset(new char[ sizeof(Slot) * (nregs + 1) ]);
uintptr_t addr = (uintptr_t)many_regs.get();
addr += alignof(Slot) -
(addr & (alignof(Slot) - 1));
SkASSERT((addr & (alignof(Slot) - 1)) == 0);
regs = (Slot*)addr;
}
auto r = [&](Reg id) -> Slot& {
SkASSERT(0 <= id && id < nregs);
return regs[id];
};
auto arg = [&](int ix) {
SkASSERT(0 <= ix && ix < nargs);
return args[ix];
};
// Step each argument pointer ahead by its stride a number of times.
auto step_args = [&](int times) {
// Looping by marching pointers until *arg == nullptr helps the
// compiler to keep this loop scalar. Otherwise it'd create a
// rather large and useless autovectorized version.
void** arg = args;
const size_t* stride = strides;
for (; *arg; arg++, stride++) {
*arg = (void*)( (char*)*arg + times * *stride );
}
SkASSERT(arg == args + nargs);
};
int start = 0,
stride;
for ( ; n > 0; start = loop, n -= stride, step_args(stride)) {
stride = n >= K ? K : 1;
for (int i = start; i < ninsts; i++) {
skvm::Program::Instruction inst = insts[i];
// d = op(x,y,z/imm)
Reg d = inst.d,
x = inst.x,
y = inst.y,
z = inst.z;
int imm = inst.imm;
// Ops that interact with memory need to know whether we're stride=1 or stride=K,
// but all non-memory ops can run the same code no matter the stride.
switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
#define STRIDE_1(op) case 2*(int)op
#define STRIDE_K(op) case 2*(int)op + 1
STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break;
STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
STRIDE_K(Op::load8 ): r(d).i32 = skvx::cast<int>(U8 ::Load(arg(imm))); break;
STRIDE_K(Op::load32): r(d).i32 = I32::Load(arg(imm)) ; break;
#undef STRIDE_1
#undef STRIDE_K
// Ops that don't interact with memory should never care about the stride.
#define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
CASE(Op::splat): r(d).i32 = imm; break;
CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
CASE(Op::sub_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
skvx::bit_pun<I16x2>(r(y).i32) ); break;
CASE(Op::mul_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
skvx::bit_pun<I16x2>(r(y).i32) ); break;
CASE(Op::shr_i16x2):
r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
break;
CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
CASE(Op::bytes): {
const U32 table[] = {
0,
(r(x).u32 ) & 0xff,
(r(x).u32 >> 8) & 0xff,
(r(x).u32 >> 16) & 0xff,
(r(x).u32 >> 24) & 0xff,
};
r(d).u32 = table[(imm >> 0) & 0xf] << 0
| table[(imm >> 4) & 0xf] << 8
| table[(imm >> 8) & 0xf] << 16
| table[(imm >> 12) & 0xf] << 24;
} break;
CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
#undef CASE
}
}
}
}
} // namespace SK_OPTS_NS
#endif//SkVM_opts_DEFINED