proactive JITting
Move JITting from lazy in eval() to proactive in Program::Program().
There's no need to delay to eval() now that strides are known up front.
There's _still_ one more reason we need to keep the interpreter around
even if we can JIT... can_jit() may return false (too many regs, too
many args).
Change-Id: I0a176b97bcd9e8d0fcf2a9fa4b7f64103fd51e75
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/227419
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 90797dd..f459c5e 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -5,6 +5,7 @@
* found in the LICENSE file.
*/
+#include "include/private/SkSpinlock.h"
#include "include/private/SkTFitsIn.h"
#include "include/private/SkThreadID.h"
#include "include/private/SkVx.h"
@@ -17,39 +18,6 @@
namespace skvm {
- Program::~Program() = default;
-
- Program::Program(Program&& other) {
- fInstructions = std::move(other.fInstructions);
- fRegs = other.fRegs;
- fLoop = other.fLoop;
- fStrides = std::move(other.fStrides);
- // Don't bother trying to move other.fJIT*. We can just regenerate it.
- }
-
- Program& Program::operator=(Program&& other) {
- fInstructions = std::move(other.fInstructions);
- fRegs = other.fRegs;
- fLoop = other.fLoop;
- fStrides = std::move(other.fStrides);
- // Don't bother trying to move other.fJIT*. We can just regenerate it,
- // but we do need to invalidate anything we have cached ourselves.
- fJITLock.acquire();
- fJIT = JIT();
- fJITLock.release();
- return *this;
- }
-
- Program::Program(std::vector<Instruction> instructions,
- int regs,
- int loop,
- std::vector<int> strides)
- : fInstructions(std::move(instructions))
- , fRegs(regs)
- , fLoop(loop)
- , fStrides(std::move(strides)) {}
-
-
Program Builder::done() const {
// Track per-instruction code hoisting, lifetime, and register assignment.
struct Analysis {
@@ -1303,338 +1271,361 @@
a.ret(A::x30);
}
#endif
-
- Program::JIT::~JIT() {
- if (buf) {
- munmap(buf,size);
- }
- }
-#else
- Program::JIT::~JIT() { SkASSERT(buf == nullptr); }
#endif // defined(SKVM_JIT)
void Program::eval(int n, void* args[]) const {
- void (*entry)() = nullptr;
- int nargs = (int)fStrides.size();
+ const int nargs = (int)fStrides.size();
- #if defined(SKVM_JIT)
- // If we can't grab this lock, another thread is probably assembling the program.
- // We can just fall through to the interpreter.
- if (fJITLock.tryAcquire()) {
- if (fJIT.entry) {
- // Use cached program.
- entry = fJIT.entry;
- } else if (can_jit(fRegs, nargs)) {
- // First assemble without any buffer to see how much memory we need to mmap.
- size_t code;
- Assembler a{nullptr};
- jit(a, &code, fInstructions, fRegs, fLoop, fStrides.data(), nargs);
-
- // mprotect() can only change at a page level granularity, so round a.size() up.
- size_t page = sysconf(_SC_PAGESIZE), // Probably 4096.
- size = ((a.size() + page - 1) / page) * page;
-
- void* buf =
- mmap(nullptr, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
-
- a = Assembler{buf};
- jit(a, &code, fInstructions, fRegs, fLoop, fStrides.data(), nargs);
-
- mprotect(buf,size, PROT_READ|PROT_EXEC);
- #if defined(__aarch64__)
- msync(buf, size, MS_SYNC|MS_INVALIDATE);
- #endif
-
- entry = (decltype(entry))( (const uint8_t*)buf + code );
- fJIT.buf = buf;
- fJIT.size = size;
- fJIT.entry = entry;
-
-
- #if 0 || defined(SKVM_PERF_DUMPS) // Debug dumps for perf.
- #if defined(__aarch64__)
- // cat | llvm-mc -arch aarch64 -disassemble
- auto cur = (const uint8_t*)buf;
- for (int i = 0; i < (int)a.size(); i++) {
- if (i % 4 == 0) {
- SkDebugf("\n");
- if (i == (int)code) {
- SkDebugf("code:\n");
- }
- }
- SkDebugf("0x%02x ", *cur++);
- }
- SkDebugf("\n");
- #endif
-
- // We're doing some really stateful things below so one thread at a time please...
- static SkSpinlock dump_lock;
- SkAutoSpinlock lock(dump_lock);
-
- auto fnv1a = [](const void* vbuf, size_t n) {
- uint32_t hash = 2166136261;
- for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) {
- hash ^= *buf;
- hash *= 16777619;
- }
- return hash;
- };
-
-
- uint32_t hash = fnv1a(fJIT.buf, fJIT.size);
- char name[64];
- sprintf(name, "skvm-jit-%u", hash);
-
- // Create a jit-<pid>.dump file that we can `perf inject -j` into a
- // perf.data captured with `perf record -k 1`, letting us see each
- // JIT'd Program as if a function named skvm-jit-<hash>. E.g.
- //
- // ninja -C out nanobench
- // perf record -k 1 out/nanobench -m SkVM_4096_I32\$
- // perf inject -j -i perf.data -o perf.data.jit
- // perf report -i perf.data.jit
- //
- // Running `perf inject -j` will also dump an .so for each JIT'd
- // program, named jitted-<pid>-<hash>.so.
- //
- // https://lwn.net/Articles/638566/
- // https://v8.dev/docs/linux-perf
- // https://cs.chromium.org/chromium/src/v8/src/diagnostics/perf-jit.cc
- // https://lore.kernel.org/patchwork/patch/622240/
-
-
- auto timestamp_ns = []() -> uint64_t {
- // It's important to use CLOCK_MONOTONIC here so that perf can
- // correlate our timestamps with those captured by `perf record
- // -k 1`. That's also what `-k 1` does, by the way, tell perf
- // record to use CLOCK_MONOTONIC.
- struct timespec ts;
- clock_gettime(CLOCK_MONOTONIC, &ts);
- return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec;
- };
-
- // We'll open the jit-<pid>.dump file and write a small header once,
- // and just leave it open forever because we're lazy.
- static FILE* jitdump = [&]{
- // Must map as w+ for the mmap() call below to work.
- char path[64];
- sprintf(path, "jit-%d.dump", getpid());
- FILE* f = fopen(path, "w+");
-
- // Calling mmap() on the file adds a "hey they mmap()'d this" record to
- // the perf.data file that will point `perf inject -j` at this log file.
- // Kind of a strange way to tell `perf inject` where the file is...
- void* marker = mmap(nullptr,
- sysconf(_SC_PAGESIZE),
- PROT_READ|PROT_EXEC,
- MAP_PRIVATE,
- fileno(f),
- /*offset=*/0);
- SkASSERT_RELEASE(marker != MAP_FAILED);
- // Like never calling fclose(f), we'll also just always leave marker mmap()'d.
-
- #if defined(__x86_64__)
- const uint32_t elf_mach = 62;
- #elif defined(__aarch64__)
- const uint32_t elf_mach = 183;
- #else
- const uint32_t elf_mach = 0; // TODO
- #endif
-
- struct Header {
- uint32_t magic, version, header_size, elf_mach, reserved, pid;
- uint64_t timestamp_us, flags;
- } header = {
- 0x4A695444, 1, sizeof(Header), elf_mach, 0, (uint32_t)getpid(),
- timestamp_ns() / 1000, 0,
- };
- fwrite(&header, sizeof(header), 1, f);
-
- return f;
- }();
-
- struct CodeLoad {
- uint32_t event_type, event_size;
- uint64_t timestamp_ns;
-
- uint32_t pid, tid;
- uint64_t vma/*???*/, code_addr, code_size, id;
- } load = {
- 0/*code load*/, (uint32_t)(sizeof(CodeLoad) + strlen(name) + 1 + fJIT.size),
- timestamp_ns(),
-
- (uint32_t)getpid(), (uint32_t)SkGetThreadID(),
- (uint64_t)fJIT.buf, (uint64_t)fJIT.buf, fJIT.size, hash,
- };
-
- // Write the header, the JIT'd function name, and the JIT'd code itself.
- fwrite(&load, sizeof(load), 1, jitdump);
- fwrite(name, 1, strlen(name), jitdump);
- fwrite("\0", 1, 1, jitdump);
- fwrite(fJIT.buf, 1, fJIT.size, jitdump);
- #endif
- }
- fJITLock.release(); // pairs with tryAcquire() in the if().
- }
- #endif // defined(SKVM_JIT)
-
- if (entry) {
+ if (fJITEntry) {
switch (nargs) {
- case 0: ((void(*)(int ))entry)(n ); break;
- case 1: ((void(*)(int, void* ))entry)(n, args[0] ); break;
- case 2: ((void(*)(int, void*, void*))entry)(n, args[0], args[1]); break;
+ case 0: return ((void(*)(int ))fJITEntry)(n );
+ case 1: return ((void(*)(int, void* ))fJITEntry)(n, args[0] );
+ case 2: return ((void(*)(int, void*, void*))fJITEntry)(n, args[0], args[1]);
default: SkUNREACHABLE; // TODO
}
- } else {
- // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
- constexpr int K = 16;
- using I32 = skvx::Vec<K, int>;
- using F32 = skvx::Vec<K, float>;
- using U32 = skvx::Vec<K, uint32_t>;
- using U8 = skvx::Vec<K, uint8_t>;
+ }
- using I16x2 = skvx::Vec<2*K, int16_t>;
- using U16x2 = skvx::Vec<2*K, uint16_t>;
+ // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
+ constexpr int K = 16;
+ using I32 = skvx::Vec<K, int>;
+ using F32 = skvx::Vec<K, float>;
+ using U32 = skvx::Vec<K, uint32_t>;
+ using U8 = skvx::Vec<K, uint8_t>;
- union Slot {
- I32 i32;
- U32 u32;
- F32 f32;
- };
+ using I16x2 = skvx::Vec<2*K, int16_t>;
+ using U16x2 = skvx::Vec<2*K, uint16_t>;
- Slot few_regs[16];
- std::unique_ptr<char[]> many_regs;
+ union Slot {
+ I32 i32;
+ U32 u32;
+ F32 f32;
+ };
- Slot* regs = few_regs;
+ Slot few_regs[16];
+ std::unique_ptr<char[]> many_regs;
- if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
- // Annoyingly we can't trust that malloc() or new will work with Slot because
- // the skvx::Vec types may have alignment greater than what they provide.
- // We'll overallocate one extra register so we can align manually.
- many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
+ Slot* regs = few_regs;
- uintptr_t addr = (uintptr_t)many_regs.get();
- addr += alignof(Slot) -
- (addr & (alignof(Slot) - 1));
- SkASSERT((addr & (alignof(Slot) - 1)) == 0);
- regs = (Slot*)addr;
+ if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
+ // Annoyingly we can't trust that malloc() or new will work with Slot because
+ // the skvx::Vec types may have alignment greater than what they provide.
+ // We'll overallocate one extra register so we can align manually.
+ many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
+
+ uintptr_t addr = (uintptr_t)many_regs.get();
+ addr += alignof(Slot) -
+ (addr & (alignof(Slot) - 1));
+ SkASSERT((addr & (alignof(Slot) - 1)) == 0);
+ regs = (Slot*)addr;
+ }
+
+
+ auto r = [&](Reg id) -> Slot& {
+ SkASSERT(0 <= id && id < fRegs);
+ return regs[id];
+ };
+ auto arg = [&](int ix) {
+ SkASSERT(0 <= ix && ix < nargs);
+ return args[ix];
+ };
+
+ // Step each argument pointer ahead by its stride a number of times.
+ auto step_args = [&](int times) {
+ // Looping by marching pointers until *arg == nullptr helps the
+ // compiler to keep this loop scalar. Otherwise it'd create a
+ // rather large and useless autovectorized version.
+ void** arg = args;
+ const int* stride = fStrides.data();
+ for (; *arg; arg++, stride++) {
+ *arg = (void*)( (char*)*arg + times * *stride );
}
+ SkASSERT(arg == args + nargs);
+ };
+ int start = 0,
+ stride;
+ for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
+ stride = n >= K ? K : 1;
- auto r = [&](Reg id) -> Slot& {
- SkASSERT(0 <= id && id < fRegs);
- return regs[id];
- };
- auto arg = [&](int ix) {
- SkASSERT(0 <= ix && ix < nargs);
- return args[ix];
- };
+ for (int i = start; i < (int)fInstructions.size(); i++) {
+ Instruction inst = fInstructions[i];
- // Step each argument pointer ahead by its stride a number of times.
- auto step_args = [&](int times) {
- // Looping by marching pointers until *arg == nullptr helps the
- // compiler to keep this loop scalar. Otherwise it'd create a
- // rather large and useless autovectorized version.
- void** arg = args;
- const int* stride = fStrides.data();
- for (; *arg; arg++, stride++) {
- *arg = (void*)( (char*)*arg + times * *stride );
- }
- SkASSERT(arg == args + nargs);
- };
+ // d = op(x,y,z/imm)
+ Reg d = inst.d,
+ x = inst.x,
+ y = inst.y,
+ z = inst.z;
+ int imm = inst.imm;
- int start = 0,
- stride;
- for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
- stride = n >= K ? K : 1;
+ // Ops that interact with memory need to know whether we're stride=1 or K,
+ // but all non-memory ops can run the same code no matter the stride.
+ switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
- for (int i = start; i < (int)fInstructions.size(); i++) {
- Instruction inst = fInstructions[i];
+ #define STRIDE_1(op) case 2*(int)op
+ #define STRIDE_K(op) case 2*(int)op + 1
+ STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
+ STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
- // d = op(x,y,z/imm)
- Reg d = inst.d,
- x = inst.x,
- y = inst.y,
- z = inst.z;
- int imm = inst.imm;
+ STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
+ STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break;
- // Ops that interact with memory need to know whether we're stride=1 or K,
- // but all non-memory ops can run the same code no matter the stride.
- switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
+ STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
+ STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
- #define STRIDE_1(op) case 2*(int)op
- #define STRIDE_K(op) case 2*(int)op + 1
- STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
- STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
+ STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break;
+ STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(imm)) ; break;
+ #undef STRIDE_1
+ #undef STRIDE_K
- STRIDE_K(Op::store8 ): skvx::cast<uint8_t>(r(x).i32).store(arg(imm)); break;
- STRIDE_K(Op::store32): (r(x).i32).store(arg(imm)); break;
+ // Ops that don't interact with memory should never care about the stride.
+ #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
+ CASE(Op::splat): r(d).i32 = imm; break;
- STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
- STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
+ CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
+ CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
+ CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
+ CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
- STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break;
- STRIDE_K(Op::load32): r(d).i32= I32::Load(arg(imm)) ; break;
- #undef STRIDE_1
- #undef STRIDE_K
+ CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
- // Ops that don't interact with memory should never care about the stride.
- #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
- CASE(Op::splat): r(d).i32 = imm; break;
+ CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
+ CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
+ CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
- CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
- CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
- CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
- CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
+ CASE(Op::sub_i16x2):
+ r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
+ skvx::bit_pun<I16x2>(r(y).i32) ); break;
+ CASE(Op::mul_i16x2):
+ r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
+ skvx::bit_pun<I16x2>(r(y).i32) ); break;
+ CASE(Op::shr_i16x2):
+ r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
+ break;
- CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
+ CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
+ CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
+ CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
+ CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
- CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
- CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
- CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
+ CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
+ CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
+ CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
- CASE(Op::sub_i16x2):
- r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) -
- skvx::bit_pun<I16x2>(r(y).i32) ); break;
- CASE(Op::mul_i16x2):
- r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<I16x2>(r(x).i32) *
- skvx::bit_pun<I16x2>(r(y).i32) ); break;
- CASE(Op::shr_i16x2):
- r(d).i32 = skvx::bit_pun<I32>(skvx::bit_pun<U16x2>(r(x).i32) >> imm);
- break;
+ CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
+ CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
- CASE(Op::bit_and): r(d).i32 = r(x).i32 & r(y).i32; break;
- CASE(Op::bit_or ): r(d).i32 = r(x).i32 | r(y).i32; break;
- CASE(Op::bit_xor): r(d).i32 = r(x).i32 ^ r(y).i32; break;
- CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
+ CASE(Op::bytes): {
+ const U32 table[] = {
+ 0,
+ (r(x).u32 ) & 0xff,
+ (r(x).u32 >> 8) & 0xff,
+ (r(x).u32 >> 16) & 0xff,
+ (r(x).u32 >> 24) & 0xff,
+ };
+ r(d).u32 = table[(imm >> 0) & 0xf] << 0
+ | table[(imm >> 4) & 0xf] << 8
+ | table[(imm >> 8) & 0xf] << 16
+ | table[(imm >> 12) & 0xf] << 24;
+ } break;
- CASE(Op::shl): r(d).i32 = r(x).i32 << imm; break;
- CASE(Op::sra): r(d).i32 = r(x).i32 >> imm; break;
- CASE(Op::shr): r(d).u32 = r(x).u32 >> imm; break;
-
- CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
- CASE(Op::pack): r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
-
- CASE(Op::bytes): {
- const U32 table[] = {
- 0,
- (r(x).u32 ) & 0xff,
- (r(x).u32 >> 8) & 0xff,
- (r(x).u32 >> 16) & 0xff,
- (r(x).u32 >> 24) & 0xff,
- };
- r(d).u32 = table[(imm >> 0) & 0xf] << 0
- | table[(imm >> 4) & 0xf] << 8
- | table[(imm >> 8) & 0xf] << 16
- | table[(imm >> 12) & 0xf] << 24;
- } break;
-
- CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
- CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
- #undef CASE
- }
+ CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
+ CASE(Op::to_i32): r(d).i32 = skvx::cast<int> (r(x).f32); break;
+ #undef CASE
}
}
}
}
+
+ Program::~Program() {
+ #if defined(SKVM_JIT)
+ if (fJITBuf) {
+ munmap(fJITBuf, fJITSize);
+ }
+ #else
+ SkASSERT(fJITBuf == nullptr);
+ #endif
+ }
+
+ Program::Program(Program&& other) {
+ fInstructions = std::move(other.fInstructions);
+ fRegs = other.fRegs;
+ fLoop = other.fLoop;
+ fStrides = std::move(other.fStrides);
+
+ std::swap(fJITBuf , other.fJITBuf);
+ std::swap(fJITSize , other.fJITSize);
+ std::swap(fJITEntry, other.fJITEntry);
+ }
+
+ Program& Program::operator=(Program&& other) {
+ fInstructions = std::move(other.fInstructions);
+ fRegs = other.fRegs;
+ fLoop = other.fLoop;
+ fStrides = std::move(other.fStrides);
+
+ std::swap(fJITBuf , other.fJITBuf);
+ std::swap(fJITSize , other.fJITSize);
+ std::swap(fJITEntry, other.fJITEntry);
+
+ return *this;
+ }
+
+ Program::Program(std::vector<Instruction> instructions,
+ int regs,
+ int loop,
+ std::vector<int> strides)
+ : fInstructions(std::move(instructions))
+ , fRegs(regs)
+ , fLoop(loop)
+ , fStrides(std::move(strides)) {
+ #if defined(SKVM_JIT)
+ const int nargs = (int)fStrides.size();
+ if (can_jit(fRegs, nargs)) {
+ // First assemble without any buffer to see how much memory we need to mmap.
+ size_t code;
+ Assembler a{nullptr};
+ jit(a, &code, fInstructions, fRegs, fLoop, fStrides.data(), nargs);
+
+ // mprotect() can only change at a page level granularity, so round a.size() up.
+ size_t page = sysconf(_SC_PAGESIZE); // Probably 4096.
+ fJITSize = ((a.size() + page - 1) / page) * page;
+
+ fJITBuf = mmap(nullptr,fJITSize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
+
+ a = Assembler{fJITBuf};
+ jit(a, &code, fInstructions, fRegs, fLoop, fStrides.data(), nargs);
+
+ mprotect(fJITBuf, fJITSize, PROT_READ|PROT_EXEC);
+ #if defined(__aarch64__)
+ msync(fJITBuf, fJITSize, MS_SYNC|MS_INVALIDATE);
+ #endif
+
+ fJITEntry = (decltype(fJITEntry))( (const uint8_t*)fJITBuf + code );
+
+ #if 0 || defined(SKVM_PERF_DUMPS) // Debug dumps for perf.
+ #if defined(__aarch64__)
+ // cat | llvm-mc -arch aarch64 -disassemble
+ auto cur = (const uint8_t*)fJITBuf;
+ for (int i = 0; i < (int)a.size(); i++) {
+ if (i % 4 == 0) {
+ SkDebugf("\n");
+ if (i == (int)code) {
+ SkDebugf("code:\n");
+ }
+ }
+ SkDebugf("0x%02x ", *cur++);
+ }
+ SkDebugf("\n");
+ #endif
+
+ // We're doing some really stateful things below so one thread at a time please...
+ static SkSpinlock dump_lock;
+ SkAutoSpinlock lock(dump_lock);
+
+ auto fnv1a = [](const void* vbuf, size_t n) {
+ uint32_t hash = 2166136261;
+ for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) {
+ hash ^= *buf;
+ hash *= 16777619;
+ }
+ return hash;
+ };
+
+
+ uint32_t hash = fnv1a(fJITBuf, fJITSize);
+ char name[64];
+ sprintf(name, "skvm-jit-%u", hash);
+
+ // Create a jit-<pid>.dump file that we can `perf inject -j` into a
+ // perf.data captured with `perf record -k 1`, letting us see each
+ // JIT'd Program as if a function named skvm-jit-<hash>. E.g.
+ //
+ // ninja -C out nanobench
+ // perf record -k 1 out/nanobench -m SkVM_4096_I32\$
+ // perf inject -j -i perf.data -o perf.data.jit
+ // perf report -i perf.data.jit
+ //
+ // Running `perf inject -j` will also dump an .so for each JIT'd
+ // program, named jitted-<pid>-<hash>.so.
+ //
+ // https://lwn.net/Articles/638566/
+ // https://v8.dev/docs/linux-perf
+ // https://cs.chromium.org/chromium/src/v8/src/diagnostics/perf-jit.cc
+ // https://lore.kernel.org/patchwork/patch/622240/
+
+
+ auto timestamp_ns = []() -> uint64_t {
+ // It's important to use CLOCK_MONOTONIC here so that perf can
+ // correlate our timestamps with those captured by `perf record
+ // -k 1`. That's also what `-k 1` does, by the way, tell perf
+ // record to use CLOCK_MONOTONIC.
+ struct timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec;
+ };
+
+ // We'll open the jit-<pid>.dump file and write a small header once,
+ // and just leave it open forever because we're lazy.
+ static FILE* jitdump = [&]{
+ // Must map as w+ for the mmap() call below to work.
+ char path[64];
+ sprintf(path, "jit-%d.dump", getpid());
+ FILE* f = fopen(path, "w+");
+
+ // Calling mmap() on the file adds a "hey they mmap()'d this" record to
+ // the perf.data file that will point `perf inject -j` at this log file.
+ // Kind of a strange way to tell `perf inject` where the file is...
+ void* marker = mmap(nullptr,
+ sysconf(_SC_PAGESIZE),
+ PROT_READ|PROT_EXEC,
+ MAP_PRIVATE,
+ fileno(f),
+ /*offset=*/0);
+ SkASSERT_RELEASE(marker != MAP_FAILED);
+ // Like never calling fclose(f), we'll also just always leave marker mmap()'d.
+
+ #if defined(__x86_64__)
+ const uint32_t elf_mach = 62;
+ #elif defined(__aarch64__)
+ const uint32_t elf_mach = 183;
+ #else
+ const uint32_t elf_mach = 0; // TODO
+ #endif
+
+ struct Header {
+ uint32_t magic, version, header_size, elf_mach, reserved, pid;
+ uint64_t timestamp_us, flags;
+ } header = {
+ 0x4A695444, 1, sizeof(Header), elf_mach, 0, (uint32_t)getpid(),
+ timestamp_ns() / 1000, 0,
+ };
+ fwrite(&header, sizeof(header), 1, f);
+
+ return f;
+ }();
+
+ struct CodeLoad {
+ uint32_t event_type, event_size;
+ uint64_t timestamp_ns;
+
+ uint32_t pid, tid;
+ uint64_t vma/*???*/, code_addr, code_size, id;
+ } load = {
+ 0/*code load*/, (uint32_t)(sizeof(CodeLoad) + strlen(name) + 1 + fJITSize),
+ timestamp_ns(),
+
+ (uint32_t)getpid(), (uint32_t)SkGetThreadID(),
+ (uint64_t)fJITBuf, (uint64_t)fJITBuf, fJITSize, hash,
+ };
+
+ // Write the header, the JIT'd function name, and the JIT'd code itself.
+ fwrite(&load, sizeof(load), 1, jitdump);
+ fwrite(name, 1, strlen(name), jitdump);
+ fwrite("\0", 1, 1, jitdump);
+ fwrite(fJITBuf, 1, fJITSize, jitdump);
+ #endif
+ }
+ #endif // defined(SKVM_JIT)
+ }
+
}
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index a845299..3f56166 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -10,7 +10,6 @@
#include "include/core/SkTypes.h"
#include "include/private/SkTHash.h"
-#include "include/private/SkSpinlock.h"
#include <vector>
namespace skvm {
@@ -270,22 +269,16 @@
int loop() const { return fLoop; }
private:
- struct JIT {
- ~JIT();
-
- void* buf = nullptr; // Raw mmap'd buffer.
- size_t size = 0; // Size of buf in bytes.
- void (*entry)() = nullptr; // Entry point, offset into buf.
- };
-
void eval(int n, void* args[]) const;
std::vector<Instruction> fInstructions;
int fRegs;
int fLoop;
std::vector<int> fStrides;
- mutable SkSpinlock fJITLock;
- mutable JIT fJIT;
+
+ void* fJITBuf = nullptr; // Raw mmap'd buffer.
+ size_t fJITSize = 0; // Size of buf in bytes.
+ void (*fJITEntry)() = nullptr; // Entry point, offset into buf.
};
using Val = int;