| /* |
| * Copyright 2019 Google LLC |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "include/core/SkString.h" |
| #include "include/private/SkSpinlock.h" |
| #include "include/private/SkThreadID.h" |
| #include "include/private/SkVx.h" |
| #include "src/core/SkOpts.h" |
| #include "src/core/SkVM.h" |
| #include <string.h> |
| #if defined(SKVM_JIT) |
| #define XBYAK_NO_OP_NAMES |
| #include "xbyak/xbyak.h" |
| #endif |
| |
| namespace skvm { |
| |
| Program::~Program() = default; |
| Program::Program(Program&&) = default; |
| Program& Program::operator=(Program&&) = default; |
| |
| Program::Program(std::vector<Instruction> instructions, int regs, int loop) |
| : fInstructions(std::move(instructions)) |
| , fRegs(regs) |
| , fLoop(loop) |
| {} |
| |
| Program Builder::done() { |
| // Basic liveness analysis (and free dead code elimination). |
| for (ID id = fProgram.size(); id --> 0; ) { |
| Instruction& inst = fProgram[id]; |
| |
| // All side-effect-only instructions (stores) are live. |
| if (inst.op <= Op::store32) { |
| inst.life = id; |
| } |
| // The arguments of a live instruction must live until that instruction. |
| if (inst.life != NA) { |
| // Notice how we're walking backward, storing the latest instruction in life. |
| if (inst.x != NA && fProgram[inst.x].life == NA) { fProgram[inst.x].life = id; } |
| if (inst.y != NA && fProgram[inst.y].life == NA) { fProgram[inst.y].life = id; } |
| if (inst.z != NA && fProgram[inst.z].life == NA) { fProgram[inst.z].life = id; } |
| } |
| } |
| |
| // Look to see if there are any instructions that can be hoisted outside the program's loop. |
| for (ID id = 0; id < (ID)fProgram.size(); id++) { |
| Instruction& inst = fProgram[id]; |
| |
| // Loads and stores cannot be hoisted out of the loop. |
| if (inst.op <= Op::load32) { |
| inst.hoist = false; |
| } |
| |
| // If any of an instruction's arguments can't be hoisted, it can't be hoisted itself. |
| if (inst.hoist) { |
| if (inst.x != NA) { inst.hoist &= fProgram[inst.x].hoist; } |
| if (inst.y != NA) { inst.hoist &= fProgram[inst.y].hoist; } |
| if (inst.z != NA) { inst.hoist &= fProgram[inst.z].hoist; } |
| } |
| } |
| |
| // We'll need to map each live value to a register. |
| std::unordered_map<ID, ID> val_to_reg; |
| |
| // Count the registers we've used so far. |
| ID next_reg = 0; |
| |
| // Our first pass of register assignment assigns hoisted values to eternal registers. |
| for (ID val = 0; val < (ID)fProgram.size(); val++) { |
| Instruction& inst = fProgram[val]; |
| if (inst.life == NA || !inst.hoist) { |
| continue; |
| } |
| |
| // Hoisted values are needed forever, so they each get their own register. |
| val_to_reg[val] = next_reg++; |
| } |
| |
| // Now we'll assign registers to values that can't be hoisted out of the loop. These |
| // values have finite liftimes, so we track pre-owned registers that have become available |
| // and a schedule of which registers become available as we reach a given instruction. |
| std::vector<ID> avail; |
| std::unordered_map<ID, std::vector<ID>> deaths; |
| |
| for (ID val = 0; val < (ID)fProgram.size(); val++) { |
| Instruction& inst = fProgram[val]; |
| if (inst.life == NA || inst.hoist) { |
| continue; |
| } |
| |
| // All the values that are no longer needed after this instruction |
| // can make their registers available to this and future values. |
| const std::vector<ID>& dying = deaths[val]; |
| avail.insert(avail.end(), |
| dying.begin(), dying.end()); |
| |
| // Allocate a register if we have to, but prefer to reuse one that's available. |
| ID reg; |
| if (avail.empty()) { |
| reg = next_reg++; |
| } else { |
| reg = avail.back(); |
| avail.pop_back(); |
| } |
| |
| // Schedule this value's own death. When we reach the instruction at inst.life, |
| // this value is no longer needed and its register becomes available for reuse. |
| deaths[inst.life].push_back(reg); |
| |
| val_to_reg[val] = reg; |
| } |
| |
| // Add a dummy mapping for the N/A sentinel value to "register N/A", |
| // so that the lookups don't have to know which arguments are used by which Ops. |
| auto lookup_register = [&](ID val) { |
| return val == NA ? NA |
| : val_to_reg[val]; |
| }; |
| |
| // Finally translate Builder::Instructions to Program::Instructions by mapping values to |
| // registers. This will be two passes again, first outside the loop, then inside. |
| |
| // The loop begins at the loop'th Instruction. |
| int loop = 0; |
| std::vector<Program::Instruction> program; |
| |
| auto push_instruction = [&](ID id, const Builder::Instruction& inst) { |
| Program::Instruction pinst{ |
| inst.op, |
| lookup_register(id), |
| lookup_register(inst.x), |
| {lookup_register(inst.y)}, |
| {lookup_register(inst.z)}, |
| }; |
| if (inst.y == NA) { pinst.y.imm = inst.immy; } |
| if (inst.z == NA) { pinst.z.imm = inst.immz; } |
| program.push_back(pinst); |
| }; |
| |
| for (ID id = 0; id < (ID)fProgram.size(); id++) { |
| Instruction& inst = fProgram[id]; |
| if (inst.life == NA || !inst.hoist) { |
| continue; |
| } |
| |
| push_instruction(id, inst); |
| loop++; |
| } |
| for (ID id = 0; id < (ID)fProgram.size(); id++) { |
| Instruction& inst = fProgram[id]; |
| if (inst.life == NA || inst.hoist) { |
| continue; |
| } |
| |
| push_instruction(id, inst); |
| } |
| |
| return { std::move(program), /*register count = */next_reg, loop }; |
| } |
| |
| // Most instructions produce a value and return it by ID, |
| // the value-producing instruction's own index in the program vector. |
| |
| ID Builder::push(Op op, ID x, ID y, ID z, int immy, int immz) { |
| Instruction inst{op, /*hoist=*/true, /*life=*/NA, x, y, z, immy, immz}; |
| |
| // Basic common subexpression elimination: |
| // if we've already seen this exact Instruction, use it instead of creating a new one. |
| auto lookup = fIndex.find(inst); |
| if (lookup != fIndex.end()) { |
| return lookup->second; |
| } |
| |
| ID id = static_cast<ID>(fProgram.size()); |
| fProgram.push_back(inst); |
| fIndex[inst] = id; |
| return id; |
| } |
| |
| bool Builder::isZero(ID id) const { |
| return fProgram[id].op == Op::splat |
| && fProgram[id].immy == 0; |
| } |
| |
| Arg Builder::arg(int ix) { return {ix}; } |
| |
| void Builder::store8 (Arg ptr, I32 val) { (void)this->push(Op::store8 , val.id,NA,NA, ptr.ix); } |
| void Builder::store32(Arg ptr, I32 val) { (void)this->push(Op::store32, val.id,NA,NA, ptr.ix); } |
| |
| I32 Builder::load8 (Arg ptr) { return {this->push(Op::load8 , NA,NA,NA, ptr.ix) }; } |
| I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; } |
| |
| // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern. |
| I32 Builder::splat(int n) { return {this->push(Op::splat, NA,NA,NA, n) }; } |
| F32 Builder::splat(float f) { |
| int bits; |
| memcpy(&bits, &f, 4); |
| return {this->push(Op::splat, NA,NA,NA, bits)}; |
| } |
| |
| F32 Builder::add(F32 x, F32 y ) { return {this->push(Op::add_f32, x.id, y.id)}; } |
| F32 Builder::sub(F32 x, F32 y ) { return {this->push(Op::sub_f32, x.id, y.id)}; } |
| F32 Builder::mul(F32 x, F32 y ) { return {this->push(Op::mul_f32, x.id, y.id)}; } |
| F32 Builder::div(F32 x, F32 y ) { return {this->push(Op::div_f32, x.id, y.id)}; } |
| F32 Builder::mad(F32 x, F32 y, F32 z) { |
| if (this->isZero(z.id)) { |
| return this->mul(x,y); |
| } |
| return {this->push(Op::mad_f32, x.id, y.id, z.id)}; |
| } |
| |
| I32 Builder::add(I32 x, I32 y) { return {this->push(Op::add_i32, x.id, y.id)}; } |
| I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; } |
| I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; } |
| |
| I32 Builder::bit_and(I32 x, I32 y) { return {this->push(Op::bit_and, x.id, y.id)}; } |
| I32 Builder::bit_or (I32 x, I32 y) { return {this->push(Op::bit_or , x.id, y.id)}; } |
| I32 Builder::bit_xor(I32 x, I32 y) { return {this->push(Op::bit_xor, x.id, y.id)}; } |
| |
| I32 Builder::shl(I32 x, int bits) { return {this->push(Op::shl, x.id,NA,NA, bits)}; } |
| I32 Builder::shr(I32 x, int bits) { return {this->push(Op::shr, x.id,NA,NA, bits)}; } |
| I32 Builder::sra(I32 x, int bits) { return {this->push(Op::sra, x.id,NA,NA, bits)}; } |
| |
| I32 Builder::mul_unorm8(I32 x, I32 y) { return {this->push(Op::mul_unorm8, x.id, y.id)}; } |
| I32 Builder::mad_unorm8(I32 x, I32 y, I32 z) { |
| return {this->push(Op::mad_unorm8, x.id, y.id, z.id)}; |
| } |
| |
| I32 Builder::extract(I32 x, int bits, I32 z) { |
| return {this->push(Op::extract, x.id,NA,z.id, bits,0)}; |
| } |
| |
| I32 Builder::pack(I32 x, I32 y, int bits) { |
| return {this->push(Op::pack, x.id,y.id,NA, 0,bits)}; |
| } |
| |
| F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; } |
| I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; } |
| |
| // ~~~~ Program::dump() and co. ~~~~ // |
| |
| struct V { ID id; }; |
| struct R { ID id; }; |
| struct Shift { int bits; }; |
| struct Splat { int bits; }; |
| |
| static void write(SkWStream* o, const char* s) { |
| o->writeText(s); |
| } |
| |
| static void write(SkWStream* o, Arg a) { |
| write(o, "arg("); |
| o->writeDecAsText(a.ix); |
| write(o, ")"); |
| } |
| static void write(SkWStream* o, V v) { |
| write(o, "v"); |
| o->writeDecAsText(v.id); |
| } |
| static void write(SkWStream* o, R r) { |
| write(o, "r"); |
| o->writeDecAsText(r.id); |
| } |
| static void write(SkWStream* o, Shift s) { |
| o->writeDecAsText(s.bits); |
| } |
| static void write(SkWStream* o, Splat s) { |
| float f; |
| memcpy(&f, &s.bits, 4); |
| o->writeHexAsText(s.bits); |
| write(o, " ("); |
| o->writeScalarAsText(f); |
| write(o, ")"); |
| } |
| |
| template <typename T, typename... Ts> |
| static void write(SkWStream* o, T first, Ts... rest) { |
| write(o, first); |
| write(o, " "); |
| write(o, rest...); |
| } |
| |
| void Builder::dump(SkWStream* o) const { |
| o->writeDecAsText(fProgram.size()); |
| o->writeText(" values:\n"); |
| for (ID id = 0; id < (ID)fProgram.size(); id++) { |
| const Instruction& inst = fProgram[id]; |
| Op op = inst.op; |
| ID x = inst.x, |
| y = inst.y, |
| z = inst.z; |
| int immy = inst.immy, |
| immz = inst.immz; |
| write(o, inst.life == NA ? "☠" : |
| inst.hoist ? "⤴ " : " "); |
| switch (op) { |
| case Op::store8: write(o, "store8" , Arg{immy}, V{x}); break; |
| case Op::store32: write(o, "store32", Arg{immy}, V{x}); break; |
| |
| case Op::load8: write(o, V{id}, "= load8" , Arg{immy}); break; |
| case Op::load32: write(o, V{id}, "= load32", Arg{immy}); break; |
| |
| case Op::splat: write(o, V{id}, "= splat", Splat{immy}); break; |
| |
| case Op::add_f32: write(o, V{id}, "= add_f32", V{x}, V{y} ); break; |
| case Op::sub_f32: write(o, V{id}, "= sub_f32", V{x}, V{y} ); break; |
| case Op::mul_f32: write(o, V{id}, "= mul_f32", V{x}, V{y} ); break; |
| case Op::div_f32: write(o, V{id}, "= div_f32", V{x}, V{y} ); break; |
| case Op::mad_f32: write(o, V{id}, "= mad_f32", V{x}, V{y}, V{z}); break; |
| |
| case Op::add_i32: write(o, V{id}, "= add_i32", V{x}, V{y}); break; |
| case Op::sub_i32: write(o, V{id}, "= sub_i32", V{x}, V{y}); break; |
| case Op::mul_i32: write(o, V{id}, "= mul_i32", V{x}, V{y}); break; |
| |
| case Op::bit_and: write(o, V{id}, "= bit_and", V{x}, V{y}); break; |
| case Op::bit_or : write(o, V{id}, "= bit_or" , V{x}, V{y}); break; |
| case Op::bit_xor: write(o, V{id}, "= bit_xor", V{x}, V{y}); break; |
| |
| case Op::shl: write(o, V{id}, "= shl", V{x}, Shift{immy}); break; |
| case Op::shr: write(o, V{id}, "= shr", V{x}, Shift{immy}); break; |
| case Op::sra: write(o, V{id}, "= sra", V{x}, Shift{immy}); break; |
| |
| case Op::mul_unorm8: write(o, V{id}, "= mul_unorm8", V{x}, V{y} ); break; |
| case Op::mad_unorm8: write(o, V{id}, "= mad_unorm8", V{x}, V{y}, V{z}); break; |
| |
| case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{immy}, V{z}); break; |
| case Op::pack: write(o, V{id}, "= pack", V{x}, V{y}, Shift{immz}); break; |
| |
| case Op::to_f32: write(o, V{id}, "= to_f32", V{x}); break; |
| case Op::to_i32: write(o, V{id}, "= to_i32", V{x}); break; |
| } |
| |
| write(o, "\n"); |
| } |
| } |
| |
| void Program::dump(SkWStream* o) const { |
| o->writeDecAsText(fRegs); |
| o->writeText(" registers, "); |
| o->writeDecAsText(fInstructions.size()); |
| o->writeText(" instructions:\n"); |
| for (int i = 0; i < (int)fInstructions.size(); i++) { |
| if (i == fLoop) { |
| write(o, "loop:\n"); |
| } |
| const Instruction& inst = fInstructions[i]; |
| Op op = inst.op; |
| ID d = inst.d, |
| x = inst.x; |
| auto y = inst.y, |
| z = inst.z; |
| switch (op) { |
| case Op::store8: write(o, "store8" , Arg{y.imm}, R{x}); break; |
| case Op::store32: write(o, "store32", Arg{y.imm}, R{x}); break; |
| |
| case Op::load8: write(o, R{d}, "= load8" , Arg{y.imm}); break; |
| case Op::load32: write(o, R{d}, "= load32", Arg{y.imm}); break; |
| |
| case Op::splat: write(o, R{d}, "= splat", Splat{y.imm}); break; |
| |
| case Op::add_f32: write(o, R{d}, "= add_f32", R{x}, R{y.id} ); break; |
| case Op::sub_f32: write(o, R{d}, "= sub_f32", R{x}, R{y.id} ); break; |
| case Op::mul_f32: write(o, R{d}, "= mul_f32", R{x}, R{y.id} ); break; |
| case Op::div_f32: write(o, R{d}, "= div_f32", R{x}, R{y.id} ); break; |
| case Op::mad_f32: write(o, R{d}, "= mad_f32", R{x}, R{y.id}, R{z.id}); break; |
| |
| case Op::add_i32: write(o, R{d}, "= add_i32", R{x}, R{y.id}); break; |
| case Op::sub_i32: write(o, R{d}, "= sub_i32", R{x}, R{y.id}); break; |
| case Op::mul_i32: write(o, R{d}, "= mul_i32", R{x}, R{y.id}); break; |
| |
| case Op::bit_and: write(o, R{d}, "= bit_and", R{x}, R{y.id}); break; |
| case Op::bit_or : write(o, R{d}, "= bit_or" , R{x}, R{y.id}); break; |
| case Op::bit_xor: write(o, R{d}, "= bit_xor", R{x}, R{y.id}); break; |
| |
| case Op::shl: write(o, R{d}, "= shl", R{x}, Shift{y.imm}); break; |
| case Op::shr: write(o, R{d}, "= shr", R{x}, Shift{y.imm}); break; |
| case Op::sra: write(o, R{d}, "= sra", R{x}, Shift{y.imm}); break; |
| |
| case Op::mul_unorm8: write(o, R{d}, "= mul_unorm8", R{x}, R{y.id} ); break; |
| case Op::mad_unorm8: write(o, R{d}, "= mad_unorm8", R{x}, R{y.id}, R{z.id}); break; |
| |
| case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{y.imm}, R{z.id}); break; |
| case Op::pack: write(o, R{d}, "= pack", R{x}, R{y.id}, Shift{z.imm}); break; |
| |
| case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break; |
| case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break; |
| } |
| write(o, "\n"); |
| } |
| } |
| |
| // ~~~~ Program::eval() and co. ~~~~ // |
| |
| #if defined(SKVM_JIT) |
| struct Program::JIT : Xbyak::CodeGenerator { |
| size_t head_ends = 0, |
| body_ends = 0, |
| tail_ends = 0; |
| |
| JIT(const std::vector<Program::Instruction>& instructions, int regs, int loop, |
| size_t strides[], int nargs) |
| { |
| // 8 float values in a ymm register. |
| constexpr int K = 8; |
| |
| #if defined(SK_BUILD_FOR_WIN) |
| // TODO Windows ABI? |
| #else |
| // These registers are used to pass the first 6 arguments, |
| // so if we stick to these we need not push, pop, spill, or move anything around. |
| Xbyak::Reg N = rdi, |
| arg[] = { rsi, rdx, rcx, r8, r9 }; |
| |
| // All 16 ymm registers are available as scratch. |
| Xbyak::Ymm r[] = { |
| ymm0, ymm1, ymm2 , ymm3 , ymm4 , ymm5 , ymm6 , ymm7 , |
| ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, |
| }, tmp = ymm15; |
| Xbyak::Xmm tmplo = xmm15; |
| #endif |
| |
| // Label / 4-byte values we need to write after ret. |
| std::vector<std::pair<Xbyak::Label, int>> splats; |
| |
| for (int i = 0; i < (int)instructions.size(); i++) { |
| if (i == loop) { |
| L("loop"); |
| this->head_ends = this->getSize(); |
| } |
| const Instruction& inst = instructions[i]; |
| Op op = inst.op; |
| |
| ID d = inst.d, |
| x = inst.x; |
| auto y = inst.y, |
| z = inst.z; |
| switch (op) { |
| case Op::store8: |
| vpackusdw(tmp, r[x], r[x]); // pack 32-bit -> 16-bit |
| vpermq (tmp, tmp, 0xd8); // u64 tmp[0,1,2,3] = tmp[0,2,1,3] |
| vpackuswb(tmp, tmp, tmp); // pack 16-bit -> 8-bit |
| vmovq(ptr[arg[y.imm]], tmplo); // store low 8 bytes |
| break; |
| |
| case Op::store32: vmovups(ptr[arg[y.imm]], r[x]); break; |
| |
| case Op::load8: vpmovzxbd(r[d], ptr[arg[y.imm]]); break; |
| case Op::load32: vmovups (r[d], ptr[arg[y.imm]]); break; |
| |
| case Op::splat: splats.emplace_back(Xbyak::Label(), y.imm); |
| vbroadcastss(r[d], ptr[rip + splats.back().first]); |
| break; |
| |
| case Op::add_f32: vaddps(r[d], r[x], r[y.id]); break; |
| case Op::sub_f32: vsubps(r[d], r[x], r[y.id]); break; |
| case Op::mul_f32: vmulps(r[d], r[x], r[y.id]); break; |
| case Op::div_f32: vdivps(r[d], r[x], r[y.id]); break; |
| case Op::mad_f32: |
| if (d == x ) { vfmadd132ps(r[x ], r[z.id], r[y.id]); } else |
| if (d == y.id) { vfmadd213ps(r[y.id], r[x ], r[z.id]); } else |
| if (d == z.id) { vfmadd231ps(r[z.id], r[x ], r[y.id]); } else |
| { vmulps(r[d], r[x], r[y.id]); |
| vaddps(r[d], r[d], r[z.id]); } |
| break; |
| |
| case Op::add_i32: vpaddd (r[d], r[x], r[y.id]); break; |
| case Op::sub_i32: vpsubd (r[d], r[x], r[y.id]); break; |
| case Op::mul_i32: vpmulld(r[d], r[x], r[y.id]); break; |
| |
| case Op::bit_and: vandps(r[d], r[x], r[y.id]); break; |
| case Op::bit_or : vorps (r[d], r[x], r[y.id]); break; |
| case Op::bit_xor: vxorps(r[d], r[x], r[y.id]); break; |
| |
| case Op::shl: vpslld(r[d], r[x], y.imm); break; |
| case Op::shr: vpsrld(r[d], r[x], y.imm); break; |
| case Op::sra: vpsrad(r[d], r[x], y.imm); break; |
| |
| case Op::mul_unorm8: vpmulld(r[d], r[x], r[y.id]); |
| vpaddd(r[d], r[d], r[x]); |
| vpsrad(r[d], r[d], 8); |
| break; |
| |
| case Op::mad_unorm8: vpmulld(r[d], r[x], r[y.id]); |
| vpaddd(r[d], r[d], r[x]); |
| vpsrad(r[d], r[d], 8); |
| vpaddd(r[d], r[d], r[z.id]); |
| break; |
| |
| case Op::extract: if (y.imm) { vpsrld(r[d], r[x], y.imm); } |
| vandps(r[d], r[d], r[z.id]); |
| break; |
| |
| case Op::pack: vpslld(r[d], r[y.id], z.imm); |
| vorps (r[d], r[d ], r[x]); |
| break; |
| |
| case Op::to_f32: vcvtdq2ps (r[d], r[x]); break; |
| case Op::to_i32: vcvttps2dq(r[d], r[x]); break; |
| } |
| } |
| |
| this->body_ends = this->getSize(); |
| sub(N, K); |
| for (int i = 0; i < nargs; i++) { |
| add(arg[i], K*(int)strides[i]); |
| } |
| cmp(N, K-1); |
| jg("loop"); |
| |
| this->tail_ends = this->getSize(); |
| vzeroupper(); |
| ret(); |
| |
| for (auto splat : splats) { |
| align(4); |
| L(splat.first); |
| dd(splat.second); |
| } |
| } |
| }; |
| #endif |
| |
| |
| void Program::eval(int n, void* args[], size_t strides[], int nargs) const { |
| #if defined(SKVM_JIT) |
| if (!fJIT) { |
| fJIT.reset(new JIT{fInstructions, fRegs, fLoop, strides, nargs}); |
| |
| #if 1 |
| // We're doing some really stateful things below, |
| // so one thread at a time please... |
| static SkSpinlock dump_lock; |
| SkAutoSpinlock lock(dump_lock); |
| |
| uint32_t hash = SkOpts::hash(fJIT->getCode(), fJIT->getSize()); |
| |
| SkString name = SkStringPrintf("skvm-jit-%u", hash); |
| |
| // Create a jit-<pid>.dump file that we can `perf inject -j` into a |
| // perf.data captured with `perf record -k 1`, letting us see each |
| // JIT'd Program as if a function named skvm-jit-<hash>. E.g. |
| // |
| // ninja -C out nanobench |
| // perf record -k 1 out/nanobench -m SkVM_4096_I32\$ |
| // perf inject -j -i perf.data -o perf.data.jit |
| // perf report -i perf.data.jit |
| // |
| // Running `perf inject -j` will also dump an .so for each JIT'd |
| // program, named jitted-<pid>-<hash>.so. |
| |
| auto timestamp_ns = []() -> uint64_t { |
| // It's important to use CLOCK_MONOTONIC here so that perf can |
| // correlate our timestamps with those captured by `perf record |
| // -k 1`. That's also what `-k 1` does, by the way, tell perf |
| // record to use CLOCK_MONOTONIC. |
| struct timespec ts; |
| clock_gettime(CLOCK_MONOTONIC, &ts); |
| return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec; |
| }; |
| |
| // We'll open the jit-<pid>.dump file and write a small header once, |
| // and just leave it open forever because we're lazy. |
| static FILE* jitdump = [&]{ |
| // Must map as w+ for the mmap() call below to work. |
| FILE* f = fopen(SkStringPrintf("jit-%d.dump", getpid()).c_str(), "w+"); |
| |
| // Calling mmap() on the file adds a "hey they mmap()'d this" record to |
| // the perf.data file that will point `perf inject -j` at this log file. |
| // Kind of a strange way to tell `perf inject` where the file is... |
| void* marker = mmap(nullptr, |
| sysconf(_SC_PAGESIZE), |
| PROT_READ|PROT_EXEC, |
| MAP_PRIVATE, |
| fileno(f), |
| /*offset=*/0); |
| SkASSERT_RELEASE(marker != MAP_FAILED); |
| // Like never calling fclose(f), we'll also just always leave marker mmap()'d. |
| |
| struct Header { |
| uint32_t magic, version, header_size, elf_mach, reserved, pid; |
| uint64_t timestamp_us, flags; |
| } header = { |
| 0x4A695444, 1, sizeof(Header), 62/*x86-64*/, 0, (uint32_t)getpid(), |
| timestamp_ns() / 1000, 0, |
| }; |
| fwrite(&header, sizeof(header), 1, f); |
| |
| return f; |
| }(); |
| |
| struct CodeLoad { |
| uint32_t event_type, event_size; |
| uint64_t timestamp_ns; |
| |
| uint32_t pid, tid; |
| uint64_t vma/*???*/, code_addr, code_size, id; |
| } load = { |
| 0/*code load*/, (uint32_t)(sizeof(CodeLoad) + name.size() + 1 + fJIT->getSize()), |
| timestamp_ns(), |
| |
| (uint32_t)getpid(), (uint32_t)SkGetThreadID(), |
| (uint64_t)fJIT->getCode(), (uint64_t)fJIT->getCode(), fJIT->getSize(), hash, |
| }; |
| |
| // Write the header, the JIT'd function name, and the JIT'd code itself. |
| fwrite(&load, sizeof(load), 1, jitdump); |
| fwrite(name.c_str(), 1, name.size(), jitdump); |
| fwrite("\0", 1, 1, jitdump); |
| fwrite(fJIT->getCode(), 1, fJIT->getSize(), jitdump); |
| #endif |
| } |
| |
| if (n >= 8) { |
| bool ran = true; |
| switch (nargs) { |
| case 0: fJIT->getCode<void(*)(int )>()(n ); break; |
| case 1: fJIT->getCode<void(*)(int, void* )>()(n, args[0] ); break; |
| case 2: fJIT->getCode<void(*)(int, void*, void*)>()(n, args[0], args[1]); break; |
| default: ran = false; break; |
| } |
| if (ran) { |
| n &= 7; |
| } |
| } |
| #endif |
| if (n) { |
| SkOpts::eval(fInstructions.data(), (int)fInstructions.size(), fRegs, fLoop, |
| n, args, strides, nargs); |
| } |
| } |
| } |
| |
| // TODO: argument strides (more generally types) should come earlier, the pointers themselves later. |
| // TODO: share 255 splats |