instructions for JIT tail support on ARM
This adds a bunch of instructions we'll need to handle the N < 4 tail
within the JIT code on ARM.
- ldrb/strb are 1-byte load and stores
- sub subtracts without setting flags
- cmp just sets flags (actually just subs with an xzr destination)
- add b and b.lt, just like b.ne
- cbz and cbnz... we only need cbz but I accidentally did cbnz first
Once I add support for forward jumps, we'll be able to use these
instructions to restructure the loop to
entry:
hoisted setup
loop:
if N < 4, jump tail (cmp N,#4; b.lt tail)
... handle 4 values ...
jump loop (b loop)
tail:
if N == 0, jump end (cbz N, end)
... handle 1 value ...
jump tail (b tail)
end:
ret
Change-Id: I62d2d190f670f758197a25d99dfde13362189993
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/226828
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index af6affe..e4d178a 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -625,6 +625,12 @@
| (n & 5_mask) << 5
| (d & 5_mask) << 0);
}
+ void Assembler::sub(X d, X n, int imm12) {
+ this->word( 0b1'1'0'10001'00 << 22
+ | (imm12 & 12_mask) << 10
+ | (n & 5_mask) << 5
+ | (d & 5_mask) << 0);
+ }
void Assembler::subs(X d, X n, int imm12) {
this->word( 0b1'1'1'10001'00 << 22
| (imm12 & 12_mask) << 10
@@ -632,19 +638,33 @@
| (d & 5_mask) << 0);
}
- void Assembler::bne(Label l) {
+ void Assembler::b(Condition cond, Label l) {
// Jump in insts from before this one.
const int imm19 = (l.offset - here().offset) / 4;
- this->word( 0b0101010'0 << 24
+ this->word( 0b0101010'0 << 24
+ | (imm19 & 19_mask) << 5
+ | ((int)cond & 4_mask) << 0);
+ }
+ void Assembler::cbz(X t, Label l) {
+ const int imm19 = (l.offset - here().offset) / 4;
+ this->word( 0b1'011010'0 << 24
| (imm19 & 19_mask) << 5
- | 0b0'0001 << 0);
+ | (t & 5_mask) << 0);
+ }
+ void Assembler::cbnz(X t, Label l) {
+ const int imm19 = (l.offset - here().offset) / 4;
+ this->word( 0b1'011010'1 << 24
+ | (imm19 & 19_mask) << 5
+ | (t & 5_mask) << 0);
}
void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
+ void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); }
void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
+ void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
void Assembler::ldrq(V dst, Label l) {
const int imm19 = (l.offset - here().offset) / 4;
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 883a186..055f1fa 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -128,14 +128,34 @@
void ret (X);
void add (X d, X n, int imm12);
- void subs(X d, X n, int imm12);
- void bne (Label);
+ void sub (X d, X n, int imm12);
+ void subs(X d, X n, int imm12); // subtract setting condition flags
+
+ // There's another encoding for unconditional branches that can jump further,
+ // but this one encoded as b.al is simple to implement and should be fine.
+ void b (Label l) { this->b(Condition::al, l); }
+ void bne(Label l) { this->b(Condition::ne, l); }
+ void blt(Label l) { this->b(Condition::lt, l); }
+
+ // "cmp ..." is just an assembler mnemonic for "subs xzr, ..."!
+ void cmp(X n, int imm12) { this->subs(xzr, n, imm12); }
+
+ // Compare and branch if zero/non-zero, as if
+ // cmp(t,0)
+ // beq/bne(l)
+ // but without setting condition flags.
+ void cbz (X t, Label l);
+ void cbnz(X t, Label l);
void ldrq(V dst, Label); // 128-bit PC-relative load
+
void ldrq(V dst, X src); // 128-bit dst = *src
- void ldrs(V dst, X src); // 32-bit dst[0] = *src
+ void ldrs(V dst, X src); // 32-bit dst = *src
+ void ldrb(V dst, X src); // 8-bit dst = *src
+
void strq(V src, X dst); // 128-bit *dst = src
- void strs(V src, X dst); // 32-bit *dst = src[0]
+ void strs(V src, X dst); // 32-bit *dst = src
+ void strb(V src, X dst); // 8-bit *dst = src
private:
// dst = op(dst, imm)
@@ -170,6 +190,10 @@
void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }
+ // Order matters... value is 4-bit encoding for condition code.
+ enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al };
+ void b(Condition, Label);
+
uint8_t* fCode;
size_t fSize;
};
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 07505dd..4d6ded6 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -641,12 +641,22 @@
a.add(A::x2, A::x2, 4);
a.add(A::x3, A::x2, 32);
+ a.sub(A::x2, A::x2, 4);
+ a.sub(A::x3, A::x2, 32);
+
a.subs(A::x2, A::x2, 4);
a.subs(A::x3, A::x2, 32);
+ a.subs(A::xzr, A::x2, 4); // These are actually the same instruction!
+ a.cmp(A::x2, 4);
+
A::Label l = a.here();
a.bne(l);
a.bne(l);
+ a.blt(l);
+ a.b(l);
+ a.cbnz(A::x2, l);
+ a.cbz(A::x2, l);
},{
0xc0,0x03,0x5f,0xd6,
0xa0,0x01,0x5f,0xd6,
@@ -654,19 +664,29 @@
0x42,0x10,0x00,0x91,
0x43,0x80,0x00,0x91,
+ 0x42,0x10,0x00,0xd1,
+ 0x43,0x80,0x00,0xd1,
+
0x42,0x10,0x00,0xf1,
0x43,0x80,0x00,0xf1,
- 0x01,0x00,0x00,0x54,
- 0xe1,0xff,0xff,0x54,
+ 0x5f,0x10,0x00,0xf1,
+ 0x5f,0x10,0x00,0xf1,
+
+ 0x01,0x00,0x00,0x54, // b.ne #0
+ 0xe1,0xff,0xff,0x54, // b.ne #-4
+ 0xcb,0xff,0xff,0x54, // b.lt #-8
+ 0xae,0xff,0xff,0x54, // b.al #-12
+ 0x82,0xff,0xff,0xb5, // cbnz x2, #-16
+ 0x62,0xff,0xff,0xb4, // cbz x2, #-20
});
test_asm(r, [&](A& a) {
a.ldrq(A::v0, A::x8);
a.strq(A::v0, A::x8);
},{
- 0x00, 0x01, 0xc0, 0x3d,
- 0x00, 0x01, 0x80, 0x3d,
+ 0x00,0x01,0xc0,0x3d,
+ 0x00,0x01,0x80,0x3d,
});
test_asm(r, [&](A& a) {
@@ -686,4 +706,12 @@
0x00,0xa4,0x08,0x2f,
0x00,0xa4,0x10,0x2f,
});
+
+ test_asm(r, [&](A& a) {
+ a.ldrb(A::v0, A::x8);
+ a.strb(A::v0, A::x8);
+ },{
+ 0x00,0x01,0x40,0x3d,
+ 0x00,0x01,0x00,0x3d,
+ });
}