instructions for JIT tail support on ARM

This adds a bunch of instructions we'll need to handle the N < 4 tail
within the JIT code on ARM.

   - ldrb/strb are 1-byte load and stores
   - sub subtracts without setting flags
   - cmp just sets flags (actually just subs with an xzr destination)
   - add b and b.lt, just like b.ne
   - cbz and cbnz... we only need cbz but I accidentally did cbnz first

Once I add support for forward jumps, we'll be able to use these
instructions to restructure the loop to

    entry:
        hoisted setup
    loop:
        if N < 4, jump tail      (cmp N,#4; b.lt tail)
        ... handle 4 values ...
        jump loop                (b loop)
    tail:
        if N == 0, jump end      (cbz N, end)
        ... handle 1 value ...
        jump tail                (b tail)
    end:
        ret

Change-Id: I62d2d190f670f758197a25d99dfde13362189993
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/226828
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index af6affe..e4d178a 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -625,6 +625,12 @@
                   | (n     &  5_mask) <<  5
                   | (d     &  5_mask) <<  0);
     }
+    void Assembler::sub(X d, X n, int imm12) {
+        this->word( 0b1'1'0'10001'00  << 22
+                  | (imm12 & 12_mask) << 10
+                  | (n     &  5_mask) <<  5
+                  | (d     &  5_mask) <<  0);
+    }
     void Assembler::subs(X d, X n, int imm12) {
         this->word( 0b1'1'1'10001'00  << 22
                   | (imm12 & 12_mask) << 10
@@ -632,19 +638,33 @@
                   | (d     &  5_mask) <<  0);
     }
 
-    void Assembler::bne(Label l) {
+    void Assembler::b(Condition cond, Label l) {
         // Jump in insts from before this one.
         const int imm19 = (l.offset - here().offset) / 4;
-        this->word( 0b0101010'0       << 24
+        this->word( 0b0101010'0           << 24
+                  | (imm19     & 19_mask) <<  5
+                  | ((int)cond &  4_mask) <<  0);
+    }
+    void Assembler::cbz(X t, Label l) {
+        const int imm19 = (l.offset - here().offset) / 4;
+        this->word( 0b1'011010'0      << 24
                   | (imm19 & 19_mask) <<  5
-                  | 0b0'0001          <<  0);
+                  | (t     &  5_mask) <<  0);
+    }
+    void Assembler::cbnz(X t, Label l) {
+        const int imm19 = (l.offset - here().offset) / 4;
+        this->word( 0b1'011010'1      << 24
+                  | (imm19 & 19_mask) <<  5
+                  | (t     &  5_mask) <<  0);
     }
 
     void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
     void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
+    void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); }
 
     void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
     void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
+    void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
 
     void Assembler::ldrq(V dst, Label l) {
         const int imm19 = (l.offset - here().offset) / 4;
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 883a186..055f1fa 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -128,14 +128,34 @@
 
         void ret (X);
         void add (X d, X n, int imm12);
-        void subs(X d, X n, int imm12);
-        void bne (Label);
+        void sub (X d, X n, int imm12);
+        void subs(X d, X n, int imm12);  // subtract setting condition flags
+
+        // There's another encoding for unconditional branches that can jump further,
+        // but this one encoded as b.al is simple to implement and should be fine.
+        void b  (Label l) { this->b(Condition::al, l); }
+        void bne(Label l) { this->b(Condition::ne, l); }
+        void blt(Label l) { this->b(Condition::lt, l); }
+
+        // "cmp ..." is just an assembler mnemonic for "subs xzr, ..."!
+        void cmp(X n, int imm12) { this->subs(xzr, n, imm12); }
+
+        // Compare and branch if zero/non-zero, as if
+        //      cmp(t,0)
+        //      beq/bne(l)
+        // but without setting condition flags.
+        void cbz (X t, Label l);
+        void cbnz(X t, Label l);
 
         void ldrq(V dst, Label);  // 128-bit PC-relative load
+
         void ldrq(V dst, X src);  // 128-bit dst = *src
-        void ldrs(V dst, X src);  //  32-bit dst[0] = *src
+        void ldrs(V dst, X src);  //  32-bit dst = *src
+        void ldrb(V dst, X src);  //   8-bit dst = *src
+
         void strq(V src, X dst);  // 128-bit *dst = src
-        void strs(V src, X dst);  //  32-bit *dst = src[0]
+        void strs(V src, X dst);  //  32-bit *dst = src
+        void strb(V src, X dst);  //   8-bit *dst = src
 
     private:
         // dst = op(dst, imm)
@@ -170,6 +190,10 @@
         void op(uint32_t op22, V n, V d) { this->op(op22,0,n,d); }
         void op(uint32_t op22, X x, V v) { this->op(op22,0,(V)x,v); }
 
+        // Order matters... value is 4-bit encoding for condition code.
+        enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al };
+        void b(Condition, Label);
+
         uint8_t* fCode;
         size_t   fSize;
     };
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 07505dd..4d6ded6 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -641,12 +641,22 @@
         a.add(A::x2, A::x2,  4);
         a.add(A::x3, A::x2, 32);
 
+        a.sub(A::x2, A::x2, 4);
+        a.sub(A::x3, A::x2, 32);
+
         a.subs(A::x2, A::x2,  4);
         a.subs(A::x3, A::x2, 32);
 
+        a.subs(A::xzr, A::x2, 4);  // These are actually the same instruction!
+        a.cmp(A::x2, 4);
+
         A::Label l = a.here();
         a.bne(l);
         a.bne(l);
+        a.blt(l);
+        a.b(l);
+        a.cbnz(A::x2, l);
+        a.cbz(A::x2, l);
     },{
         0xc0,0x03,0x5f,0xd6,
         0xa0,0x01,0x5f,0xd6,
@@ -654,19 +664,29 @@
         0x42,0x10,0x00,0x91,
         0x43,0x80,0x00,0x91,
 
+        0x42,0x10,0x00,0xd1,
+        0x43,0x80,0x00,0xd1,
+
         0x42,0x10,0x00,0xf1,
         0x43,0x80,0x00,0xf1,
 
-        0x01,0x00,0x00,0x54,
-        0xe1,0xff,0xff,0x54,
+        0x5f,0x10,0x00,0xf1,
+        0x5f,0x10,0x00,0xf1,
+
+        0x01,0x00,0x00,0x54,   // b.ne #0
+        0xe1,0xff,0xff,0x54,   // b.ne #-4
+        0xcb,0xff,0xff,0x54,   // b.lt #-8
+        0xae,0xff,0xff,0x54,   // b.al #-12
+        0x82,0xff,0xff,0xb5,   // cbnz x2, #-16
+        0x62,0xff,0xff,0xb4,   // cbz x2, #-20
     });
 
     test_asm(r, [&](A& a) {
         a.ldrq(A::v0, A::x8);
         a.strq(A::v0, A::x8);
     },{
-        0x00, 0x01, 0xc0, 0x3d,
-        0x00, 0x01, 0x80, 0x3d,
+        0x00,0x01,0xc0,0x3d,
+        0x00,0x01,0x80,0x3d,
     });
 
     test_asm(r, [&](A& a) {
@@ -686,4 +706,12 @@
         0x00,0xa4,0x08,0x2f,
         0x00,0xa4,0x10,0x2f,
     });
+
+    test_asm(r, [&](A& a) {
+        a.ldrb(A::v0, A::x8);
+        a.strb(A::v0, A::x8);
+    },{
+        0x00,0x01,0x40,0x3d,
+        0x00,0x01,0x00,0x3d,
+    });
 }