bidirectional Labels

Support forward references in Label.

In addition to tracking the current Label offset (used for
backward references essentially just the same as before this CL)
we also store a list of instructions that refer to each Label.
When a Label moves, each instruction gets a new displacement.

To make this a little easier, remove the 8-bit jump form on x86...
this way all x86 displacements are 32-bit and and all ARM 19-bit.

For now only cbz() supports this, just to start somewhere.
More to do but it's worth an early design review.

Change-Id: I23d2bcd7742965ab694ae4828f53409cb9fc807f
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/226937
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index e4d178a..fd6e5ac 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -378,14 +378,14 @@
         return vex;
     }
 
-    Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
+    Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {}
 
     size_t Assembler::size() const { return fSize; }
 
     void Assembler::byte(const void* p, int n) {
-        if (fCode) {
-            memcpy(fCode, p, n);
-            fCode += n;
+        if (fCurr) {
+            memcpy(fCurr, p, n);
+            fCurr += n;
         }
         fSize += n;
     }
@@ -487,7 +487,33 @@
     void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); }
 
     Assembler::Label Assembler::here() {
-        return { (int)this->size() };
+        return { (int)this->size(), Label::None, {} };
+    }
+
+    // ARM 19-bit instruction count, from the beginning of this instruction.
+    int Assembler::disp19(const Label& l) {
+        return (l.offset - here().offset) / 4;
+    }
+
+    // x86 32-bit byte count, from the end of this instruction.
+    int Assembler::disp32(const Label& l) {
+        return l.offset - (here().offset + 4);
+    }
+
+    // Each as above, registering this as a location to update if l changes.
+    int Assembler::disp19(Label* l) {
+        SkASSERT(l->kind == Label::None ||
+                 l->kind == Label::ARMDisp19);
+        l->kind = Label::ARMDisp19;
+        l->references.push_back(here().offset);
+        return this->disp19(*l);
+    }
+    int Assembler::disp32(Label* l) {
+        SkASSERT(l->kind == Label::None ||
+                 l->kind == Label::X86Disp32);
+        l->kind = Label::X86Disp32;
+        l->references.push_back(here().offset);
+        return this->disp32(*l);
     }
 
     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label l) {
@@ -499,10 +525,7 @@
         this->byte(v.bytes, v.len);
         this->byte(opcode);
         this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
-
-        // IP relative addresses are relative to IP _after_ this instruction.
-        int imm = l.offset - (here().offset + 4);
-        this->byte(&imm, 4);
+        this->word(this->disp32(l));
     }
 
     void Assembler::vbroadcastss(Ymm dst, Label l) { this->op(0x66,0x380f,0x18, dst,l); }
@@ -513,17 +536,9 @@
         // jne can be either 2 bytes (short) or 6 bytes (near):
         //    75     one-byte-disp
         //    0F 85 four-byte-disp
-        // As usual, all displacements relative to the end of this instruction.
-        int shrt = l.offset - (here().offset + 2),
-            near = l.offset - (here().offset + 6);
-
-        if (SkTFitsIn<int8_t>(shrt)) {
-            this->byte(0x75);
-            this->byte(&shrt, 1);
-        } else {
-            this->byte(0x0f, 0x85);
-            this->byte(&near, 4);
-        }
+        // We always use the near displacement to make updating labels simpler.
+        this->byte(0x0f, 0x85);
+        this->word(this->disp32(l));
     }
 
     void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) {
@@ -640,19 +655,19 @@
 
     void Assembler::b(Condition cond, Label l) {
         // Jump in insts from before this one.
-        const int imm19 = (l.offset - here().offset) / 4;
+        const int imm19 = this->disp19(l);
         this->word( 0b0101010'0           << 24
                   | (imm19     & 19_mask) <<  5
                   | ((int)cond &  4_mask) <<  0);
     }
-    void Assembler::cbz(X t, Label l) {
-        const int imm19 = (l.offset - here().offset) / 4;
+    void Assembler::cbz(X t, Label* l) {
+        const int imm19 = this->disp19(l);
         this->word( 0b1'011010'0      << 24
                   | (imm19 & 19_mask) <<  5
                   | (t     &  5_mask) <<  0);
     }
     void Assembler::cbnz(X t, Label l) {
-        const int imm19 = (l.offset - here().offset) / 4;
+        const int imm19 = this->disp19(l);
         this->word( 0b1'011010'1      << 24
                   | (imm19 & 19_mask) <<  5
                   | (t     &  5_mask) <<  0);
@@ -667,12 +682,52 @@
     void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
 
     void Assembler::ldrq(V dst, Label l) {
-        const int imm19 = (l.offset - here().offset) / 4;
+        const int imm19 = this->disp19(l);
         this->word( 0b10'011'1'00     << 24
                   | (imm19 & 19_mask) << 5
                   | (dst   &  5_mask) << 0);
     }
 
+    void Assembler::label(Label* l) {
+        if (fCode) {
+            // The instructions all currently point to l->offset.
+            // We'll want to add a delta to point them to here().
+            int delta = here().offset - l->offset;
+            l->offset = here().offset;
+
+            if (l->kind == Label::ARMDisp19) {
+                for (int ref : l->references) {
+                    // ref points to a 32-bit instruction with 19-bit displacement in instructions.
+                    uint32_t inst;
+                    memcpy(&inst, fCode + ref, 4);
+
+                    // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
+                    int disp = (int)(inst << 8) >> 13;
+
+                    disp += delta/4;  // delta is in bytes, we want instructions.
+
+                    // Put it all back together, preserving the high 8 bits and low 5.
+                    inst = ((disp << 5) &  (19_mask << 5))
+                         | ((inst     ) & ~(19_mask << 5));
+
+                    memcpy(fCode + ref, &inst, 4);
+                }
+            }
+
+            if (l->kind == Label::X86Disp32) {
+                for (int ref : l->references) {
+                    // ref points to a 32-bit displacement in bytes.
+                    int disp;
+                    memcpy(&disp, fCode + ref, 4);
+
+                    disp += delta;
+
+                    memcpy(fCode + ref, &disp, 4);
+                }
+            }
+        }
+    }
+
 #if defined(SKVM_JIT)
     static bool can_jit(int regs, int nargs) {
     #if defined(__x86_64__)
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index 055f1fa..a0b79f0 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -83,8 +83,14 @@
         using DstEqOpX = void(Ymm dst, Ymm x);
         DstEqOpX vcvtdq2ps, vcvttps2dq;
 
-        struct Label { int offset; };
+        struct Label {
+            int                                 offset = 0;
+            enum { None, ARMDisp19, X86Disp32 } kind = None;
+            std::vector<int>                    references;
+        };
+
         Label here();
+        void label(Label*);
 
         void jne(Label);
 
@@ -144,7 +150,7 @@
         //      cmp(t,0)
         //      beq/bne(l)
         // but without setting condition flags.
-        void cbz (X t, Label l);
+        void cbz (X t, Label* l);
         void cbnz(X t, Label l);
 
         void ldrq(V dst, Label);  // 128-bit PC-relative load
@@ -194,7 +200,14 @@
         enum class Condition { eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,al };
         void b(Condition, Label);
 
+        int disp19(const Label&);
+        int disp32(const Label&);
+
+        int disp19(Label*);
+        int disp32(Label*);
+
         uint8_t* fCode;
+        uint8_t* fCurr;
         size_t   fSize;
     };
 
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 4d6ded6..e73589e 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -494,33 +494,10 @@
     test_asm(r, [&](A& a) {
         A::Label l = a.here();
         a.jne(l);
-        for (int i = 0; i < 124; i++) {
-            a.nop();
-        }
-        a.jne(l);
         a.jne(l);
     },{
-        0x75, 0xfe,  // short jump -2 bytes
-
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-        0x90, 0x90, 0x90, 0x90, 0x90,  0x90, 0x90, 0x90, 0x90, 0x90,
-
-        0x90, 0x90, 0x90, 0x90,
-
-        0x75, 0x80,                        // short jump -128 bytes
-        0x0f, 0x85, 0x7a,0xff,0xff,0xff,   // near jump back -134 bytes
+        0x0f,0x85, 0xfa,0xff,0xff,0xff,   // near jump -6 bytes
+        0x0f,0x85, 0xf4,0xff,0xff,0xff,   // near jump -12 bytes
     });
 
     test_asm(r, [&](A& a) {
@@ -656,7 +633,7 @@
         a.blt(l);
         a.b(l);
         a.cbnz(A::x2, l);
-        a.cbz(A::x2, l);
+        a.cbz(A::x2, &l);
     },{
         0xc0,0x03,0x5f,0xd6,
         0xa0,0x01,0x5f,0xd6,
@@ -681,6 +658,43 @@
         0x62,0xff,0xff,0xb4,   // cbz x2, #-20
     });
 
+    // Can we cbz() to a not-yet-defined label?
+    test_asm(r, [&](A& a) {
+        A::Label l;
+        a.cbz(A::x2, &l);
+        a.add(A::x3, A::x2, 32);
+        a.label(&l);
+        a.ret(A::x30);
+    },{
+        0x42,0x00,0x00,0xb4,  // cbz x2, #8
+        0x43,0x80,0x00,0x91,  // add x3, x2, #32
+        0xc0,0x03,0x5f,0xd6,  // ret
+    });
+
+    // If we start a label as a backward label,
+    // can we redefine it to be a future label?
+    // (Not sure this is useful... just want to test it works.)
+    test_asm(r, [&](A& a) {
+        A::Label l1 = a.here();
+        a.add(A::x3, A::x2, 32);
+        a.cbz(A::x2, &l1);          // This will jump backward... nothing sneaky.
+
+        A::Label l2 = a.here();     // Start off the same...
+        a.add(A::x3, A::x2, 32);
+        a.cbz(A::x2, &l2);          // Looks like this will go backward...
+        a.add(A::x2, A::x2, 4);
+        a.add(A::x3, A::x2, 32);
+        a.label(&l2);               // But no... actually forward!  What a switcheroo!
+    },{
+        0x43,0x80,0x00,0x91,  // add x3, x2, #32
+        0xe2,0xff,0xff,0xb4,  // cbz x2, #-4
+
+        0x43,0x80,0x00,0x91,  // add x3, x2, #32
+        0x62,0x00,0x00,0xb4,  // cbz x2, #12
+        0x42,0x10,0x00,0x91,  // add x2, x2, #4
+        0x43,0x80,0x00,0x91,  // add x3, x2, #32
+    });
+
     test_asm(r, [&](A& a) {
         a.ldrq(A::v0, A::x8);
         a.strq(A::v0, A::x8);