add sli.4s, use it in pack sometimes

We have pack(x,y,imm) = x | (y<<imm) assuming (x & (y<<imm)) == 0.

If we can destroy x, sli (shift-left-insert) lets us implement that
as x |= y << imm.  This happens quite often, so you'll see sequences
of pack that used to look like this

	shl	v4.4s, v2.4s, #8
	orr	v1.16b, v4.16b, v1.16b
	shl	v2.4s, v0.4s, #8
	orr	v0.16b, v2.16b, v3.16b
	shl	v2.4s, v0.4s, #16
	orr	v0.16b, v2.16b, v1.16b

now look like this

	sli	v1.4s, v2.4s, #8
	sli	v3.4s, v0.4s, #8
	sli	v1.4s, v3.4s, #16

We can do this thanks to the new simultaneous register assignment
and instruction selection I added.  We used to never hit this case.

Change-Id: I75fa3defc1afd38779b3993887ca302a0885c5b1
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/228611
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 7f859ba..fc3672a 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -549,6 +549,9 @@
                   | (d    &  5_mask) <<  0);
     }
 
+    void Assembler::sli4s(V d, V n, int imm) {
+        this->op(0b0'1'1'011110'0100'000'01010'1,    ( imm&31), n, d);
+    }
     void Assembler::shl4s(V d, V n, int imm) {
         this->op(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
     }
@@ -1277,10 +1280,11 @@
                                   else     { a->and16b(dst(), r[x], r[y]); }
                                              break;
 
-                // TODO: use vsli when avail & (1<<r[x])
-                case Op::pack: a->shl4s (tmp(), r[y], imm);
-                               a->orr16b(dst(), tmp(), r[x]);
-                               break;
+                case Op::pack:
+                    if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x],  r[y],  imm); }
+                    else                   {                a->shl4s (tmp(),  r[y],  imm);
+                                                            a->orr16b(dst(), tmp(), r[x]); }
+                                                            break;
 
                 case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
                 case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index a1b8d68..358c96c 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -129,7 +129,8 @@
 
         // d = op(n,imm)
         using DOpNImm = void(V d, V n, int imm);
-        DOpNImm shl4s, sshr4s, ushr4s,
+        DOpNImm sli4s,
+                shl4s, sshr4s, ushr4s,
                                ushr8h;
 
         // d = op(n)
@@ -300,7 +301,7 @@
         I32 sra(I32 x, int bits);
 
         I32 extract(I32 x, int bits, I32 y);   // (x >> bits) & y
-        I32 pack   (I32 x, I32 y, int bits);   // x | (y << bits)
+        I32 pack   (I32 x, I32 y, int bits);   // x | (y << bits), assuming (x & (y << bits)) == 0
 
         // Shuffle the bytes in x according to each nibble of control, as if
         //
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 4b46633..de383d6 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -690,6 +690,20 @@
     });
 
     test_asm(r, [&](A& a) {
+        a.sli4s(A::v4, A::v3,  0);
+        a.sli4s(A::v4, A::v3,  1);
+        a.sli4s(A::v4, A::v3,  8);
+        a.sli4s(A::v4, A::v3, 16);
+        a.sli4s(A::v4, A::v3, 31);
+    },{
+        0x64,0x54,0x20,0x6f,
+        0x64,0x54,0x21,0x6f,
+        0x64,0x54,0x28,0x6f,
+        0x64,0x54,0x30,0x6f,
+        0x64,0x54,0x3f,0x6f,
+    });
+
+    test_asm(r, [&](A& a) {
         a.scvtf4s (A::v4, A::v3);
         a.fcvtzs4s(A::v4, A::v3);
     },{