add sli.4s, use it in pack sometimes
We have pack(x,y,imm) = x | (y<<imm) assuming (x & (y<<imm)) == 0.
If we can destroy x, sli (shift-left-insert) lets us implement that
as x |= y << imm. This happens quite often, so you'll see sequences
of pack that used to look like this
shl v4.4s, v2.4s, #8
orr v1.16b, v4.16b, v1.16b
shl v2.4s, v0.4s, #8
orr v0.16b, v2.16b, v3.16b
shl v2.4s, v0.4s, #16
orr v0.16b, v2.16b, v1.16b
now look like this
sli v1.4s, v2.4s, #8
sli v3.4s, v0.4s, #8
sli v1.4s, v3.4s, #16
We can do this thanks to the new simultaneous register assignment
and instruction selection I added. We used to never hit this case.
Change-Id: I75fa3defc1afd38779b3993887ca302a0885c5b1
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/228611
Reviewed-by: Mike Klein <mtklein@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 7f859ba..fc3672a 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -549,6 +549,9 @@
| (d & 5_mask) << 0);
}
+ void Assembler::sli4s(V d, V n, int imm) {
+ this->op(0b0'1'1'011110'0100'000'01010'1, ( imm&31), n, d);
+ }
void Assembler::shl4s(V d, V n, int imm) {
this->op(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d);
}
@@ -1277,10 +1280,11 @@
else { a->and16b(dst(), r[x], r[y]); }
break;
- // TODO: use vsli when avail & (1<<r[x])
- case Op::pack: a->shl4s (tmp(), r[y], imm);
- a->orr16b(dst(), tmp(), r[x]);
- break;
+ case Op::pack:
+ if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x], r[y], imm); }
+ else { a->shl4s (tmp(), r[y], imm);
+ a->orr16b(dst(), tmp(), r[x]); }
+ break;
case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index a1b8d68..358c96c 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -129,7 +129,8 @@
// d = op(n,imm)
using DOpNImm = void(V d, V n, int imm);
- DOpNImm shl4s, sshr4s, ushr4s,
+ DOpNImm sli4s,
+ shl4s, sshr4s, ushr4s,
ushr8h;
// d = op(n)
@@ -300,7 +301,7 @@
I32 sra(I32 x, int bits);
I32 extract(I32 x, int bits, I32 y); // (x >> bits) & y
- I32 pack (I32 x, I32 y, int bits); // x | (y << bits)
+ I32 pack (I32 x, I32 y, int bits); // x | (y << bits), assuming (x & (y << bits)) == 0
// Shuffle the bytes in x according to each nibble of control, as if
//
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 4b46633..de383d6 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -690,6 +690,20 @@
});
test_asm(r, [&](A& a) {
+ a.sli4s(A::v4, A::v3, 0);
+ a.sli4s(A::v4, A::v3, 1);
+ a.sli4s(A::v4, A::v3, 8);
+ a.sli4s(A::v4, A::v3, 16);
+ a.sli4s(A::v4, A::v3, 31);
+ },{
+ 0x64,0x54,0x20,0x6f,
+ 0x64,0x54,0x21,0x6f,
+ 0x64,0x54,0x28,0x6f,
+ 0x64,0x54,0x30,0x6f,
+ 0x64,0x54,0x3f,0x6f,
+ });
+
+ test_asm(r, [&](A& a) {
a.scvtf4s (A::v4, A::v3);
a.fcvtzs4s(A::v4, A::v3);
},{