Reland "Reland "add op array32 for indirect uniform access""

Forgot to load the pointer to the array in the arm64 case.

This is a reland of fe2506f3cac8edee2fb83b2ba681d8be0d3a20ae

Bug=skia:11822

Original change's description:
> Reland "add op array32 for indirect uniform access"
>
> This is a reland of ac2d053ccfe80775b8144c069bf1f8660a5e8f9a
>
> The original CL was reverted because of a bug in the hash
> function.
>
> Bug=skia:11822
>
> Original change's description:
> > add op array32 for indirect uniform access
> >
> > Change-Id: I6249594a2348c7b24e4f057cce2f4e8a6a2c4409
> > Reviewed-on: https://skia-review.googlesource.com/c/skia/+/431676
> > Reviewed-by: Brian Osman <brianosman@google.com>
> > Commit-Queue: Herb Derby <herb@google.com>
>
> Change-Id: I94604f5589c72d342c39cad44540d810ed7f31a1
> Reviewed-on: https://skia-review.googlesource.com/c/skia/+/432797
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: Herb Derby <herb@google.com>

Change-Id: I185fc9554dcb6aa7ce367814ce2c69603074c434
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/433356
Reviewed-by: Brian Osman <brianosman@google.com>
Commit-Queue: Herb Derby <herb@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index bc49f16..cf9514b 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -233,7 +233,8 @@
              z = inst.z,
              w = inst.w;
         int immA = inst.immA,
-            immB = inst.immB;
+            immB = inst.immB,
+            immC = inst.immC;
         switch (op) {
             case Op::assert_true: write(o, op, V{x}, V{y}); break;
 
@@ -256,6 +257,7 @@
             case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
 
             case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
+            case Op::array32:   write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
 
             case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
 
@@ -346,7 +348,8 @@
                   z = inst.z,
                   w = inst.w;
             int immA = inst.immA,
-                immB = inst.immB;
+                immB = inst.immB,
+                immC = inst.immC;
             switch (op) {
                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
 
@@ -369,6 +372,7 @@
                 case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
 
                 case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
+                case Op::array32:   write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
 
                 case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
 
@@ -467,7 +471,8 @@
         std::vector<OptimizedInstruction> optimized(program.size());
         for (Val id = 0; id < (Val)program.size(); id++) {
             Instruction inst = program[id];
-            optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w, inst.immA,inst.immB,
+            optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w,
+                             inst.immA,inst.immB,inst.immC,
                              /*death=*/id, /*can_hoist=*/true};
         }
 
@@ -540,7 +545,8 @@
             && a.z    == b.z
             && a.w    == b.w
             && a.immA == b.immA
-            && a.immB == b.immB;
+            && a.immB == b.immB
+            && a.immC == b.immC;
     }
 
     uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
@@ -618,6 +624,10 @@
         return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
     }
 
+    I32 Builder::array32  (Ptr ptr, int offset, int index) {
+        return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index)};
+    }
+
     I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
 
     // Be careful peepholing float math!  Transformations you might expect to
@@ -2363,7 +2373,6 @@
                     // Put it all back together, preserving the high 8 bits and low 5.
                     inst = ((disp << 5) &  (19_mask << 5))
                          | ((inst     ) & ~(19_mask << 5));
-
                     memcpy(fCode + ref, &inst, 4);
                 }
             }
@@ -2481,7 +2490,7 @@
         std::vector<llvm::Value*> vals(instructions.size());
 
         auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
-            auto [op, x,y,z,w, immA,immB, death,can_hoist] = instructions[i];
+            auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i];
 
             llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
                        *i8    = llvm::Type::getInt8Ty (*ctx),
@@ -2985,6 +2994,7 @@
                 lookup_register(inst.w),
                 inst.immA,
                 inst.immB,
+                inst.immC,
             };
             fImpl->instructions.push_back(pinst);
         };
@@ -3209,7 +3219,8 @@
                       z = inst.z,
                       w = inst.w;
             const int immA = inst.immA,
-                      immB = inst.immB;
+                      immB = inst.immB,
+                      immC = inst.immC;
 
             // alloc_tmp() returns the first of N adjacent temporary registers,
             // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
@@ -3613,6 +3624,10 @@
                 case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
                                     break;
 
+                case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB});
+                                  a->vbroadcastss(dst(), A::Mem{GP0, immC});
+                                  break;
+
                 case Op::index: a->vmovd((A::Xmm)dst(), N);
                                 a->vbroadcastss(dst(), dst());
                                 a->vpsubd(dst(), dst(), &iota);
@@ -3885,6 +3900,12 @@
                                     a->ld1r4s(dst(), GP0);
                                     break;
 
+                case Op::array32: a->add(GP0, arg[immA], immB);
+                                  a->ldrd(GP0, GP0);
+                                  a->add(GP0, GP0, immC);
+                                  a->ld1r4s(dst(), GP0);
+                                  break;
+
                 case Op::gather8: {
                     // As usual, the gather base pointer is immB bytes off of uniform immA.
                     a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index e594a0c..ebe3546 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -439,6 +439,7 @@
         M(index)                                                     \
         M(gather8)  M(gather16)  M(gather32)                         \
                                  M(uniform32)                        \
+                                 M(array32)                          \
         M(splat)                                                     \
         M(add_f32) M(add_i32)                                        \
         M(sub_f32) M(sub_i32)                                        \
@@ -554,9 +555,9 @@
 
     SK_BEGIN_REQUIRE_DENSE
     struct Instruction {
-        Op  op;         // v* = op(x,y,z,w,immA,immB), where * == index of this Instruction.
-        Val x,y,z,w;    // Enough arguments for Op::store128.
-        int immA,immB;  // Immediate bit pattern, shift count, pointer index, byte offset, etc.
+        Op  op;              // v* = op(x,y,z,w,immA,immB), where * == index of this Instruction.
+        Val x,y,z,w;         // Enough arguments for Op::store128.
+        int immA,immB,immC;  // Immediate bit pattern, shift count, pointer index, byte offset, etc.
     };
     SK_END_REQUIRE_DENSE
 
@@ -568,7 +569,7 @@
     struct OptimizedInstruction {
         Op op;
         Val x,y,z,w;
-        int immA,immB;
+        int immA,immB,immC;
 
         Val  death;
         bool can_hoist;
@@ -632,6 +633,9 @@
         I32 uniform32(Ptr ptr, int offset);
         F32 uniformF (Ptr ptr, int offset) { return pun_to_F32(uniform32(ptr,offset)); }
 
+        // Load i32/f32 uniform with byte-count offset and index.
+        I32 array32  (Ptr ptr, int offset, int index);
+
         // Push and load this color as a uniform.
         Color uniformColor(SkColor4f, Uniforms*);
 
@@ -936,8 +940,9 @@
         }
 
     private:
-        Val push(Op op, Val x=NA, Val y=NA, Val z=NA, Val w=NA, int immA=0, int immB=0) {
-            return this->push(Instruction{op, x,y,z,w, immA,immB});
+        Val push(
+                Op op, Val x=NA, Val y=NA, Val z=NA, Val w=NA, int immA=0, int immB=0, int immC=0) {
+            return this->push(Instruction{op, x,y,z,w, immA,immB,immC});
         }
 
         template <typename T>
@@ -963,7 +968,7 @@
     struct InterpreterInstruction {
         Op  op;
         Reg d,x,y,z,w;
-        int immA,immB;
+        int immA,immB,immC;
     };
 
     class Program {
diff --git a/src/opts/SkVM_opts.h b/src/opts/SkVM_opts.h
index 3f9132a..cb92fdb 100644
--- a/src/opts/SkVM_opts.h
+++ b/src/opts/SkVM_opts.h
@@ -93,7 +93,8 @@
                       z = inst.z,
                       w = inst.w;
                 int immA = inst.immA,
-                    immB = inst.immB;
+                    immB = inst.immB,
+                    immC = inst.immC;
 
                 // Ops that interact with memory need to know whether we're stride=1 or K,
                 // but all non-memory ops can run the same code no matter the stride.
@@ -216,6 +217,12 @@
                         r[d].i32 = *(const int*)( (const char*)args[immA] + immB );
                         break;
 
+                    CASE(Op::array32):
+                        const int* ptr;
+                        memcpy(&ptr, (const uint8_t*)args[immA] + immB, sizeof(ptr));
+                        r[d].i32 = ptr[immC/sizeof(int)];
+                        break;
+
                     CASE(Op::splat): r[d].i32 = immA; break;
 
                     CASE(Op::add_f32): r[d].f32 = r[x].f32 + r[y].f32; break;
diff --git a/tests/SkVMTest.cpp b/tests/SkVMTest.cpp
index 3d7ef42..0432fc4 100644
--- a/tests/SkVMTest.cpp
+++ b/tests/SkVMTest.cpp
@@ -769,6 +769,46 @@
     });
 }
 
+DEF_TEST(SKVM_array32, r) {
+    skvm::Builder b;
+    {
+        skvm::Ptr buf0     = b.varying<int32_t>(),
+                  buf1     = b.varying<int32_t>(),
+                  uniforms = b.uniform();
+
+        skvm::I32 x = b.array32(uniforms, 0, 0);
+        b.store32(buf0, x);
+        skvm::I32 y = b.array32(uniforms, 0, 4);
+        b.store32(buf1, y);
+    }
+
+    test_jit_and_interpreter(b, [&](const skvm::Program& program) {
+        const int K = 20;
+        int i[2] = {3, 7};
+        struct {
+            int* g;
+        } uniforms{i};
+        int32_t buf0[K];
+        int32_t buf1[K];
+
+        program.eval(K, buf0, buf1, &uniforms);
+        for (auto v : buf0) {
+            REPORTER_ASSERT(r, v == 3);
+        }
+        for (auto v : buf1) {
+            REPORTER_ASSERT(r, v == 7);
+        }
+        i[0] = 4;
+        program.eval(K, buf0, buf1, &uniforms);
+        for (auto v : buf0) {
+            REPORTER_ASSERT(r, v == 4);
+        }
+        for (auto v : buf1) {
+            REPORTER_ASSERT(r, v == 7);
+        }
+    });
+}
+
 DEF_TEST(SkVM_sqrt, r) {
     skvm::Builder b;
     auto buf = b.varying<int>();