little steps forward

   - impl splat, store32, which lets us handle the conceptually simplest
     program, a 32-bit memset.

   - dump bitcode to /tmp on success

Kind of starting to look good!

$ ninja -C out dm && out/dm -m SkVM_Pointless
$ opt --O1 /tmp/skvm-jit-211960346.bc | llvm-dis

    ; Function Attrs: nofree norecurse nounwind writeonly
    define void @skvm-jit-211960346(i64, i8* nocapture) local_unnamed_addr #0 {
    enter:
      %2 = icmp ugt i64 %0, 7
      br i1 %2, label %loopK, label %test1.preheader

    test1.preheader:                                  ; preds = %loopK, %enter
      %.07.lcssa = phi i64 [ %0, %enter ], [ %5, %loopK ]
      %.0.lcssa = phi i8* [ %1, %enter ], [ %6, %loopK ]
      %3 = icmp eq i64 %.07.lcssa, 0
      br i1 %3, label %leave, label %loop1

    loopK:                                            ; preds = %enter, %loopK
      %.012 = phi i8* [ %6, %loopK ], [ %1, %enter ]
      %.0711 = phi i64 [ %5, %loopK ], [ %0, %enter ]
      %4 = bitcast i8* %.012 to <8 x i32>*
      store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, <8 x i32>* %4, align 1
      %5 = add i64 %.0711, -8
      %6 = getelementptr i8, i8* %.012, i64 32
      %7 = icmp ugt i64 %5, 7
      br i1 %7, label %loopK, label %test1.preheader

    loop1:                                            ; preds = %test1.preheader, %loop1
      %.110 = phi i8* [ %10, %loop1 ], [ %.0.lcssa, %test1.preheader ]
      %.189 = phi i64 [ %9, %loop1 ], [ %.07.lcssa, %test1.preheader ]
      %8 = bitcast i8* %.110 to i32*
      store i32 42, i32* %8, align 1
      %9 = add i64 %.189, -1
      %10 = getelementptr i8, i8* %.110, i64 4
      %11 = icmp eq i64 %9, 0
      br i1 %11, label %leave, label %loop1

    leave:                                            ; preds = %loop1, %test1.preheader
      ret void
    }

    attributes #0 = { nofree norecurse nounwind writeonly }

Change-Id: I00953c1113739a9ee094cb6cb3c99f1b7f8de9bf
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/273509
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 8d24258..a8b3829 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -17,6 +17,7 @@
 #include "src/core/SkVM.h"
 
 #if defined(SKVM_LLVM)
+    #include <llvm/Bitcode/BitcodeWriter.h>
     #include <llvm/IR/IRBuilder.h>
     #include <llvm/IR/Verifier.h>
 #endif
@@ -1895,13 +1896,19 @@
     // Smallest program:
     // b.store32(b.varying<int>(), b.splat(42));
     static bool try_llvm(const std::vector<OptimizedInstruction>& instructions,
-                         const std::vector<int>& strides) {
+                         const std::vector<int>& strides,
+                         const char* debug_name) {
         llvm::LLVMContext ctx;
         llvm::Module mod("", ctx);
         // All the scary bare pointers from here on are owned by ctx or mod, I think.
 
-        llvm::IntegerType* i64 = llvm::Type::getInt64Ty(ctx);
-        llvm::Type* ptr = llvm::Type::getInt8Ty(ctx)->getPointerTo();
+        const int K = 8;   // Primary vector width.
+        llvm::Type        *ptr = llvm::Type::getInt8Ty(ctx)->getPointerTo();
+      //llvm::Type        *f32 = llvm::Type::getFloatTy(ctx);
+        llvm::IntegerType *i32 = llvm::Type::getInt32Ty(ctx),
+                          *i64 = llvm::Type::getInt64Ty(ctx);
+      //llvm::VectorType  *I32 = llvm::VectorType::get(i32, K),
+      //                  *F32 = llvm::VectorType::get(f32, K);
 
         std::vector<llvm::Type*> arg_types = { i64 };
         for (size_t i = 0; i < strides.size(); i++) {
@@ -1911,7 +1918,7 @@
         llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(ctx),
                                                               arg_types, /*vararg?=*/false);
         llvm::Function* fn
-            = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, "", mod);
+            = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, mod);
 
         llvm::BasicBlock *enter = llvm::BasicBlock::Create(ctx, "enter", fn),
                          *testK = llvm::BasicBlock::Create(ctx, "testK", fn),
@@ -1922,17 +1929,39 @@
 
         using IRBuilder = llvm::IRBuilder<>;
 
+        llvm::Value* n;
+        std::vector<llvm::Value*> args;
+        std::vector<llvm::Value*> vals(instructions.size());
+
         auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
-            const OptimizedInstruction& insn = instructions[i];
-            switch (insn.op) {
-                default: return false;
+            auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = instructions[i];
+            switch (op) {
+                default:
+                    SkDebugf("can't llvm %s (%d)\n", name(op), op);
+                    return false;
+
+                case Op::store32: {
+                    llvm::Value* v = vals[x];
+                    if (scalar) {
+                        v = b->CreateExtractElement(v, (uint64_t)0);
+                    }
+                    llvm::Value* ptr = b->CreateBitCast(b->CreateLoad(args[immy]),
+                                                        v->getType()->getPointerTo());
+                    vals[i] = b->CreateAlignedStore(v, ptr, 1);
+                } break;
+
+                // Ops below this line shouldn't need to consider `scalar`... they're Just Math.
+
+                case Op::splat:
+                    vals[i] = llvm::ConstantVector::getSplat(K, llvm::ConstantInt::get(i32, immy));
+                    break;
+
             }
             return true;
         };
 
-        // enter:  set up stack homes for N and each pointer arg
-        llvm::Value* n;
-        std::vector<llvm::Value*> args;
+        // enter:  set up stack homes `n` and `args` for loop counter and uniform/varying pointers.
+        // TODO: manual PHI nodes for these instead of relying on load/store and mem2reg
         {
             IRBuilder b(enter);
 
@@ -1949,14 +1978,13 @@
         }
 
         // testK:  if (N >= K) goto loopK; else goto test1;
-        const int K = 8;
         llvm::ConstantInt* i64_K = llvm::ConstantInt::get(i64, K);
         {
             IRBuilder b(testK);
             b.CreateCondBr(b.CreateICmpUGE(b.CreateLoad(n), i64_K), loopK, test1);
         }
 
-        // loopK:  ... insns on K x T vectors; N -= K, args += K*stride; goto testK;
+        // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
         {
             IRBuilder b(loopK);
             for (size_t i = 0; i < instructions.size(); i++) {
@@ -1979,7 +2007,7 @@
             b.CreateCondBr(b.CreateICmpUGE(b.CreateLoad(n), i64_1), loop1, leave);
         }
 
-        // loop1:  ... insns on scalars; N -= 1, args += stride; goto test1;
+        // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
         {
             IRBuilder b(loop1);
             for (size_t i = 0; i < instructions.size(); i++) {
@@ -2002,6 +2030,14 @@
         }
 
         SkASSERT(false == llvm::verifyModule(mod));
+
+        SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
+        std::error_code err;
+        llvm::raw_fd_ostream os(path.c_str(), err);
+        if (err) {
+            return false;
+        }
+        llvm::WriteBitcodeToFile(mod, os);
         return true;
     }
 #endif
@@ -2056,13 +2092,6 @@
     Program::Program(const std::vector<OptimizedInstruction>& interpreter,
                      const std::vector<int>& strides) : fStrides(strides) {
         this->setupInterpreter(interpreter);
-    #if defined(SKVM_LLVM)
-        if (try_llvm(interpreter, fStrides)) {
-            SkDebugf("hey, neat!  that might work\n");
-        } else {
-            SkDebugf("bummer\n");
-        }
-    #endif
     }
 
     Program::Program(const std::vector<OptimizedInstruction>& interpreter,
@@ -2072,6 +2101,14 @@
     #if 1 && defined(SKVM_JIT)
         this->setupJIT(jit, debug_name);
     #endif
+
+    #if defined(SKVM_LLVM)
+        if (try_llvm(interpreter, fStrides, debug_name)) {
+            SkDebugf("hey, neat!  that might work\n");
+        } else {
+            SkDebugf("bummer\n");
+        }
+    #endif
     }
 
     // Translate OptimizedInstructions to Program::Instructions used by the interpreter.