start on windows jit support

  - Replacement for in-memory JIT mmap,mprotect,munmap.

  - dylib feature disabled (for now, but could be done
    with LoadLibrary/GetProcAddress)

  - Mostly share impl with __x86_64__, but with enter
    and exit updated for the MS ABI.

I "rediscovered" along the way that r12 has the same low three
bits as rsp, and that means the assembler needs to handle it
specially in at least one place it's not today.  No big deal;
we can easily avoid using r12.  GP registers are all statically
allocated.  Left a warning and a TODO.

Still need to investigate 17 GMs that are triggering asserts.

Change-Id: I0f543b0efab968e805e89dcf1f068eac1cafea38
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/298530
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
diff --git a/gn/skia.gni b/gn/skia.gni
index 1e4f81a..635b1e5 100644
--- a/gn/skia.gni
+++ b/gn/skia.gni
@@ -26,8 +26,9 @@
   skia_enable_skrive = true
   skia_enable_sksl_interpreter = is_skia_dev_build
   skia_enable_skvm_jit =
-      is_skia_dev_build && ((target_cpu == "x64" && (is_linux || is_mac)) ||
-                            (target_cpu == "arm64" && is_android))
+      is_skia_dev_build &&
+      ((target_cpu == "x64" && (is_linux || is_mac || is_win)) ||
+       (target_cpu == "arm64" && is_android))
   skia_enable_tools = is_skia_dev_build
   skia_enable_gpu_debug_layers = is_skia_dev_build && is_debug
   skia_generate_workarounds = false
diff --git a/src/core/SkVM.cpp b/src/core/SkVM.cpp
index 66fb40e..8d34278 100644
--- a/src/core/SkVM.cpp
+++ b/src/core/SkVM.cpp
@@ -38,10 +38,50 @@
 bool gSkVMJITViaDylib{false};
 
 #if defined(SKVM_JIT)
-    #include <dlfcn.h>      // dlopen, dlsym
-    #include <sys/mman.h>   // mmap, mprotect
+    #if defined(SK_BUILD_FOR_WIN)
+        #include "src/core/SkLeanWindows.h"
+        #include <memoryapi.h>
+
+        static void* alloc_jit_buffer(size_t* len) {
+            return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+        }
+        static void unmap_jit_buffer(void* ptr, size_t len) {
+            VirtualFree(ptr, 0, MEM_RELEASE);
+        }
+        static void remap_as_executable(void* ptr, size_t len) {
+            DWORD old;
+            VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
+            SkASSERT(old == PAGE_READWRITE);
+        }
+        static void close_dylib(void* dylib) {
+            SkASSERT(false);  // TODO?  For now just assert we never make one.
+        }
+    #else
+        #include <dlfcn.h>
+        #include <sys/mman.h>
+
+        static void* alloc_jit_buffer(size_t* len) {
+            // While mprotect and VirtualAlloc both work at page granularity,
+            // mprotect doesn't round up for you, and instead requires *len is at page granularity.
+            const size_t page = sysconf(_SC_PAGESIZE);
+            *len = ((*len + page - 1) / page) * page;
+            return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
+        }
+        static void unmap_jit_buffer(void* ptr, size_t len) {
+            munmap(ptr, len);
+        }
+        static void remap_as_executable(void* ptr, size_t len) {
+            mprotect(ptr, len, PROT_READ|PROT_EXEC);
+            __builtin___clear_cache((char*)ptr,
+                                    (char*)ptr + len);
+        }
+        static void close_dylib(void* dylib) {
+            dlclose(dylib);
+        }
+    #endif
 #endif
 
+
 namespace skvm {
 
     struct Program::Impl {
@@ -2596,9 +2636,9 @@
         fImpl->llvm_ctx.reset(nullptr);
     #elif defined(SKVM_JIT)
         if (fImpl->dylib) {
-            dlclose(fImpl->dylib);
+            close_dylib(fImpl->dylib);
         } else if (auto jit_entry = fImpl->jit_entry.load()) {
-            munmap(jit_entry, fImpl->jit_size);
+            unmap_jit_buffer(jit_entry, fImpl->jit_size);
         }
     #else
         SkASSERT(!this->hasJIT());
@@ -2767,15 +2807,67 @@
                                                   : stack_slot.size();
 
 
-    #if defined(__x86_64__)
+    #if defined(__x86_64__) || defined(_M_X64)
         if (!SkCpu::Supports(SkCpu::HSW)) {
             return false;
         }
         const int K = 8;
-        const A::GP64 N   = A::rdi,
-                      GP0 = A::rax,
-                      GP1 = A::r11,
-                      arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
+        #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
+            const A::GP64 N = A::rcx,
+                        GP0 = A::rax,
+                        GP1 = A::r11,
+                        arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi };
+            auto enter = [&]{
+                // Fun extra setup to work within the MS ABI:
+                // 0) rcx,rdx,r8,r9 are all already holding their correct values,
+                //    and rax,r10,r11 can be used freely.
+                // 1) Load r10 from rsp+40 if there's a fourth arg.
+                if (fImpl->strides.size() >= 4) {
+                    a->mov(A::r10, A::Mem{A::rsp, 40});
+                }
+                // 2) Load rdi from rsp+48 if there's a fifth arg,
+                //    first preserving its original callee-saved value at rsp+8,
+                //    which is an ABI reserved shadow area usually for spilling rcx.
+                if (fImpl->strides.size() >= 5) {
+                    a->mov(A::Mem{A::rsp, 8}, A::rdi);
+                    a->mov(A::rdi, A::Mem{A::rsp, 48});
+                }
+                // 3) Save ymm6-ymm15 (really just need to save xmm6-xmm15, but this works).
+                a->sub(A::rsp, 10*K*4);
+                for (int i = 0; i < 10; i++) {
+                    a->vmovups(A::Mem{A::rsp, i*K*4}, (A::Ymm)(i+6));
+                }
+
+                // Now our normal "make space for values".
+                if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
+            };
+            auto exit  = [&]{
+                if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
+                // Undo MS ABI setup in reverse.
+                // 3) restore ymm6-ymm15
+                for (int i = 0; i < 10; i++) {
+                    a->vmovups((A::Ymm)(i+6), A::Mem{A::rsp, i*K*4});
+                }
+                a->add(A::rsp, 10*K*4);
+                // 2) restore rdi if we used it
+                if (fImpl->strides.size() >= 5) {
+                    a->mov(A::rdi, A::Mem{A::rsp, 8});
+                }
+                // 1) no need to restore caller-saved r10
+                a->vzeroupper();
+                a->ret();
+            };
+        #elif defined(__x86_64__)
+            const A::GP64 N = A::rdi,
+                        GP0 = A::rax,
+                        GP1 = A::r11,
+                        arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
+
+            auto enter = [&]{ if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } };
+            auto exit  = [&]{ if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
+                              a->vzeroupper();
+                              a->ret(); };
+        #endif
 
         // All 16 ymm registers are available to use.
         using Reg = A::Ymm;
@@ -2807,7 +2899,11 @@
                    GP0   = A::x8,
                    arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
 
-        // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15.
+        auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
+        auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
+                          a->ret(A::x30); };
+
+        // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
         using Reg = A::V;
         std::array<Val,32> regs = {
              NA, NA, NA, NA,  NA, NA, NA, NA,
@@ -2888,7 +2984,7 @@
                 return r;
             };
 
-        #if defined(__x86_64__)  // Nothing special... just happens to not be used on ARM right now.
+        #if defined(__x86_64__) || defined(_M_X64)  // Nothing special... just unused on ARM.
             auto free_tmp = [&](Reg r) {
                 SkASSERT(regs[r] == TMP);
                 regs[r] = NA;
@@ -2972,7 +3068,7 @@
                 return r(id);
             };
 
-        #if defined(__x86_64__)
+        #if defined(__x86_64__) || defined(_M_X64)
             // On x86 we can work with many values directly from the stack or program constant pool.
             auto any = [&](Val v) -> A::Operand {
                 SkASSERT(v >= 0);
@@ -3000,7 +3096,7 @@
                     (void)constants[immy];
                     break;
 
-            #if defined(__x86_64__)
+            #if defined(__x86_64__) || defined(_M_X64)
                 case Op::assert_true: {
                     a->vptest (r(x), &constants[0xffffffff]);
                     A::Label all_true;
@@ -3382,27 +3478,18 @@
             return true;
         };
 
-        #if defined(__x86_64__)
+        #if defined(__x86_64__) || defined(_M_X64)
             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
             auto jump         = [&](A::Label* l) { a->jmp(l); };
 
             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
-
-            auto enter = [&]{ if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); } };
-            auto exit  = [&]{ if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
-                              a->vzeroupper();
-                              a->ret(); };
         #elif defined(__aarch64__)
             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
             auto jump         = [&](A::Label* l) { a->b  (l); };
 
             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
-
-            auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
-            auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
-                              a->ret(A::x30); };
         #endif
 
         A::Label body,
@@ -3510,14 +3597,8 @@
             return;
         }
 
-        // Allocate space that we can remap as executable.
-        const size_t page = sysconf(_SC_PAGESIZE);
-
-        // mprotect works at page granularity.
-        fImpl->jit_size = ((a.size() + page - 1) / page) * page;
-
-        void* jit_entry
-             = mmap(nullptr,fImpl->jit_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
+        fImpl->jit_size = a.size();
+        void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
         fImpl->jit_entry.store(jit_entry);
 
         // Assemble the program for real.
@@ -3526,10 +3607,9 @@
         SkASSERT(a.size() <= fImpl->jit_size);
 
         // Remap as executable, and flush caches on platforms that need that.
-        mprotect(jit_entry, fImpl->jit_size, PROT_READ|PROT_EXEC);
-        __builtin___clear_cache((char*)jit_entry,
-                                (char*)jit_entry + fImpl->jit_size);
+        remap_as_executable(jit_entry, fImpl->jit_size);
 
+    #if !defined(SK_BUILD_FOR_WIN)
         // For profiling and debugging, it's helpful to have this code loaded
         // dynamically rather than just jumping info fImpl->jit_entry.
         if (gSkVMJITViaDylib) {
@@ -3556,6 +3636,7 @@
             }
             fImpl->jit_entry.store(sym);
         }
+    #endif
     }
 #endif
 
diff --git a/src/core/SkVM.h b/src/core/SkVM.h
index db5b273..46848b9 100644
--- a/src/core/SkVM.h
+++ b/src/core/SkVM.h
@@ -47,9 +47,10 @@
 
         // Order matters... GP64, Xmm, Ymm values match 4-bit register encoding for each.
         enum GP64 {
-            rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi,
-            r8 , r9 , r10, r11, r12, r13, r14, r15,
+            rax, rcx, rdx, rbx, rsp,               rbp, rsi, rdi,
+            r8 , r9 , r10, r11, r12BROKENDONOTUSE, r13, r14, r15,
         };
+        // TODO: need to fix up assembler before r12 is safe to use
         enum Xmm {
             xmm0, xmm1, xmm2 , xmm3 , xmm4 , xmm5 , xmm6 , xmm7 ,
             xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,