Win/x64: Fix improper callee save of xmm8-xmm11

The x86-64 SIMD accelerations for Huffman encoding used incorrect
stack math to save xmm8-xmm11 on Windows.  This caused TJBench to
always report 1 Mpixel/sec for the compression performance, and it
likely would have caused other application issues as well.
diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm
index d22efc3..84eaeeb 100644
--- a/simd/jchuff-sse2-64.asm
+++ b/simd/jchuff-sse2-64.asm
@@ -196,11 +196,11 @@
         lea     rsp, [t2]
         collect_args
 %ifdef WIN64
-        sub     rsp, 4*SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm8
+        movaps  XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
         movaps  XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
-        movaps  XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm10
-        movaps  XMMWORD [rsp-0*SIZEOF_XMMWORD], xmm11
+        movaps  XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
+        movaps  XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
+        sub     rsp, 4*SIZEOF_XMMWORD
 %endif
         push rbx
 
@@ -344,10 +344,10 @@
 
         pop rbx
 %ifdef WIN64
-        movaps  xmm8, XMMWORD [rsp-3*SIZEOF_XMMWORD]
-        movaps  xmm9, XMMWORD [rsp-2*SIZEOF_XMMWORD]
-        movaps  xmm10, XMMWORD [rsp-1*SIZEOF_XMMWORD]
-        movaps  xmm11, XMMWORD [rsp-0*SIZEOF_XMMWORD]
+        movaps  xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+        movaps  xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+        movaps  xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+        movaps  xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
         add     rsp, 4*SIZEOF_XMMWORD
 %endif
         uncollect_args