Win/x64: Fix improper callee save of xmm8-xmm11
The x86-64 SIMD accelerations for Huffman encoding used incorrect
stack math to save xmm8-xmm11 on Windows. This caused TJBench to
always report 1 Mpixel/sec for the compression performance, and it
likely would have caused other application issues as well.
diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm
index d22efc3..84eaeeb 100644
--- a/simd/jchuff-sse2-64.asm
+++ b/simd/jchuff-sse2-64.asm
@@ -196,11 +196,11 @@
lea rsp, [t2]
collect_args
%ifdef WIN64
- sub rsp, 4*SIZEOF_XMMWORD
- movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm8
+ movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
- movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm10
- movaps XMMWORD [rsp-0*SIZEOF_XMMWORD], xmm11
+ movaps XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
+ movaps XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
+ sub rsp, 4*SIZEOF_XMMWORD
%endif
push rbx
@@ -344,10 +344,10 @@
pop rbx
%ifdef WIN64
- movaps xmm8, XMMWORD [rsp-3*SIZEOF_XMMWORD]
- movaps xmm9, XMMWORD [rsp-2*SIZEOF_XMMWORD]
- movaps xmm10, XMMWORD [rsp-1*SIZEOF_XMMWORD]
- movaps xmm11, XMMWORD [rsp-0*SIZEOF_XMMWORD]
+ movaps xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+ movaps xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+ movaps xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+ movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD
%endif
uncollect_args