Fixed regression caused by a bug in the 32-bit strict memory access code in jdmrgss2.asm (contributed by Chromium to stop valgrind from whining whenever the output buffer size was not evenly divisible by 16 bytes.)  On Linux/x86, this regression caused incorrect pixels on the right-hand side of images whose rows were not 16-byte aligned, whenever fancy upsampling was used and when decompressing to a 32-bit (RGBX, etc.) buffer.

git-svn-id: svn+ssh:// 632fc199-4ca6-4c93-a231-07263d6284db
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
index 36e2582..ba3de35 100644
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm
@@ -12,7 +12,7 @@
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
-; NASM is available from for
+; NASM is available from or
 ; [TAB8]
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
index 6a0dbd9..a00e539 100644
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm
@@ -478,9 +478,9 @@
 	cmp	ecx, byte SIZEOF_XMMWORD/8
 	jb	short .column_st7
 	movq	MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD/2
+	add	edi, byte SIZEOF_XMMWORD/8*4
 	sub	ecx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, 64
+	psrldq	xmmA, SIZEOF_XMMWORD/8*4
 	; Store one pixel (4 bytes) of xmmA to the output when it has enough
 	; space.