x86 SIMD: Consistify capitalization of NASM types

byte, word, dword, qword, oword, and yword are all assembler keywords,
so it makes sense to use lowercase for these so as not to mistake them
for macros or constants.
diff --git a/simd/i386/jccolext-avx2.asm b/simd/i386/jccolext-avx2.asm
index 11674d5..c46d684 100644
--- a/simd/i386/jccolext-avx2.asm
+++ b/simd/i386/jccolext-avx2.asm
@@ -108,12 +108,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jccolext-mmx.asm b/simd/i386/jccolext-mmx.asm
index c18dbc4..6357a42 100644
--- a/simd/i386/jccolext-mmx.asm
+++ b/simd/i386/jccolext-mmx.asm
@@ -109,13 +109,13 @@
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
     xor         eax, eax
-    mov         al, BYTE [esi+ecx]
+    mov         al, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
     xor         edx, edx
-    mov         dx, WORD [esi+ecx]
+    mov         dx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
@@ -125,7 +125,7 @@
     test        cl, SIZEOF_DWORD
     jz          short .column_ld8
     sub         ecx, byte SIZEOF_DWORD
-    movd        mmG, DWORD [esi+ecx]
+    movd        mmG, dword [esi+ecx]
     psllq       mmA, DWORD_BIT
     por         mmA, mmG
 .column_ld8:
@@ -195,7 +195,7 @@
     test        cl, SIZEOF_MMWORD/8
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_MMWORD/8
-    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
 .column_ld2:
     test        cl, SIZEOF_MMWORD/4
     jz          short .column_ld4
diff --git a/simd/i386/jccolext-sse2.asm b/simd/i386/jccolext-sse2.asm
index 06b02d5..c6c8085 100644
--- a/simd/i386/jccolext-sse2.asm
+++ b/simd/i386/jccolext-sse2.asm
@@ -107,12 +107,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jcgryext-avx2.asm b/simd/i386/jcgryext-avx2.asm
index 5601a6a..3fa7973 100644
--- a/simd/i386/jcgryext-avx2.asm
+++ b/simd/i386/jcgryext-avx2.asm
@@ -100,12 +100,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jcgryext-mmx.asm b/simd/i386/jcgryext-mmx.asm
index 727894f..8af42e5 100644
--- a/simd/i386/jcgryext-mmx.asm
+++ b/simd/i386/jcgryext-mmx.asm
@@ -101,13 +101,13 @@
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
     xor         eax, eax
-    mov         al, BYTE [esi+ecx]
+    mov         al, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
     xor         edx, edx
-    mov         dx, WORD [esi+ecx]
+    mov         dx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
@@ -117,7 +117,7 @@
     test        cl, SIZEOF_DWORD
     jz          short .column_ld8
     sub         ecx, byte SIZEOF_DWORD
-    movd        mmG, DWORD [esi+ecx]
+    movd        mmG, dword [esi+ecx]
     psllq       mmA, DWORD_BIT
     por         mmA, mmG
 .column_ld8:
@@ -187,7 +187,7 @@
     test        cl, SIZEOF_MMWORD/8
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_MMWORD/8
-    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
 .column_ld2:
     test        cl, SIZEOF_MMWORD/4
     jz          short .column_ld4
diff --git a/simd/i386/jcgryext-sse2.asm b/simd/i386/jcgryext-sse2.asm
index 5b4559f..c9d6ff1 100644
--- a/simd/i386/jcgryext-sse2.asm
+++ b/simd/i386/jcgryext-sse2.asm
@@ -99,12 +99,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         ecx, byte SIZEOF_BYTE
-    movzx       eax, BYTE [esi+ecx]
+    movzx       eax, byte [esi+ecx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         ecx, byte SIZEOF_WORD
-    movzx       edx, WORD [esi+ecx]
+    movzx       edx, word [esi+ecx]
     shl         eax, WORD_BIT
     or          eax, edx
 .column_ld4:
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index 2b0b469..79f0ca5 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -195,8 +195,8 @@
     push        ebp
 
     mov         esi, POINTER [eax+8]       ; (working_state *state)
-    mov         put_buffer, DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits, DWORD [esi+12]   ; put_bits = state->cur.put_bits;
+    mov         put_buffer, dword [esi+8]  ; put_buffer = state->cur.put_buffer;
+    mov         put_bits, dword [esi+12]   ; put_bits = state->cur.put_bits;
     push        esi                        ; esi is now scratch
 
     get_GOT     edx                        ; get GOT address
@@ -212,7 +212,7 @@
     ; Encode the DC coefficient difference per section F.1.2.1
     mov         esi, POINTER [esp+block]  ; block
     movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
-    sub         ecx, DWORD [eax+20]
+    sub         ecx, dword [eax+20]
     mov         esi, ecx
 
     ; This is a well-known technique for obtaining the absolute value
@@ -227,12 +227,12 @@
     ; For a negative input, want temp2 = bitwise complement of abs(input)
     ; This code assumes we are on a two's complement machine
     add         esi, edx                ; temp2 += temp3;
-    mov         DWORD [esp+temp], esi   ; backup temp2 in temp
+    mov         dword [esp+temp], esi   ; backup temp2 in temp
 
     ; Find the number of bits needed for the magnitude of the coefficient
     movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
     movzx       edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-    mov         DWORD [esp+temp2], edx                           ; backup nbits in temp2
+    mov         dword [esp+temp2], edx                           ; backup nbits in temp2
 
     ; Emit the Huffman-coded symbol for the number of bits
     mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
@@ -240,13 +240,13 @@
     movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
     EMIT_BITS   eax                           ; EMIT_BITS(code, size)
 
-    mov         ecx, DWORD [esp+temp2]        ; restore nbits
+    mov         ecx, dword [esp+temp2]        ; restore nbits
 
     ; Mask off any extra bits in code
     mov         eax, 1
     shl         eax, cl
     dec         eax
-    and         eax, DWORD [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    and         eax, dword [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
 
     ; Emit that number of bits of the value, if positive,
     ; or the complement of its magnitude, if negative.
@@ -289,22 +289,22 @@
     jz          near .ELOOP
     lea         esi, [esi+ecx*2]        ; k += r;
     shr         edx, cl                 ; index >>= r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
 .BRLOOP:
     cmp         ecx, 16                       ; while (r > 15) {
     jl          near .ERLOOP
     sub         ecx, 16                       ; r -= 16;
-    mov         DWORD [esp+temp], ecx
+    mov         dword [esp+temp], ecx
     mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
     movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
     EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, DWORD [esp+temp]
+    mov         ecx, dword [esp+temp]
     jmp         .BRLOOP
 .ERLOOP:
     movsx       eax, word [esi]                                  ; temp = t1[k];
     movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
     movzx       eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-    mov         DWORD [esp+temp2], eax
+    mov         dword [esp+temp2], eax
     ; Emit Huffman symbol for run length / number of bits
     shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
     add         ecx, eax
@@ -314,13 +314,13 @@
 
     movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
     ; Mask off any extra bits in code
-    mov         ecx, DWORD [esp+temp2]
+    mov         ecx, dword [esp+temp2]
     mov         eax, 1
     shl         eax, cl
     dec         eax
     and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
     EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, DWORD [esp+temp3]
+    mov         edx, dword [esp+temp3]
     add         esi, 2                  ; ++k;
     shr         edx, 1                  ; index >>= 1;
 
@@ -350,29 +350,29 @@
     shr         edx, cl                 ; index >>= r;
     add         ecx, eax
     lea         esi, [esi+ecx*2]        ; k += r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
     jmp         .BRLOOP2
 .BLOOP2:
     bsf         ecx, edx                ; r = __builtin_ctzl(index);
     jz          near .ELOOP2
     lea         esi, [esi+ecx*2]        ; k += r;
     shr         edx, cl                 ; index >>= r;
-    mov         DWORD [esp+temp3], edx
+    mov         dword [esp+temp3], edx
 .BRLOOP2:
     cmp         ecx, 16                       ; while (r > 15) {
     jl          near .ERLOOP2
     sub         ecx, 16                       ; r -= 16;
-    mov         DWORD [esp+temp], ecx
+    mov         dword [esp+temp], ecx
     mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
     movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
     EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, DWORD [esp+temp]
+    mov         ecx, dword [esp+temp]
     jmp         .BRLOOP2
 .ERLOOP2:
     movsx       eax, word [esi]         ; temp = t1[k];
     bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
     inc         eax
-    mov         DWORD [esp+temp2], eax
+    mov         dword [esp+temp2], eax
     ; Emit Huffman symbol for run length / number of bits
     shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
     add         ecx, eax
@@ -382,13 +382,13 @@
 
     movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
     ; Mask off any extra bits in code
-    mov         ecx, DWORD [esp+temp2]
+    mov         ecx, dword [esp+temp2]
     mov         eax, 1
     shl         eax, cl
     dec         eax
     and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
     EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, DWORD [esp+temp3]
+    mov         edx, dword [esp+temp3]
     add         esi, 2                  ; ++k;
     shr         edx, 1                  ; index >>= 1;
 
@@ -405,8 +405,8 @@
     mov         eax, [esp+buffer]
     pop         esi
     ; Save put_buffer & put_bits
-    mov         DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         DWORD [esi+12], put_bits   ; state->cur.put_bits = put_bits;
+    mov         dword [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
+    mov         dword [esi+12], put_bits   ; state->cur.put_bits = put_bits;
 
     pop         ebp
     pop         edi
diff --git a/simd/i386/jdcolext-avx2.asm b/simd/i386/jdcolext-avx2.asm
index 66e2683..015be04 100644
--- a/simd/i386/jdcolext-avx2.asm
+++ b/simd/i386/jdcolext-avx2.asm
@@ -346,7 +346,7 @@
     vmovd       eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -355,7 +355,7 @@
     ; space.
     test        ecx, ecx
     jz          short .nextrow
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdcolext-mmx.asm b/simd/i386/jdcolext-mmx.asm
index 29b6dd4..5813cfc 100644
--- a/simd/i386/jdcolext-mmx.asm
+++ b/simd/i386/jdcolext-mmx.asm
@@ -278,7 +278,7 @@
     movd        eax, mmA
     cmp         ecx, byte SIZEOF_DWORD
     jb          short .column_st2
-    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    mov         dword [edi+0*SIZEOF_DWORD], eax
     psrlq       mmA, DWORD_BIT
     movd        eax, mmA
     sub         ecx, byte SIZEOF_DWORD
@@ -286,14 +286,14 @@
 .column_st2:
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi+0*SIZEOF_WORD], ax
+    mov         word [edi+0*SIZEOF_WORD], ax
     shr         eax, WORD_BIT
     sub         ecx, byte SIZEOF_WORD
     add         edi, byte SIZEOF_WORD
 .column_st1:
     cmp         ecx, byte SIZEOF_BYTE
     jb          short .nextrow
-    mov         BYTE [edi+0*SIZEOF_BYTE], al
+    mov         byte [edi+0*SIZEOF_BYTE], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -365,7 +365,7 @@
 .column_st4:
     cmp         ecx, byte SIZEOF_MMWORD/8
     jb          short .nextrow
-    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/i386/jdcolext-sse2.asm b/simd/i386/jdcolext-sse2.asm
index 73b37de..d5572b3 100644
--- a/simd/i386/jdcolext-sse2.asm
+++ b/simd/i386/jdcolext-sse2.asm
@@ -318,7 +318,7 @@
     movd        eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -327,7 +327,7 @@
     ; space.
     test        ecx, ecx
     jz          short .nextrow
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdmrgext-avx2.asm b/simd/i386/jdmrgext-avx2.asm
index 3512c50..e35f728 100644
--- a/simd/i386/jdmrgext-avx2.asm
+++ b/simd/i386/jdmrgext-avx2.asm
@@ -352,7 +352,7 @@
     vmovd       eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -361,7 +361,7 @@
     ; space.
     test        ecx, ecx
     jz          short .endcolumn
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jdmrgext-mmx.asm b/simd/i386/jdmrgext-mmx.asm
index dab5c32..eb3e36b 100644
--- a/simd/i386/jdmrgext-mmx.asm
+++ b/simd/i386/jdmrgext-mmx.asm
@@ -281,7 +281,7 @@
     movd        eax, mmA
     cmp         ecx, byte SIZEOF_DWORD
     jb          short .column_st2
-    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    mov         dword [edi+0*SIZEOF_DWORD], eax
     psrlq       mmA, DWORD_BIT
     movd        eax, mmA
     sub         ecx, byte SIZEOF_DWORD
@@ -289,14 +289,14 @@
 .column_st2:
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi+0*SIZEOF_WORD], ax
+    mov         word [edi+0*SIZEOF_WORD], ax
     shr         eax, WORD_BIT
     sub         ecx, byte SIZEOF_WORD
     add         edi, byte SIZEOF_WORD
 .column_st1:
     cmp         ecx, byte SIZEOF_BYTE
     jb          short .endcolumn
-    mov         BYTE [edi+0*SIZEOF_BYTE], al
+    mov         byte [edi+0*SIZEOF_BYTE], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -371,7 +371,7 @@
 .column_st4:
     cmp         ecx, byte SIZEOF_MMWORD/8
     jb          short .endcolumn
-    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
 
 %endif  ; RGB_PIXELSIZE ; ---------------
 
diff --git a/simd/i386/jdmrgext-sse2.asm b/simd/i386/jdmrgext-sse2.asm
index 91295ff..c113dc4 100644
--- a/simd/i386/jdmrgext-sse2.asm
+++ b/simd/i386/jdmrgext-sse2.asm
@@ -323,7 +323,7 @@
     movd        eax, xmmA
     cmp         ecx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [edi], ax
+    mov         word [edi], ax
     add         edi, byte SIZEOF_WORD
     sub         ecx, byte SIZEOF_WORD
     shr         eax, 16
@@ -332,7 +332,7 @@
     ; space.
     test        ecx, ecx
     jz          short .endcolumn
-    mov         BYTE [edi], al
+    mov         byte [edi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/i386/jidctflt-3dn.asm b/simd/i386/jidctflt-3dn.asm
index 396f36a..8795191 100644
--- a/simd/i386/jidctflt-3dn.asm
+++ b/simd/i386/jidctflt-3dn.asm
@@ -90,23 +90,23 @@
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     pushpic     ebx                     ; save GOT address
-    mov         ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-    mov         eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-    or          ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-    or          ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
     or          eax, ebx
     poppic      ebx                     ; restore GOT address
     jnz         short .columnDCT
 
     ; -- AC terms all zero
 
-    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm0, mm0
     psrad       mm0, (DWORD_BIT-WORD_BIT)
@@ -133,10 +133,10 @@
 
     ; -- Even part
 
-    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-    movd        mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-    movd        mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-    movd        mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm0, mm0
     punpcklwd   mm1, mm1
@@ -180,10 +180,10 @@
 
     ; -- Odd part
 
-    movd        mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    movd        mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-    movd        mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-    movd        mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
 
     punpcklwd   mm2, mm2
     punpcklwd   mm3, mm3
diff --git a/simd/i386/jidctflt-sse.asm b/simd/i386/jidctflt-sse.asm
index 6236fd5..b27ecfd 100644
--- a/simd/i386/jidctflt-sse.asm
+++ b/simd/i386/jidctflt-sse.asm
@@ -100,8 +100,8 @@
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctflt-sse2.asm b/simd/i386/jidctflt-sse2.asm
index 0b38937..c646eae 100644
--- a/simd/i386/jidctflt-sse2.asm
+++ b/simd/i386/jidctflt-sse2.asm
@@ -100,8 +100,8 @@
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-mmx.asm b/simd/i386/jidctfst-mmx.asm
index c69bffa..24622d4 100644
--- a/simd/i386/jidctfst-mmx.asm
+++ b/simd/i386/jidctfst-mmx.asm
@@ -121,8 +121,8 @@
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-sse2.asm b/simd/i386/jidctfst-sse2.asm
index 03bac9d..19704ff 100644
--- a/simd/i386/jidctfst-sse2.asm
+++ b/simd/i386/jidctfst-sse2.asm
@@ -116,8 +116,8 @@
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm
index 1e94541..c371985 100644
--- a/simd/i386/jidctint-avx2.asm
+++ b/simd/i386/jidctint-avx2.asm
@@ -318,8 +318,8 @@
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm
index 281e5b3..4f07f56 100644
--- a/simd/i386/jidctint-mmx.asm
+++ b/simd/i386/jidctint-mmx.asm
@@ -134,8 +134,8 @@
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm
index acb1140..e442fdd 100644
--- a/simd/i386/jidctint-sse2.asm
+++ b/simd/i386/jidctint-sse2.asm
@@ -129,8 +129,8 @@
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctred-mmx.asm b/simd/i386/jidctred-mmx.asm
index ad3fcb1..e2307e1 100644
--- a/simd/i386/jidctred-mmx.asm
+++ b/simd/i386/jidctred-mmx.asm
@@ -142,8 +142,8 @@
     alignx      16, 7
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -462,16 +462,16 @@
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
 
     psrlq       mm1, 4*BYTE_BIT
     psrlq       mm0, 4*BYTE_BIT
 
     mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
 
     emms                                ; empty MMX state
 
@@ -686,8 +686,8 @@
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
-    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
 
     emms                                ; empty MMX state
 
diff --git a/simd/i386/jidctred-sse2.asm b/simd/i386/jidctred-sse2.asm
index c00d0e4..6e56494 100644
--- a/simd/i386/jidctred-sse2.asm
+++ b/simd/i386/jidctred-sse2.asm
@@ -137,8 +137,8 @@
     mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -576,8 +576,8 @@
 
     mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
     mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
-    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
 
     pop         edi
     pop         esi
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
index 383b709..11fe8ef 100644
--- a/simd/nasm/jsimdext.inc
+++ b/simd/nasm/jsimdext.inc
@@ -165,19 +165,19 @@
 %define XMM_DWORD
 %define XMM_MMWORD
 
-%define SIZEOF_BYTE   1                 ; sizeof(BYTE)
-%define SIZEOF_WORD   2                 ; sizeof(WORD)
-%define SIZEOF_DWORD  4                 ; sizeof(DWORD)
-%define SIZEOF_QWORD  8                 ; sizeof(QWORD)
-%define SIZEOF_OWORD  16                ; sizeof(OWORD)
-%define SIZEOF_YWORD  32                ; sizeof(YWORD)
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
 
 %define BYTE_BIT      8                 ; CHAR_BIT in C
-%define WORD_BIT      16                ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT     32                ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT     64                ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT     128               ; sizeof(OWORD)*BYTE_BIT
-%define YWORD_BIT     256               ; sizeof(YWORD)*BYTE_BIT
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
 
 ; --------------------------------------------------------------------------
 ;  External Symbol Name
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
index 61f3b9e..10d2834 100644
--- a/simd/x86_64/jccolext-avx2.asm
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -94,12 +94,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
index 9df4f7f..2c914d3 100644
--- a/simd/x86_64/jccolext-sse2.asm
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -93,12 +93,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
index 20c48c1..175b60d 100644
--- a/simd/x86_64/jcgryext-avx2.asm
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -86,12 +86,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
index 70e6891..873be80 100644
--- a/simd/x86_64/jcgryext-sse2.asm
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -85,12 +85,12 @@
     test        cl, SIZEOF_BYTE
     jz          short .column_ld2
     sub         rcx, byte SIZEOF_BYTE
-    movzx       rax, BYTE [rsi+rcx]
+    movzx       rax, byte [rsi+rcx]
 .column_ld2:
     test        cl, SIZEOF_WORD
     jz          short .column_ld4
     sub         rcx, byte SIZEOF_WORD
-    movzx       rdx, WORD [rsi+rcx]
+    movzx       rdx, word [rsi+rcx]
     shl         rax, WORD_BIT
     or          rax, rdx
 .column_ld4:
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index d49be5b..aa78fd5 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -198,7 +198,7 @@
     mov         buffer, r11                  ; r11 is now sratch
 
     mov         put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
+    mov         put_bits,    dword [r10+24]  ; put_bits = state->cur.put_bits;
     push        r10                          ; r10 is now scratch
 
     ; Encode the DC coefficient difference per section F.1.2.1
@@ -331,7 +331,7 @@
     pop         r10
     ; Save put_buffer & put_bits
     mov         MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         DWORD  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
+    mov         dword  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
 
     pop         rbx
     uncollect_args 6
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
index 4b43baa..677b8ed 100644
--- a/simd/x86_64/jdcolext-avx2.asm
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -332,7 +332,7 @@
     vmovd       eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -341,7 +341,7 @@
     ; space.
     test        rcx, rcx
     jz          short .nextrow
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
index e5bd0ee..071aa62 100644
--- a/simd/x86_64/jdcolext-sse2.asm
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -304,7 +304,7 @@
     movd        eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -313,7 +313,7 @@
     ; space.
     test        rcx, rcx
     jz          short .nextrow
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
index 666d2ca..bb733c5 100644
--- a/simd/x86_64/jdmrgext-avx2.asm
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -337,7 +337,7 @@
     vmovd       eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -346,7 +346,7 @@
     ; space.
     test        rcx, rcx
     jz          short .endcolumn
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
index 4fa69af..b176a4c 100644
--- a/simd/x86_64/jdmrgext-sse2.asm
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -308,7 +308,7 @@
     movd        eax, xmmA
     cmp         rcx, byte SIZEOF_WORD
     jb          short .column_st1
-    mov         WORD [rdi], ax
+    mov         word [rdi], ax
     add         rdi, byte SIZEOF_WORD
     sub         rcx, byte SIZEOF_WORD
     shr         rax, 16
@@ -317,7 +317,7 @@
     ; space.
     test        rcx, rcx
     jz          short .endcolumn
-    mov         BYTE [rdi], al
+    mov         byte [rdi], al
 
 %else  ; RGB_PIXELSIZE == 4 ; -----------
 
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
index 95aff82..ab95e1a 100644
--- a/simd/x86_64/jidctflt-sse2.asm
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -93,8 +93,8 @@
     mov         rcx, DCTSIZE/4          ; ctr
 .columnloop:
 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
index 03ca13f..a66a681 100644
--- a/simd/x86_64/jidctfst-sse2.asm
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -109,8 +109,8 @@
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
index b36cb31..50270f4 100644
--- a/simd/x86_64/jidctint-avx2.asm
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -290,8 +290,8 @@
     ; ---- Pass 1: process columns.
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
-    mov         eax, DWORD [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
index 0f52a6e..034530c 100644
--- a/simd/x86_64/jidctint-sse2.asm
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -122,8 +122,8 @@
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         near .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
index 133a59a..7fbfcc5 100644
--- a/simd/x86_64/jidctred-sse2.asm
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -130,8 +130,8 @@
     mov         rsi, r11                ; inptr
 
 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     jnz         short .columnDCT
 
     movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
@@ -560,8 +560,8 @@
 
     mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
     mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-    mov         WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
 
     pop         rbx
     uncollect_args 4