jidctflt.asm - external/github.com/libjpeg-turbo/libjpeg-turbo - Git at Google

 ;
 ; jidctflt.asm - floating-point IDCT (non-SIMD)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
 ;
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains a floating-point implementation of the inverse DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
 ;
 ; Last Modified : October 17, 2004
 ;
 ; [TAB8]

 %include "jsimdext.inc"
 %include "jdct.inc"

 %ifdef DCT_FLOAT_SUPPORTED

 ; This module is specialized to the case DCTSIZE = 8.
 ;
 %if DCTSIZE != 8
 %error "Sorry, this code only copes with 8x8 DCTs."
 %endif

 ; --------------------------------------------------------------------------
 	SECTION	SEG_CONST

 %define ROTATOR_TYPE	FP32	; float

 	alignz	16
 	global	EXTN(jconst_idct_float)

 EXTN(jconst_idct_float):

 F_1_414	dd	1.414213562373095048801689	; 2*cos(PI*1/4)
 F_1_847	dd	1.847759065022573512256366	; 2*cos(PI*1/8)
 F_1_082	dd	1.082392200292393968799446	; 2*(cos(PI*1/8)-cos(PI*3/8))
 F_2_613	dd	2.613125929752753055713286	; 2*(cos(PI*1/8)+cos(PI*3/8))

 	alignz	16

 ; --------------------------------------------------------------------------
 	SECTION	SEG_TEXT
 	BITS	32
 ;
 ; Perform dequantization and inverse DCT on one block of coefficients.
 ;
 ; GLOBAL(void)
 ; jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 ;                  JCOEFPTR coef_block,
 ;                  JSAMPARRAY output_buf, JDIMENSION output_col)
 ;

 %define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
 %define compptr(b)	(b)+12		; jpeg_component_info * compptr
 %define coef_block(b)	(b)+16		; JCOEFPTR coef_block
 %define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
 %define output_col(b)	(b)+24		; JDIMENSION output_col

 %define tmp		ebp-SIZEOF_FP64	; double tmp
 %define workspace	tmp-DCTSIZE2*SIZEOF_FAST_FLOAT
 					; FAST_FLOAT workspace[DCTSIZE2]
 %define rndint_magic	workspace-SIZEOF_FP32
 					; float rndint_magic = 100663296.0F
 %define gotptr		rndint_magic-SIZEOF_POINTER	; void * gotptr

 	align	16
 	global	EXTN(jpeg_idct_float)

 EXTN(jpeg_idct_float):
 	push	ebp
 	mov	ebp,esp
 	lea	esp, [workspace]
 	push	FP32 0x4CC00000		; (float)(0x00C00000 << 3)
 	pushpic	eax			; make a room for GOT address
 	push	ebx
 ;	push	ecx		; need not be preserved
 ;	push	edx		; need not be preserved
 	push	esi
 	push	edi

 	get_GOT	ebx			; get GOT address
 	movpic	POINTER [gotptr], ebx	; save GOT address

 	; ---- Pass 1: process columns from input, store into work array.

 	mov	edx, POINTER [compptr(ebp)]
 	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
 	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
 	lea	edi, [workspace]			; FAST_FLOAT * wsptr
 	mov	ecx, DCTSIZE				; ctr
 	alignx	16,7
 .columnloop:
 	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
 	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
 	jnz	short .columnDCT

 	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
 	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
 	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
 	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
 	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
 	or	ax,bx
 	jnz	short .columnDCT

 	; -- AC terms all zero

 	fild	JCOEF [COL(0,esi,SIZEOF_JCOEF)]
 	fmul	FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]

 	fst	FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
 	fst	FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
 	fst	FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
 	fst	FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
 	fst	FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
 	fst	FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
 	fst	FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
 	jmp	near .nextcolumn
 	alignx	16,7

 .columnDCT:
 	movpic	ebx, POINTER [gotptr]	; load GOT address

 	; -- Even part

 	fild	JCOEF [COL(2,esi,SIZEOF_JCOEF)]
 	fild	JCOEF [COL(6,esi,SIZEOF_JCOEF)]
 	fild	JCOEF [COL(4,esi,SIZEOF_JCOEF)]
 	fild	JCOEF [COL(0,esi,SIZEOF_JCOEF)]

 	fxch	st0,st3

 	fmul	FLOAT_MULT_TYPE [COL(2,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st2
 	fmul	FLOAT_MULT_TYPE [COL(6,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st1
 	fmul	FLOAT_MULT_TYPE [COL(4,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st3
 	fmul	FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st1

 	fld	st2	; st2 = st2 + st0, st0 = st2 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st3,st0

 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]

 	fld	st3	; st1 = st1 + st3, st3 = st1 - st3
 	fsubr	st0,st2
 	fxch	st0,st4
 	faddp	st2,st0

 	fsub	st0,st2

 	fld	st1	; st2 = st1 + st2, st1 = st1 - st2
 	fsub	st0,st3
 	fxch	st0,st2
 	faddp	st3,st0
 	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st4
 	faddp	st1,st0

 	; -- Odd part

 	fild	JCOEF [COL(1,esi,SIZEOF_JCOEF)]
 	fild	JCOEF [COL(7,esi,SIZEOF_JCOEF)]
 	fild	JCOEF [COL(3,esi,SIZEOF_JCOEF)]
 	fild	JCOEF [COL(5,esi,SIZEOF_JCOEF)]

 	fxch	st0,st3

 	fmul	FLOAT_MULT_TYPE [COL(1,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st2
 	fmul	FLOAT_MULT_TYPE [COL(7,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st1
 	fmul	FLOAT_MULT_TYPE [COL(3,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st6
 	fxch	st3,st0
 	fmul	FLOAT_MULT_TYPE [COL(5,edx,SIZEOF_FLOAT_MULT_TYPE)]
 	fxch	st0,st5
 	fstp	FP64 [tmp]

 	fld	st1	; st1 = st1 + st0, st0 = st1 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st2,st0
 	fld	st5	; st4 = st4 + st5, st5 = st4 - st5
 	fsubr	st0,st5
 	fxch	st0,st6
 	faddp	st5,st0

 	fld	st1	; st1 = st1 + st4, st4 = st1 - st4
 	fsub	st0,st5
 	fxch	st0,st5
 	faddp	st2,st0

 	fld	st5
 	fadd	st0,st1
 	fxch	st0,st5
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
 	fxch	st0,st5
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
 	fxch	st0,st6
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
 	fxch	st0,st1
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
 	fxch	st0,st6
 	fsubr	st1,st0
 	fsubp	st6,st0

 	; -- Final output stage

 	fsub	st0,st1
 	fld	st2	; st1 = st2 + st1, st2 = st2 - st1
 	fsub	st0,st2
 	fxch	st0,st3
 	faddp	st2,st0
 	fsub	st4,st0
 	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st4
 	faddp	st1,st0

 	fxch	st0,st2

 	fstp	FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]

 	fadd	st1,st0
 	fld	FP64 [tmp]
 	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
 	fsubr	st0,st4
 	fxch	st0,st2
 	faddp	st4,st0
 	fld	st0	; st0 = st0 + st2, st2 = st0 - st2
 	fsub	st0,st3
 	fxch	st0,st3
 	faddp	st1,st0

 	fxch	st0,st3

 	fstp	FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]

 .nextcolumn:
 	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
 	add	edx, byte SIZEOF_FLOAT_MULT_TYPE
 	add	edi, byte SIZEOF_FAST_FLOAT
 	dec	ecx
 	jnz	near .columnloop

 	; ---- Pass 2: process rows from work array, store into output array.

 	mov	edx, POINTER [cinfo(ebp)]
 	mov	edx, POINTER [jdstruct_sample_range_limit(edx)]
 	sub	edx, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit

 	lea	esi, [workspace]			; FAST_FLOAT * wsptr
 	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
 	mov	ecx, DCTSIZE				; ctr
 	alignx	16,7
 .rowloop:
 	push	edi
 	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
 	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr

 %ifndef NO_ZERO_ROW_TEST_FLOAT
 	mov	eax, FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
 	add	eax,eax			; shl eax,1 (shift out the sign bit)
 	jnz	short .rowDCT

 	mov	eax, FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
 	mov	ebx, FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
 	or	eax, FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
 	or	ebx, FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
 	or	eax, FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
 	or	ebx, FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
 	or	eax,ebx
 	add	eax,eax			; shl eax,1 (shift out the sign bit)
 	jnz	short .rowDCT

 	; -- AC terms all zero

 	push	eax

 	fld	FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
 	fadd	FP32 [rndint_magic]
 	fstp	FP32 [esp]

 	pop	eax
 	and	eax,RANGE_MASK
 	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
 	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
 	jmp	near .nextrow
 	alignx	16,7
 %endif
 .rowDCT:
 	movpic	ebx, POINTER [gotptr]	; load GOT address

 	; -- Even part

 	fld	FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]

 	fld	st2	; st2 = st2 + st0, st0 = st2 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st3,st0

 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]

 	fld	st3	; st1 = st1 + st3, st3 = st1 - st3
 	fsubr	st0,st2
 	fxch	st0,st4
 	faddp	st2,st0

 	fsub	st0,st2

 	fld	st1	; st2 = st1 + st2, st1 = st1 - st2
 	fsub	st0,st3
 	fxch	st0,st2
 	faddp	st3,st0
 	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st4
 	faddp	st1,st0

 	; -- Odd part

 	fld	FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st3
 	fld	FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st5
 	fstp	FP64 [tmp]

 	fld	st1	; st1 = st1 + st0, st0 = st1 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st2,st0
 	fld	st5	; st4 = st4 + st5, st5 = st4 - st5
 	fsubr	st0,st5
 	fxch	st0,st6
 	faddp	st5,st0

 	fld	st1	; st1 = st1 + st4, st4 = st1 - st4
 	fsub	st0,st5
 	fxch	st0,st5
 	faddp	st2,st0

 	fld	st5
 	fadd	st0,st1
 	fxch	st0,st5
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
 	fxch	st0,st5
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
 	fxch	st0,st6
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
 	fxch	st0,st1
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
 	fxch	st0,st6
 	fsubr	st1,st0
 	fsubp	st6,st0

 	; -- Final output stage

 	sub	esp, byte DCTSIZE*SIZEOF_FP32

 	fsub	st0,st1
 	fld	st2	; st1 = st2 + st1, st2 = st2 - st1
 	fsub	st0,st2
 	fxch	st0,st3
 	faddp	st2,st0
 	fsub	st4,st0
 	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st4
 	faddp	st1,st0

 	fld	FP32 [rndint_magic]

 	fadd	st4,st0
 	fadd	st1,st0
 	fadd	st2,st0
 	fadd	st3,st0

 	fxch	st0,st4

 	fstp	FP32 [esp+6*SIZEOF_FP32]
 	fstp	FP32 [esp+1*SIZEOF_FP32]
 	fstp	FP32 [esp+0*SIZEOF_FP32]
 	fstp	FP32 [esp+7*SIZEOF_FP32]

 	fxch	st0,st1

 	fadd	st2,st0
 	fld	FP64 [tmp]
 	fld	st1	; st4 = st4 + st1, st1 = st4 - st1
 	fsubr	st0,st5
 	fxch	st0,st2
 	faddp	st5,st0
 	fld	st0	; st0 = st0 + st3, st3 = st0 - st3
 	fsub	st0,st4
 	fxch	st0,st4
 	faddp	st1,st0

 	fxch	st0,st2

 	fadd	st1,st0
 	fadd	st2,st0
 	fadd	st3,st0
 	faddp	st4,st0

 	fstp	FP32 [esp+5*SIZEOF_FP32]
 	fstp	FP32 [esp+4*SIZEOF_FP32]
 	fstp	FP32 [esp+3*SIZEOF_FP32]
 	fstp	FP32 [esp+2*SIZEOF_FP32]

 %assign i 0	; i=0;
 %rep 4	; -- repeat 4 times ---
 	pop	eax
 	pop	ebx
 	and	eax,RANGE_MASK
 	and	ebx,RANGE_MASK
 	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
 	mov	bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
 	mov	JSAMPLE [edi+(i+0)*SIZEOF_JSAMPLE], al
 	mov	JSAMPLE [edi+(i+1)*SIZEOF_JSAMPLE], bl
 %assign i i+2	; i+=2;
 %endrep	; -- repeat end ---

 .nextrow:
 	pop	edi
 	add	esi, byte DCTSIZE*SIZEOF_FAST_FLOAT
 	add	edi, byte SIZEOF_JSAMPROW	; advance pointer to next row
 	dec	ecx
 	jnz	near .rowloop

 	pop	edi
 	pop	esi
 ;	pop	edx		; need not be preserved
 ;	pop	ecx		; need not be preserved
 	pop	ebx
 	mov	esp,ebp
 	pop	ebp
 	ret

 %endif ; DCT_FLOAT_SUPPORTED
	;
	; jidctflt.asm - floating-point IDCT (non-SIMD)
	;
	; x86 SIMD extension for IJG JPEG library
	; Copyright (C) 1999-2006, MIYASAKA Masaru.
	; For conditions of distribution and use, see copyright notice in jsimdext.inc
	;
	; This file should be assembled with NASM (Netwide Assembler),
	; can not be assembled with Microsoft's MASM or any compatible
	; assembler (including Borland's Turbo Assembler).
	; NASM is available from http://nasm.sourceforge.net/ or
	; http://sourceforge.net/project/showfiles.php?group_id=6208
	;
	; This file contains a floating-point implementation of the inverse DCT
	; (Discrete Cosine Transform). The following code is based directly on
	; the IJG's original jidctflt.c; see the jidctflt.c for more details.
	;
	; Last Modified : October 17, 2004
	;
	; [TAB8]

	%include "jsimdext.inc"
	%include "jdct.inc"

	%ifdef DCT_FLOAT_SUPPORTED

	; This module is specialized to the case DCTSIZE = 8.
	;
	%if DCTSIZE != 8
	%error "Sorry, this code only copes with 8x8 DCTs."
	%endif

	; --------------------------------------------------------------------------
	SECTION SEG_CONST

	%define ROTATOR_TYPE FP32 ; float

	alignz 16
	global EXTN(jconst_idct_float)

	EXTN(jconst_idct_float):

	F_1_414 dd 1.414213562373095048801689 ; 2cos(PI1/4)
	F_1_847 dd 1.847759065022573512256366 ; 2cos(PI1/8)
	F_1_082 dd 1.082392200292393968799446 ; 2(cos(PI1/8)-cos(PI*3/8))
	F_2_613 dd 2.613125929752753055713286 ; 2(cos(PI1/8)+cos(PI*3/8))

	alignz 16

	; --------------------------------------------------------------------------
	SECTION SEG_TEXT
	BITS 32
	;
	; Perform dequantization and inverse DCT on one block of coefficients.
	;
	; GLOBAL(void)
	; jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
	; JCOEFPTR coef_block,
	; JSAMPARRAY output_buf, JDIMENSION output_col)
	;

	%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo
	%define compptr(b) (b)+12 ; jpeg_component_info * compptr
	%define coef_block(b) (b)+16 ; JCOEFPTR coef_block
	%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf
	%define output_col(b) (b)+24 ; JDIMENSION output_col

	%define tmp ebp-SIZEOF_FP64 ; double tmp
	%define workspace tmp-DCTSIZE2*SIZEOF_FAST_FLOAT
	; FAST_FLOAT workspace[DCTSIZE2]
	%define rndint_magic workspace-SIZEOF_FP32
	; float rndint_magic = 100663296.0F
	%define gotptr rndint_magic-SIZEOF_POINTER ; void * gotptr

	align 16
	global EXTN(jpeg_idct_float)

	EXTN(jpeg_idct_float):
	push ebp
	mov ebp,esp
	lea esp, [workspace]
	push FP32 0x4CC00000 ; (float)(0x00C00000 << 3)
	pushpic eax ; make a room for GOT address
	push ebx
	; push ecx ; need not be preserved
	; push edx ; need not be preserved
	push esi
	push edi

	get_GOT ebx ; get GOT address
	movpic POINTER [gotptr], ebx ; save GOT address

	; ---- Pass 1: process columns from input, store into work array.

	mov edx, POINTER [compptr(ebp)]
	mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr
	mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
	lea edi, [workspace] ; FAST_FLOAT * wsptr
	mov ecx, DCTSIZE ; ctr
	alignx 16,7
	.columnloop:
	mov ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
	or ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
	jnz short .columnDCT

	mov bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
	mov ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
	or bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
	or ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
	or bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
	or ax,bx
	jnz short .columnDCT

	; -- AC terms all zero

	fild JCOEF [COL(0,esi,SIZEOF_JCOEF)]
	fmul FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]

	fst FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
	fst FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
	fst FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
	fst FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
	fst FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
	fst FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
	fst FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
	jmp near .nextcolumn
	alignx 16,7

	.columnDCT:
	movpic ebx, POINTER [gotptr] ; load GOT address

	; -- Even part

	fild JCOEF [COL(2,esi,SIZEOF_JCOEF)]
	fild JCOEF [COL(6,esi,SIZEOF_JCOEF)]
	fild JCOEF [COL(4,esi,SIZEOF_JCOEF)]
	fild JCOEF [COL(0,esi,SIZEOF_JCOEF)]

	fxch st0,st3

	fmul FLOAT_MULT_TYPE [COL(2,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st2
	fmul FLOAT_MULT_TYPE [COL(6,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st1
	fmul FLOAT_MULT_TYPE [COL(4,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st3
	fmul FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st1

	fld st2 ; st2 = st2 + st0, st0 = st2 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st3,st0

	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]

	fld st3 ; st1 = st1 + st3, st3 = st1 - st3
	fsubr st0,st2
	fxch st0,st4
	faddp st2,st0

	fsub st0,st2

	fld st1 ; st2 = st1 + st2, st1 = st1 - st2
	fsub st0,st3
	fxch st0,st2
	faddp st3,st0
	fld st3 ; st0 = st3 + st0, st3 = st3 - st0
	fsub st0,st1
	fxch st0,st4
	faddp st1,st0

	; -- Odd part

	fild JCOEF [COL(1,esi,SIZEOF_JCOEF)]
	fild JCOEF [COL(7,esi,SIZEOF_JCOEF)]
	fild JCOEF [COL(3,esi,SIZEOF_JCOEF)]
	fild JCOEF [COL(5,esi,SIZEOF_JCOEF)]

	fxch st0,st3

	fmul FLOAT_MULT_TYPE [COL(1,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st2
	fmul FLOAT_MULT_TYPE [COL(7,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st1
	fmul FLOAT_MULT_TYPE [COL(3,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st6
	fxch st3,st0
	fmul FLOAT_MULT_TYPE [COL(5,edx,SIZEOF_FLOAT_MULT_TYPE)]
	fxch st0,st5
	fstp FP64 [tmp]

	fld st1 ; st1 = st1 + st0, st0 = st1 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st2,st0
	fld st5 ; st4 = st4 + st5, st5 = st4 - st5
	fsubr st0,st5
	fxch st0,st6
	faddp st5,st0

	fld st1 ; st1 = st1 + st4, st4 = st1 - st4
	fsub st0,st5
	fxch st0,st5
	faddp st2,st0

	fld st5
	fadd st0,st1
	fxch st0,st5
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
	fxch st0,st5
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
	fxch st0,st6
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
	fxch st0,st1
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
	fxch st0,st6
	fsubr st1,st0
	fsubp st6,st0

	; -- Final output stage

	fsub st0,st1
	fld st2 ; st1 = st2 + st1, st2 = st2 - st1
	fsub st0,st2
	fxch st0,st3
	faddp st2,st0
	fsub st4,st0
	fld st3 ; st0 = st3 + st0, st3 = st3 - st0
	fsub st0,st1
	fxch st0,st4
	faddp st1,st0

	fxch st0,st2

	fstp FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]

	fadd st1,st0
	fld FP64 [tmp]
	fld st1 ; st3 = st3 + st1, st1 = st3 - st1
	fsubr st0,st4
	fxch st0,st2
	faddp st4,st0
	fld st0 ; st0 = st0 + st2, st2 = st0 - st2
	fsub st0,st3
	fxch st0,st3
	faddp st1,st0

	fxch st0,st3

	fstp FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]

	.nextcolumn:
	add esi, byte SIZEOF_JCOEF ; advance pointers to next column
	add edx, byte SIZEOF_FLOAT_MULT_TYPE
	add edi, byte SIZEOF_FAST_FLOAT
	dec ecx
	jnz near .columnloop

	; ---- Pass 2: process rows from work array, store into output array.

	mov edx, POINTER [cinfo(ebp)]
	mov edx, POINTER [jdstruct_sample_range_limit(edx)]
	sub edx, byte -CENTERJSAMPLESIZEOF_JSAMPLE ; JSAMPLE range_limit

	lea esi, [workspace] ; FAST_FLOAT * wsptr
	mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
	mov ecx, DCTSIZE ; ctr
	alignx 16,7
	.rowloop:
	push edi
	mov edi, JSAMPROW [edi] ; (JSAMPLE *)
	add edi, JDIMENSION [output_col(ebp)] ; edi=outptr

	%ifndef NO_ZERO_ROW_TEST_FLOAT
	mov eax, FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
	add eax,eax ; shl eax,1 (shift out the sign bit)
	jnz short .rowDCT

	mov eax, FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
	mov ebx, FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
	or eax, FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
	or ebx, FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
	or eax, FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
	or ebx, FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
	or eax,ebx
	add eax,eax ; shl eax,1 (shift out the sign bit)
	jnz short .rowDCT

	; -- AC terms all zero

	push eax

	fld FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
	fadd FP32 [rndint_magic]
	fstp FP32 [esp]

	pop eax
	and eax,RANGE_MASK
	mov al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
	mov JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
	jmp near .nextrow
	alignx 16,7
	%endif
	.rowDCT:
	movpic ebx, POINTER [gotptr] ; load GOT address

	; -- Even part

	fld FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]

	fld st2 ; st2 = st2 + st0, st0 = st2 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st3,st0

	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]

	fld st3 ; st1 = st1 + st3, st3 = st1 - st3
	fsubr st0,st2
	fxch st0,st4
	faddp st2,st0

	fsub st0,st2

	fld st1 ; st2 = st1 + st2, st1 = st1 - st2
	fsub st0,st3
	fxch st0,st2
	faddp st3,st0
	fld st3 ; st0 = st3 + st0, st3 = st3 - st0
	fsub st0,st1
	fxch st0,st4
	faddp st1,st0

	; -- Odd part

	fld FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
	fxch st0,st3
	fld FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
	fxch st0,st5
	fstp FP64 [tmp]

	fld st1 ; st1 = st1 + st0, st0 = st1 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st2,st0
	fld st5 ; st4 = st4 + st5, st5 = st4 - st5
	fsubr st0,st5
	fxch st0,st6
	faddp st5,st0

	fld st1 ; st1 = st1 + st4, st4 = st1 - st4
	fsub st0,st5
	fxch st0,st5
	faddp st2,st0

	fld st5
	fadd st0,st1
	fxch st0,st5
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
	fxch st0,st5
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
	fxch st0,st6
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
	fxch st0,st1
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
	fxch st0,st6
	fsubr st1,st0
	fsubp st6,st0

	; -- Final output stage

	sub esp, byte DCTSIZE*SIZEOF_FP32

	fsub st0,st1
	fld st2 ; st1 = st2 + st1, st2 = st2 - st1
	fsub st0,st2
	fxch st0,st3
	faddp st2,st0
	fsub st4,st0
	fld st3 ; st0 = st3 + st0, st3 = st3 - st0
	fsub st0,st1
	fxch st0,st4
	faddp st1,st0

	fld FP32 [rndint_magic]

	fadd st4,st0
	fadd st1,st0
	fadd st2,st0
	fadd st3,st0

	fxch st0,st4

	fstp FP32 [esp+6*SIZEOF_FP32]
	fstp FP32 [esp+1*SIZEOF_FP32]
	fstp FP32 [esp+0*SIZEOF_FP32]
	fstp FP32 [esp+7*SIZEOF_FP32]

	fxch st0,st1

	fadd st2,st0
	fld FP64 [tmp]
	fld st1 ; st4 = st4 + st1, st1 = st4 - st1
	fsubr st0,st5
	fxch st0,st2
	faddp st5,st0
	fld st0 ; st0 = st0 + st3, st3 = st0 - st3
	fsub st0,st4
	fxch st0,st4
	faddp st1,st0

	fxch st0,st2

	fadd st1,st0
	fadd st2,st0
	fadd st3,st0
	faddp st4,st0

	fstp FP32 [esp+5*SIZEOF_FP32]
	fstp FP32 [esp+4*SIZEOF_FP32]
	fstp FP32 [esp+3*SIZEOF_FP32]
	fstp FP32 [esp+2*SIZEOF_FP32]

	%assign i 0 ; i=0;
	%rep 4 ; -- repeat 4 times ---
	pop eax
	pop ebx
	and eax,RANGE_MASK
	and ebx,RANGE_MASK
	mov al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
	mov bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
	mov JSAMPLE [edi+(i+0)*SIZEOF_JSAMPLE], al
	mov JSAMPLE [edi+(i+1)*SIZEOF_JSAMPLE], bl
	%assign i i+2 ; i+=2;
	%endrep ; -- repeat end ---

	.nextrow:
	pop edi
	add esi, byte DCTSIZE*SIZEOF_FAST_FLOAT
	add edi, byte SIZEOF_JSAMPROW ; advance pointer to next row
	dec ecx
	jnz near .rowloop

	pop edi
	pop esi
	; pop edx ; need not be preserved
	; pop ecx ; need not be preserved
	pop ebx
	mov esp,ebp
	pop ebp
	ret

	%endif ; DCT_FLOAT_SUPPORTED