jfdctfst.asm - external/github.com/libjpeg-turbo/libjpeg-turbo - Git at Google

 ;
 ; jfdctfst.asm - fast integer FDCT (non-SIMD)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
 ;
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains a fast, not so accurate integer implementation of
 ; the forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctfst.c; see the jfdctfst.c for
 ; more details.
 ;
 ; Last Modified : October 17, 2004
 ;
 ; [TAB8]

 %include "jsimdext.inc"
 %include "jdct.inc"

 %ifdef DCT_IFAST_SUPPORTED

 ; This module is specialized to the case DCTSIZE = 8.
 ;
 %if DCTSIZE != 8
 %error "Sorry, this code only copes with 8x8 DCTs."
 %endif

 ; --------------------------------------------------------------------------

 ; We can gain a little more speed, with a further compromise in accuracy,
 ; by omitting the addition in a descaling shift.  This yields an
 ; incorrectly rounded result half the time...
 ;
 %macro	descale 2
 %ifdef USE_ACCURATE_ROUNDING
 %if (%2)<=7
 	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
 %else
 	add	%1, (1<<((%2)-1))	; add reg32,imm32
 %endif
 %endif
 	sar	%1,%2
 %endmacro

 ; --------------------------------------------------------------------------

 %define CONST_BITS	8

 %if CONST_BITS == 8
 F_0_382	equ	 98		; FIX(0.382683433)
 F_0_541	equ	139		; FIX(0.541196100)
 F_0_707	equ	181		; FIX(0.707106781)
 F_1_306	equ	334		; FIX(1.306562965)
 %else
 ; NASM cannot do compile-time arithmetic on floating-point constants.
 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
 F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
 F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
 F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
 F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
 %endif

 ; --------------------------------------------------------------------------
 	SECTION	SEG_TEXT
 	BITS	32
 ;
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
 ; jpeg_fdct_ifast (DCTELEM * data)
 ;

 %define data(b)	(b)+8		; DCTELEM * data

 	align	16
 	global	EXTN(jpeg_fdct_ifast)

 EXTN(jpeg_fdct_ifast):
 	push	ebp
 	mov	ebp,esp
 	push	ebx
 ;	push	ecx		; need not be preserved
 ;	push	edx		; need not be preserved
 	push	esi
 	push	edi

 	; ---- Pass 1: process rows.

 	mov	ecx, DCTSIZE
 	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
 	alignx	16,7
 .rowloop:
 	push	ecx		; ctr
 	push	edx		; dataptr

 	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
 	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
 	lea	esi,[eax+edi]	; esi=tmp0
 	sub	eax,edi		; eax=tmp7
 	push	eax

 	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
 	lea	edi,[ebx+ecx]	; edi=tmp1
 	sub	ebx,ecx		; ebx=tmp6
 	push	ebx

 	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
 	lea	ebx,[eax+ecx]	; ebx=tmp2
 	sub	eax,ecx		; eax=tmp5
 	push	eax

 	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
 	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
 	lea	edx,[ecx+eax]	; edx=tmp3
 	sub	ecx,eax		; ecx=tmp4
 	push	ecx

 	; -- Even part

 	lea	eax,[esi+edx]	; eax=tmp10
 	lea	ecx,[edi+ebx]	; ecx=tmp11
 	sub	esi,edx		; esi=tmp13
 	sub	edi,ebx		; edi=tmp12

 	mov	edx, POINTER [esp+16]	; dataptr

 	add	edi,esi
 	imul	edi,(F_0_707)	; edi=z1
 	descale	edi,CONST_BITS

 	lea	ebx,[eax+ecx]	; ebx=data0
 	sub	eax,ecx		; eax=data4
 	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
 	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax

 	lea	ecx,[esi+edi]	; ecx=data2
 	sub	esi,edi		; esi=data6
 	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], cx
 	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], si

 	; -- Odd part

 	pop	eax	; eax=tmp4
 	pop	edx	; edx=tmp5
 	pop	ebx	; ebx=tmp6
 	pop	edi	; edi=tmp7

 	add	eax,edx		; eax=tmp10
 	add	edx,ebx		; edx=tmp11
 	add	ebx,edi		; ebx=tmp12, edi=tmp7

 	imul	edx,(F_0_707)	; edx=z3
 	descale	edx,CONST_BITS
 	lea	esi,[edi+edx]	; esi=z11
 	sub	edi,edx		; edi=z13

 	mov	ecx,eax		; ecx=tmp10
 	sub	eax,ebx
 	imul	eax,(F_0_382)	; eax=z5
 	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
 	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
 	descale	eax,CONST_BITS
 	descale	ecx,CONST_BITS
 	descale	ebx,CONST_BITS
 	add	ecx,eax		; ecx=z2
 	add	ebx,eax		; ebx=z4

 	pop	edx		; dataptr

 	lea	eax,[edi+ecx]	; eax=data5
 	sub	edi,ecx		; edi=data3
 	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], ax
 	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], di

 	lea	ecx,[esi+ebx]	; ecx=data1
 	sub	esi,ebx		; esi=data7
 	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], cx
 	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], si

 	pop	ecx		; ctr

 	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
 	dec	ecx			; advance pointer to next row
 	jnz	near .rowloop

 	; ---- Pass 2: process columns.

 	mov	ecx, DCTSIZE
 	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
 	alignx	16,7
 .columnloop:
 	push	ecx		; ctr
 	push	edx		; dataptr

 	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
 	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
 	lea	esi,[eax+edi]	; esi=tmp0
 	sub	eax,edi		; eax=tmp7
 	push	eax

 	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
 	lea	edi,[ebx+ecx]	; edi=tmp1
 	sub	ebx,ecx		; ebx=tmp6
 	push	ebx

 	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
 	lea	ebx,[eax+ecx]	; ebx=tmp2
 	sub	eax,ecx		; eax=tmp5
 	push	eax

 	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
 	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
 	lea	edx,[ecx+eax]	; edx=tmp3
 	sub	ecx,eax		; ecx=tmp4
 	push	ecx

 	; -- Even part

 	lea	eax,[esi+edx]	; eax=tmp10
 	lea	ecx,[edi+ebx]	; ecx=tmp11
 	sub	esi,edx		; esi=tmp13
 	sub	edi,ebx		; edi=tmp12

 	mov	edx, POINTER [esp+16]	; dataptr

 	add	edi,esi
 	imul	edi,(F_0_707)	; edi=z1
 	descale	edi,CONST_BITS

 	lea	ebx,[eax+ecx]	; ebx=data0
 	sub	eax,ecx		; eax=data4
 	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
 	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax

 	lea	ecx,[esi+edi]	; ecx=data2
 	sub	esi,edi		; esi=data6
 	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], cx
 	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], si

 	; -- Odd part

 	pop	eax	; eax=tmp4
 	pop	edx	; edx=tmp5
 	pop	ebx	; ebx=tmp6
 	pop	edi	; edi=tmp7

 	add	eax,edx		; eax=tmp10
 	add	edx,ebx		; edx=tmp11
 	add	ebx,edi		; ebx=tmp12, edi=tmp7

 	imul	edx,(F_0_707)	; edx=z3
 	descale	edx,CONST_BITS
 	lea	esi,[edi+edx]	; esi=z11
 	sub	edi,edx		; edi=z13

 	mov	ecx,eax		; ecx=tmp10
 	sub	eax,ebx
 	imul	eax,(F_0_382)	; eax=z5
 	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
 	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
 	descale	eax,CONST_BITS
 	descale	ecx,CONST_BITS
 	descale	ebx,CONST_BITS
 	add	ecx,eax		; ecx=z2
 	add	ebx,eax		; ebx=z4

 	pop	edx		; dataptr

 	lea	eax,[edi+ecx]	; eax=data5
 	sub	edi,ecx		; edi=data3
 	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], ax
 	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], di

 	lea	ecx,[esi+ebx]	; ecx=data1
 	sub	esi,ebx		; esi=data7
 	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], cx
 	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], si

 	pop	ecx		; ctr

 	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
 	dec	ecx
 	jnz	near .columnloop

 	pop	edi
 	pop	esi
 ;	pop	edx		; need not be preserved
 ;	pop	ecx		; need not be preserved
 	pop	ebx
 	pop	ebp
 	ret

 %endif ; DCT_IFAST_SUPPORTED
	;
	; jfdctfst.asm - fast integer FDCT (non-SIMD)
	;
	; x86 SIMD extension for IJG JPEG library
	; Copyright (C) 1999-2006, MIYASAKA Masaru.
	; For conditions of distribution and use, see copyright notice in jsimdext.inc
	;
	; This file should be assembled with NASM (Netwide Assembler),
	; can not be assembled with Microsoft's MASM or any compatible
	; assembler (including Borland's Turbo Assembler).
	; NASM is available from http://nasm.sourceforge.net/ or
	; http://sourceforge.net/project/showfiles.php?group_id=6208
	;
	; This file contains a fast, not so accurate integer implementation of
	; the forward DCT (Discrete Cosine Transform). The following code is based
	; directly on the IJG's original jfdctfst.c; see the jfdctfst.c for
	; more details.
	;
	; Last Modified : October 17, 2004
	;
	; [TAB8]

	%include "jsimdext.inc"
	%include "jdct.inc"

	%ifdef DCT_IFAST_SUPPORTED

	; This module is specialized to the case DCTSIZE = 8.
	;
	%if DCTSIZE != 8
	%error "Sorry, this code only copes with 8x8 DCTs."
	%endif

	; --------------------------------------------------------------------------

	; We can gain a little more speed, with a further compromise in accuracy,
	; by omitting the addition in a descaling shift. This yields an
	; incorrectly rounded result half the time...
	;
	%macro descale 2
	%ifdef USE_ACCURATE_ROUNDING
	%if (%2)<=7
	add %1, byte (1<<((%2)-1)) ; add reg32,imm8
	%else
	add %1, (1<<((%2)-1)) ; add reg32,imm32
	%endif
	%endif
	sar %1,%2
	%endmacro

	; --------------------------------------------------------------------------

	%define CONST_BITS 8

	%if CONST_BITS == 8
	F_0_382 equ 98 ; FIX(0.382683433)
	F_0_541 equ 139 ; FIX(0.541196100)
	F_0_707 equ 181 ; FIX(0.707106781)
	F_1_306 equ 334 ; FIX(1.306562965)
	%else
	; NASM cannot do compile-time arithmetic on floating-point constants.
	%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
	F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433)
	F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
	F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781)
	F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965)
	%endif

	; --------------------------------------------------------------------------
	SECTION SEG_TEXT
	BITS 32
	;
	; Perform the forward DCT on one block of samples.
	;
	; GLOBAL(void)
	; jpeg_fdct_ifast (DCTELEM * data)
	;

	%define data(b) (b)+8 ; DCTELEM * data

	align 16
	global EXTN(jpeg_fdct_ifast)

	EXTN(jpeg_fdct_ifast):
	push ebp
	mov ebp,esp
	push ebx
	; push ecx ; need not be preserved
	; push edx ; need not be preserved
	push esi
	push edi

	; ---- Pass 1: process rows.

	mov ecx, DCTSIZE
	mov edx, POINTER [data(ebp)] ; (DCTELEM *)
	alignx 16,7
	.rowloop:
	push ecx ; ctr
	push edx ; dataptr

	movsx eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
	movsx edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
	lea esi,[eax+edi] ; esi=tmp0
	sub eax,edi ; eax=tmp7
	push eax

	movsx ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
	lea edi,[ebx+ecx] ; edi=tmp1
	sub ebx,ecx ; ebx=tmp6
	push ebx

	movsx eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
	lea ebx,[eax+ecx] ; ebx=tmp2
	sub eax,ecx ; eax=tmp5
	push eax

	movsx ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
	movsx eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
	lea edx,[ecx+eax] ; edx=tmp3
	sub ecx,eax ; ecx=tmp4
	push ecx

	; -- Even part

	lea eax,[esi+edx] ; eax=tmp10
	lea ecx,[edi+ebx] ; ecx=tmp11
	sub esi,edx ; esi=tmp13
	sub edi,ebx ; edi=tmp12

	mov edx, POINTER [esp+16] ; dataptr

	add edi,esi
	imul edi,(F_0_707) ; edi=z1
	descale edi,CONST_BITS

	lea ebx,[eax+ecx] ; ebx=data0
	sub eax,ecx ; eax=data4
	mov DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
	mov DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax

	lea ecx,[esi+edi] ; ecx=data2
	sub esi,edi ; esi=data6
	mov DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], cx
	mov DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], si

	; -- Odd part

	pop eax ; eax=tmp4
	pop edx ; edx=tmp5
	pop ebx ; ebx=tmp6
	pop edi ; edi=tmp7

	add eax,edx ; eax=tmp10
	add edx,ebx ; edx=tmp11
	add ebx,edi ; ebx=tmp12, edi=tmp7

	imul edx,(F_0_707) ; edx=z3
	descale edx,CONST_BITS
	lea esi,[edi+edx] ; esi=z11
	sub edi,edx ; edi=z13

	mov ecx,eax ; ecx=tmp10
	sub eax,ebx
	imul eax,(F_0_382) ; eax=z5
	imul ecx,(F_0_541) ; ecx=MULTIPLY(tmp10,FIX_0_541196100)
	imul ebx,(F_1_306) ; ebx=MULTIPLY(tmp12,FIX_1_306562965)
	descale eax,CONST_BITS
	descale ecx,CONST_BITS
	descale ebx,CONST_BITS
	add ecx,eax ; ecx=z2
	add ebx,eax ; ebx=z4

	pop edx ; dataptr

	lea eax,[edi+ecx] ; eax=data5
	sub edi,ecx ; edi=data3
	mov DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], ax
	mov DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], di

	lea ecx,[esi+ebx] ; ecx=data1
	sub esi,ebx ; esi=data7
	mov DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], cx
	mov DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], si

	pop ecx ; ctr

	add edx, byte DCTSIZE*SIZEOF_DCTELEM
	dec ecx ; advance pointer to next row
	jnz near .rowloop

	; ---- Pass 2: process columns.

	mov ecx, DCTSIZE
	mov edx, POINTER [data(ebp)] ; (DCTELEM *)
	alignx 16,7
	.columnloop:
	push ecx ; ctr
	push edx ; dataptr

	movsx eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
	movsx edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
	lea esi,[eax+edi] ; esi=tmp0
	sub eax,edi ; eax=tmp7
	push eax

	movsx ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
	lea edi,[ebx+ecx] ; edi=tmp1
	sub ebx,ecx ; ebx=tmp6
	push ebx

	movsx eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
	lea ebx,[eax+ecx] ; ebx=tmp2
	sub eax,ecx ; eax=tmp5
	push eax

	movsx ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
	movsx eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
	lea edx,[ecx+eax] ; edx=tmp3
	sub ecx,eax ; ecx=tmp4
	push ecx

	; -- Even part

	lea eax,[esi+edx] ; eax=tmp10
	lea ecx,[edi+ebx] ; ecx=tmp11
	sub esi,edx ; esi=tmp13
	sub edi,ebx ; edi=tmp12

	mov edx, POINTER [esp+16] ; dataptr

	add edi,esi
	imul edi,(F_0_707) ; edi=z1
	descale edi,CONST_BITS

	lea ebx,[eax+ecx] ; ebx=data0
	sub eax,ecx ; eax=data4
	mov DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
	mov DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax

	lea ecx,[esi+edi] ; ecx=data2
	sub esi,edi ; esi=data6
	mov DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], cx
	mov DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], si

	; -- Odd part

	pop eax ; eax=tmp4
	pop edx ; edx=tmp5
	pop ebx ; ebx=tmp6
	pop edi ; edi=tmp7

	add eax,edx ; eax=tmp10
	add edx,ebx ; edx=tmp11
	add ebx,edi ; ebx=tmp12, edi=tmp7

	imul edx,(F_0_707) ; edx=z3
	descale edx,CONST_BITS
	lea esi,[edi+edx] ; esi=z11
	sub edi,edx ; edi=z13

	mov ecx,eax ; ecx=tmp10
	sub eax,ebx
	imul eax,(F_0_382) ; eax=z5
	imul ecx,(F_0_541) ; ecx=MULTIPLY(tmp10,FIX_0_541196100)
	imul ebx,(F_1_306) ; ebx=MULTIPLY(tmp12,FIX_1_306562965)
	descale eax,CONST_BITS
	descale ecx,CONST_BITS
	descale ebx,CONST_BITS
	add ecx,eax ; ecx=z2
	add ebx,eax ; ebx=z4

	pop edx ; dataptr

	lea eax,[edi+ecx] ; eax=data5
	sub edi,ecx ; edi=data3
	mov DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], ax
	mov DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], di

	lea ecx,[esi+ebx] ; ecx=data1
	sub esi,ebx ; esi=data7
	mov DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], cx
	mov DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], si

	pop ecx ; ctr

	add edx, byte SIZEOF_DCTELEM ; advance pointer to next column
	dec ecx
	jnz near .columnloop

	pop edi
	pop esi
	; pop edx ; need not be preserved
	; pop ecx ; need not be preserved
	pop ebx
	pop ebp
	ret

	%endif ; DCT_IFAST_SUPPORTED