| ; |
| ; jidctflt.asm - floating-point IDCT (non-SIMD) |
| ; |
| ; x86 SIMD extension for IJG JPEG library |
| ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| ; |
| ; This file should be assembled with NASM (Netwide Assembler), |
| ; can *not* be assembled with Microsoft's MASM or any compatible |
| ; assembler (including Borland's Turbo Assembler). |
| ; NASM is available from http://nasm.sourceforge.net/ or |
| ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| ; |
| ; This file contains a floating-point implementation of the inverse DCT |
| ; (Discrete Cosine Transform). The following code is based directly on |
| ; the IJG's original jidctflt.c; see the jidctflt.c for more details. |
| ; |
| ; Last Modified : October 17, 2004 |
| ; |
| ; [TAB8] |
| |
| %include "jsimdext.inc" |
| %include "jdct.inc" |
| |
| %ifdef DCT_FLOAT_SUPPORTED |
| |
| ; This module is specialized to the case DCTSIZE = 8. |
| ; |
| %if DCTSIZE != 8 |
| %error "Sorry, this code only copes with 8x8 DCTs." |
| %endif |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_CONST |
| |
| %define ROTATOR_TYPE FP32 ; float |
| |
| alignz 16 |
| global EXTN(jconst_idct_float) |
| |
| EXTN(jconst_idct_float): |
| |
| F_1_414 dd 1.414213562373095048801689 ; 2*cos(PI*1/4) |
| F_1_847 dd 1.847759065022573512256366 ; 2*cos(PI*1/8) |
| F_1_082 dd 1.082392200292393968799446 ; 2*(cos(PI*1/8)-cos(PI*3/8)) |
| F_2_613 dd 2.613125929752753055713286 ; 2*(cos(PI*1/8)+cos(PI*3/8)) |
| |
| alignz 16 |
| |
| ; -------------------------------------------------------------------------- |
| SECTION SEG_TEXT |
| BITS 32 |
| ; |
| ; Perform dequantization and inverse DCT on one block of coefficients. |
| ; |
| ; GLOBAL(void) |
| ; jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
| ; JCOEFPTR coef_block, |
| ; JSAMPARRAY output_buf, JDIMENSION output_col) |
| ; |
| |
| %define cinfo(b) (b)+8 ; j_decompress_ptr cinfo |
| %define compptr(b) (b)+12 ; jpeg_component_info * compptr |
| %define coef_block(b) (b)+16 ; JCOEFPTR coef_block |
| %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf |
| %define output_col(b) (b)+24 ; JDIMENSION output_col |
| |
| %define tmp ebp-SIZEOF_FP64 ; double tmp |
| %define workspace tmp-DCTSIZE2*SIZEOF_FAST_FLOAT |
| ; FAST_FLOAT workspace[DCTSIZE2] |
| %define rndint_magic workspace-SIZEOF_FP32 |
| ; float rndint_magic = 100663296.0F |
| %define gotptr rndint_magic-SIZEOF_POINTER ; void * gotptr |
| |
| align 16 |
| global EXTN(jpeg_idct_float) |
| |
| EXTN(jpeg_idct_float): |
| push ebp |
| mov ebp,esp |
| lea esp, [workspace] |
| push FP32 0x4CC00000 ; (float)(0x00C00000 << 3) |
| pushpic eax ; make a room for GOT address |
| push ebx |
| ; push ecx ; need not be preserved |
| ; push edx ; need not be preserved |
| push esi |
| push edi |
| |
| get_GOT ebx ; get GOT address |
| movpic POINTER [gotptr], ebx ; save GOT address |
| |
| ; ---- Pass 1: process columns from input, store into work array. |
| |
| mov edx, POINTER [compptr(ebp)] |
| mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr |
| mov esi, JCOEFPTR [coef_block(ebp)] ; inptr |
| lea edi, [workspace] ; FAST_FLOAT * wsptr |
| mov ecx, DCTSIZE ; ctr |
| alignx 16,7 |
| .columnloop: |
| mov ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)] |
| or ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)] |
| jnz short .columnDCT |
| |
| mov bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)] |
| mov ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)] |
| or bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)] |
| or ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)] |
| or bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)] |
| or ax,bx |
| jnz short .columnDCT |
| |
| ; -- AC terms all zero |
| |
| fild JCOEF [COL(0,esi,SIZEOF_JCOEF)] |
| fmul FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| |
| fst FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)] |
| fst FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)] |
| fst FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)] |
| fst FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)] |
| fst FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)] |
| fst FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)] |
| fst FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)] |
| jmp near .nextcolumn |
| alignx 16,7 |
| |
| .columnDCT: |
| movpic ebx, POINTER [gotptr] ; load GOT address |
| |
| ; -- Even part |
| |
| fild JCOEF [COL(2,esi,SIZEOF_JCOEF)] |
| fild JCOEF [COL(6,esi,SIZEOF_JCOEF)] |
| fild JCOEF [COL(4,esi,SIZEOF_JCOEF)] |
| fild JCOEF [COL(0,esi,SIZEOF_JCOEF)] |
| |
| fxch st0,st3 |
| |
| fmul FLOAT_MULT_TYPE [COL(2,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st2 |
| fmul FLOAT_MULT_TYPE [COL(6,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st1 |
| fmul FLOAT_MULT_TYPE [COL(4,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st3 |
| fmul FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st1 |
| |
| fld st2 ; st2 = st2 + st0, st0 = st2 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st3,st0 |
| |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)] |
| |
| fld st3 ; st1 = st1 + st3, st3 = st1 - st3 |
| fsubr st0,st2 |
| fxch st0,st4 |
| faddp st2,st0 |
| |
| fsub st0,st2 |
| |
| fld st1 ; st2 = st1 + st2, st1 = st1 - st2 |
| fsub st0,st3 |
| fxch st0,st2 |
| faddp st3,st0 |
| fld st3 ; st0 = st3 + st0, st3 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st4 |
| faddp st1,st0 |
| |
| ; -- Odd part |
| |
| fild JCOEF [COL(1,esi,SIZEOF_JCOEF)] |
| fild JCOEF [COL(7,esi,SIZEOF_JCOEF)] |
| fild JCOEF [COL(3,esi,SIZEOF_JCOEF)] |
| fild JCOEF [COL(5,esi,SIZEOF_JCOEF)] |
| |
| fxch st0,st3 |
| |
| fmul FLOAT_MULT_TYPE [COL(1,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st2 |
| fmul FLOAT_MULT_TYPE [COL(7,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st1 |
| fmul FLOAT_MULT_TYPE [COL(3,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st6 |
| fxch st3,st0 |
| fmul FLOAT_MULT_TYPE [COL(5,edx,SIZEOF_FLOAT_MULT_TYPE)] |
| fxch st0,st5 |
| fstp FP64 [tmp] |
| |
| fld st1 ; st1 = st1 + st0, st0 = st1 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st2,st0 |
| fld st5 ; st4 = st4 + st5, st5 = st4 - st5 |
| fsubr st0,st5 |
| fxch st0,st6 |
| faddp st5,st0 |
| |
| fld st1 ; st1 = st1 + st4, st4 = st1 - st4 |
| fsub st0,st5 |
| fxch st0,st5 |
| faddp st2,st0 |
| |
| fld st5 |
| fadd st0,st1 |
| fxch st0,st5 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)] |
| fxch st0,st5 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_847)] |
| fxch st0,st6 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_2_613)] |
| fxch st0,st1 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_082)] |
| fxch st0,st6 |
| fsubr st1,st0 |
| fsubp st6,st0 |
| |
| ; -- Final output stage |
| |
| fsub st0,st1 |
| fld st2 ; st1 = st2 + st1, st2 = st2 - st1 |
| fsub st0,st2 |
| fxch st0,st3 |
| faddp st2,st0 |
| fsub st4,st0 |
| fld st3 ; st0 = st3 + st0, st3 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st4 |
| faddp st1,st0 |
| |
| fxch st0,st2 |
| |
| fstp FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)] |
| |
| fadd st1,st0 |
| fld FP64 [tmp] |
| fld st1 ; st3 = st3 + st1, st1 = st3 - st1 |
| fsubr st0,st4 |
| fxch st0,st2 |
| faddp st4,st0 |
| fld st0 ; st0 = st0 + st2, st2 = st0 - st2 |
| fsub st0,st3 |
| fxch st0,st3 |
| faddp st1,st0 |
| |
| fxch st0,st3 |
| |
| fstp FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)] |
| fstp FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)] |
| |
| .nextcolumn: |
| add esi, byte SIZEOF_JCOEF ; advance pointers to next column |
| add edx, byte SIZEOF_FLOAT_MULT_TYPE |
| add edi, byte SIZEOF_FAST_FLOAT |
| dec ecx |
| jnz near .columnloop |
| |
| ; ---- Pass 2: process rows from work array, store into output array. |
| |
| mov edx, POINTER [cinfo(ebp)] |
| mov edx, POINTER [jdstruct_sample_range_limit(edx)] |
| sub edx, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE ; JSAMPLE * range_limit |
| |
| lea esi, [workspace] ; FAST_FLOAT * wsptr |
| mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) |
| mov ecx, DCTSIZE ; ctr |
| alignx 16,7 |
| .rowloop: |
| push edi |
| mov edi, JSAMPROW [edi] ; (JSAMPLE *) |
| add edi, JDIMENSION [output_col(ebp)] ; edi=outptr |
| |
| %ifndef NO_ZERO_ROW_TEST_FLOAT |
| mov eax, FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)] |
| add eax,eax ; shl eax,1 (shift out the sign bit) |
| jnz short .rowDCT |
| |
| mov eax, FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)] |
| mov ebx, FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)] |
| or eax, FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)] |
| or ebx, FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)] |
| or eax, FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)] |
| or ebx, FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)] |
| or eax,ebx |
| add eax,eax ; shl eax,1 (shift out the sign bit) |
| jnz short .rowDCT |
| |
| ; -- AC terms all zero |
| |
| push eax |
| |
| fld FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)] |
| fadd FP32 [rndint_magic] |
| fstp FP32 [esp] |
| |
| pop eax |
| and eax,RANGE_MASK |
| mov al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE] |
| mov JSAMPLE [edi+0*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+1*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+2*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+3*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+4*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+5*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+6*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+7*SIZEOF_JSAMPLE], al |
| jmp near .nextrow |
| alignx 16,7 |
| %endif |
| .rowDCT: |
| movpic ebx, POINTER [gotptr] ; load GOT address |
| |
| ; -- Even part |
| |
| fld FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)] |
| |
| fld st2 ; st2 = st2 + st0, st0 = st2 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st3,st0 |
| |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)] |
| |
| fld st3 ; st1 = st1 + st3, st3 = st1 - st3 |
| fsubr st0,st2 |
| fxch st0,st4 |
| faddp st2,st0 |
| |
| fsub st0,st2 |
| |
| fld st1 ; st2 = st1 + st2, st1 = st1 - st2 |
| fsub st0,st3 |
| fxch st0,st2 |
| faddp st3,st0 |
| fld st3 ; st0 = st3 + st0, st3 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st4 |
| faddp st1,st0 |
| |
| ; -- Odd part |
| |
| fld FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)] |
| fxch st0,st3 |
| fld FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)] |
| fld FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)] |
| fxch st0,st5 |
| fstp FP64 [tmp] |
| |
| fld st1 ; st1 = st1 + st0, st0 = st1 - st0 |
| fsub st0,st1 |
| fxch st0,st1 |
| faddp st2,st0 |
| fld st5 ; st4 = st4 + st5, st5 = st4 - st5 |
| fsubr st0,st5 |
| fxch st0,st6 |
| faddp st5,st0 |
| |
| fld st1 ; st1 = st1 + st4, st4 = st1 - st4 |
| fsub st0,st5 |
| fxch st0,st5 |
| faddp st2,st0 |
| |
| fld st5 |
| fadd st0,st1 |
| fxch st0,st5 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_414)] |
| fxch st0,st5 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_847)] |
| fxch st0,st6 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_2_613)] |
| fxch st0,st1 |
| fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_082)] |
| fxch st0,st6 |
| fsubr st1,st0 |
| fsubp st6,st0 |
| |
| ; -- Final output stage |
| |
| sub esp, byte DCTSIZE*SIZEOF_FP32 |
| |
| fsub st0,st1 |
| fld st2 ; st1 = st2 + st1, st2 = st2 - st1 |
| fsub st0,st2 |
| fxch st0,st3 |
| faddp st2,st0 |
| fsub st4,st0 |
| fld st3 ; st0 = st3 + st0, st3 = st3 - st0 |
| fsub st0,st1 |
| fxch st0,st4 |
| faddp st1,st0 |
| |
| fld FP32 [rndint_magic] |
| |
| fadd st4,st0 |
| fadd st1,st0 |
| fadd st2,st0 |
| fadd st3,st0 |
| |
| fxch st0,st4 |
| |
| fstp FP32 [esp+6*SIZEOF_FP32] |
| fstp FP32 [esp+1*SIZEOF_FP32] |
| fstp FP32 [esp+0*SIZEOF_FP32] |
| fstp FP32 [esp+7*SIZEOF_FP32] |
| |
| fxch st0,st1 |
| |
| fadd st2,st0 |
| fld FP64 [tmp] |
| fld st1 ; st4 = st4 + st1, st1 = st4 - st1 |
| fsubr st0,st5 |
| fxch st0,st2 |
| faddp st5,st0 |
| fld st0 ; st0 = st0 + st3, st3 = st0 - st3 |
| fsub st0,st4 |
| fxch st0,st4 |
| faddp st1,st0 |
| |
| fxch st0,st2 |
| |
| fadd st1,st0 |
| fadd st2,st0 |
| fadd st3,st0 |
| faddp st4,st0 |
| |
| fstp FP32 [esp+5*SIZEOF_FP32] |
| fstp FP32 [esp+4*SIZEOF_FP32] |
| fstp FP32 [esp+3*SIZEOF_FP32] |
| fstp FP32 [esp+2*SIZEOF_FP32] |
| |
| %assign i 0 ; i=0; |
| %rep 4 ; -- repeat 4 times --- |
| pop eax |
| pop ebx |
| and eax,RANGE_MASK |
| and ebx,RANGE_MASK |
| mov al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE] |
| mov bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE] |
| mov JSAMPLE [edi+(i+0)*SIZEOF_JSAMPLE], al |
| mov JSAMPLE [edi+(i+1)*SIZEOF_JSAMPLE], bl |
| %assign i i+2 ; i+=2; |
| %endrep ; -- repeat end --- |
| |
| .nextrow: |
| pop edi |
| add esi, byte DCTSIZE*SIZEOF_FAST_FLOAT |
| add edi, byte SIZEOF_JSAMPROW ; advance pointer to next row |
| dec ecx |
| jnz near .rowloop |
| |
| pop edi |
| pop esi |
| ; pop edx ; need not be preserved |
| ; pop ecx ; need not be preserved |
| pop ebx |
| mov esp,ebp |
| pop ebp |
| ret |
| |
| %endif ; DCT_FLOAT_SUPPORTED |