;
; jsimdext.inc - common declarations
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright 2010 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library - version 1.02
;
; Copyright (C) 1999-2006, MIYASAKA Masaru.
;
; This software is provided 'as-is', without any express or implied
; warranty.  In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
;    claim that you wrote the original software. If you use this software
;    in a product, an acknowledgment in the product documentation would be
;    appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
;    misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
;
; [TAB8]

; ==========================================================================
;  System-dependent configurations

%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
; * Microsoft Visual C++
; * MinGW (Minimalist GNU for Windows)
; * CygWin
; * LCC-Win32

; -- segment definition --
;
%define SEG_TEXT    .text  align=16 public use32 class=CODE
%define SEG_CONST   .rdata align=16 public use32 class=CONST

%elifdef WIN64	; ----(nasm -fwin64 -DWIN64 ...)--------
; * Microsoft Visual C++

; -- segment definition --
;
%define SEG_TEXT    .text  align=16 public use64 class=CODE
%define SEG_CONST   .rdata align=16 public use64 class=CONST
%define EXTN(name)  name			; foo() -> foo

%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
; * Borland C++ (Win32)

; -- segment definition --
;
%define SEG_TEXT    .text  align=16 public use32 class=CODE
%define SEG_CONST   .data  align=16 public use32 class=DATA

%elifdef ELF	; ----(nasm -felf[64] -DELF ...)------------
; * Linux
; * *BSD family Unix using elf format
; * Unix System V, including Solaris x86, UnixWare and SCO Unix

; mark stack as non-executable
section .note.GNU-stack noalloc noexec nowrite progbits

; -- segment definition --
;
%ifdef __x86_64__
%define SEG_TEXT    .text   progbits align=16
%define SEG_CONST   .rodata progbits align=16
%else
%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
%endif

%define STRICT_MEMORY_ACCESS 1

; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
%define EXTN(name)  name			; foo() -> foo

%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)

; -- segment definition --
;
%define SEG_TEXT    .text
%define SEG_CONST   .data

; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC

%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)

; -- segment definition --
;
%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
%define SEG_CONST   .rodata align=16

; The generation of position-independent code (PIC) is the default on Darwin.
;
%define PIC
%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing

%else		; ----(Other case)----------------------

; -- segment definition --
;
%define SEG_TEXT    .text
%define SEG_CONST   .data

%endif	; ----------------------------------------------

; ==========================================================================

; --------------------------------------------------------------------------
;  Common types
;
%ifdef __x86_64__
%define POINTER                 qword           ; general pointer type
%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
%else
%define POINTER                 dword           ; general pointer type
%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
%endif

%define INT                     dword           ; signed integer type
%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT

%define FP32                    dword           ; IEEE754 single
%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT

%define MMWORD                  qword           ; int64  (MMX register)
%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT

; NASM is buggy and doesn't properly handle operand sizes for SSE
; instructions, so for now we have to define XMMWORD as blank.
%define XMMWORD                                 ; int128 (SSE register)
%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT

; Similar hacks for when we load a dword or MMWORD into an xmm# register
%define XMM_DWORD
%define XMM_MMWORD

%define SIZEOF_BYTE             1               ; sizeof(BYTE)
%define SIZEOF_WORD             2               ; sizeof(WORD)
%define SIZEOF_DWORD            4               ; sizeof(DWORD)
%define SIZEOF_QWORD            8               ; sizeof(QWORD)
%define SIZEOF_OWORD            16              ; sizeof(OWORD)

%define BYTE_BIT                8               ; CHAR_BIT in C
%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT

; --------------------------------------------------------------------------
;  External Symbol Name
;
%ifndef EXTN
%define EXTN(name)   _ %+ name		; foo() -> _foo
%endif

; --------------------------------------------------------------------------
;  Macros for position-independent code (PIC) support
;
%ifndef GOT_SYMBOL
%undef PIC
%endif

%ifdef PIC ; -------------------------------------------

%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------

; At present, nasm doesn't seem to support PIC generation for Mach-O.
; The PIC support code below is a little tricky.

	SECTION	SEG_CONST
const_base:

%define GOTOFF(got,sym) (got) + (sym) - const_base

%imacro get_GOT	1
	; NOTE: this macro destroys ecx resister.
	call	%%geteip
	add	ecx, byte (%%ref - $)
	jmp	short %%adjust
%%geteip:
	mov	ecx, POINTER [esp]
	ret
%%adjust:
	push	ebp
	xor	ebp,ebp		; ebp = 0
%ifidni %1,ebx	; (%1 == ebx)
	; db 0x8D,0x9C + jmp near const_base =
	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
	db	0x8D,0x9C		; 8D,9C
	jmp	near const_base		; E9,(const_base-%%ref)
%%ref:
%else  ; (%1 != ebx)
	; db 0x8D,0x8C + jmp near const_base =
	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
	db	0x8D,0x8C		; 8D,8C
	jmp	near const_base		; E9,(const_base-%%ref)
%%ref:	mov	%1, ecx
%endif ; (%1 == ebx)
	pop	ebp
%endmacro

%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------

%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff

%imacro get_GOT	1
	extern	GOT_SYMBOL
	call	%%geteip
	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
	jmp	short %%done
%%geteip:
	mov	%1, POINTER [esp]
	ret
%%done:
%endmacro

%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------

%imacro pushpic	1.nolist
	push	%1
%endmacro
%imacro poppic	1.nolist
	pop	%1
%endmacro
%imacro movpic	2.nolist
	mov	%1,%2
%endmacro

%else	; !PIC -----------------------------------------

%define GOTOFF(got,sym) (sym)

%imacro get_GOT	1.nolist
%endmacro
%imacro pushpic	1.nolist
%endmacro
%imacro poppic	1.nolist
%endmacro
%imacro movpic	2.nolist
%endmacro

%endif	;  PIC -----------------------------------------

; --------------------------------------------------------------------------
;  Align the next instruction on {2,4,8,16,..}-byte boundary.
;  ".balign n,,m" in GNU as
;
%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
%define FILLB(b,n)  (($$-(b)) & ((n)-1))

%imacro alignx 1-2.nolist 0xFFFF
%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
	       db 0x90                               ; nop
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
	       db 0x8B,0xED                          ; mov ebp,ebp
	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
	       db 0x90                               ; nop
%endmacro

; Align the next data on {2,4,8,16,..}-byte boundary.
;
%imacro alignz 1.nolist
	align %1, db 0		; filling zeros
%endmacro

%ifdef __x86_64__

%ifdef WIN64

%imacro collect_args 0
	push r12
	push r13
	push r14
	push r15
	mov r10, rcx
	mov r11, rdx
	mov r12, r8
	mov r13, r9
	mov r14, [rax+48]
	mov r15, [rax+56]
	push rsi
	push rdi
	sub     rsp, SIZEOF_XMMWORD
	movlpd  XMMWORD [rsp], xmm6
	sub     rsp, SIZEOF_XMMWORD
	movlpd  XMMWORD [rsp], xmm7
%endmacro

%imacro uncollect_args 0
	movlpd  xmm7, XMMWORD [rsp]
	add     rsp, SIZEOF_XMMWORD
	movlpd  xmm6, XMMWORD [rsp]
	add     rsp, SIZEOF_XMMWORD
	pop rdi
	pop rsi
	pop r15
	pop r14
	pop r13
	pop r12
%endmacro

%else

%imacro collect_args 0
	push r10
	push r11
	push r12
	push r13
	push r14
	push r15
	mov r10, rdi
	mov r11, rsi
	mov r12, rdx
	mov r13, rcx
	mov r14, r8
	mov r15, r9
%endmacro

%imacro uncollect_args 0
	pop r15
	pop r14
	pop r13
	pop r12
	pop r11
	pop r10
%endmacro

%endif

%endif

; --------------------------------------------------------------------------
;  Defines picked up from the C headers
;
%include "jsimdcfg.inc"

; --------------------------------------------------------------------------
