blob: 46953603a6b790f8196efcae92da6e4f7e6aec9d [file] [log] [blame]
Pierre Ossman82c7f312009-03-09 13:21:27 +00001;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library - version 1.02
8;
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10;
11; This software is provided 'as-is', without any express or implied
12; warranty. In no event will the authors be held liable for any damages
13; arising from the use of this software.
14;
15; Permission is granted to anyone to use this software for any purpose,
16; including commercial applications, and to alter it and redistribute it
17; freely, subject to the following restrictions:
18;
19; 1. The origin of this software must not be misrepresented; you must not
20; claim that you wrote the original software. If you use this software
21; in a product, an acknowledgment in the product documentation would be
22; appreciated but is not required.
23; 2. Altered source versions must be plainly marked as such, and must not be
24; misrepresented as being the original software.
25; 3. This notice may not be removed or altered from any source distribution.
26;
27; [TAB8]
28
29; ==========================================================================
30; System-dependent configurations
31
32%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
33; * Microsoft Visual C++
34; * MinGW (Minimalist GNU for Windows)
35; * CygWin
36; * LCC-Win32
37
38; -- segment definition --
39;
40%define SEG_TEXT .text align=16 public use32 class=CODE
41%define SEG_CONST .rdata align=16 public use32 class=CONST
42
43%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
44; * Borland C++ (Win32)
45
46; -- segment definition --
47;
48%define SEG_TEXT .text align=16 public use32 class=CODE
49%define SEG_CONST .data align=16 public use32 class=DATA
50
DRC246c3d92009-06-25 20:38:31 +000051%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
Pierre Ossman82c7f312009-03-09 13:21:27 +000052; * Linux
53; * *BSD family Unix using elf format
54; * Unix System V, including Solaris x86, UnixWare and SCO Unix
55
Adam Tkac3160a562010-01-27 10:10:23 +000056; mark stack as non-executable
57section .note.GNU-stack noalloc noexec nowrite progbits
58
Pierre Ossman82c7f312009-03-09 13:21:27 +000059; -- segment definition --
60;
DRC246c3d92009-06-25 20:38:31 +000061%ifdef __x86_64__
62%define SEG_TEXT .text progbits align=16
63%define SEG_CONST .rodata progbits align=16
64%else
Pierre Ossman82c7f312009-03-09 13:21:27 +000065%define SEG_TEXT .text progbits alloc exec nowrite align=16
66%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
DRC246c3d92009-06-25 20:38:31 +000067%endif
Pierre Ossman82c7f312009-03-09 13:21:27 +000068
69; To make the code position-independent, append -DPIC to the commandline
70;
71%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
72%define EXTN(name) name ; foo() -> foo
73
74%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
75; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
76; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
77
78; -- segment definition --
79;
80%define SEG_TEXT .text
81%define SEG_CONST .data
82
83; To make the code position-independent, append -DPIC to the commandline
84;
85%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
86
87%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
88; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
89
90; -- segment definition --
91;
92%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
93%define SEG_CONST .rodata align=16
94
95; The generation of position-independent code (PIC) is the default on Darwin.
96;
97%define PIC
98%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
99
100%else ; ----(Other case)----------------------
101
102; -- segment definition --
103;
104%define SEG_TEXT .text
105%define SEG_CONST .data
106
107%endif ; ----------------------------------------------
108
109; ==========================================================================
110
111; --------------------------------------------------------------------------
112; Common types
113;
DRC246c3d92009-06-25 20:38:31 +0000114%ifdef __x86_64__
115%define POINTER qword ; general pointer type
116%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
117%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
118%else
Pierre Ossman82c7f312009-03-09 13:21:27 +0000119%define POINTER dword ; general pointer type
120%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
121%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
DRC246c3d92009-06-25 20:38:31 +0000122%endif
Pierre Ossman82c7f312009-03-09 13:21:27 +0000123
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000124%define INT dword ; signed integer type
125%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
126%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
127
Pierre Ossman2c2e54b2009-03-09 13:28:10 +0000128%define FP32 dword ; IEEE754 single
129%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
130%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
131
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000132%define MMWORD qword ; int64 (MMX register)
133%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
134%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
135
Pierre Ossman0d37c572009-03-09 13:31:56 +0000136; NASM is buggy and doesn't properly handle operand sizes for SSE
137; instructions, so for now we have to define XMMWORD as blank.
138%define XMMWORD ; int128 (SSE register)
139%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
140%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
141
Pierre Ossman74693862009-03-09 13:34:17 +0000142; Similar hacks for when we load a dword or MMWORD into an xmm# register
143%define XMM_DWORD
144%define XMM_MMWORD
145
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000146%define SIZEOF_BYTE 1 ; sizeof(BYTE)
147%define SIZEOF_WORD 2 ; sizeof(WORD)
148%define SIZEOF_DWORD 4 ; sizeof(DWORD)
149%define SIZEOF_QWORD 8 ; sizeof(QWORD)
Pierre Ossman0d37c572009-03-09 13:31:56 +0000150%define SIZEOF_OWORD 16 ; sizeof(OWORD)
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000151
152%define BYTE_BIT 8 ; CHAR_BIT in C
153%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
154%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
155%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
Pierre Ossman0d37c572009-03-09 13:31:56 +0000156%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000157
Pierre Ossman82c7f312009-03-09 13:21:27 +0000158; --------------------------------------------------------------------------
159; External Symbol Name
160;
161%ifndef EXTN
162%define EXTN(name) _ %+ name ; foo() -> _foo
163%endif
164
165; --------------------------------------------------------------------------
166; Macros for position-independent code (PIC) support
167;
168%ifndef GOT_SYMBOL
169%undef PIC
170%endif
171
172%ifdef PIC ; -------------------------------------------
173
174%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
175
176; At present, nasm doesn't seem to support PIC generation for Mach-O.
177; The PIC support code below is a little tricky.
178
179 SECTION SEG_CONST
180const_base:
181
182%define GOTOFF(got,sym) (got) + (sym) - const_base
183
184%imacro get_GOT 1
185 ; NOTE: this macro destroys ecx resister.
186 call %%geteip
187 add ecx, byte (%%ref - $)
188 jmp short %%adjust
189%%geteip:
190 mov ecx, POINTER [esp]
191 ret
192%%adjust:
193 push ebp
194 xor ebp,ebp ; ebp = 0
195%ifidni %1,ebx ; (%1 == ebx)
196 ; db 0x8D,0x9C + jmp near const_base =
197 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
198 db 0x8D,0x9C ; 8D,9C
199 jmp near const_base ; E9,(const_base-%%ref)
200%%ref:
201%else ; (%1 != ebx)
202 ; db 0x8D,0x8C + jmp near const_base =
203 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
204 db 0x8D,0x8C ; 8D,8C
205 jmp near const_base ; E9,(const_base-%%ref)
206%%ref: mov %1, ecx
207%endif ; (%1 == ebx)
208 pop ebp
209%endmacro
210
211%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
212
213%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
214
215%imacro get_GOT 1
216 extern GOT_SYMBOL
217 call %%geteip
218 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
219 jmp short %%done
220%%geteip:
221 mov %1, POINTER [esp]
222 ret
223%%done:
224%endmacro
225
226%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
227
228%imacro pushpic 1.nolist
229 push %1
230%endmacro
231%imacro poppic 1.nolist
232 pop %1
233%endmacro
234%imacro movpic 2.nolist
235 mov %1,%2
236%endmacro
237
238%else ; !PIC -----------------------------------------
239
240%define GOTOFF(got,sym) (sym)
241
242%imacro get_GOT 1.nolist
243%endmacro
244%imacro pushpic 1.nolist
245%endmacro
246%imacro poppic 1.nolist
247%endmacro
248%imacro movpic 2.nolist
249%endmacro
250
251%endif ; PIC -----------------------------------------
252
253; --------------------------------------------------------------------------
254; Align the next instruction on {2,4,8,16,..}-byte boundary.
255; ".balign n,,m" in GNU as
256;
257%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
258%define FILLB(b,n) (($$-(b)) & ((n)-1))
259
260%imacro alignx 1-2.nolist 0xFFFF
261%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
262 db 0x90 ; nop
263 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
264 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
265 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
266 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
267 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
268 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
269 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
270 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
271 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
272 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
273 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
274 db 0x8B,0xED ; mov ebp,ebp
275 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
276 db 0x90 ; nop
277%endmacro
278
279; Align the next data on {2,4,8,16,..}-byte boundary.
280;
281%imacro alignz 1.nolist
282 align %1, db 0 ; filling zeros
283%endmacro
284
DRC246c3d92009-06-25 20:38:31 +0000285%ifdef __x86_64__
286%imacro collect_args 0
287 push r10
288 push r11
289 push r12
290 push r13
291 push r14
292 push r15
293 mov r10, rdi
294 mov r11, rsi
295 mov r12, rdx
296 mov r13, rcx
297 mov r14, r8
298 mov r15, r9
299%endmacro
300
301%imacro uncollect_args 0
302 pop r15
303 pop r14
304 pop r13
305 pop r12
306 pop r11
307 pop r10
308%endmacro
309
310%endif
Pierre Ossman82c7f312009-03-09 13:21:27 +0000311
312; --------------------------------------------------------------------------
313; Defines picked up from the C headers
314;
Pierre Ossman39170cf2009-03-16 13:34:18 +0000315%include "jsimdcfg.inc"
Pierre Ossman82c7f312009-03-09 13:21:27 +0000316
317; --------------------------------------------------------------------------