blob: d0b47e6393820673990a7ddcf26753dd4bbbad49 [file] [log] [blame]
Pierre Ossman82c7f312009-03-09 13:21:27 +00001;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library - version 1.02
8;
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10;
11; This software is provided 'as-is', without any express or implied
12; warranty. In no event will the authors be held liable for any damages
13; arising from the use of this software.
14;
15; Permission is granted to anyone to use this software for any purpose,
16; including commercial applications, and to alter it and redistribute it
17; freely, subject to the following restrictions:
18;
19; 1. The origin of this software must not be misrepresented; you must not
20; claim that you wrote the original software. If you use this software
21; in a product, an acknowledgment in the product documentation would be
22; appreciated but is not required.
23; 2. Altered source versions must be plainly marked as such, and must not be
24; misrepresented as being the original software.
25; 3. This notice may not be removed or altered from any source distribution.
26;
27; [TAB8]
28
29; ==========================================================================
30; System-dependent configurations
31
32%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
33; * Microsoft Visual C++
34; * MinGW (Minimalist GNU for Windows)
35; * CygWin
36; * LCC-Win32
37
38; -- segment definition --
39;
40%define SEG_TEXT .text align=16 public use32 class=CODE
41%define SEG_CONST .rdata align=16 public use32 class=CONST
42
DRCb53e4a42010-02-22 05:12:38 +000043%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
44; * Microsoft Visual C++
45
46; -- segment definition --
47;
48%define SEG_TEXT .text align=16 public use64 class=CODE
49%define SEG_CONST .rdata align=16 public use64 class=CONST
50%ifdef MSVC
51%define EXTN(name) name ; foo() -> foo
52%endif
53
Pierre Ossman82c7f312009-03-09 13:21:27 +000054%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
55; * Borland C++ (Win32)
56
57; -- segment definition --
58;
59%define SEG_TEXT .text align=16 public use32 class=CODE
60%define SEG_CONST .data align=16 public use32 class=DATA
61
DRC246c3d92009-06-25 20:38:31 +000062%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
Pierre Ossman82c7f312009-03-09 13:21:27 +000063; * Linux
64; * *BSD family Unix using elf format
65; * Unix System V, including Solaris x86, UnixWare and SCO Unix
66
Adam Tkac3160a562010-01-27 10:10:23 +000067; mark stack as non-executable
68section .note.GNU-stack noalloc noexec nowrite progbits
69
Pierre Ossman82c7f312009-03-09 13:21:27 +000070; -- segment definition --
71;
DRC246c3d92009-06-25 20:38:31 +000072%ifdef __x86_64__
73%define SEG_TEXT .text progbits align=16
74%define SEG_CONST .rodata progbits align=16
75%else
Pierre Ossman82c7f312009-03-09 13:21:27 +000076%define SEG_TEXT .text progbits alloc exec nowrite align=16
77%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
DRC246c3d92009-06-25 20:38:31 +000078%endif
Pierre Ossman82c7f312009-03-09 13:21:27 +000079
80; To make the code position-independent, append -DPIC to the commandline
81;
82%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
83%define EXTN(name) name ; foo() -> foo
84
85%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
86; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
87; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
88
89; -- segment definition --
90;
91%define SEG_TEXT .text
92%define SEG_CONST .data
93
94; To make the code position-independent, append -DPIC to the commandline
95;
96%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
97
98%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
99; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
100
101; -- segment definition --
102;
103%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
104%define SEG_CONST .rodata align=16
105
106; The generation of position-independent code (PIC) is the default on Darwin.
107;
108%define PIC
109%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
110
111%else ; ----(Other case)----------------------
112
113; -- segment definition --
114;
115%define SEG_TEXT .text
116%define SEG_CONST .data
117
118%endif ; ----------------------------------------------
119
120; ==========================================================================
121
122; --------------------------------------------------------------------------
123; Common types
124;
DRC246c3d92009-06-25 20:38:31 +0000125%ifdef __x86_64__
126%define POINTER qword ; general pointer type
127%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
128%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
129%else
Pierre Ossman82c7f312009-03-09 13:21:27 +0000130%define POINTER dword ; general pointer type
131%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
132%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
DRC246c3d92009-06-25 20:38:31 +0000133%endif
Pierre Ossman82c7f312009-03-09 13:21:27 +0000134
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000135%define INT dword ; signed integer type
136%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
137%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
138
Pierre Ossman2c2e54b2009-03-09 13:28:10 +0000139%define FP32 dword ; IEEE754 single
140%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
141%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
142
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000143%define MMWORD qword ; int64 (MMX register)
144%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
145%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
146
Pierre Ossman0d37c572009-03-09 13:31:56 +0000147; NASM is buggy and doesn't properly handle operand sizes for SSE
148; instructions, so for now we have to define XMMWORD as blank.
149%define XMMWORD ; int128 (SSE register)
150%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
151%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
152
Pierre Ossman74693862009-03-09 13:34:17 +0000153; Similar hacks for when we load a dword or MMWORD into an xmm# register
154%define XMM_DWORD
155%define XMM_MMWORD
156
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000157%define SIZEOF_BYTE 1 ; sizeof(BYTE)
158%define SIZEOF_WORD 2 ; sizeof(WORD)
159%define SIZEOF_DWORD 4 ; sizeof(DWORD)
160%define SIZEOF_QWORD 8 ; sizeof(QWORD)
Pierre Ossman0d37c572009-03-09 13:31:56 +0000161%define SIZEOF_OWORD 16 ; sizeof(OWORD)
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000162
163%define BYTE_BIT 8 ; CHAR_BIT in C
164%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
165%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
166%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
Pierre Ossman0d37c572009-03-09 13:31:56 +0000167%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
Pierre Ossman3e0e2de2009-03-09 13:25:30 +0000168
Pierre Ossman82c7f312009-03-09 13:21:27 +0000169; --------------------------------------------------------------------------
170; External Symbol Name
171;
172%ifndef EXTN
173%define EXTN(name) _ %+ name ; foo() -> _foo
174%endif
175
176; --------------------------------------------------------------------------
177; Macros for position-independent code (PIC) support
178;
179%ifndef GOT_SYMBOL
180%undef PIC
181%endif
182
183%ifdef PIC ; -------------------------------------------
184
185%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
186
187; At present, nasm doesn't seem to support PIC generation for Mach-O.
188; The PIC support code below is a little tricky.
189
190 SECTION SEG_CONST
191const_base:
192
193%define GOTOFF(got,sym) (got) + (sym) - const_base
194
195%imacro get_GOT 1
196 ; NOTE: this macro destroys ecx resister.
197 call %%geteip
198 add ecx, byte (%%ref - $)
199 jmp short %%adjust
200%%geteip:
201 mov ecx, POINTER [esp]
202 ret
203%%adjust:
204 push ebp
205 xor ebp,ebp ; ebp = 0
206%ifidni %1,ebx ; (%1 == ebx)
207 ; db 0x8D,0x9C + jmp near const_base =
208 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
209 db 0x8D,0x9C ; 8D,9C
210 jmp near const_base ; E9,(const_base-%%ref)
211%%ref:
212%else ; (%1 != ebx)
213 ; db 0x8D,0x8C + jmp near const_base =
214 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
215 db 0x8D,0x8C ; 8D,8C
216 jmp near const_base ; E9,(const_base-%%ref)
217%%ref: mov %1, ecx
218%endif ; (%1 == ebx)
219 pop ebp
220%endmacro
221
222%else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
223
224%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
225
226%imacro get_GOT 1
227 extern GOT_SYMBOL
228 call %%geteip
229 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
230 jmp short %%done
231%%geteip:
232 mov %1, POINTER [esp]
233 ret
234%%done:
235%endmacro
236
237%endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
238
239%imacro pushpic 1.nolist
240 push %1
241%endmacro
242%imacro poppic 1.nolist
243 pop %1
244%endmacro
245%imacro movpic 2.nolist
246 mov %1,%2
247%endmacro
248
249%else ; !PIC -----------------------------------------
250
251%define GOTOFF(got,sym) (sym)
252
253%imacro get_GOT 1.nolist
254%endmacro
255%imacro pushpic 1.nolist
256%endmacro
257%imacro poppic 1.nolist
258%endmacro
259%imacro movpic 2.nolist
260%endmacro
261
262%endif ; PIC -----------------------------------------
263
264; --------------------------------------------------------------------------
265; Align the next instruction on {2,4,8,16,..}-byte boundary.
266; ".balign n,,m" in GNU as
267;
268%define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
269%define FILLB(b,n) (($$-(b)) & ((n)-1))
270
271%imacro alignx 1-2.nolist 0xFFFF
272%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
273 db 0x90 ; nop
274 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
275 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
276 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
277 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
278 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
279 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
280 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
281 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
282 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
283 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
284 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
285 db 0x8B,0xED ; mov ebp,ebp
286 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
287 db 0x90 ; nop
288%endmacro
289
290; Align the next data on {2,4,8,16,..}-byte boundary.
291;
292%imacro alignz 1.nolist
293 align %1, db 0 ; filling zeros
294%endmacro
295
DRC246c3d92009-06-25 20:38:31 +0000296%ifdef __x86_64__
DRCb53e4a42010-02-22 05:12:38 +0000297
298%ifdef WIN64
299
300%imacro collect_args 0
301 push r10
302 push r11
303 push r12
304 push r13
305 push r14
306 push r15
307 mov r10, rcx
308 mov r11, rdx
309 mov r12, r8
310 mov r13, r9
311 mov r14, [rax+48]
312 mov r15, [rax+56]
313%endmacro
314
315%else
316
DRC246c3d92009-06-25 20:38:31 +0000317%imacro collect_args 0
318 push r10
319 push r11
320 push r12
321 push r13
322 push r14
323 push r15
324 mov r10, rdi
325 mov r11, rsi
326 mov r12, rdx
327 mov r13, rcx
328 mov r14, r8
329 mov r15, r9
330%endmacro
331
DRCb53e4a42010-02-22 05:12:38 +0000332%endif
333
DRC246c3d92009-06-25 20:38:31 +0000334%imacro uncollect_args 0
335 pop r15
336 pop r14
337 pop r13
338 pop r12
339 pop r11
340 pop r10
341%endmacro
342
343%endif
Pierre Ossman82c7f312009-03-09 13:21:27 +0000344
345; --------------------------------------------------------------------------
346; Defines picked up from the C headers
347;
Pierre Ossman39170cf2009-03-16 13:34:18 +0000348%include "jsimdcfg.inc"
Pierre Ossman82c7f312009-03-09 13:21:27 +0000349
350; --------------------------------------------------------------------------