blob: e8ceee18f38abb76ca412800fab90c1186a7d673 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040031#include "cache.h"
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040032
Bruce Beare8ff1a272010-03-04 11:03:37 -080033#ifndef L
34# define L(label) .L##label
35#endif
36
37#ifndef ALIGN
38# define ALIGN(n) .p2align n
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
Bruce Beare124a5422010-10-11 12:24:41 -070054# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080055#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#define CFI_PUSH(REG) \
77 cfi_adjust_cfa_offset (4); \
78 cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG) \
81 cfi_adjust_cfa_offset (-4); \
82 cfi_restore (REG)
83
84#define PUSH(REG) pushl REG; CFI_PUSH (REG)
85#define POP(REG) popl REG; CFI_POP (REG)
86
Elliott Hughes01d5b942016-03-02 17:18:18 -080087#define DEST PARMS
88#define CHR DEST+4
89#define LEN CHR+4
90#define SETRTNVAL movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -080091
Nick Kralevich0aa82892011-11-11 15:47:24 -080092#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -080093# define ENTRANCE PUSH (%ebx);
94# define RETURN_END POP (%ebx); ret
95# define RETURN RETURN_END; CFI_PUSH (%ebx)
96# define PARMS 8 /* Preserve EBX. */
97# define JMPTBL(I, B) I - B
98
99/* Load an entry in a jump table into EBX and branch to it. TABLE is a
100 jump table with relative offsets. */
101# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
102 /* We first load PC into EBX. */ \
Varvara Rainchik5a922842014-04-24 15:41:20 +0400103 call __x86.get_pc_thunk.bx; \
Bruce Beare8ff1a272010-03-04 11:03:37 -0800104 /* Get the address of the jump table. */ \
105 add $(TABLE - .), %ebx; \
106 /* Get the entry and convert the relative offset to the \
107 absolute address. */ \
108 add (%ebx,%ecx,4), %ebx; \
109 add %ecx, %edx; \
110 /* We loaded the jump table and adjuested EDX. Go. */ \
111 jmp *%ebx
112
Varvara Rainchik5a922842014-04-24 15:41:20 +0400113 .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
114 .globl __x86.get_pc_thunk.bx
115 .hidden __x86.get_pc_thunk.bx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800116 ALIGN (4)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400117 .type __x86.get_pc_thunk.bx,@function
118__x86.get_pc_thunk.bx:
Bruce Beare8ff1a272010-03-04 11:03:37 -0800119 movl (%esp), %ebx
120 ret
121#else
122# define ENTRANCE
123# define RETURN_END ret
124# define RETURN RETURN_END
125# define PARMS 4
126# define JMPTBL(I, B) I
127
128/* Branch to an entry in a jump table. TABLE is a jump table with
129 absolute offsets. */
130# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
131 add %ecx, %edx; \
132 jmp *TABLE(,%ecx,4)
133#endif
134
Liubov Dmitrieva0a490662012-01-17 12:55:46 +0400135#ifndef MEMSET
136# define MEMSET memset
137#endif
138
Bruce Beare8ff1a272010-03-04 11:03:37 -0800139 .section .text.sse2,"ax",@progbits
140 ALIGN (4)
Liubov Dmitrieva0a490662012-01-17 12:55:46 +0400141ENTRY (MEMSET)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800142 ENTRANCE
143
144 movl LEN(%esp), %ecx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800145 movzbl CHR(%esp), %eax
146 movb %al, %ah
147 /* Fill the whole EAX with pattern. */
148 movl %eax, %edx
149 shl $16, %eax
150 or %edx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800151 movl DEST(%esp), %edx
152 cmp $32, %ecx
153 jae L(32bytesormore)
154
155L(write_less32bytes):
156 BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
157
158
159 .pushsection .rodata.sse2,"a",@progbits
160 ALIGN (2)
161L(table_less_32bytes):
162 .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
163 .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
164 .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
165 .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
166 .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
167 .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
168 .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
169 .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
170 .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
171 .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
172 .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
173 .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
174 .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
175 .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
176 .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
177 .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
178 .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
179 .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
180 .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
181 .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
182 .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
183 .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
184 .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
185 .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
186 .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
187 .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
188 .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
189 .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
190 .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
191 .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
192 .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
193 .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
194 .popsection
195
196 ALIGN (4)
197L(write_28bytes):
198 movl %eax, -28(%edx)
199L(write_24bytes):
200 movl %eax, -24(%edx)
201L(write_20bytes):
202 movl %eax, -20(%edx)
203L(write_16bytes):
204 movl %eax, -16(%edx)
205L(write_12bytes):
206 movl %eax, -12(%edx)
207L(write_8bytes):
208 movl %eax, -8(%edx)
209L(write_4bytes):
210 movl %eax, -4(%edx)
211L(write_0bytes):
212 SETRTNVAL
213 RETURN
214
215 ALIGN (4)
216L(write_29bytes):
217 movl %eax, -29(%edx)
218L(write_25bytes):
219 movl %eax, -25(%edx)
220L(write_21bytes):
221 movl %eax, -21(%edx)
222L(write_17bytes):
223 movl %eax, -17(%edx)
224L(write_13bytes):
225 movl %eax, -13(%edx)
226L(write_9bytes):
227 movl %eax, -9(%edx)
228L(write_5bytes):
229 movl %eax, -5(%edx)
230L(write_1bytes):
231 movb %al, -1(%edx)
232 SETRTNVAL
233 RETURN
234
235 ALIGN (4)
236L(write_30bytes):
237 movl %eax, -30(%edx)
238L(write_26bytes):
239 movl %eax, -26(%edx)
240L(write_22bytes):
241 movl %eax, -22(%edx)
242L(write_18bytes):
243 movl %eax, -18(%edx)
244L(write_14bytes):
245 movl %eax, -14(%edx)
246L(write_10bytes):
247 movl %eax, -10(%edx)
248L(write_6bytes):
249 movl %eax, -6(%edx)
250L(write_2bytes):
251 movw %ax, -2(%edx)
252 SETRTNVAL
253 RETURN
254
255 ALIGN (4)
256L(write_31bytes):
257 movl %eax, -31(%edx)
258L(write_27bytes):
259 movl %eax, -27(%edx)
260L(write_23bytes):
261 movl %eax, -23(%edx)
262L(write_19bytes):
263 movl %eax, -19(%edx)
264L(write_15bytes):
265 movl %eax, -15(%edx)
266L(write_11bytes):
267 movl %eax, -11(%edx)
268L(write_7bytes):
269 movl %eax, -7(%edx)
270L(write_3bytes):
271 movw %ax, -3(%edx)
272 movb %al, -1(%edx)
273 SETRTNVAL
274 RETURN
275
276 ALIGN (4)
277/* ECX > 32 and EDX is 4 byte aligned. */
278L(32bytesormore):
279 /* Fill xmm0 with the pattern. */
Bruce Beare8ff1a272010-03-04 11:03:37 -0800280 movd %eax, %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -0800281 pshufd $0, %xmm0, %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -0800282 testl $0xf, %edx
283 jz L(aligned_16)
284/* ECX > 32 and EDX is not 16 byte aligned. */
285L(not_aligned_16):
286 movdqu %xmm0, (%edx)
287 movl %edx, %eax
288 and $-16, %edx
289 add $16, %edx
290 sub %edx, %eax
291 add %eax, %ecx
292 movd %xmm0, %eax
293
294 ALIGN (4)
295L(aligned_16):
296 cmp $128, %ecx
297 jae L(128bytesormore)
298
299L(aligned_16_less128bytes):
300 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
301
302 ALIGN (4)
303L(128bytesormore):
304#ifdef SHARED_CACHE_SIZE
305 PUSH (%ebx)
306 mov $SHARED_CACHE_SIZE, %ebx
307#else
Nick Kralevich0aa82892011-11-11 15:47:24 -0800308# if (defined SHARED || defined __PIC__)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400309 call __x86.get_pc_thunk.bx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800310 add $_GLOBAL_OFFSET_TABLE_, %ebx
311 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
312# else
313 PUSH (%ebx)
314 mov __x86_shared_cache_size, %ebx
315# endif
316#endif
317 cmp %ebx, %ecx
318 jae L(128bytesormore_nt_start)
319
320
321#ifdef DATA_CACHE_SIZE
322 POP (%ebx)
Bruce Beare124a5422010-10-11 12:24:41 -0700323# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800324 cmp $DATA_CACHE_SIZE, %ecx
325#else
Nick Kralevich0aa82892011-11-11 15:47:24 -0800326# if (defined SHARED || defined __PIC__)
Bruce Beare124a5422010-10-11 12:24:41 -0700327# define RESTORE_EBX_STATE
Varvara Rainchik5a922842014-04-24 15:41:20 +0400328 call __x86.get_pc_thunk.bx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800329 add $_GLOBAL_OFFSET_TABLE_, %ebx
330 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
331# else
332 POP (%ebx)
Bruce Beare124a5422010-10-11 12:24:41 -0700333# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800334 cmp __x86_data_cache_size, %ecx
335# endif
336#endif
337
338 jae L(128bytes_L2_normal)
339 subl $128, %ecx
340L(128bytesormore_normal):
341 sub $128, %ecx
342 movdqa %xmm0, (%edx)
343 movdqa %xmm0, 0x10(%edx)
344 movdqa %xmm0, 0x20(%edx)
345 movdqa %xmm0, 0x30(%edx)
346 movdqa %xmm0, 0x40(%edx)
347 movdqa %xmm0, 0x50(%edx)
348 movdqa %xmm0, 0x60(%edx)
349 movdqa %xmm0, 0x70(%edx)
350 lea 128(%edx), %edx
351 jb L(128bytesless_normal)
352
353
354 sub $128, %ecx
355 movdqa %xmm0, (%edx)
356 movdqa %xmm0, 0x10(%edx)
357 movdqa %xmm0, 0x20(%edx)
358 movdqa %xmm0, 0x30(%edx)
359 movdqa %xmm0, 0x40(%edx)
360 movdqa %xmm0, 0x50(%edx)
361 movdqa %xmm0, 0x60(%edx)
362 movdqa %xmm0, 0x70(%edx)
363 lea 128(%edx), %edx
364 jae L(128bytesormore_normal)
365
366L(128bytesless_normal):
Bruce Beare124a5422010-10-11 12:24:41 -0700367 add $128, %ecx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800368 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
369
370 ALIGN (4)
371L(128bytes_L2_normal):
372 prefetcht0 0x380(%edx)
373 prefetcht0 0x3c0(%edx)
374 sub $128, %ecx
375 movdqa %xmm0, (%edx)
376 movaps %xmm0, 0x10(%edx)
377 movaps %xmm0, 0x20(%edx)
378 movaps %xmm0, 0x30(%edx)
379 movaps %xmm0, 0x40(%edx)
380 movaps %xmm0, 0x50(%edx)
381 movaps %xmm0, 0x60(%edx)
382 movaps %xmm0, 0x70(%edx)
383 add $128, %edx
384 cmp $128, %ecx
385 jae L(128bytes_L2_normal)
386
387L(128bytesless_L2_normal):
388 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
389
Bruce Beare124a5422010-10-11 12:24:41 -0700390 RESTORE_EBX_STATE
Bruce Beare8ff1a272010-03-04 11:03:37 -0800391L(128bytesormore_nt_start):
392 sub %ebx, %ecx
Bruce Beare124a5422010-10-11 12:24:41 -0700393 mov %ebx, %eax
394 and $0x7f, %eax
395 add %eax, %ecx
396 movd %xmm0, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800397 ALIGN (4)
398L(128bytesormore_shared_cache_loop):
399 prefetcht0 0x3c0(%edx)
400 prefetcht0 0x380(%edx)
401 sub $0x80, %ebx
402 movdqa %xmm0, (%edx)
403 movdqa %xmm0, 0x10(%edx)
404 movdqa %xmm0, 0x20(%edx)
405 movdqa %xmm0, 0x30(%edx)
406 movdqa %xmm0, 0x40(%edx)
407 movdqa %xmm0, 0x50(%edx)
408 movdqa %xmm0, 0x60(%edx)
409 movdqa %xmm0, 0x70(%edx)
410 add $0x80, %edx
411 cmp $0x80, %ebx
412 jae L(128bytesormore_shared_cache_loop)
413 cmp $0x80, %ecx
414 jb L(shared_cache_loop_end)
415 ALIGN (4)
416L(128bytesormore_nt):
417 sub $0x80, %ecx
418 movntdq %xmm0, (%edx)
419 movntdq %xmm0, 0x10(%edx)
420 movntdq %xmm0, 0x20(%edx)
421 movntdq %xmm0, 0x30(%edx)
422 movntdq %xmm0, 0x40(%edx)
423 movntdq %xmm0, 0x50(%edx)
424 movntdq %xmm0, 0x60(%edx)
425 movntdq %xmm0, 0x70(%edx)
426 add $0x80, %edx
427 cmp $0x80, %ecx
428 jae L(128bytesormore_nt)
429 sfence
430L(shared_cache_loop_end):
Nick Kralevich0aa82892011-11-11 15:47:24 -0800431#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800432 POP (%ebx)
433#endif
434 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
435
436
437 .pushsection .rodata.sse2,"a",@progbits
438 ALIGN (2)
439L(table_16_128bytes):
440 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
441 .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
442 .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
443 .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
444 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
445 .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
446 .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
447 .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
448 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
449 .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
450 .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
451 .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
452 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
453 .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
454 .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
455 .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
456 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
457 .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
458 .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
459 .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
460 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
461 .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
462 .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
463 .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
464 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
465 .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
466 .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
467 .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
468 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
469 .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
470 .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
471 .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
472 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
473 .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
474 .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
475 .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
476 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
477 .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
478 .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
479 .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
480 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
481 .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
482 .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
483 .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
484 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
485 .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
486 .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
487 .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
488 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
489 .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
490 .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
491 .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
492 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
493 .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
494 .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
495 .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
496 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
497 .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
498 .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
499 .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
500 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
501 .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
502 .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
503 .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
504 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
505 .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
506 .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
507 .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
508 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
509 .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
510 .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
511 .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
512 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
513 .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
514 .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
515 .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
516 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
517 .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
518 .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
519 .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
520 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
521 .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
522 .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
523 .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
524 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
525 .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
526 .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
527 .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
528 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
529 .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
530 .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
531 .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
532 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
533 .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
534 .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
535 .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
536 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
537 .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
538 .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
539 .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
540 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
541 .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
542 .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
543 .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
544 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
545 .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
546 .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
547 .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
548 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
549 .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
550 .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
551 .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
552 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
553 .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
554 .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
555 .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
556 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
557 .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
558 .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
559 .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
560 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
561 .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
562 .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
563 .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
564 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
565 .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
566 .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
567 .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
568 .popsection
569
570 ALIGN (4)
571L(aligned_16_112bytes):
572 movdqa %xmm0, -112(%edx)
573L(aligned_16_96bytes):
574 movdqa %xmm0, -96(%edx)
575L(aligned_16_80bytes):
576 movdqa %xmm0, -80(%edx)
577L(aligned_16_64bytes):
578 movdqa %xmm0, -64(%edx)
579L(aligned_16_48bytes):
580 movdqa %xmm0, -48(%edx)
581L(aligned_16_32bytes):
582 movdqa %xmm0, -32(%edx)
583L(aligned_16_16bytes):
584 movdqa %xmm0, -16(%edx)
585L(aligned_16_0bytes):
586 SETRTNVAL
587 RETURN
588
589 ALIGN (4)
590L(aligned_16_113bytes):
591 movdqa %xmm0, -113(%edx)
592L(aligned_16_97bytes):
593 movdqa %xmm0, -97(%edx)
594L(aligned_16_81bytes):
595 movdqa %xmm0, -81(%edx)
596L(aligned_16_65bytes):
597 movdqa %xmm0, -65(%edx)
598L(aligned_16_49bytes):
599 movdqa %xmm0, -49(%edx)
600L(aligned_16_33bytes):
601 movdqa %xmm0, -33(%edx)
602L(aligned_16_17bytes):
603 movdqa %xmm0, -17(%edx)
604L(aligned_16_1bytes):
605 movb %al, -1(%edx)
606 SETRTNVAL
607 RETURN
608
609 ALIGN (4)
610L(aligned_16_114bytes):
611 movdqa %xmm0, -114(%edx)
612L(aligned_16_98bytes):
613 movdqa %xmm0, -98(%edx)
614L(aligned_16_82bytes):
615 movdqa %xmm0, -82(%edx)
616L(aligned_16_66bytes):
617 movdqa %xmm0, -66(%edx)
618L(aligned_16_50bytes):
619 movdqa %xmm0, -50(%edx)
620L(aligned_16_34bytes):
621 movdqa %xmm0, -34(%edx)
622L(aligned_16_18bytes):
623 movdqa %xmm0, -18(%edx)
624L(aligned_16_2bytes):
625 movw %ax, -2(%edx)
626 SETRTNVAL
627 RETURN
628
629 ALIGN (4)
630L(aligned_16_115bytes):
631 movdqa %xmm0, -115(%edx)
632L(aligned_16_99bytes):
633 movdqa %xmm0, -99(%edx)
634L(aligned_16_83bytes):
635 movdqa %xmm0, -83(%edx)
636L(aligned_16_67bytes):
637 movdqa %xmm0, -67(%edx)
638L(aligned_16_51bytes):
639 movdqa %xmm0, -51(%edx)
640L(aligned_16_35bytes):
641 movdqa %xmm0, -35(%edx)
642L(aligned_16_19bytes):
643 movdqa %xmm0, -19(%edx)
644L(aligned_16_3bytes):
645 movw %ax, -3(%edx)
646 movb %al, -1(%edx)
647 SETRTNVAL
648 RETURN
649
650 ALIGN (4)
651L(aligned_16_116bytes):
652 movdqa %xmm0, -116(%edx)
653L(aligned_16_100bytes):
654 movdqa %xmm0, -100(%edx)
655L(aligned_16_84bytes):
656 movdqa %xmm0, -84(%edx)
657L(aligned_16_68bytes):
658 movdqa %xmm0, -68(%edx)
659L(aligned_16_52bytes):
660 movdqa %xmm0, -52(%edx)
661L(aligned_16_36bytes):
662 movdqa %xmm0, -36(%edx)
663L(aligned_16_20bytes):
664 movdqa %xmm0, -20(%edx)
665L(aligned_16_4bytes):
666 movl %eax, -4(%edx)
667 SETRTNVAL
668 RETURN
669
670 ALIGN (4)
671L(aligned_16_117bytes):
672 movdqa %xmm0, -117(%edx)
673L(aligned_16_101bytes):
674 movdqa %xmm0, -101(%edx)
675L(aligned_16_85bytes):
676 movdqa %xmm0, -85(%edx)
677L(aligned_16_69bytes):
678 movdqa %xmm0, -69(%edx)
679L(aligned_16_53bytes):
680 movdqa %xmm0, -53(%edx)
681L(aligned_16_37bytes):
682 movdqa %xmm0, -37(%edx)
683L(aligned_16_21bytes):
684 movdqa %xmm0, -21(%edx)
685L(aligned_16_5bytes):
686 movl %eax, -5(%edx)
687 movb %al, -1(%edx)
688 SETRTNVAL
689 RETURN
690
691 ALIGN (4)
692L(aligned_16_118bytes):
693 movdqa %xmm0, -118(%edx)
694L(aligned_16_102bytes):
695 movdqa %xmm0, -102(%edx)
696L(aligned_16_86bytes):
697 movdqa %xmm0, -86(%edx)
698L(aligned_16_70bytes):
699 movdqa %xmm0, -70(%edx)
700L(aligned_16_54bytes):
701 movdqa %xmm0, -54(%edx)
702L(aligned_16_38bytes):
703 movdqa %xmm0, -38(%edx)
704L(aligned_16_22bytes):
705 movdqa %xmm0, -22(%edx)
706L(aligned_16_6bytes):
707 movl %eax, -6(%edx)
708 movw %ax, -2(%edx)
709 SETRTNVAL
710 RETURN
711
712 ALIGN (4)
713L(aligned_16_119bytes):
714 movdqa %xmm0, -119(%edx)
715L(aligned_16_103bytes):
716 movdqa %xmm0, -103(%edx)
717L(aligned_16_87bytes):
718 movdqa %xmm0, -87(%edx)
719L(aligned_16_71bytes):
720 movdqa %xmm0, -71(%edx)
721L(aligned_16_55bytes):
722 movdqa %xmm0, -55(%edx)
723L(aligned_16_39bytes):
724 movdqa %xmm0, -39(%edx)
725L(aligned_16_23bytes):
726 movdqa %xmm0, -23(%edx)
727L(aligned_16_7bytes):
728 movl %eax, -7(%edx)
729 movw %ax, -3(%edx)
730 movb %al, -1(%edx)
731 SETRTNVAL
732 RETURN
733
734 ALIGN (4)
735L(aligned_16_120bytes):
736 movdqa %xmm0, -120(%edx)
737L(aligned_16_104bytes):
738 movdqa %xmm0, -104(%edx)
739L(aligned_16_88bytes):
740 movdqa %xmm0, -88(%edx)
741L(aligned_16_72bytes):
742 movdqa %xmm0, -72(%edx)
743L(aligned_16_56bytes):
744 movdqa %xmm0, -56(%edx)
745L(aligned_16_40bytes):
746 movdqa %xmm0, -40(%edx)
747L(aligned_16_24bytes):
748 movdqa %xmm0, -24(%edx)
749L(aligned_16_8bytes):
750 movq %xmm0, -8(%edx)
751 SETRTNVAL
752 RETURN
753
754 ALIGN (4)
755L(aligned_16_121bytes):
756 movdqa %xmm0, -121(%edx)
757L(aligned_16_105bytes):
758 movdqa %xmm0, -105(%edx)
759L(aligned_16_89bytes):
760 movdqa %xmm0, -89(%edx)
761L(aligned_16_73bytes):
762 movdqa %xmm0, -73(%edx)
763L(aligned_16_57bytes):
764 movdqa %xmm0, -57(%edx)
765L(aligned_16_41bytes):
766 movdqa %xmm0, -41(%edx)
767L(aligned_16_25bytes):
768 movdqa %xmm0, -25(%edx)
769L(aligned_16_9bytes):
770 movq %xmm0, -9(%edx)
771 movb %al, -1(%edx)
772 SETRTNVAL
773 RETURN
774
775 ALIGN (4)
776L(aligned_16_122bytes):
777 movdqa %xmm0, -122(%edx)
778L(aligned_16_106bytes):
779 movdqa %xmm0, -106(%edx)
780L(aligned_16_90bytes):
781 movdqa %xmm0, -90(%edx)
782L(aligned_16_74bytes):
783 movdqa %xmm0, -74(%edx)
784L(aligned_16_58bytes):
785 movdqa %xmm0, -58(%edx)
786L(aligned_16_42bytes):
787 movdqa %xmm0, -42(%edx)
788L(aligned_16_26bytes):
789 movdqa %xmm0, -26(%edx)
790L(aligned_16_10bytes):
791 movq %xmm0, -10(%edx)
792 movw %ax, -2(%edx)
793 SETRTNVAL
794 RETURN
795
796 ALIGN (4)
797L(aligned_16_123bytes):
798 movdqa %xmm0, -123(%edx)
799L(aligned_16_107bytes):
800 movdqa %xmm0, -107(%edx)
801L(aligned_16_91bytes):
802 movdqa %xmm0, -91(%edx)
803L(aligned_16_75bytes):
804 movdqa %xmm0, -75(%edx)
805L(aligned_16_59bytes):
806 movdqa %xmm0, -59(%edx)
807L(aligned_16_43bytes):
808 movdqa %xmm0, -43(%edx)
809L(aligned_16_27bytes):
810 movdqa %xmm0, -27(%edx)
811L(aligned_16_11bytes):
812 movq %xmm0, -11(%edx)
813 movw %ax, -3(%edx)
814 movb %al, -1(%edx)
815 SETRTNVAL
816 RETURN
817
818 ALIGN (4)
819L(aligned_16_124bytes):
820 movdqa %xmm0, -124(%edx)
821L(aligned_16_108bytes):
822 movdqa %xmm0, -108(%edx)
823L(aligned_16_92bytes):
824 movdqa %xmm0, -92(%edx)
825L(aligned_16_76bytes):
826 movdqa %xmm0, -76(%edx)
827L(aligned_16_60bytes):
828 movdqa %xmm0, -60(%edx)
829L(aligned_16_44bytes):
830 movdqa %xmm0, -44(%edx)
831L(aligned_16_28bytes):
832 movdqa %xmm0, -28(%edx)
833L(aligned_16_12bytes):
834 movq %xmm0, -12(%edx)
835 movl %eax, -4(%edx)
836 SETRTNVAL
837 RETURN
838
839 ALIGN (4)
840L(aligned_16_125bytes):
841 movdqa %xmm0, -125(%edx)
842L(aligned_16_109bytes):
843 movdqa %xmm0, -109(%edx)
844L(aligned_16_93bytes):
845 movdqa %xmm0, -93(%edx)
846L(aligned_16_77bytes):
847 movdqa %xmm0, -77(%edx)
848L(aligned_16_61bytes):
849 movdqa %xmm0, -61(%edx)
850L(aligned_16_45bytes):
851 movdqa %xmm0, -45(%edx)
852L(aligned_16_29bytes):
853 movdqa %xmm0, -29(%edx)
854L(aligned_16_13bytes):
855 movq %xmm0, -13(%edx)
856 movl %eax, -5(%edx)
857 movb %al, -1(%edx)
858 SETRTNVAL
859 RETURN
860
861 ALIGN (4)
862L(aligned_16_126bytes):
863 movdqa %xmm0, -126(%edx)
864L(aligned_16_110bytes):
865 movdqa %xmm0, -110(%edx)
866L(aligned_16_94bytes):
867 movdqa %xmm0, -94(%edx)
868L(aligned_16_78bytes):
869 movdqa %xmm0, -78(%edx)
870L(aligned_16_62bytes):
871 movdqa %xmm0, -62(%edx)
872L(aligned_16_46bytes):
873 movdqa %xmm0, -46(%edx)
874L(aligned_16_30bytes):
875 movdqa %xmm0, -30(%edx)
876L(aligned_16_14bytes):
877 movq %xmm0, -14(%edx)
878 movl %eax, -6(%edx)
879 movw %ax, -2(%edx)
880 SETRTNVAL
881 RETURN
882
883 ALIGN (4)
884L(aligned_16_127bytes):
885 movdqa %xmm0, -127(%edx)
886L(aligned_16_111bytes):
887 movdqa %xmm0, -111(%edx)
888L(aligned_16_95bytes):
889 movdqa %xmm0, -95(%edx)
890L(aligned_16_79bytes):
891 movdqa %xmm0, -79(%edx)
892L(aligned_16_63bytes):
893 movdqa %xmm0, -63(%edx)
894L(aligned_16_47bytes):
895 movdqa %xmm0, -47(%edx)
896L(aligned_16_31bytes):
897 movdqa %xmm0, -31(%edx)
898L(aligned_16_15bytes):
899 movq %xmm0, -15(%edx)
900 movl %eax, -7(%edx)
901 movw %ax, -3(%edx)
902 movb %al, -1(%edx)
903 SETRTNVAL
904 RETURN_END
905
Liubov Dmitrieva0a490662012-01-17 12:55:46 +0400906END (MEMSET)