blob: 4b2fb8e8f55f4da03dce621509e0cc0db07ebe49 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040031#include "cache.h"
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040032
Bruce Beare8ff1a272010-03-04 11:03:37 -080033#ifndef MEMCPY
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040034# define MEMCPY memcpy
Bruce Beare8ff1a272010-03-04 11:03:37 -080035#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
Bruce Beare8ff1a272010-03-04 11:03:37 -080041#ifndef cfi_startproc
Jack Renc47703a2012-02-14 12:01:52 +040042# define cfi_startproc .cfi_startproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080043#endif
44
45#ifndef cfi_endproc
Jack Renc47703a2012-02-14 12:01:52 +040046# define cfi_endproc .cfi_endproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080047#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
Jack Renc47703a2012-02-14 12:01:52 +040054# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080055#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
Jack Renc47703a2012-02-14 12:01:52 +040062# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
Bruce Beare8ff1a272010-03-04 11:03:37 -080067 cfi_startproc
68#endif
69
70#ifndef END
Jack Renc47703a2012-02-14 12:01:52 +040071# define END(name) \
72 cfi_endproc; \
Bruce Beare8ff1a272010-03-04 11:03:37 -080073 .size name, .-name
74#endif
75
Elliott Hughesbed110a2016-03-03 10:41:42 -080076#define DEST PARMS
77#define SRC DEST+4
78#define LEN SRC+4
Bruce Beare8ff1a272010-03-04 11:03:37 -080079
Jack Renc47703a2012-02-14 12:01:52 +040080#define CFI_PUSH(REG) \
81 cfi_adjust_cfa_offset (4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080082 cfi_rel_offset (REG, 0)
83
Jack Renc47703a2012-02-14 12:01:52 +040084#define CFI_POP(REG) \
85 cfi_adjust_cfa_offset (-4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080086 cfi_restore (REG)
87
88#define PUSH(REG) pushl REG; CFI_PUSH (REG)
89#define POP(REG) popl REG; CFI_POP (REG)
90
Nick Kralevich5982e332011-11-11 15:47:24 -080091#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -080092# define PARMS 8 /* Preserve EBX. */
93# define ENTRANCE PUSH (%ebx);
94# define RETURN_END POP (%ebx); ret
95# define RETURN RETURN_END; CFI_PUSH (%ebx)
96# define JMPTBL(I, B) I - B
Jack Renc47703a2012-02-14 12:01:52 +040097
Varvara Rainchik5a922842014-04-24 15:41:20 +040098# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
Bruce Beare8ff1a272010-03-04 11:03:37 -080099
100/* Load an entry in a jump table into EBX and branch to it. TABLE is a
Jack Renc47703a2012-02-14 12:01:52 +0400101 jump table with relative offsets. INDEX is a register contains the
102 index into the jump table. SCALE is the scale of INDEX. */
103
Bruce Beare8ff1a272010-03-04 11:03:37 -0800104# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400105 /* We first load PC into EBX. */ \
106 SETUP_PIC_REG(bx); \
107 /* Get the address of the jump table. */ \
108 addl $(TABLE - .), %ebx; \
109 /* Get the entry and convert the relative offset to the \
110 absolute address. */ \
111 addl (%ebx, INDEX, SCALE), %ebx; \
112 /* We loaded the jump table. Go. */ \
113 jmp *%ebx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800114#else
Jack Renc47703a2012-02-14 12:01:52 +0400115
Bruce Beare8ff1a272010-03-04 11:03:37 -0800116# define PARMS 4
117# define ENTRANCE
118# define RETURN_END ret
119# define RETURN RETURN_END
120# define JMPTBL(I, B) I
121
122/* Branch to an entry in a jump table. TABLE is a jump table with
Jack Renc47703a2012-02-14 12:01:52 +0400123 absolute offsets. INDEX is a register contains the index into the
124 jump table. SCALE is the scale of INDEX. */
125
Bruce Beare8ff1a272010-03-04 11:03:37 -0800126# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400127 jmp *TABLE(, INDEX, SCALE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800128#endif
129
130 .section .text.ssse3,"ax",@progbits
131ENTRY (MEMCPY)
132 ENTRANCE
133 movl LEN(%esp), %ecx
134 movl SRC(%esp), %eax
135 movl DEST(%esp), %edx
136
137#ifdef USE_AS_MEMMOVE
138 cmp %eax, %edx
139 jb L(copy_forward)
140 je L(fwd_write_0bytes)
141 cmp $32, %ecx
142 jae L(memmove_bwd)
143 jmp L(bk_write_less32bytes_2)
Jack Renc47703a2012-02-14 12:01:52 +0400144
145 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800146L(memmove_bwd):
147 add %ecx, %eax
148 cmp %eax, %edx
149 movl SRC(%esp), %eax
150 jb L(copy_backward)
151
152L(copy_forward):
153#endif
154 cmp $48, %ecx
155 jae L(48bytesormore)
156
157L(fwd_write_less32bytes):
158#ifndef USE_AS_MEMMOVE
159 cmp %dl, %al
160 jb L(bk_write)
161#endif
162 add %ecx, %edx
163 add %ecx, %eax
164 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
165#ifndef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +0400166 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800167L(bk_write):
168 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
169#endif
170
Jack Renc47703a2012-02-14 12:01:52 +0400171 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800172L(48bytesormore):
Jack Renc47703a2012-02-14 12:01:52 +0400173#ifndef USE_AS_MEMMOVE
174 movlpd (%eax), %xmm0
175 movlpd 8(%eax), %xmm1
176 movlpd %xmm0, (%edx)
177 movlpd %xmm1, 8(%edx)
178#else
Bruce Beare8ff1a272010-03-04 11:03:37 -0800179 movdqu (%eax), %xmm0
Jack Renc47703a2012-02-14 12:01:52 +0400180#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800181 PUSH (%edi)
182 movl %edx, %edi
183 and $-16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800184 add $16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800185 sub %edx, %edi
186 add %edi, %ecx
187 sub %edi, %eax
188
189#ifdef SHARED_CACHE_SIZE_HALF
190 cmp $SHARED_CACHE_SIZE_HALF, %ecx
191#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800192# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400193 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800194 add $_GLOBAL_OFFSET_TABLE_, %ebx
195 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
196# else
197 cmp __x86_shared_cache_size_half, %ecx
198# endif
199#endif
200
201 mov %eax, %edi
202 jae L(large_page)
203 and $0xf, %edi
204 jz L(shl_0)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800205 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
206
Jack Renc47703a2012-02-14 12:01:52 +0400207 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800208L(shl_0):
Jack Renc47703a2012-02-14 12:01:52 +0400209#ifdef USE_AS_MEMMOVE
210 movl DEST+4(%esp), %edi
211 movdqu %xmm0, (%edi)
212#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800213 xor %edi, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -0800214 cmp $127, %ecx
215 ja L(shl_0_gobble)
216 lea -32(%ecx), %ecx
Jack Renc47703a2012-02-14 12:01:52 +0400217
218 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800219L(shl_0_loop):
220 movdqa (%eax, %edi), %xmm0
221 movdqa 16(%eax, %edi), %xmm1
222 sub $32, %ecx
223 movdqa %xmm0, (%edx, %edi)
224 movdqa %xmm1, 16(%edx, %edi)
225 lea 32(%edi), %edi
226 jb L(shl_0_end)
227
228 movdqa (%eax, %edi), %xmm0
229 movdqa 16(%eax, %edi), %xmm1
230 sub $32, %ecx
231 movdqa %xmm0, (%edx, %edi)
232 movdqa %xmm1, 16(%edx, %edi)
233 lea 32(%edi), %edi
234 jb L(shl_0_end)
235
236 movdqa (%eax, %edi), %xmm0
237 movdqa 16(%eax, %edi), %xmm1
238 sub $32, %ecx
239 movdqa %xmm0, (%edx, %edi)
240 movdqa %xmm1, 16(%edx, %edi)
241 lea 32(%edi), %edi
242 jb L(shl_0_end)
243
244 movdqa (%eax, %edi), %xmm0
245 movdqa 16(%eax, %edi), %xmm1
246 sub $32, %ecx
247 movdqa %xmm0, (%edx, %edi)
248 movdqa %xmm1, 16(%edx, %edi)
249 lea 32(%edi), %edi
Jack Renc47703a2012-02-14 12:01:52 +0400250
Bruce Beare8ff1a272010-03-04 11:03:37 -0800251L(shl_0_end):
252 lea 32(%ecx), %ecx
253 add %ecx, %edi
254 add %edi, %edx
255 add %edi, %eax
256 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +0400257 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800258
Bruce Beare124a5422010-10-11 12:24:41 -0700259 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800260
Jack Renc47703a2012-02-14 12:01:52 +0400261 .p2align 4
262L(shl_0_gobble):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800263#ifdef DATA_CACHE_SIZE_HALF
264 cmp $DATA_CACHE_SIZE_HALF, %ecx
265#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800266# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400267 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800268 add $_GLOBAL_OFFSET_TABLE_, %ebx
269 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
270# else
271 cmp __x86_data_cache_size_half, %ecx
272# endif
273#endif
Jack Renc47703a2012-02-14 12:01:52 +0400274 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800275 lea -128(%ecx), %ecx
276 jae L(shl_0_gobble_mem_loop)
Jack Renc47703a2012-02-14 12:01:52 +0400277
278 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800279L(shl_0_gobble_cache_loop):
280 movdqa (%eax), %xmm0
281 movdqa 0x10(%eax), %xmm1
282 movdqa 0x20(%eax), %xmm2
283 movdqa 0x30(%eax), %xmm3
284 movdqa 0x40(%eax), %xmm4
285 movdqa 0x50(%eax), %xmm5
286 movdqa 0x60(%eax), %xmm6
287 movdqa 0x70(%eax), %xmm7
288 lea 0x80(%eax), %eax
289 sub $128, %ecx
290 movdqa %xmm0, (%edx)
291 movdqa %xmm1, 0x10(%edx)
292 movdqa %xmm2, 0x20(%edx)
293 movdqa %xmm3, 0x30(%edx)
294 movdqa %xmm4, 0x40(%edx)
295 movdqa %xmm5, 0x50(%edx)
296 movdqa %xmm6, 0x60(%edx)
297 movdqa %xmm7, 0x70(%edx)
298 lea 0x80(%edx), %edx
299
300 jae L(shl_0_gobble_cache_loop)
301 cmp $-0x40, %ecx
302 lea 0x80(%ecx), %ecx
303 jl L(shl_0_cache_less_64bytes)
304
305 movdqa (%eax), %xmm0
306 sub $0x40, %ecx
307 movdqa 0x10(%eax), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -0800308 movdqa %xmm0, (%edx)
309 movdqa %xmm1, 0x10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800310 movdqa 0x20(%eax), %xmm0
311 movdqa 0x30(%eax), %xmm1
312 add $0x40, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800313 movdqa %xmm0, 0x20(%edx)
314 movdqa %xmm1, 0x30(%edx)
315 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400316
Bruce Beare8ff1a272010-03-04 11:03:37 -0800317L(shl_0_cache_less_64bytes):
318 cmp $0x20, %ecx
319 jb L(shl_0_cache_less_32bytes)
320 movdqa (%eax), %xmm0
321 sub $0x20, %ecx
322 movdqa 0x10(%eax), %xmm1
323 add $0x20, %eax
324 movdqa %xmm0, (%edx)
325 movdqa %xmm1, 0x10(%edx)
326 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400327
Bruce Beare8ff1a272010-03-04 11:03:37 -0800328L(shl_0_cache_less_32bytes):
329 cmp $0x10, %ecx
330 jb L(shl_0_cache_less_16bytes)
331 sub $0x10, %ecx
332 movdqa (%eax), %xmm0
333 add $0x10, %eax
334 movdqa %xmm0, (%edx)
335 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400336
Bruce Beare8ff1a272010-03-04 11:03:37 -0800337L(shl_0_cache_less_16bytes):
338 add %ecx, %edx
339 add %ecx, %eax
340 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
341
Jack Renc47703a2012-02-14 12:01:52 +0400342 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800343L(shl_0_gobble_mem_loop):
344 prefetcht0 0x1c0(%eax)
345 prefetcht0 0x280(%eax)
346 prefetcht0 0x1c0(%edx)
347
348 movdqa (%eax), %xmm0
349 movdqa 0x10(%eax), %xmm1
350 movdqa 0x20(%eax), %xmm2
351 movdqa 0x30(%eax), %xmm3
352 movdqa 0x40(%eax), %xmm4
353 movdqa 0x50(%eax), %xmm5
354 movdqa 0x60(%eax), %xmm6
355 movdqa 0x70(%eax), %xmm7
356 lea 0x80(%eax), %eax
357 sub $0x80, %ecx
358 movdqa %xmm0, (%edx)
359 movdqa %xmm1, 0x10(%edx)
360 movdqa %xmm2, 0x20(%edx)
361 movdqa %xmm3, 0x30(%edx)
362 movdqa %xmm4, 0x40(%edx)
363 movdqa %xmm5, 0x50(%edx)
364 movdqa %xmm6, 0x60(%edx)
365 movdqa %xmm7, 0x70(%edx)
366 lea 0x80(%edx), %edx
367
368 jae L(shl_0_gobble_mem_loop)
369 cmp $-0x40, %ecx
370 lea 0x80(%ecx), %ecx
371 jl L(shl_0_mem_less_64bytes)
372
373 movdqa (%eax), %xmm0
374 sub $0x40, %ecx
375 movdqa 0x10(%eax), %xmm1
376
377 movdqa %xmm0, (%edx)
378 movdqa %xmm1, 0x10(%edx)
379
380 movdqa 0x20(%eax), %xmm0
381 movdqa 0x30(%eax), %xmm1
382 add $0x40, %eax
383
384 movdqa %xmm0, 0x20(%edx)
385 movdqa %xmm1, 0x30(%edx)
386 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400387
Bruce Beare8ff1a272010-03-04 11:03:37 -0800388L(shl_0_mem_less_64bytes):
389 cmp $0x20, %ecx
390 jb L(shl_0_mem_less_32bytes)
391 movdqa (%eax), %xmm0
392 sub $0x20, %ecx
393 movdqa 0x10(%eax), %xmm1
394 add $0x20, %eax
395 movdqa %xmm0, (%edx)
396 movdqa %xmm1, 0x10(%edx)
397 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400398
Bruce Beare8ff1a272010-03-04 11:03:37 -0800399L(shl_0_mem_less_32bytes):
400 cmp $0x10, %ecx
401 jb L(shl_0_mem_less_16bytes)
402 sub $0x10, %ecx
403 movdqa (%eax), %xmm0
404 add $0x10, %eax
405 movdqa %xmm0, (%edx)
406 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400407
Bruce Beare8ff1a272010-03-04 11:03:37 -0800408L(shl_0_mem_less_16bytes):
409 add %ecx, %edx
410 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +0400411 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800412
Jack Renc47703a2012-02-14 12:01:52 +0400413 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800414L(shl_1):
Jack Renc47703a2012-02-14 12:01:52 +0400415#ifndef USE_AS_MEMMOVE
416 movaps -1(%eax), %xmm1
417#else
418 movl DEST+4(%esp), %edi
419 movaps -1(%eax), %xmm1
420 movdqu %xmm0, (%edi)
421#endif
422#ifdef DATA_CACHE_SIZE_HALF
423 cmp $DATA_CACHE_SIZE_HALF, %ecx
424#else
425# if (defined SHARED || defined __PIC__)
426 SETUP_PIC_REG(bx)
427 add $_GLOBAL_OFFSET_TABLE_, %ebx
428 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
429# else
430 cmp __x86_data_cache_size_half, %ecx
431# endif
432#endif
433 jb L(sh_1_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800434
Jack Renc47703a2012-02-14 12:01:52 +0400435 lea -64(%ecx), %ecx
436
437 .p2align 4
438L(Shl1LoopStart):
439 prefetcht0 0x1c0(%eax)
440 prefetcht0 0x1c0(%edx)
441 movaps 15(%eax), %xmm2
442 movaps 31(%eax), %xmm3
443 movaps 47(%eax), %xmm4
444 movaps 63(%eax), %xmm5
445 movaps %xmm5, %xmm7
446 palignr $1, %xmm4, %xmm5
447 palignr $1, %xmm3, %xmm4
448 movaps %xmm5, 48(%edx)
449 palignr $1, %xmm2, %xmm3
450 lea 64(%eax), %eax
451 palignr $1, %xmm1, %xmm2
452 movaps %xmm4, 32(%edx)
453 movaps %xmm3, 16(%edx)
454 movaps %xmm7, %xmm1
455 movaps %xmm2, (%edx)
456 lea 64(%edx), %edx
457 sub $64, %ecx
458 ja L(Shl1LoopStart)
459
460L(Shl1LoopLeave):
461 add $32, %ecx
462 jle L(shl_end_0)
463
464 movaps 15(%eax), %xmm2
465 movaps 31(%eax), %xmm3
466 palignr $1, %xmm2, %xmm3
467 palignr $1, %xmm1, %xmm2
468 movaps %xmm2, (%edx)
469 movaps %xmm3, 16(%edx)
470 lea 32(%edx, %ecx), %edx
471 lea 32(%eax, %ecx), %eax
472 POP (%edi)
473 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
474
475 CFI_PUSH (%edi)
476
477 .p2align 4
478L(sh_1_no_prefetch):
479 lea -32(%ecx), %ecx
480 lea -1(%eax), %eax
481 xor %edi, %edi
482
483 .p2align 4
484L(sh_1_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800485 movdqa 16(%eax, %edi), %xmm2
486 sub $32, %ecx
487 movdqa 32(%eax, %edi), %xmm3
488 movdqa %xmm3, %xmm4
489 palignr $1, %xmm2, %xmm3
490 palignr $1, %xmm1, %xmm2
491 lea 32(%edi), %edi
492 movdqa %xmm2, -32(%edx, %edi)
493 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400494 jb L(sh_1_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800495
496 movdqa 16(%eax, %edi), %xmm2
497 sub $32, %ecx
498 movdqa 32(%eax, %edi), %xmm3
499 movdqa %xmm3, %xmm1
500 palignr $1, %xmm2, %xmm3
501 palignr $1, %xmm4, %xmm2
502 lea 32(%edi), %edi
503 movdqa %xmm2, -32(%edx, %edi)
504 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400505 jae L(sh_1_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800506
Jack Renc47703a2012-02-14 12:01:52 +0400507L(sh_1_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800508 lea 32(%ecx), %ecx
509 add %ecx, %edi
510 add %edi, %edx
511 lea 1(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400512 POP (%edi)
513 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800514
Jack Renc47703a2012-02-14 12:01:52 +0400515 CFI_PUSH (%edi)
516
517 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800518L(shl_2):
Jack Renc47703a2012-02-14 12:01:52 +0400519#ifndef USE_AS_MEMMOVE
520 movaps -2(%eax), %xmm1
521#else
522 movl DEST+4(%esp), %edi
523 movaps -2(%eax), %xmm1
524 movdqu %xmm0, (%edi)
525#endif
526#ifdef DATA_CACHE_SIZE_HALF
527 cmp $DATA_CACHE_SIZE_HALF, %ecx
528#else
529# if (defined SHARED || defined __PIC__)
530 SETUP_PIC_REG(bx)
531 add $_GLOBAL_OFFSET_TABLE_, %ebx
532 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
533# else
534 cmp __x86_data_cache_size_half, %ecx
535# endif
536#endif
537 jb L(sh_2_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800538
Jack Renc47703a2012-02-14 12:01:52 +0400539 lea -64(%ecx), %ecx
540
541 .p2align 4
542L(Shl2LoopStart):
543 prefetcht0 0x1c0(%eax)
544 prefetcht0 0x1c0(%edx)
545 movaps 14(%eax), %xmm2
546 movaps 30(%eax), %xmm3
547 movaps 46(%eax), %xmm4
548 movaps 62(%eax), %xmm5
549 movaps %xmm5, %xmm7
550 palignr $2, %xmm4, %xmm5
551 palignr $2, %xmm3, %xmm4
552 movaps %xmm5, 48(%edx)
553 palignr $2, %xmm2, %xmm3
554 lea 64(%eax), %eax
555 palignr $2, %xmm1, %xmm2
556 movaps %xmm4, 32(%edx)
557 movaps %xmm3, 16(%edx)
558 movaps %xmm7, %xmm1
559 movaps %xmm2, (%edx)
560 lea 64(%edx), %edx
561 sub $64, %ecx
562 ja L(Shl2LoopStart)
563
564L(Shl2LoopLeave):
565 add $32, %ecx
566 jle L(shl_end_0)
567
568 movaps 14(%eax), %xmm2
569 movaps 30(%eax), %xmm3
570 palignr $2, %xmm2, %xmm3
571 palignr $2, %xmm1, %xmm2
572 movaps %xmm2, (%edx)
573 movaps %xmm3, 16(%edx)
574 lea 32(%edx, %ecx), %edx
575 lea 32(%eax, %ecx), %eax
576 POP (%edi)
577 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
578
579 CFI_PUSH (%edi)
580
581 .p2align 4
582L(sh_2_no_prefetch):
583 lea -32(%ecx), %ecx
584 lea -2(%eax), %eax
585 xor %edi, %edi
586
587 .p2align 4
588L(sh_2_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800589 movdqa 16(%eax, %edi), %xmm2
590 sub $32, %ecx
591 movdqa 32(%eax, %edi), %xmm3
592 movdqa %xmm3, %xmm4
593 palignr $2, %xmm2, %xmm3
594 palignr $2, %xmm1, %xmm2
595 lea 32(%edi), %edi
596 movdqa %xmm2, -32(%edx, %edi)
597 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400598 jb L(sh_2_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800599
600 movdqa 16(%eax, %edi), %xmm2
601 sub $32, %ecx
602 movdqa 32(%eax, %edi), %xmm3
603 movdqa %xmm3, %xmm1
604 palignr $2, %xmm2, %xmm3
605 palignr $2, %xmm4, %xmm2
606 lea 32(%edi), %edi
607 movdqa %xmm2, -32(%edx, %edi)
608 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400609 jae L(sh_2_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800610
Jack Renc47703a2012-02-14 12:01:52 +0400611L(sh_2_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800612 lea 32(%ecx), %ecx
613 add %ecx, %edi
614 add %edi, %edx
615 lea 2(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400616 POP (%edi)
617 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800618
Jack Renc47703a2012-02-14 12:01:52 +0400619 CFI_PUSH (%edi)
620
621 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800622L(shl_3):
Jack Renc47703a2012-02-14 12:01:52 +0400623#ifndef USE_AS_MEMMOVE
624 movaps -3(%eax), %xmm1
625#else
626 movl DEST+4(%esp), %edi
627 movaps -3(%eax), %xmm1
628 movdqu %xmm0, (%edi)
629#endif
630#ifdef DATA_CACHE_SIZE_HALF
631 cmp $DATA_CACHE_SIZE_HALF, %ecx
632#else
633# if (defined SHARED || defined __PIC__)
634 SETUP_PIC_REG(bx)
635 add $_GLOBAL_OFFSET_TABLE_, %ebx
636 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
637# else
638 cmp __x86_data_cache_size_half, %ecx
639# endif
640#endif
641 jb L(sh_3_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800642
Jack Renc47703a2012-02-14 12:01:52 +0400643 lea -64(%ecx), %ecx
644
645 .p2align 4
646L(Shl3LoopStart):
647 prefetcht0 0x1c0(%eax)
648 prefetcht0 0x1c0(%edx)
649 movaps 13(%eax), %xmm2
650 movaps 29(%eax), %xmm3
651 movaps 45(%eax), %xmm4
652 movaps 61(%eax), %xmm5
653 movaps %xmm5, %xmm7
654 palignr $3, %xmm4, %xmm5
655 palignr $3, %xmm3, %xmm4
656 movaps %xmm5, 48(%edx)
657 palignr $3, %xmm2, %xmm3
658 lea 64(%eax), %eax
659 palignr $3, %xmm1, %xmm2
660 movaps %xmm4, 32(%edx)
661 movaps %xmm3, 16(%edx)
662 movaps %xmm7, %xmm1
663 movaps %xmm2, (%edx)
664 lea 64(%edx), %edx
665 sub $64, %ecx
666 ja L(Shl3LoopStart)
667
668L(Shl3LoopLeave):
669 add $32, %ecx
670 jle L(shl_end_0)
671
672 movaps 13(%eax), %xmm2
673 movaps 29(%eax), %xmm3
674 palignr $3, %xmm2, %xmm3
675 palignr $3, %xmm1, %xmm2
676 movaps %xmm2, (%edx)
677 movaps %xmm3, 16(%edx)
678 lea 32(%edx, %ecx), %edx
679 lea 32(%eax, %ecx), %eax
680 POP (%edi)
681 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
682
683 CFI_PUSH (%edi)
684
685 .p2align 4
686L(sh_3_no_prefetch):
687 lea -32(%ecx), %ecx
688 lea -3(%eax), %eax
689 xor %edi, %edi
690
691 .p2align 4
692L(sh_3_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800693 movdqa 16(%eax, %edi), %xmm2
694 sub $32, %ecx
695 movdqa 32(%eax, %edi), %xmm3
696 movdqa %xmm3, %xmm4
697 palignr $3, %xmm2, %xmm3
698 palignr $3, %xmm1, %xmm2
699 lea 32(%edi), %edi
700 movdqa %xmm2, -32(%edx, %edi)
701 movdqa %xmm3, -16(%edx, %edi)
702
Jack Renc47703a2012-02-14 12:01:52 +0400703 jb L(sh_3_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800704
705 movdqa 16(%eax, %edi), %xmm2
706 sub $32, %ecx
707 movdqa 32(%eax, %edi), %xmm3
708 movdqa %xmm3, %xmm1
709 palignr $3, %xmm2, %xmm3
710 palignr $3, %xmm4, %xmm2
711 lea 32(%edi), %edi
712 movdqa %xmm2, -32(%edx, %edi)
713 movdqa %xmm3, -16(%edx, %edi)
714
Jack Renc47703a2012-02-14 12:01:52 +0400715 jae L(sh_3_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800716
Jack Renc47703a2012-02-14 12:01:52 +0400717L(sh_3_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800718 lea 32(%ecx), %ecx
719 add %ecx, %edi
720 add %edi, %edx
721 lea 3(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400722 POP (%edi)
723 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800724
Jack Renc47703a2012-02-14 12:01:52 +0400725 CFI_PUSH (%edi)
726
727 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800728L(shl_4):
Jack Renc47703a2012-02-14 12:01:52 +0400729#ifndef USE_AS_MEMMOVE
730 movaps -4(%eax), %xmm1
731#else
732 movl DEST+4(%esp), %edi
733 movaps -4(%eax), %xmm1
734 movdqu %xmm0, (%edi)
735#endif
736#ifdef DATA_CACHE_SIZE_HALF
737 cmp $DATA_CACHE_SIZE_HALF, %ecx
738#else
739# if (defined SHARED || defined __PIC__)
740 SETUP_PIC_REG(bx)
741 add $_GLOBAL_OFFSET_TABLE_, %ebx
742 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
743# else
744 cmp __x86_data_cache_size_half, %ecx
745# endif
746#endif
747 jb L(sh_4_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800748
Jack Renc47703a2012-02-14 12:01:52 +0400749 lea -64(%ecx), %ecx
750
751 .p2align 4
752L(Shl4LoopStart):
753 prefetcht0 0x1c0(%eax)
754 prefetcht0 0x1c0(%edx)
755 movaps 12(%eax), %xmm2
756 movaps 28(%eax), %xmm3
757 movaps 44(%eax), %xmm4
758 movaps 60(%eax), %xmm5
759 movaps %xmm5, %xmm7
760 palignr $4, %xmm4, %xmm5
761 palignr $4, %xmm3, %xmm4
762 movaps %xmm5, 48(%edx)
763 palignr $4, %xmm2, %xmm3
764 lea 64(%eax), %eax
765 palignr $4, %xmm1, %xmm2
766 movaps %xmm4, 32(%edx)
767 movaps %xmm3, 16(%edx)
768 movaps %xmm7, %xmm1
769 movaps %xmm2, (%edx)
770 lea 64(%edx), %edx
771 sub $64, %ecx
772 ja L(Shl4LoopStart)
773
774L(Shl4LoopLeave):
775 add $32, %ecx
776 jle L(shl_end_0)
777
778 movaps 12(%eax), %xmm2
779 movaps 28(%eax), %xmm3
780 palignr $4, %xmm2, %xmm3
781 palignr $4, %xmm1, %xmm2
782 movaps %xmm2, (%edx)
783 movaps %xmm3, 16(%edx)
784 lea 32(%edx, %ecx), %edx
785 lea 32(%eax, %ecx), %eax
786 POP (%edi)
787 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
788
789 CFI_PUSH (%edi)
790
791 .p2align 4
792L(sh_4_no_prefetch):
793 lea -32(%ecx), %ecx
794 lea -4(%eax), %eax
795 xor %edi, %edi
796
797 .p2align 4
798L(sh_4_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800799 movdqa 16(%eax, %edi), %xmm2
800 sub $32, %ecx
801 movdqa 32(%eax, %edi), %xmm3
802 movdqa %xmm3, %xmm4
803 palignr $4, %xmm2, %xmm3
804 palignr $4, %xmm1, %xmm2
805 lea 32(%edi), %edi
806 movdqa %xmm2, -32(%edx, %edi)
807 movdqa %xmm3, -16(%edx, %edi)
808
Jack Renc47703a2012-02-14 12:01:52 +0400809 jb L(sh_4_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800810
811 movdqa 16(%eax, %edi), %xmm2
812 sub $32, %ecx
813 movdqa 32(%eax, %edi), %xmm3
814 movdqa %xmm3, %xmm1
815 palignr $4, %xmm2, %xmm3
816 palignr $4, %xmm4, %xmm2
817 lea 32(%edi), %edi
818 movdqa %xmm2, -32(%edx, %edi)
819 movdqa %xmm3, -16(%edx, %edi)
820
Jack Renc47703a2012-02-14 12:01:52 +0400821 jae L(sh_4_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800822
Jack Renc47703a2012-02-14 12:01:52 +0400823L(sh_4_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800824 lea 32(%ecx), %ecx
825 add %ecx, %edi
826 add %edi, %edx
827 lea 4(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400828 POP (%edi)
829 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800830
Jack Renc47703a2012-02-14 12:01:52 +0400831 CFI_PUSH (%edi)
832
833 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800834L(shl_5):
Jack Renc47703a2012-02-14 12:01:52 +0400835#ifndef USE_AS_MEMMOVE
836 movaps -5(%eax), %xmm1
837#else
838 movl DEST+4(%esp), %edi
839 movaps -5(%eax), %xmm1
840 movdqu %xmm0, (%edi)
841#endif
842#ifdef DATA_CACHE_SIZE_HALF
843 cmp $DATA_CACHE_SIZE_HALF, %ecx
844#else
845# if (defined SHARED || defined __PIC__)
846 SETUP_PIC_REG(bx)
847 add $_GLOBAL_OFFSET_TABLE_, %ebx
848 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
849# else
850 cmp __x86_data_cache_size_half, %ecx
851# endif
852#endif
853 jb L(sh_5_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800854
Jack Renc47703a2012-02-14 12:01:52 +0400855 lea -64(%ecx), %ecx
856
857 .p2align 4
858L(Shl5LoopStart):
859 prefetcht0 0x1c0(%eax)
860 prefetcht0 0x1c0(%edx)
861 movaps 11(%eax), %xmm2
862 movaps 27(%eax), %xmm3
863 movaps 43(%eax), %xmm4
864 movaps 59(%eax), %xmm5
865 movaps %xmm5, %xmm7
866 palignr $5, %xmm4, %xmm5
867 palignr $5, %xmm3, %xmm4
868 movaps %xmm5, 48(%edx)
869 palignr $5, %xmm2, %xmm3
870 lea 64(%eax), %eax
871 palignr $5, %xmm1, %xmm2
872 movaps %xmm4, 32(%edx)
873 movaps %xmm3, 16(%edx)
874 movaps %xmm7, %xmm1
875 movaps %xmm2, (%edx)
876 lea 64(%edx), %edx
877 sub $64, %ecx
878 ja L(Shl5LoopStart)
879
880L(Shl5LoopLeave):
881 add $32, %ecx
882 jle L(shl_end_0)
883
884 movaps 11(%eax), %xmm2
885 movaps 27(%eax), %xmm3
886 palignr $5, %xmm2, %xmm3
887 palignr $5, %xmm1, %xmm2
888 movaps %xmm2, (%edx)
889 movaps %xmm3, 16(%edx)
890 lea 32(%edx, %ecx), %edx
891 lea 32(%eax, %ecx), %eax
892 POP (%edi)
893 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
894
895 CFI_PUSH (%edi)
896
897 .p2align 4
898L(sh_5_no_prefetch):
899 lea -32(%ecx), %ecx
900 lea -5(%eax), %eax
901 xor %edi, %edi
902
903 .p2align 4
904L(sh_5_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800905 movdqa 16(%eax, %edi), %xmm2
906 sub $32, %ecx
907 movdqa 32(%eax, %edi), %xmm3
908 movdqa %xmm3, %xmm4
909 palignr $5, %xmm2, %xmm3
910 palignr $5, %xmm1, %xmm2
911 lea 32(%edi), %edi
912 movdqa %xmm2, -32(%edx, %edi)
913 movdqa %xmm3, -16(%edx, %edi)
914
Jack Renc47703a2012-02-14 12:01:52 +0400915 jb L(sh_5_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800916
917 movdqa 16(%eax, %edi), %xmm2
918 sub $32, %ecx
919 movdqa 32(%eax, %edi), %xmm3
920 movdqa %xmm3, %xmm1
921 palignr $5, %xmm2, %xmm3
922 palignr $5, %xmm4, %xmm2
923 lea 32(%edi), %edi
924 movdqa %xmm2, -32(%edx, %edi)
925 movdqa %xmm3, -16(%edx, %edi)
926
Jack Renc47703a2012-02-14 12:01:52 +0400927 jae L(sh_5_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800928
Jack Renc47703a2012-02-14 12:01:52 +0400929L(sh_5_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800930 lea 32(%ecx), %ecx
931 add %ecx, %edi
932 add %edi, %edx
933 lea 5(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400934 POP (%edi)
935 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800936
Jack Renc47703a2012-02-14 12:01:52 +0400937 CFI_PUSH (%edi)
938
939 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800940L(shl_6):
Jack Renc47703a2012-02-14 12:01:52 +0400941#ifndef USE_AS_MEMMOVE
942 movaps -6(%eax), %xmm1
943#else
944 movl DEST+4(%esp), %edi
945 movaps -6(%eax), %xmm1
946 movdqu %xmm0, (%edi)
947#endif
948#ifdef DATA_CACHE_SIZE_HALF
949 cmp $DATA_CACHE_SIZE_HALF, %ecx
950#else
951# if (defined SHARED || defined __PIC__)
952 SETUP_PIC_REG(bx)
953 add $_GLOBAL_OFFSET_TABLE_, %ebx
954 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
955# else
956 cmp __x86_data_cache_size_half, %ecx
957# endif
958#endif
959 jb L(sh_6_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800960
Jack Renc47703a2012-02-14 12:01:52 +0400961 lea -64(%ecx), %ecx
962
963 .p2align 4
964L(Shl6LoopStart):
965 prefetcht0 0x1c0(%eax)
966 prefetcht0 0x1c0(%edx)
967 movaps 10(%eax), %xmm2
968 movaps 26(%eax), %xmm3
969 movaps 42(%eax), %xmm4
970 movaps 58(%eax), %xmm5
971 movaps %xmm5, %xmm7
972 palignr $6, %xmm4, %xmm5
973 palignr $6, %xmm3, %xmm4
974 movaps %xmm5, 48(%edx)
975 palignr $6, %xmm2, %xmm3
976 lea 64(%eax), %eax
977 palignr $6, %xmm1, %xmm2
978 movaps %xmm4, 32(%edx)
979 movaps %xmm3, 16(%edx)
980 movaps %xmm7, %xmm1
981 movaps %xmm2, (%edx)
982 lea 64(%edx), %edx
983 sub $64, %ecx
984 ja L(Shl6LoopStart)
985
986L(Shl6LoopLeave):
987 add $32, %ecx
988 jle L(shl_end_0)
989
990 movaps 10(%eax), %xmm2
991 movaps 26(%eax), %xmm3
992 palignr $6, %xmm2, %xmm3
993 palignr $6, %xmm1, %xmm2
994 movaps %xmm2, (%edx)
995 movaps %xmm3, 16(%edx)
996 lea 32(%edx, %ecx), %edx
997 lea 32(%eax, %ecx), %eax
998 POP (%edi)
999 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1000
1001 CFI_PUSH (%edi)
1002
1003 .p2align 4
1004L(sh_6_no_prefetch):
1005 lea -32(%ecx), %ecx
1006 lea -6(%eax), %eax
1007 xor %edi, %edi
1008
1009 .p2align 4
1010L(sh_6_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001011 movdqa 16(%eax, %edi), %xmm2
1012 sub $32, %ecx
1013 movdqa 32(%eax, %edi), %xmm3
1014 movdqa %xmm3, %xmm4
1015 palignr $6, %xmm2, %xmm3
1016 palignr $6, %xmm1, %xmm2
1017 lea 32(%edi), %edi
1018 movdqa %xmm2, -32(%edx, %edi)
1019 movdqa %xmm3, -16(%edx, %edi)
1020
Jack Renc47703a2012-02-14 12:01:52 +04001021 jb L(sh_6_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001022
1023 movdqa 16(%eax, %edi), %xmm2
1024 sub $32, %ecx
1025 movdqa 32(%eax, %edi), %xmm3
1026 movdqa %xmm3, %xmm1
1027 palignr $6, %xmm2, %xmm3
1028 palignr $6, %xmm4, %xmm2
1029 lea 32(%edi), %edi
1030 movdqa %xmm2, -32(%edx, %edi)
1031 movdqa %xmm3, -16(%edx, %edi)
1032
Jack Renc47703a2012-02-14 12:01:52 +04001033 jae L(sh_6_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001034
Jack Renc47703a2012-02-14 12:01:52 +04001035L(sh_6_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001036 lea 32(%ecx), %ecx
1037 add %ecx, %edi
1038 add %edi, %edx
1039 lea 6(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001040 POP (%edi)
1041 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001042
Jack Renc47703a2012-02-14 12:01:52 +04001043 CFI_PUSH (%edi)
1044
1045 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001046L(shl_7):
Jack Renc47703a2012-02-14 12:01:52 +04001047#ifndef USE_AS_MEMMOVE
1048 movaps -7(%eax), %xmm1
1049#else
1050 movl DEST+4(%esp), %edi
1051 movaps -7(%eax), %xmm1
1052 movdqu %xmm0, (%edi)
1053#endif
1054#ifdef DATA_CACHE_SIZE_HALF
1055 cmp $DATA_CACHE_SIZE_HALF, %ecx
1056#else
1057# if (defined SHARED || defined __PIC__)
1058 SETUP_PIC_REG(bx)
1059 add $_GLOBAL_OFFSET_TABLE_, %ebx
1060 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1061# else
1062 cmp __x86_data_cache_size_half, %ecx
1063# endif
1064#endif
1065 jb L(sh_7_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001066
Jack Renc47703a2012-02-14 12:01:52 +04001067 lea -64(%ecx), %ecx
1068
1069 .p2align 4
1070L(Shl7LoopStart):
1071 prefetcht0 0x1c0(%eax)
1072 prefetcht0 0x1c0(%edx)
1073 movaps 9(%eax), %xmm2
1074 movaps 25(%eax), %xmm3
1075 movaps 41(%eax), %xmm4
1076 movaps 57(%eax), %xmm5
1077 movaps %xmm5, %xmm7
1078 palignr $7, %xmm4, %xmm5
1079 palignr $7, %xmm3, %xmm4
1080 movaps %xmm5, 48(%edx)
1081 palignr $7, %xmm2, %xmm3
1082 lea 64(%eax), %eax
1083 palignr $7, %xmm1, %xmm2
1084 movaps %xmm4, 32(%edx)
1085 movaps %xmm3, 16(%edx)
1086 movaps %xmm7, %xmm1
1087 movaps %xmm2, (%edx)
1088 lea 64(%edx), %edx
1089 sub $64, %ecx
1090 ja L(Shl7LoopStart)
1091
1092L(Shl7LoopLeave):
1093 add $32, %ecx
1094 jle L(shl_end_0)
1095
1096 movaps 9(%eax), %xmm2
1097 movaps 25(%eax), %xmm3
1098 palignr $7, %xmm2, %xmm3
1099 palignr $7, %xmm1, %xmm2
1100 movaps %xmm2, (%edx)
1101 movaps %xmm3, 16(%edx)
1102 lea 32(%edx, %ecx), %edx
1103 lea 32(%eax, %ecx), %eax
1104 POP (%edi)
1105 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1106
1107 CFI_PUSH (%edi)
1108
1109 .p2align 4
1110L(sh_7_no_prefetch):
1111 lea -32(%ecx), %ecx
1112 lea -7(%eax), %eax
1113 xor %edi, %edi
1114
1115 .p2align 4
1116L(sh_7_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001117 movdqa 16(%eax, %edi), %xmm2
1118 sub $32, %ecx
1119 movdqa 32(%eax, %edi), %xmm3
1120 movdqa %xmm3, %xmm4
1121 palignr $7, %xmm2, %xmm3
1122 palignr $7, %xmm1, %xmm2
1123 lea 32(%edi), %edi
1124 movdqa %xmm2, -32(%edx, %edi)
1125 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001126 jb L(sh_7_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001127
1128 movdqa 16(%eax, %edi), %xmm2
1129 sub $32, %ecx
1130 movdqa 32(%eax, %edi), %xmm3
1131 movdqa %xmm3, %xmm1
1132 palignr $7, %xmm2, %xmm3
1133 palignr $7, %xmm4, %xmm2
1134 lea 32(%edi), %edi
1135 movdqa %xmm2, -32(%edx, %edi)
1136 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001137 jae L(sh_7_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001138
Jack Renc47703a2012-02-14 12:01:52 +04001139L(sh_7_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001140 lea 32(%ecx), %ecx
1141 add %ecx, %edi
1142 add %edi, %edx
1143 lea 7(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001144 POP (%edi)
1145 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001146
Jack Renc47703a2012-02-14 12:01:52 +04001147 CFI_PUSH (%edi)
1148
1149 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001150L(shl_8):
Jack Renc47703a2012-02-14 12:01:52 +04001151#ifndef USE_AS_MEMMOVE
1152 movaps -8(%eax), %xmm1
1153#else
1154 movl DEST+4(%esp), %edi
1155 movaps -8(%eax), %xmm1
1156 movdqu %xmm0, (%edi)
1157#endif
1158#ifdef DATA_CACHE_SIZE_HALF
1159 cmp $DATA_CACHE_SIZE_HALF, %ecx
1160#else
1161# if (defined SHARED || defined __PIC__)
1162 SETUP_PIC_REG(bx)
1163 add $_GLOBAL_OFFSET_TABLE_, %ebx
1164 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1165# else
1166 cmp __x86_data_cache_size_half, %ecx
1167# endif
1168#endif
1169 jb L(sh_8_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001170
Jack Renc47703a2012-02-14 12:01:52 +04001171 lea -64(%ecx), %ecx
1172
1173 .p2align 4
1174L(Shl8LoopStart):
1175 prefetcht0 0x1c0(%eax)
1176 prefetcht0 0x1c0(%edx)
1177 movaps 8(%eax), %xmm2
1178 movaps 24(%eax), %xmm3
1179 movaps 40(%eax), %xmm4
1180 movaps 56(%eax), %xmm5
1181 movaps %xmm5, %xmm7
1182 palignr $8, %xmm4, %xmm5
1183 palignr $8, %xmm3, %xmm4
1184 movaps %xmm5, 48(%edx)
1185 palignr $8, %xmm2, %xmm3
1186 lea 64(%eax), %eax
1187 palignr $8, %xmm1, %xmm2
1188 movaps %xmm4, 32(%edx)
1189 movaps %xmm3, 16(%edx)
1190 movaps %xmm7, %xmm1
1191 movaps %xmm2, (%edx)
1192 lea 64(%edx), %edx
1193 sub $64, %ecx
1194 ja L(Shl8LoopStart)
1195
1196L(LoopLeave8):
1197 add $32, %ecx
1198 jle L(shl_end_0)
1199
1200 movaps 8(%eax), %xmm2
1201 movaps 24(%eax), %xmm3
1202 palignr $8, %xmm2, %xmm3
1203 palignr $8, %xmm1, %xmm2
1204 movaps %xmm2, (%edx)
1205 movaps %xmm3, 16(%edx)
1206 lea 32(%edx, %ecx), %edx
1207 lea 32(%eax, %ecx), %eax
1208 POP (%edi)
1209 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1210
1211 CFI_PUSH (%edi)
1212
1213 .p2align 4
1214L(sh_8_no_prefetch):
1215 lea -32(%ecx), %ecx
1216 lea -8(%eax), %eax
1217 xor %edi, %edi
1218
1219 .p2align 4
1220L(sh_8_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001221 movdqa 16(%eax, %edi), %xmm2
1222 sub $32, %ecx
1223 movdqa 32(%eax, %edi), %xmm3
1224 movdqa %xmm3, %xmm4
1225 palignr $8, %xmm2, %xmm3
1226 palignr $8, %xmm1, %xmm2
1227 lea 32(%edi), %edi
1228 movdqa %xmm2, -32(%edx, %edi)
1229 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001230 jb L(sh_8_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001231
1232 movdqa 16(%eax, %edi), %xmm2
1233 sub $32, %ecx
1234 movdqa 32(%eax, %edi), %xmm3
1235 movdqa %xmm3, %xmm1
1236 palignr $8, %xmm2, %xmm3
1237 palignr $8, %xmm4, %xmm2
1238 lea 32(%edi), %edi
1239 movdqa %xmm2, -32(%edx, %edi)
1240 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001241 jae L(sh_8_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001242
Jack Renc47703a2012-02-14 12:01:52 +04001243L(sh_8_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001244 lea 32(%ecx), %ecx
1245 add %ecx, %edi
1246 add %edi, %edx
1247 lea 8(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001248 POP (%edi)
1249 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001250
Jack Renc47703a2012-02-14 12:01:52 +04001251 CFI_PUSH (%edi)
1252
1253 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001254L(shl_9):
Jack Renc47703a2012-02-14 12:01:52 +04001255#ifndef USE_AS_MEMMOVE
1256 movaps -9(%eax), %xmm1
1257#else
1258 movl DEST+4(%esp), %edi
1259 movaps -9(%eax), %xmm1
1260 movdqu %xmm0, (%edi)
1261#endif
1262#ifdef DATA_CACHE_SIZE_HALF
1263 cmp $DATA_CACHE_SIZE_HALF, %ecx
1264#else
1265# if (defined SHARED || defined __PIC__)
1266 SETUP_PIC_REG(bx)
1267 add $_GLOBAL_OFFSET_TABLE_, %ebx
1268 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1269# else
1270 cmp __x86_data_cache_size_half, %ecx
1271# endif
1272#endif
1273 jb L(sh_9_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001274
Jack Renc47703a2012-02-14 12:01:52 +04001275 lea -64(%ecx), %ecx
1276
1277 .p2align 4
1278L(Shl9LoopStart):
1279 prefetcht0 0x1c0(%eax)
1280 prefetcht0 0x1c0(%edx)
1281 movaps 7(%eax), %xmm2
1282 movaps 23(%eax), %xmm3
1283 movaps 39(%eax), %xmm4
1284 movaps 55(%eax), %xmm5
1285 movaps %xmm5, %xmm7
1286 palignr $9, %xmm4, %xmm5
1287 palignr $9, %xmm3, %xmm4
1288 movaps %xmm5, 48(%edx)
1289 palignr $9, %xmm2, %xmm3
1290 lea 64(%eax), %eax
1291 palignr $9, %xmm1, %xmm2
1292 movaps %xmm4, 32(%edx)
1293 movaps %xmm3, 16(%edx)
1294 movaps %xmm7, %xmm1
1295 movaps %xmm2, (%edx)
1296 lea 64(%edx), %edx
1297 sub $64, %ecx
1298 ja L(Shl9LoopStart)
1299
1300L(Shl9LoopLeave):
1301 add $32, %ecx
1302 jle L(shl_end_0)
1303
1304 movaps 7(%eax), %xmm2
1305 movaps 23(%eax), %xmm3
1306 palignr $9, %xmm2, %xmm3
1307 palignr $9, %xmm1, %xmm2
1308
1309 movaps %xmm2, (%edx)
1310 movaps %xmm3, 16(%edx)
1311 lea 32(%edx, %ecx), %edx
1312 lea 32(%eax, %ecx), %eax
1313 POP (%edi)
1314 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1315
1316 CFI_PUSH (%edi)
1317
1318 .p2align 4
1319L(sh_9_no_prefetch):
1320 lea -32(%ecx), %ecx
1321 lea -9(%eax), %eax
1322 xor %edi, %edi
1323
1324 .p2align 4
1325L(sh_9_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001326 movdqa 16(%eax, %edi), %xmm2
1327 sub $32, %ecx
1328 movdqa 32(%eax, %edi), %xmm3
1329 movdqa %xmm3, %xmm4
1330 palignr $9, %xmm2, %xmm3
1331 palignr $9, %xmm1, %xmm2
1332 lea 32(%edi), %edi
1333 movdqa %xmm2, -32(%edx, %edi)
1334 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001335 jb L(sh_9_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001336
1337 movdqa 16(%eax, %edi), %xmm2
1338 sub $32, %ecx
1339 movdqa 32(%eax, %edi), %xmm3
1340 movdqa %xmm3, %xmm1
1341 palignr $9, %xmm2, %xmm3
1342 palignr $9, %xmm4, %xmm2
1343 lea 32(%edi), %edi
1344 movdqa %xmm2, -32(%edx, %edi)
1345 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001346 jae L(sh_9_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001347
Jack Renc47703a2012-02-14 12:01:52 +04001348L(sh_9_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001349 lea 32(%ecx), %ecx
1350 add %ecx, %edi
1351 add %edi, %edx
1352 lea 9(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001353 POP (%edi)
1354 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001355
Jack Renc47703a2012-02-14 12:01:52 +04001356 CFI_PUSH (%edi)
1357
1358 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001359L(shl_10):
Jack Renc47703a2012-02-14 12:01:52 +04001360#ifndef USE_AS_MEMMOVE
1361 movaps -10(%eax), %xmm1
1362#else
1363 movl DEST+4(%esp), %edi
1364 movaps -10(%eax), %xmm1
1365 movdqu %xmm0, (%edi)
1366#endif
1367#ifdef DATA_CACHE_SIZE_HALF
1368 cmp $DATA_CACHE_SIZE_HALF, %ecx
1369#else
1370# if (defined SHARED || defined __PIC__)
1371 SETUP_PIC_REG(bx)
1372 add $_GLOBAL_OFFSET_TABLE_, %ebx
1373 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1374# else
1375 cmp __x86_data_cache_size_half, %ecx
1376# endif
1377#endif
1378 jb L(sh_10_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001379
Jack Renc47703a2012-02-14 12:01:52 +04001380 lea -64(%ecx), %ecx
1381
1382 .p2align 4
1383L(Shl10LoopStart):
1384 prefetcht0 0x1c0(%eax)
1385 prefetcht0 0x1c0(%edx)
1386 movaps 6(%eax), %xmm2
1387 movaps 22(%eax), %xmm3
1388 movaps 38(%eax), %xmm4
1389 movaps 54(%eax), %xmm5
1390 movaps %xmm5, %xmm7
1391 palignr $10, %xmm4, %xmm5
1392 palignr $10, %xmm3, %xmm4
1393 movaps %xmm5, 48(%edx)
1394 palignr $10, %xmm2, %xmm3
1395 lea 64(%eax), %eax
1396 palignr $10, %xmm1, %xmm2
1397 movaps %xmm4, 32(%edx)
1398 movaps %xmm3, 16(%edx)
1399 movaps %xmm7, %xmm1
1400 movaps %xmm2, (%edx)
1401 lea 64(%edx), %edx
1402 sub $64, %ecx
1403 ja L(Shl10LoopStart)
1404
1405L(Shl10LoopLeave):
1406 add $32, %ecx
1407 jle L(shl_end_0)
1408
1409 movaps 6(%eax), %xmm2
1410 movaps 22(%eax), %xmm3
1411 palignr $10, %xmm2, %xmm3
1412 palignr $10, %xmm1, %xmm2
1413
1414 movaps %xmm2, (%edx)
1415 movaps %xmm3, 16(%edx)
1416 lea 32(%edx, %ecx), %edx
1417 lea 32(%eax, %ecx), %eax
1418 POP (%edi)
1419 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1420
1421 CFI_PUSH (%edi)
1422
1423 .p2align 4
1424L(sh_10_no_prefetch):
1425 lea -32(%ecx), %ecx
1426 lea -10(%eax), %eax
1427 xor %edi, %edi
1428
1429 .p2align 4
1430L(sh_10_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001431 movdqa 16(%eax, %edi), %xmm2
1432 sub $32, %ecx
1433 movdqa 32(%eax, %edi), %xmm3
1434 movdqa %xmm3, %xmm4
1435 palignr $10, %xmm2, %xmm3
1436 palignr $10, %xmm1, %xmm2
1437 lea 32(%edi), %edi
1438 movdqa %xmm2, -32(%edx, %edi)
1439 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001440 jb L(sh_10_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001441
1442 movdqa 16(%eax, %edi), %xmm2
1443 sub $32, %ecx
1444 movdqa 32(%eax, %edi), %xmm3
1445 movdqa %xmm3, %xmm1
1446 palignr $10, %xmm2, %xmm3
1447 palignr $10, %xmm4, %xmm2
1448 lea 32(%edi), %edi
1449 movdqa %xmm2, -32(%edx, %edi)
1450 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001451 jae L(sh_10_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001452
Jack Renc47703a2012-02-14 12:01:52 +04001453L(sh_10_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001454 lea 32(%ecx), %ecx
1455 add %ecx, %edi
1456 add %edi, %edx
1457 lea 10(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001458 POP (%edi)
1459 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001460
Jack Renc47703a2012-02-14 12:01:52 +04001461 CFI_PUSH (%edi)
1462
1463 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001464L(shl_11):
Jack Renc47703a2012-02-14 12:01:52 +04001465#ifndef USE_AS_MEMMOVE
1466 movaps -11(%eax), %xmm1
1467#else
1468 movl DEST+4(%esp), %edi
1469 movaps -11(%eax), %xmm1
1470 movdqu %xmm0, (%edi)
1471#endif
1472#ifdef DATA_CACHE_SIZE_HALF
1473 cmp $DATA_CACHE_SIZE_HALF, %ecx
1474#else
1475# if (defined SHARED || defined __PIC__)
1476 SETUP_PIC_REG(bx)
1477 add $_GLOBAL_OFFSET_TABLE_, %ebx
1478 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1479# else
1480 cmp __x86_data_cache_size_half, %ecx
1481# endif
1482#endif
1483 jb L(sh_11_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001484
Jack Renc47703a2012-02-14 12:01:52 +04001485 lea -64(%ecx), %ecx
1486
1487 .p2align 4
1488L(Shl11LoopStart):
1489 prefetcht0 0x1c0(%eax)
1490 prefetcht0 0x1c0(%edx)
1491 movaps 5(%eax), %xmm2
1492 movaps 21(%eax), %xmm3
1493 movaps 37(%eax), %xmm4
1494 movaps 53(%eax), %xmm5
1495 movaps %xmm5, %xmm7
1496 palignr $11, %xmm4, %xmm5
1497 palignr $11, %xmm3, %xmm4
1498 movaps %xmm5, 48(%edx)
1499 palignr $11, %xmm2, %xmm3
1500 lea 64(%eax), %eax
1501 palignr $11, %xmm1, %xmm2
1502 movaps %xmm4, 32(%edx)
1503 movaps %xmm3, 16(%edx)
1504 movaps %xmm7, %xmm1
1505 movaps %xmm2, (%edx)
1506 lea 64(%edx), %edx
1507 sub $64, %ecx
1508 ja L(Shl11LoopStart)
1509
1510L(Shl11LoopLeave):
1511 add $32, %ecx
1512 jle L(shl_end_0)
1513
1514 movaps 5(%eax), %xmm2
1515 movaps 21(%eax), %xmm3
1516 palignr $11, %xmm2, %xmm3
1517 palignr $11, %xmm1, %xmm2
1518
1519 movaps %xmm2, (%edx)
1520 movaps %xmm3, 16(%edx)
1521 lea 32(%edx, %ecx), %edx
1522 lea 32(%eax, %ecx), %eax
1523 POP (%edi)
1524 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1525
1526 CFI_PUSH (%edi)
1527
1528 .p2align 4
1529L(sh_11_no_prefetch):
1530 lea -32(%ecx), %ecx
1531 lea -11(%eax), %eax
1532 xor %edi, %edi
1533
1534 .p2align 4
1535L(sh_11_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001536 movdqa 16(%eax, %edi), %xmm2
1537 sub $32, %ecx
1538 movdqa 32(%eax, %edi), %xmm3
1539 movdqa %xmm3, %xmm4
1540 palignr $11, %xmm2, %xmm3
1541 palignr $11, %xmm1, %xmm2
1542 lea 32(%edi), %edi
1543 movdqa %xmm2, -32(%edx, %edi)
1544 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001545 jb L(sh_11_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001546
1547 movdqa 16(%eax, %edi), %xmm2
1548 sub $32, %ecx
1549 movdqa 32(%eax, %edi), %xmm3
1550 movdqa %xmm3, %xmm1
1551 palignr $11, %xmm2, %xmm3
1552 palignr $11, %xmm4, %xmm2
1553 lea 32(%edi), %edi
1554 movdqa %xmm2, -32(%edx, %edi)
1555 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001556 jae L(sh_11_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001557
Jack Renc47703a2012-02-14 12:01:52 +04001558L(sh_11_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001559 lea 32(%ecx), %ecx
1560 add %ecx, %edi
1561 add %edi, %edx
1562 lea 11(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001563 POP (%edi)
1564 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001565
Jack Renc47703a2012-02-14 12:01:52 +04001566 CFI_PUSH (%edi)
1567
1568 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001569L(shl_12):
Jack Renc47703a2012-02-14 12:01:52 +04001570#ifndef USE_AS_MEMMOVE
1571 movaps -12(%eax), %xmm1
1572#else
1573 movl DEST+4(%esp), %edi
1574 movaps -12(%eax), %xmm1
1575 movdqu %xmm0, (%edi)
1576#endif
1577#ifdef DATA_CACHE_SIZE_HALF
1578 cmp $DATA_CACHE_SIZE_HALF, %ecx
1579#else
1580# if (defined SHARED || defined __PIC__)
1581 SETUP_PIC_REG(bx)
1582 add $_GLOBAL_OFFSET_TABLE_, %ebx
1583 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1584# else
1585 cmp __x86_data_cache_size_half, %ecx
1586# endif
1587#endif
1588 jb L(sh_12_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001589
Jack Renc47703a2012-02-14 12:01:52 +04001590 lea -64(%ecx), %ecx
1591
1592 .p2align 4
1593L(Shl12LoopStart):
1594 prefetcht0 0x1c0(%eax)
1595 prefetcht0 0x1c0(%edx)
1596 movaps 4(%eax), %xmm2
1597 movaps 20(%eax), %xmm3
1598 movaps 36(%eax), %xmm4
1599 movaps 52(%eax), %xmm5
1600 movaps %xmm5, %xmm7
1601 palignr $12, %xmm4, %xmm5
1602 palignr $12, %xmm3, %xmm4
1603 movaps %xmm5, 48(%edx)
1604 palignr $12, %xmm2, %xmm3
1605 lea 64(%eax), %eax
1606 palignr $12, %xmm1, %xmm2
1607 movaps %xmm4, 32(%edx)
1608 movaps %xmm3, 16(%edx)
1609 movaps %xmm7, %xmm1
1610 movaps %xmm2, (%edx)
1611 lea 64(%edx), %edx
1612 sub $64, %ecx
1613 ja L(Shl12LoopStart)
1614
1615L(Shl12LoopLeave):
1616 add $32, %ecx
1617 jle L(shl_end_0)
1618
1619 movaps 4(%eax), %xmm2
1620 movaps 20(%eax), %xmm3
1621 palignr $12, %xmm2, %xmm3
1622 palignr $12, %xmm1, %xmm2
1623
1624 movaps %xmm2, (%edx)
1625 movaps %xmm3, 16(%edx)
1626 lea 32(%edx, %ecx), %edx
1627 lea 32(%eax, %ecx), %eax
1628 POP (%edi)
1629 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1630
1631 CFI_PUSH (%edi)
1632
1633 .p2align 4
1634L(sh_12_no_prefetch):
1635 lea -32(%ecx), %ecx
1636 lea -12(%eax), %eax
1637 xor %edi, %edi
1638
1639 .p2align 4
1640L(sh_12_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001641 movdqa 16(%eax, %edi), %xmm2
1642 sub $32, %ecx
1643 movdqa 32(%eax, %edi), %xmm3
1644 movdqa %xmm3, %xmm4
1645 palignr $12, %xmm2, %xmm3
1646 palignr $12, %xmm1, %xmm2
1647 lea 32(%edi), %edi
1648 movdqa %xmm2, -32(%edx, %edi)
1649 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001650 jb L(sh_12_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001651
1652 movdqa 16(%eax, %edi), %xmm2
1653 sub $32, %ecx
1654 movdqa 32(%eax, %edi), %xmm3
1655 movdqa %xmm3, %xmm1
1656 palignr $12, %xmm2, %xmm3
1657 palignr $12, %xmm4, %xmm2
1658 lea 32(%edi), %edi
1659 movdqa %xmm2, -32(%edx, %edi)
1660 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001661 jae L(sh_12_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001662
Jack Renc47703a2012-02-14 12:01:52 +04001663L(sh_12_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001664 lea 32(%ecx), %ecx
1665 add %ecx, %edi
1666 add %edi, %edx
1667 lea 12(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001668 POP (%edi)
1669 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001670
Jack Renc47703a2012-02-14 12:01:52 +04001671 CFI_PUSH (%edi)
1672
1673 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001674L(shl_13):
Jack Renc47703a2012-02-14 12:01:52 +04001675#ifndef USE_AS_MEMMOVE
1676 movaps -13(%eax), %xmm1
1677#else
1678 movl DEST+4(%esp), %edi
1679 movaps -13(%eax), %xmm1
1680 movdqu %xmm0, (%edi)
1681#endif
1682#ifdef DATA_CACHE_SIZE_HALF
1683 cmp $DATA_CACHE_SIZE_HALF, %ecx
1684#else
1685# if (defined SHARED || defined __PIC__)
1686 SETUP_PIC_REG(bx)
1687 add $_GLOBAL_OFFSET_TABLE_, %ebx
1688 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1689# else
1690 cmp __x86_data_cache_size_half, %ecx
1691# endif
1692#endif
1693 jb L(sh_13_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001694
Jack Renc47703a2012-02-14 12:01:52 +04001695 lea -64(%ecx), %ecx
1696
1697 .p2align 4
1698L(Shl13LoopStart):
1699 prefetcht0 0x1c0(%eax)
1700 prefetcht0 0x1c0(%edx)
1701 movaps 3(%eax), %xmm2
1702 movaps 19(%eax), %xmm3
1703 movaps 35(%eax), %xmm4
1704 movaps 51(%eax), %xmm5
1705 movaps %xmm5, %xmm7
1706 palignr $13, %xmm4, %xmm5
1707 palignr $13, %xmm3, %xmm4
1708 movaps %xmm5, 48(%edx)
1709 palignr $13, %xmm2, %xmm3
1710 lea 64(%eax), %eax
1711 palignr $13, %xmm1, %xmm2
1712 movaps %xmm4, 32(%edx)
1713 movaps %xmm3, 16(%edx)
1714 movaps %xmm7, %xmm1
1715 movaps %xmm2, (%edx)
1716 lea 64(%edx), %edx
1717 sub $64, %ecx
1718 ja L(Shl13LoopStart)
1719
1720L(Shl13LoopLeave):
1721 add $32, %ecx
1722 jle L(shl_end_0)
1723
1724 movaps 3(%eax), %xmm2
1725 movaps 19(%eax), %xmm3
1726 palignr $13, %xmm2, %xmm3
1727 palignr $13, %xmm1, %xmm2
1728
1729 movaps %xmm2, (%edx)
1730 movaps %xmm3, 16(%edx)
1731 lea 32(%edx, %ecx), %edx
1732 lea 32(%eax, %ecx), %eax
1733 POP (%edi)
1734 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1735
1736 CFI_PUSH (%edi)
1737
1738 .p2align 4
1739L(sh_13_no_prefetch):
1740 lea -32(%ecx), %ecx
1741 lea -13(%eax), %eax
1742 xor %edi, %edi
1743
1744 .p2align 4
1745L(sh_13_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001746 movdqa 16(%eax, %edi), %xmm2
1747 sub $32, %ecx
1748 movdqa 32(%eax, %edi), %xmm3
1749 movdqa %xmm3, %xmm4
1750 palignr $13, %xmm2, %xmm3
1751 palignr $13, %xmm1, %xmm2
1752 lea 32(%edi), %edi
1753 movdqa %xmm2, -32(%edx, %edi)
1754 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001755 jb L(sh_13_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001756
1757 movdqa 16(%eax, %edi), %xmm2
1758 sub $32, %ecx
1759 movdqa 32(%eax, %edi), %xmm3
1760 movdqa %xmm3, %xmm1
1761 palignr $13, %xmm2, %xmm3
1762 palignr $13, %xmm4, %xmm2
1763 lea 32(%edi), %edi
1764 movdqa %xmm2, -32(%edx, %edi)
1765 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001766 jae L(sh_13_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001767
Jack Renc47703a2012-02-14 12:01:52 +04001768L(sh_13_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001769 lea 32(%ecx), %ecx
1770 add %ecx, %edi
1771 add %edi, %edx
1772 lea 13(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001773 POP (%edi)
1774 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001775
Jack Renc47703a2012-02-14 12:01:52 +04001776 CFI_PUSH (%edi)
1777
1778 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001779L(shl_14):
Jack Renc47703a2012-02-14 12:01:52 +04001780#ifndef USE_AS_MEMMOVE
1781 movaps -14(%eax), %xmm1
1782#else
1783 movl DEST+4(%esp), %edi
1784 movaps -14(%eax), %xmm1
1785 movdqu %xmm0, (%edi)
1786#endif
1787#ifdef DATA_CACHE_SIZE_HALF
1788 cmp $DATA_CACHE_SIZE_HALF, %ecx
1789#else
1790# if (defined SHARED || defined __PIC__)
1791 SETUP_PIC_REG(bx)
1792 add $_GLOBAL_OFFSET_TABLE_, %ebx
1793 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1794# else
1795 cmp __x86_data_cache_size_half, %ecx
1796# endif
1797#endif
1798 jb L(sh_14_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001799
Jack Renc47703a2012-02-14 12:01:52 +04001800 lea -64(%ecx), %ecx
1801
1802 .p2align 4
1803L(Shl14LoopStart):
1804 prefetcht0 0x1c0(%eax)
1805 prefetcht0 0x1c0(%edx)
1806 movaps 2(%eax), %xmm2
1807 movaps 18(%eax), %xmm3
1808 movaps 34(%eax), %xmm4
1809 movaps 50(%eax), %xmm5
1810 movaps %xmm5, %xmm7
1811 palignr $14, %xmm4, %xmm5
1812 palignr $14, %xmm3, %xmm4
1813 movaps %xmm5, 48(%edx)
1814 palignr $14, %xmm2, %xmm3
1815 lea 64(%eax), %eax
1816 palignr $14, %xmm1, %xmm2
1817 movaps %xmm4, 32(%edx)
1818 movaps %xmm3, 16(%edx)
1819 movaps %xmm7, %xmm1
1820 movaps %xmm2, (%edx)
1821 lea 64(%edx), %edx
1822 sub $64, %ecx
1823 ja L(Shl14LoopStart)
1824
1825L(Shl14LoopLeave):
1826 add $32, %ecx
1827 jle L(shl_end_0)
1828
1829 movaps 2(%eax), %xmm2
1830 movaps 18(%eax), %xmm3
1831 palignr $14, %xmm2, %xmm3
1832 palignr $14, %xmm1, %xmm2
1833
1834 movaps %xmm2, (%edx)
1835 movaps %xmm3, 16(%edx)
1836 lea 32(%edx, %ecx), %edx
1837 lea 32(%eax, %ecx), %eax
1838 POP (%edi)
1839 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1840
1841 CFI_PUSH (%edi)
1842
1843 .p2align 4
1844L(sh_14_no_prefetch):
1845 lea -32(%ecx), %ecx
1846 lea -14(%eax), %eax
1847 xor %edi, %edi
1848
1849 .p2align 4
1850L(sh_14_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001851 movdqa 16(%eax, %edi), %xmm2
1852 sub $32, %ecx
1853 movdqa 32(%eax, %edi), %xmm3
1854 movdqa %xmm3, %xmm4
1855 palignr $14, %xmm2, %xmm3
1856 palignr $14, %xmm1, %xmm2
1857 lea 32(%edi), %edi
1858 movdqa %xmm2, -32(%edx, %edi)
1859 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001860 jb L(sh_14_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001861
1862 movdqa 16(%eax, %edi), %xmm2
1863 sub $32, %ecx
1864 movdqa 32(%eax, %edi), %xmm3
1865 movdqa %xmm3, %xmm1
1866 palignr $14, %xmm2, %xmm3
1867 palignr $14, %xmm4, %xmm2
1868 lea 32(%edi), %edi
1869 movdqa %xmm2, -32(%edx, %edi)
1870 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001871 jae L(sh_14_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001872
Jack Renc47703a2012-02-14 12:01:52 +04001873L(sh_14_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001874 lea 32(%ecx), %ecx
1875 add %ecx, %edi
1876 add %edi, %edx
1877 lea 14(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001878 POP (%edi)
1879 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001880
Jack Renc47703a2012-02-14 12:01:52 +04001881 CFI_PUSH (%edi)
1882
1883 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001884L(shl_15):
Jack Renc47703a2012-02-14 12:01:52 +04001885#ifndef USE_AS_MEMMOVE
1886 movaps -15(%eax), %xmm1
1887#else
1888 movl DEST+4(%esp), %edi
1889 movaps -15(%eax), %xmm1
1890 movdqu %xmm0, (%edi)
1891#endif
1892#ifdef DATA_CACHE_SIZE_HALF
1893 cmp $DATA_CACHE_SIZE_HALF, %ecx
1894#else
1895# if (defined SHARED || defined __PIC__)
1896 SETUP_PIC_REG(bx)
1897 add $_GLOBAL_OFFSET_TABLE_, %ebx
1898 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1899# else
1900 cmp __x86_data_cache_size_half, %ecx
1901# endif
1902#endif
1903 jb L(sh_15_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001904
Jack Renc47703a2012-02-14 12:01:52 +04001905 lea -64(%ecx), %ecx
1906
1907 .p2align 4
1908L(Shl15LoopStart):
1909 prefetcht0 0x1c0(%eax)
1910 prefetcht0 0x1c0(%edx)
1911 movaps 1(%eax), %xmm2
1912 movaps 17(%eax), %xmm3
1913 movaps 33(%eax), %xmm4
1914 movaps 49(%eax), %xmm5
1915 movaps %xmm5, %xmm7
1916 palignr $15, %xmm4, %xmm5
1917 palignr $15, %xmm3, %xmm4
1918 movaps %xmm5, 48(%edx)
1919 palignr $15, %xmm2, %xmm3
1920 lea 64(%eax), %eax
1921 palignr $15, %xmm1, %xmm2
1922 movaps %xmm4, 32(%edx)
1923 movaps %xmm3, 16(%edx)
1924 movaps %xmm7, %xmm1
1925 movaps %xmm2, (%edx)
1926 lea 64(%edx), %edx
1927 sub $64, %ecx
1928 ja L(Shl15LoopStart)
1929
1930L(Shl15LoopLeave):
1931 add $32, %ecx
1932 jle L(shl_end_0)
1933
1934 movaps 1(%eax), %xmm2
1935 movaps 17(%eax), %xmm3
1936 palignr $15, %xmm2, %xmm3
1937 palignr $15, %xmm1, %xmm2
1938
1939 movaps %xmm2, (%edx)
1940 movaps %xmm3, 16(%edx)
1941 lea 32(%edx, %ecx), %edx
1942 lea 32(%eax, %ecx), %eax
1943 POP (%edi)
1944 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1945
1946 CFI_PUSH (%edi)
1947
1948 .p2align 4
1949L(sh_15_no_prefetch):
1950 lea -32(%ecx), %ecx
1951 lea -15(%eax), %eax
1952 xor %edi, %edi
1953
1954 .p2align 4
1955L(sh_15_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001956 movdqa 16(%eax, %edi), %xmm2
1957 sub $32, %ecx
1958 movdqa 32(%eax, %edi), %xmm3
1959 movdqa %xmm3, %xmm4
1960 palignr $15, %xmm2, %xmm3
1961 palignr $15, %xmm1, %xmm2
1962 lea 32(%edi), %edi
1963 movdqa %xmm2, -32(%edx, %edi)
1964 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001965 jb L(sh_15_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001966
1967 movdqa 16(%eax, %edi), %xmm2
1968 sub $32, %ecx
1969 movdqa 32(%eax, %edi), %xmm3
1970 movdqa %xmm3, %xmm1
1971 palignr $15, %xmm2, %xmm3
1972 palignr $15, %xmm4, %xmm2
1973 lea 32(%edi), %edi
1974 movdqa %xmm2, -32(%edx, %edi)
1975 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001976 jae L(sh_15_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001977
Jack Renc47703a2012-02-14 12:01:52 +04001978L(sh_15_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001979 lea 32(%ecx), %ecx
1980 add %ecx, %edi
1981 add %edi, %edx
1982 lea 15(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001983 POP (%edi)
1984 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001985
Jack Renc47703a2012-02-14 12:01:52 +04001986 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001987
Jack Renc47703a2012-02-14 12:01:52 +04001988 .p2align 4
1989L(shl_end_0):
1990 lea 32(%ecx), %ecx
1991 lea (%edx, %ecx), %edx
1992 lea (%eax, %ecx), %eax
1993 POP (%edi)
1994 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1995
1996 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001997L(fwd_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04001998 movq -44(%eax), %xmm0
1999 movq %xmm0, -44(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002000L(fwd_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002001 movq -36(%eax), %xmm0
2002 movq %xmm0, -36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002003L(fwd_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002004 movq -28(%eax), %xmm0
2005 movq %xmm0, -28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002006L(fwd_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002007 movq -20(%eax), %xmm0
2008 movq %xmm0, -20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002009L(fwd_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002010 movq -12(%eax), %xmm0
2011 movq %xmm0, -12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002012L(fwd_write_4bytes):
2013 movl -4(%eax), %ecx
2014 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002015#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002016 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002017#else
Jack Renc47703a2012-02-14 12:01:52 +04002018 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002019#endif
2020 RETURN
2021
2022 .p2align 4
2023L(fwd_write_40bytes):
2024 movq -40(%eax), %xmm0
2025 movq %xmm0, -40(%edx)
2026L(fwd_write_32bytes):
2027 movq -32(%eax), %xmm0
2028 movq %xmm0, -32(%edx)
2029L(fwd_write_24bytes):
2030 movq -24(%eax), %xmm0
2031 movq %xmm0, -24(%edx)
2032L(fwd_write_16bytes):
2033 movq -16(%eax), %xmm0
2034 movq %xmm0, -16(%edx)
2035L(fwd_write_8bytes):
2036 movq -8(%eax), %xmm0
2037 movq %xmm0, -8(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002038L(fwd_write_0bytes):
Elliott Hughesbed110a2016-03-03 10:41:42 -08002039#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002040 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002041#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002042 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002043#endif
2044 RETURN
2045
Jack Renc47703a2012-02-14 12:01:52 +04002046 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002047L(fwd_write_5bytes):
2048 movl -5(%eax), %ecx
2049 movl -4(%eax), %eax
2050 movl %ecx, -5(%edx)
2051 movl %eax, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002052#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002053 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002054#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002055 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002056#endif
2057 RETURN
2058
Jack Renc47703a2012-02-14 12:01:52 +04002059 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002060L(fwd_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002061 movq -45(%eax), %xmm0
2062 movq %xmm0, -45(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002063L(fwd_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002064 movq -37(%eax), %xmm0
2065 movq %xmm0, -37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002066L(fwd_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002067 movq -29(%eax), %xmm0
2068 movq %xmm0, -29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002069L(fwd_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002070 movq -21(%eax), %xmm0
2071 movq %xmm0, -21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002072L(fwd_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002073 movq -13(%eax), %xmm0
2074 movq %xmm0, -13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002075 movl -5(%eax), %ecx
2076 movl %ecx, -5(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002077 movzbl -1(%eax), %ecx
2078 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002079#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002080 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002081#else
Jack Renc47703a2012-02-14 12:01:52 +04002082 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002083#endif
2084 RETURN
2085
2086 .p2align 4
2087L(fwd_write_41bytes):
2088 movq -41(%eax), %xmm0
2089 movq %xmm0, -41(%edx)
2090L(fwd_write_33bytes):
2091 movq -33(%eax), %xmm0
2092 movq %xmm0, -33(%edx)
2093L(fwd_write_25bytes):
2094 movq -25(%eax), %xmm0
2095 movq %xmm0, -25(%edx)
2096L(fwd_write_17bytes):
2097 movq -17(%eax), %xmm0
2098 movq %xmm0, -17(%edx)
2099L(fwd_write_9bytes):
2100 movq -9(%eax), %xmm0
2101 movq %xmm0, -9(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002102L(fwd_write_1bytes):
2103 movzbl -1(%eax), %ecx
2104 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002105#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002106 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002107#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002108 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002109#endif
2110 RETURN
2111
Jack Renc47703a2012-02-14 12:01:52 +04002112 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002113L(fwd_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002114 movq -46(%eax), %xmm0
2115 movq %xmm0, -46(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002116L(fwd_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002117 movq -38(%eax), %xmm0
2118 movq %xmm0, -38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002119L(fwd_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002120 movq -30(%eax), %xmm0
2121 movq %xmm0, -30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002122L(fwd_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002123 movq -22(%eax), %xmm0
2124 movq %xmm0, -22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002125L(fwd_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002126 movq -14(%eax), %xmm0
2127 movq %xmm0, -14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002128L(fwd_write_6bytes):
2129 movl -6(%eax), %ecx
2130 movl %ecx, -6(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002131 movzwl -2(%eax), %ecx
2132 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002133#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002134 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002135#else
Jack Renc47703a2012-02-14 12:01:52 +04002136 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002137#endif
2138 RETURN
2139
2140 .p2align 4
2141L(fwd_write_42bytes):
2142 movq -42(%eax), %xmm0
2143 movq %xmm0, -42(%edx)
2144L(fwd_write_34bytes):
2145 movq -34(%eax), %xmm0
2146 movq %xmm0, -34(%edx)
2147L(fwd_write_26bytes):
2148 movq -26(%eax), %xmm0
2149 movq %xmm0, -26(%edx)
2150L(fwd_write_18bytes):
2151 movq -18(%eax), %xmm0
2152 movq %xmm0, -18(%edx)
2153L(fwd_write_10bytes):
2154 movq -10(%eax), %xmm0
2155 movq %xmm0, -10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002156L(fwd_write_2bytes):
2157 movzwl -2(%eax), %ecx
2158 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002159#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002160 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002161#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002162 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002163#endif
2164 RETURN
2165
Jack Renc47703a2012-02-14 12:01:52 +04002166 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002167L(fwd_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002168 movq -47(%eax), %xmm0
2169 movq %xmm0, -47(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002170L(fwd_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002171 movq -39(%eax), %xmm0
2172 movq %xmm0, -39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002173L(fwd_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002174 movq -31(%eax), %xmm0
2175 movq %xmm0, -31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002176L(fwd_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002177 movq -23(%eax), %xmm0
2178 movq %xmm0, -23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002179L(fwd_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002180 movq -15(%eax), %xmm0
2181 movq %xmm0, -15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002182L(fwd_write_7bytes):
2183 movl -7(%eax), %ecx
2184 movl %ecx, -7(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002185 movzwl -3(%eax), %ecx
2186 movzbl -1(%eax), %eax
2187 movw %cx, -3(%edx)
2188 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002189#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002190 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002191#else
Jack Renc47703a2012-02-14 12:01:52 +04002192 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002193#endif
2194 RETURN
2195
2196 .p2align 4
2197L(fwd_write_43bytes):
2198 movq -43(%eax), %xmm0
2199 movq %xmm0, -43(%edx)
2200L(fwd_write_35bytes):
2201 movq -35(%eax), %xmm0
2202 movq %xmm0, -35(%edx)
2203L(fwd_write_27bytes):
2204 movq -27(%eax), %xmm0
2205 movq %xmm0, -27(%edx)
2206L(fwd_write_19bytes):
2207 movq -19(%eax), %xmm0
2208 movq %xmm0, -19(%edx)
2209L(fwd_write_11bytes):
2210 movq -11(%eax), %xmm0
2211 movq %xmm0, -11(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002212L(fwd_write_3bytes):
2213 movzwl -3(%eax), %ecx
2214 movzbl -1(%eax), %eax
2215 movw %cx, -3(%edx)
2216 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002217#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002218 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002219#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002220 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002221#endif
Jack Renc47703a2012-02-14 12:01:52 +04002222 RETURN
2223
2224 .p2align 4
2225L(fwd_write_40bytes_align):
2226 movdqa -40(%eax), %xmm0
2227 movdqa %xmm0, -40(%edx)
2228L(fwd_write_24bytes_align):
2229 movdqa -24(%eax), %xmm0
2230 movdqa %xmm0, -24(%edx)
2231L(fwd_write_8bytes_align):
2232 movq -8(%eax), %xmm0
2233 movq %xmm0, -8(%edx)
2234L(fwd_write_0bytes_align):
Elliott Hughesbed110a2016-03-03 10:41:42 -08002235#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002236 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002237#else
Jack Renc47703a2012-02-14 12:01:52 +04002238 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002239#endif
2240 RETURN
2241
2242 .p2align 4
2243L(fwd_write_32bytes_align):
2244 movdqa -32(%eax), %xmm0
2245 movdqa %xmm0, -32(%edx)
2246L(fwd_write_16bytes_align):
2247 movdqa -16(%eax), %xmm0
2248 movdqa %xmm0, -16(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002249#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002250 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002251#else
Jack Renc47703a2012-02-14 12:01:52 +04002252 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002253#endif
2254 RETURN
2255
2256 .p2align 4
2257L(fwd_write_5bytes_align):
2258 movl -5(%eax), %ecx
2259 movl -4(%eax), %eax
2260 movl %ecx, -5(%edx)
2261 movl %eax, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002262#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002263 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002264#else
Jack Renc47703a2012-02-14 12:01:52 +04002265 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002266#endif
2267 RETURN
2268
2269 .p2align 4
2270L(fwd_write_45bytes_align):
2271 movdqa -45(%eax), %xmm0
2272 movdqa %xmm0, -45(%edx)
2273L(fwd_write_29bytes_align):
2274 movdqa -29(%eax), %xmm0
2275 movdqa %xmm0, -29(%edx)
2276L(fwd_write_13bytes_align):
2277 movq -13(%eax), %xmm0
2278 movq %xmm0, -13(%edx)
2279 movl -5(%eax), %ecx
2280 movl %ecx, -5(%edx)
2281 movzbl -1(%eax), %ecx
2282 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002283#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002284 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002285#else
Jack Renc47703a2012-02-14 12:01:52 +04002286 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002287#endif
2288 RETURN
2289
2290 .p2align 4
2291L(fwd_write_37bytes_align):
2292 movdqa -37(%eax), %xmm0
2293 movdqa %xmm0, -37(%edx)
2294L(fwd_write_21bytes_align):
2295 movdqa -21(%eax), %xmm0
2296 movdqa %xmm0, -21(%edx)
2297 movl -5(%eax), %ecx
2298 movl %ecx, -5(%edx)
2299 movzbl -1(%eax), %ecx
2300 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002301#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002302 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002303#else
Jack Renc47703a2012-02-14 12:01:52 +04002304 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002305#endif
2306 RETURN
2307
2308 .p2align 4
2309L(fwd_write_41bytes_align):
2310 movdqa -41(%eax), %xmm0
2311 movdqa %xmm0, -41(%edx)
2312L(fwd_write_25bytes_align):
2313 movdqa -25(%eax), %xmm0
2314 movdqa %xmm0, -25(%edx)
2315L(fwd_write_9bytes_align):
2316 movq -9(%eax), %xmm0
2317 movq %xmm0, -9(%edx)
2318L(fwd_write_1bytes_align):
2319 movzbl -1(%eax), %ecx
2320 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002321#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002322 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002323#else
Jack Renc47703a2012-02-14 12:01:52 +04002324 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002325#endif
2326 RETURN
2327
2328 .p2align 4
2329L(fwd_write_33bytes_align):
2330 movdqa -33(%eax), %xmm0
2331 movdqa %xmm0, -33(%edx)
2332L(fwd_write_17bytes_align):
2333 movdqa -17(%eax), %xmm0
2334 movdqa %xmm0, -17(%edx)
2335 movzbl -1(%eax), %ecx
2336 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002337#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002338 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002339#else
Jack Renc47703a2012-02-14 12:01:52 +04002340 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002341#endif
2342 RETURN
2343
2344 .p2align 4
2345L(fwd_write_46bytes_align):
2346 movdqa -46(%eax), %xmm0
2347 movdqa %xmm0, -46(%edx)
2348L(fwd_write_30bytes_align):
2349 movdqa -30(%eax), %xmm0
2350 movdqa %xmm0, -30(%edx)
2351L(fwd_write_14bytes_align):
2352 movq -14(%eax), %xmm0
2353 movq %xmm0, -14(%edx)
2354L(fwd_write_6bytes_align):
2355 movl -6(%eax), %ecx
2356 movl %ecx, -6(%edx)
2357 movzwl -2(%eax), %ecx
2358 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002359#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002360 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002361#else
Jack Renc47703a2012-02-14 12:01:52 +04002362 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002363#endif
2364 RETURN
2365
2366 .p2align 4
2367L(fwd_write_38bytes_align):
2368 movdqa -38(%eax), %xmm0
2369 movdqa %xmm0, -38(%edx)
2370L(fwd_write_22bytes_align):
2371 movdqa -22(%eax), %xmm0
2372 movdqa %xmm0, -22(%edx)
2373 movl -6(%eax), %ecx
2374 movl %ecx, -6(%edx)
2375 movzwl -2(%eax), %ecx
2376 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002377#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002378 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002379#else
Jack Renc47703a2012-02-14 12:01:52 +04002380 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002381#endif
2382 RETURN
2383
2384 .p2align 4
2385L(fwd_write_42bytes_align):
2386 movdqa -42(%eax), %xmm0
2387 movdqa %xmm0, -42(%edx)
2388L(fwd_write_26bytes_align):
2389 movdqa -26(%eax), %xmm0
2390 movdqa %xmm0, -26(%edx)
2391L(fwd_write_10bytes_align):
2392 movq -10(%eax), %xmm0
2393 movq %xmm0, -10(%edx)
2394L(fwd_write_2bytes_align):
2395 movzwl -2(%eax), %ecx
2396 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002397#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002398 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002399#else
Jack Renc47703a2012-02-14 12:01:52 +04002400 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002401#endif
2402 RETURN
2403
2404 .p2align 4
2405L(fwd_write_34bytes_align):
2406 movdqa -34(%eax), %xmm0
2407 movdqa %xmm0, -34(%edx)
2408L(fwd_write_18bytes_align):
2409 movdqa -18(%eax), %xmm0
2410 movdqa %xmm0, -18(%edx)
2411 movzwl -2(%eax), %ecx
2412 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002413#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002414 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002415#else
Jack Renc47703a2012-02-14 12:01:52 +04002416 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002417#endif
2418 RETURN
2419
2420 .p2align 4
2421L(fwd_write_47bytes_align):
2422 movdqa -47(%eax), %xmm0
2423 movdqa %xmm0, -47(%edx)
2424L(fwd_write_31bytes_align):
2425 movdqa -31(%eax), %xmm0
2426 movdqa %xmm0, -31(%edx)
2427L(fwd_write_15bytes_align):
2428 movq -15(%eax), %xmm0
2429 movq %xmm0, -15(%edx)
2430L(fwd_write_7bytes_align):
2431 movl -7(%eax), %ecx
2432 movl %ecx, -7(%edx)
2433 movzwl -3(%eax), %ecx
2434 movzbl -1(%eax), %eax
2435 movw %cx, -3(%edx)
2436 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002437#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002438 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002439#else
Jack Renc47703a2012-02-14 12:01:52 +04002440 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002441#endif
2442 RETURN
2443
2444 .p2align 4
2445L(fwd_write_39bytes_align):
2446 movdqa -39(%eax), %xmm0
2447 movdqa %xmm0, -39(%edx)
2448L(fwd_write_23bytes_align):
2449 movdqa -23(%eax), %xmm0
2450 movdqa %xmm0, -23(%edx)
2451 movl -7(%eax), %ecx
2452 movl %ecx, -7(%edx)
2453 movzwl -3(%eax), %ecx
2454 movzbl -1(%eax), %eax
2455 movw %cx, -3(%edx)
2456 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002457#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002458 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002459#else
Jack Renc47703a2012-02-14 12:01:52 +04002460 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002461#endif
2462 RETURN
2463
2464 .p2align 4
2465L(fwd_write_43bytes_align):
2466 movdqa -43(%eax), %xmm0
2467 movdqa %xmm0, -43(%edx)
2468L(fwd_write_27bytes_align):
2469 movdqa -27(%eax), %xmm0
2470 movdqa %xmm0, -27(%edx)
2471L(fwd_write_11bytes_align):
2472 movq -11(%eax), %xmm0
2473 movq %xmm0, -11(%edx)
2474L(fwd_write_3bytes_align):
2475 movzwl -3(%eax), %ecx
2476 movzbl -1(%eax), %eax
2477 movw %cx, -3(%edx)
2478 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002479#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002480 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002481#else
Jack Renc47703a2012-02-14 12:01:52 +04002482 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002483#endif
2484 RETURN
2485
2486 .p2align 4
2487L(fwd_write_35bytes_align):
2488 movdqa -35(%eax), %xmm0
2489 movdqa %xmm0, -35(%edx)
2490L(fwd_write_19bytes_align):
2491 movdqa -19(%eax), %xmm0
2492 movdqa %xmm0, -19(%edx)
2493 movzwl -3(%eax), %ecx
2494 movzbl -1(%eax), %eax
2495 movw %cx, -3(%edx)
2496 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002497#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002498 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002499#else
Jack Renc47703a2012-02-14 12:01:52 +04002500 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002501#endif
2502 RETURN
2503
2504 .p2align 4
2505L(fwd_write_44bytes_align):
2506 movdqa -44(%eax), %xmm0
2507 movdqa %xmm0, -44(%edx)
2508L(fwd_write_28bytes_align):
2509 movdqa -28(%eax), %xmm0
2510 movdqa %xmm0, -28(%edx)
2511L(fwd_write_12bytes_align):
2512 movq -12(%eax), %xmm0
2513 movq %xmm0, -12(%edx)
2514L(fwd_write_4bytes_align):
2515 movl -4(%eax), %ecx
2516 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002517#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002518 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002519#else
Jack Renc47703a2012-02-14 12:01:52 +04002520 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002521#endif
2522 RETURN
2523
2524 .p2align 4
2525L(fwd_write_36bytes_align):
2526 movdqa -36(%eax), %xmm0
2527 movdqa %xmm0, -36(%edx)
2528L(fwd_write_20bytes_align):
2529 movdqa -20(%eax), %xmm0
2530 movdqa %xmm0, -20(%edx)
2531 movl -4(%eax), %ecx
2532 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002533#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002534 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002535#else
Jack Renc47703a2012-02-14 12:01:52 +04002536 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002537#endif
Bruce Beare124a5422010-10-11 12:24:41 -07002538 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08002539
Jack Renc47703a2012-02-14 12:01:52 +04002540 CFI_PUSH (%edi)
2541
2542 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002543L(large_page):
2544 movdqu (%eax), %xmm1
Jack Renc47703a2012-02-14 12:01:52 +04002545#ifdef USE_AS_MEMMOVE
2546 movl DEST+4(%esp), %edi
2547 movdqu %xmm0, (%edi)
2548#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -08002549 lea 16(%eax), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002550 movntdq %xmm1, (%edx)
2551 lea 16(%edx), %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -08002552 lea -0x90(%ecx), %ecx
2553 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +04002554
2555 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002556L(large_page_loop):
2557 movdqu (%eax), %xmm0
2558 movdqu 0x10(%eax), %xmm1
2559 movdqu 0x20(%eax), %xmm2
2560 movdqu 0x30(%eax), %xmm3
2561 movdqu 0x40(%eax), %xmm4
2562 movdqu 0x50(%eax), %xmm5
2563 movdqu 0x60(%eax), %xmm6
2564 movdqu 0x70(%eax), %xmm7
2565 lea 0x80(%eax), %eax
2566
2567 sub $0x80, %ecx
2568 movntdq %xmm0, (%edx)
2569 movntdq %xmm1, 0x10(%edx)
2570 movntdq %xmm2, 0x20(%edx)
2571 movntdq %xmm3, 0x30(%edx)
2572 movntdq %xmm4, 0x40(%edx)
2573 movntdq %xmm5, 0x50(%edx)
2574 movntdq %xmm6, 0x60(%edx)
2575 movntdq %xmm7, 0x70(%edx)
2576 lea 0x80(%edx), %edx
2577 jae L(large_page_loop)
2578 cmp $-0x40, %ecx
2579 lea 0x80(%ecx), %ecx
2580 jl L(large_page_less_64bytes)
2581
2582 movdqu (%eax), %xmm0
2583 movdqu 0x10(%eax), %xmm1
2584 movdqu 0x20(%eax), %xmm2
2585 movdqu 0x30(%eax), %xmm3
2586 lea 0x40(%eax), %eax
2587
2588 movntdq %xmm0, (%edx)
2589 movntdq %xmm1, 0x10(%edx)
2590 movntdq %xmm2, 0x20(%edx)
2591 movntdq %xmm3, 0x30(%edx)
2592 lea 0x40(%edx), %edx
2593 sub $0x40, %ecx
2594L(large_page_less_64bytes):
2595 cmp $32, %ecx
2596 jb L(large_page_less_32bytes)
2597 movdqu (%eax), %xmm0
2598 movdqu 0x10(%eax), %xmm1
2599 lea 0x20(%eax), %eax
2600 movntdq %xmm0, (%edx)
2601 movntdq %xmm1, 0x10(%edx)
2602 lea 0x20(%edx), %edx
2603 sub $0x20, %ecx
2604L(large_page_less_32bytes):
2605 add %ecx, %edx
2606 add %ecx, %eax
2607 sfence
2608 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2609
Jack Renc47703a2012-02-14 12:01:52 +04002610 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002611L(bk_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002612 movq 36(%eax), %xmm0
2613 movq %xmm0, 36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002614L(bk_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002615 movq 28(%eax), %xmm0
2616 movq %xmm0, 28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002617L(bk_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002618 movq 20(%eax), %xmm0
2619 movq %xmm0, 20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002620L(bk_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002621 movq 12(%eax), %xmm0
2622 movq %xmm0, 12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002623L(bk_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002624 movq 4(%eax), %xmm0
2625 movq %xmm0, 4(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002626L(bk_write_4bytes):
2627 movl (%eax), %ecx
2628 movl %ecx, (%edx)
2629L(bk_write_0bytes):
Bruce Beare8ff1a272010-03-04 11:03:37 -08002630 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002631#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002632 movl LEN(%esp), %ecx
2633 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002634#endif
2635 RETURN
2636
Jack Renc47703a2012-02-14 12:01:52 +04002637 .p2align 4
2638L(bk_write_40bytes):
2639 movq 32(%eax), %xmm0
2640 movq %xmm0, 32(%edx)
2641L(bk_write_32bytes):
2642 movq 24(%eax), %xmm0
2643 movq %xmm0, 24(%edx)
2644L(bk_write_24bytes):
2645 movq 16(%eax), %xmm0
2646 movq %xmm0, 16(%edx)
2647L(bk_write_16bytes):
2648 movq 8(%eax), %xmm0
2649 movq %xmm0, 8(%edx)
2650L(bk_write_8bytes):
2651 movq (%eax), %xmm0
2652 movq %xmm0, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002653 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002654#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002655 movl LEN(%esp), %ecx
2656 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002657#endif
2658 RETURN
2659
2660 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002661L(bk_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002662 movq 37(%eax), %xmm0
2663 movq %xmm0, 37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002664L(bk_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002665 movq 29(%eax), %xmm0
2666 movq %xmm0, 29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002667L(bk_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002668 movq 21(%eax), %xmm0
2669 movq %xmm0, 21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002670L(bk_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002671 movq 13(%eax), %xmm0
2672 movq %xmm0, 13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002673L(bk_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002674 movq 5(%eax), %xmm0
2675 movq %xmm0, 5(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002676L(bk_write_5bytes):
2677 movl 1(%eax), %ecx
2678 movl %ecx, 1(%edx)
2679L(bk_write_1bytes):
2680 movzbl (%eax), %ecx
2681 movb %cl, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002682 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002683#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002684 movl LEN(%esp), %ecx
2685 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002686#endif
2687 RETURN
2688
Jack Renc47703a2012-02-14 12:01:52 +04002689 .p2align 4
2690L(bk_write_41bytes):
2691 movq 33(%eax), %xmm0
2692 movq %xmm0, 33(%edx)
2693L(bk_write_33bytes):
2694 movq 25(%eax), %xmm0
2695 movq %xmm0, 25(%edx)
2696L(bk_write_25bytes):
2697 movq 17(%eax), %xmm0
2698 movq %xmm0, 17(%edx)
2699L(bk_write_17bytes):
2700 movq 9(%eax), %xmm0
2701 movq %xmm0, 9(%edx)
2702L(bk_write_9bytes):
2703 movq 1(%eax), %xmm0
2704 movq %xmm0, 1(%edx)
2705 movzbl (%eax), %ecx
2706 movb %cl, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002707 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002708#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002709 movl LEN(%esp), %ecx
2710 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002711#endif
2712 RETURN
2713
2714 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002715L(bk_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002716 movq 38(%eax), %xmm0
2717 movq %xmm0, 38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002718L(bk_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002719 movq 30(%eax), %xmm0
2720 movq %xmm0, 30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002721L(bk_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002722 movq 22(%eax), %xmm0
2723 movq %xmm0, 22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002724L(bk_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002725 movq 14(%eax), %xmm0
2726 movq %xmm0, 14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002727L(bk_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002728 movq 6(%eax), %xmm0
2729 movq %xmm0, 6(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002730L(bk_write_6bytes):
2731 movl 2(%eax), %ecx
2732 movl %ecx, 2(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002733 movzwl (%eax), %ecx
2734 movw %cx, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002735 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002736#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002737 movl LEN(%esp), %ecx
2738 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002739#endif
2740 RETURN
2741
2742 .p2align 4
2743L(bk_write_42bytes):
2744 movq 34(%eax), %xmm0
2745 movq %xmm0, 34(%edx)
2746L(bk_write_34bytes):
2747 movq 26(%eax), %xmm0
2748 movq %xmm0, 26(%edx)
2749L(bk_write_26bytes):
2750 movq 18(%eax), %xmm0
2751 movq %xmm0, 18(%edx)
2752L(bk_write_18bytes):
2753 movq 10(%eax), %xmm0
2754 movq %xmm0, 10(%edx)
2755L(bk_write_10bytes):
2756 movq 2(%eax), %xmm0
2757 movq %xmm0, 2(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002758L(bk_write_2bytes):
2759 movzwl (%eax), %ecx
2760 movw %cx, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002761 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002762#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002763 movl LEN(%esp), %ecx
2764 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002765#endif
2766 RETURN
2767
Jack Renc47703a2012-02-14 12:01:52 +04002768 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002769L(bk_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002770 movq 39(%eax), %xmm0
2771 movq %xmm0, 39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002772L(bk_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002773 movq 31(%eax), %xmm0
2774 movq %xmm0, 31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002775L(bk_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002776 movq 23(%eax), %xmm0
2777 movq %xmm0, 23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002778L(bk_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002779 movq 15(%eax), %xmm0
2780 movq %xmm0, 15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002781L(bk_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002782 movq 7(%eax), %xmm0
2783 movq %xmm0, 7(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002784L(bk_write_7bytes):
2785 movl 3(%eax), %ecx
2786 movl %ecx, 3(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002787 movzwl 1(%eax), %ecx
2788 movw %cx, 1(%edx)
2789 movzbl (%eax), %eax
2790 movb %al, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002791 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002792#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002793 movl LEN(%esp), %ecx
2794 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002795#endif
2796 RETURN
2797
2798 .p2align 4
2799L(bk_write_43bytes):
2800 movq 35(%eax), %xmm0
2801 movq %xmm0, 35(%edx)
2802L(bk_write_35bytes):
2803 movq 27(%eax), %xmm0
2804 movq %xmm0, 27(%edx)
2805L(bk_write_27bytes):
2806 movq 19(%eax), %xmm0
2807 movq %xmm0, 19(%edx)
2808L(bk_write_19bytes):
2809 movq 11(%eax), %xmm0
2810 movq %xmm0, 11(%edx)
2811L(bk_write_11bytes):
2812 movq 3(%eax), %xmm0
2813 movq %xmm0, 3(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002814L(bk_write_3bytes):
2815 movzwl 1(%eax), %ecx
2816 movw %cx, 1(%edx)
2817 movzbl (%eax), %eax
2818 movb %al, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002819 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002820#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002821 movl LEN(%esp), %ecx
2822 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002823#endif
2824 RETURN_END
2825
2826
2827 .pushsection .rodata.ssse3,"a",@progbits
Jack Renc47703a2012-02-14 12:01:52 +04002828 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002829L(table_48bytes_fwd):
2830 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2831 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2832 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2833 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2834 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2835 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2836 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2837 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2838 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2839 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2840 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2841 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2842 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2843 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2844 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2845 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2846 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2847 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2848 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2849 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2850 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2851 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2852 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2853 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2854 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2855 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2856 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2857 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2858 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2859 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2860 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2861 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2862 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2863 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2864 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2865 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2866 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2878
Jack Renc47703a2012-02-14 12:01:52 +04002879 .p2align 2
2880L(table_48bytes_fwd_align):
2881 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2882 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2883 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2884 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2885 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2886 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2887 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2888 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2889 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2890 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2891 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2892 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2893 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2894 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2895 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2896 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2897 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2898 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2899 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2900 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2901 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2902 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2903 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2904 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2905 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2906 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2907 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2908 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2909 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2910 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2911 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2912 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2913 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2914 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2915 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2916 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2917 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2929
2930 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002931L(shl_table):
2932 .int JMPTBL (L(shl_0), L(shl_table))
2933 .int JMPTBL (L(shl_1), L(shl_table))
2934 .int JMPTBL (L(shl_2), L(shl_table))
2935 .int JMPTBL (L(shl_3), L(shl_table))
2936 .int JMPTBL (L(shl_4), L(shl_table))
2937 .int JMPTBL (L(shl_5), L(shl_table))
2938 .int JMPTBL (L(shl_6), L(shl_table))
2939 .int JMPTBL (L(shl_7), L(shl_table))
2940 .int JMPTBL (L(shl_8), L(shl_table))
2941 .int JMPTBL (L(shl_9), L(shl_table))
2942 .int JMPTBL (L(shl_10), L(shl_table))
2943 .int JMPTBL (L(shl_11), L(shl_table))
2944 .int JMPTBL (L(shl_12), L(shl_table))
2945 .int JMPTBL (L(shl_13), L(shl_table))
2946 .int JMPTBL (L(shl_14), L(shl_table))
2947 .int JMPTBL (L(shl_15), L(shl_table))
2948
Jack Renc47703a2012-02-14 12:01:52 +04002949 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002950L(table_48_bytes_bwd):
2951 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2952 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2953 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2954 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2955 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2956 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2957 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2958 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2959 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2960 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2961 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2962 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2963 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2964 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2965 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2966 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2967 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2968 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2969 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2970 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2971 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2972 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2973 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2974 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2975 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2976 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2977 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2978 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2979 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2980 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2981 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2982 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2983 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2984 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2985 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2986 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2987 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
2999
3000 .popsection
3001
3002#ifdef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +04003003 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003004L(copy_backward):
Jack Renc47703a2012-02-14 12:01:52 +04003005 PUSH (%edi)
3006 movl %eax, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003007 lea (%ecx,%edx,1),%edx
Jack Renc47703a2012-02-14 12:01:52 +04003008 lea (%ecx,%edi,1),%edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003009 testl $0x3, %edx
3010 jnz L(bk_align)
3011
3012L(bk_aligned_4):
3013 cmp $64, %ecx
3014 jae L(bk_write_more64bytes)
3015
3016L(bk_write_64bytesless):
3017 cmp $32, %ecx
3018 jb L(bk_write_less32bytes)
3019
3020L(bk_write_more32bytes):
3021 /* Copy 32 bytes at a time. */
3022 sub $32, %ecx
Jack Renc47703a2012-02-14 12:01:52 +04003023 movq -8(%edi), %xmm0
3024 movq %xmm0, -8(%edx)
3025 movq -16(%edi), %xmm0
3026 movq %xmm0, -16(%edx)
3027 movq -24(%edi), %xmm0
3028 movq %xmm0, -24(%edx)
3029 movq -32(%edi), %xmm0
3030 movq %xmm0, -32(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003031 sub $32, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003032 sub $32, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003033
3034L(bk_write_less32bytes):
Jack Renc47703a2012-02-14 12:01:52 +04003035 movl %edi, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003036 sub %ecx, %edx
3037 sub %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04003038 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003039L(bk_write_less32bytes_2):
3040 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3041
Jack Renc47703a2012-02-14 12:01:52 +04003042 CFI_PUSH (%edi)
3043
3044 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003045L(bk_align):
3046 cmp $8, %ecx
3047 jbe L(bk_write_less32bytes)
3048 testl $1, %edx
3049 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
Jack Renc47703a2012-02-14 12:01:52 +04003050 then (EDX & 2) must be != 0. */
Bruce Beare8ff1a272010-03-04 11:03:37 -08003051 jz L(bk_got2)
Jack Renc47703a2012-02-14 12:01:52 +04003052 sub $1, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003053 sub $1, %ecx
3054 sub $1, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003055 movzbl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003056 movb %al, (%edx)
3057
3058 testl $2, %edx
3059 jz L(bk_aligned_4)
3060
3061L(bk_got2):
Jack Renc47703a2012-02-14 12:01:52 +04003062 sub $2, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003063 sub $2, %ecx
3064 sub $2, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003065 movzwl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003066 movw %ax, (%edx)
3067 jmp L(bk_aligned_4)
3068
Jack Renc47703a2012-02-14 12:01:52 +04003069 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003070L(bk_write_more64bytes):
3071 /* Check alignment of last byte. */
3072 testl $15, %edx
3073 jz L(bk_ssse3_cpy_pre)
3074
3075/* EDX is aligned 4 bytes, but not 16 bytes. */
3076L(bk_ssse3_align):
Jack Renc47703a2012-02-14 12:01:52 +04003077 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003078 sub $4, %ecx
3079 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003080 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003081 movl %eax, (%edx)
3082
3083 testl $15, %edx
3084 jz L(bk_ssse3_cpy_pre)
3085
Jack Renc47703a2012-02-14 12:01:52 +04003086 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003087 sub $4, %ecx
3088 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003089 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003090 movl %eax, (%edx)
3091
3092 testl $15, %edx
3093 jz L(bk_ssse3_cpy_pre)
3094
Jack Renc47703a2012-02-14 12:01:52 +04003095 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003096 sub $4, %ecx
3097 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003098 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003099 movl %eax, (%edx)
3100
3101L(bk_ssse3_cpy_pre):
3102 cmp $64, %ecx
3103 jb L(bk_write_more32bytes)
3104
Jack Renc47703a2012-02-14 12:01:52 +04003105 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003106L(bk_ssse3_cpy):
Jack Renc47703a2012-02-14 12:01:52 +04003107 sub $64, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003108 sub $64, %ecx
3109 sub $64, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003110 movdqu 0x30(%edi), %xmm3
Bruce Beare8ff1a272010-03-04 11:03:37 -08003111 movdqa %xmm3, 0x30(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003112 movdqu 0x20(%edi), %xmm2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003113 movdqa %xmm2, 0x20(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003114 movdqu 0x10(%edi), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -08003115 movdqa %xmm1, 0x10(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003116 movdqu (%edi), %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -08003117 movdqa %xmm0, (%edx)
3118 cmp $64, %ecx
3119 jae L(bk_ssse3_cpy)
3120 jmp L(bk_write_64bytesless)
3121
3122#endif
3123
3124END (MEMCPY)