blob: fe3082ee7d6ee8beee68c82a19a09256a0efa907 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Elliott Hughesed777142022-07-25 16:25:11 +000031#define FOR_ATOM
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040032#include "cache.h"
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040033
Bruce Beare8ff1a272010-03-04 11:03:37 -080034#ifndef MEMCPY
Haibo Huangb9244ff2018-08-11 10:12:13 -070035# define MEMCPY memcpy_atom
Bruce Beare8ff1a272010-03-04 11:03:37 -080036#endif
37
38#ifndef L
39# define L(label) .L##label
40#endif
41
Bruce Beare8ff1a272010-03-04 11:03:37 -080042#ifndef cfi_startproc
Jack Renc47703a2012-02-14 12:01:52 +040043# define cfi_startproc .cfi_startproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080044#endif
45
46#ifndef cfi_endproc
Jack Renc47703a2012-02-14 12:01:52 +040047# define cfi_endproc .cfi_endproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080048#endif
49
50#ifndef cfi_rel_offset
51# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
52#endif
53
54#ifndef cfi_restore
Jack Renc47703a2012-02-14 12:01:52 +040055# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080056#endif
57
58#ifndef cfi_adjust_cfa_offset
59# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
60#endif
61
62#ifndef ENTRY
Jack Renc47703a2012-02-14 12:01:52 +040063# define ENTRY(name) \
64 .type name, @function; \
65 .globl name; \
66 .p2align 4; \
67name: \
Bruce Beare8ff1a272010-03-04 11:03:37 -080068 cfi_startproc
69#endif
70
71#ifndef END
Jack Renc47703a2012-02-14 12:01:52 +040072# define END(name) \
73 cfi_endproc; \
Bruce Beare8ff1a272010-03-04 11:03:37 -080074 .size name, .-name
75#endif
76
Elliott Hughesbed110a2016-03-03 10:41:42 -080077#define DEST PARMS
78#define SRC DEST+4
79#define LEN SRC+4
Bruce Beare8ff1a272010-03-04 11:03:37 -080080
Jack Renc47703a2012-02-14 12:01:52 +040081#define CFI_PUSH(REG) \
82 cfi_adjust_cfa_offset (4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080083 cfi_rel_offset (REG, 0)
84
Jack Renc47703a2012-02-14 12:01:52 +040085#define CFI_POP(REG) \
86 cfi_adjust_cfa_offset (-4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080087 cfi_restore (REG)
88
89#define PUSH(REG) pushl REG; CFI_PUSH (REG)
90#define POP(REG) popl REG; CFI_POP (REG)
91
Nick Kralevich5982e332011-11-11 15:47:24 -080092#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -080093# define PARMS 8 /* Preserve EBX. */
94# define ENTRANCE PUSH (%ebx);
95# define RETURN_END POP (%ebx); ret
96# define RETURN RETURN_END; CFI_PUSH (%ebx)
97# define JMPTBL(I, B) I - B
Jack Renc47703a2012-02-14 12:01:52 +040098
Varvara Rainchik5a922842014-04-24 15:41:20 +040099# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
Bruce Beare8ff1a272010-03-04 11:03:37 -0800100
101/* Load an entry in a jump table into EBX and branch to it. TABLE is a
Jack Renc47703a2012-02-14 12:01:52 +0400102 jump table with relative offsets. INDEX is a register contains the
103 index into the jump table. SCALE is the scale of INDEX. */
104
Bruce Beare8ff1a272010-03-04 11:03:37 -0800105# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400106 /* We first load PC into EBX. */ \
107 SETUP_PIC_REG(bx); \
108 /* Get the address of the jump table. */ \
109 addl $(TABLE - .), %ebx; \
110 /* Get the entry and convert the relative offset to the \
111 absolute address. */ \
112 addl (%ebx, INDEX, SCALE), %ebx; \
113 /* We loaded the jump table. Go. */ \
114 jmp *%ebx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800115#else
Jack Renc47703a2012-02-14 12:01:52 +0400116
Bruce Beare8ff1a272010-03-04 11:03:37 -0800117# define PARMS 4
118# define ENTRANCE
119# define RETURN_END ret
120# define RETURN RETURN_END
121# define JMPTBL(I, B) I
122
123/* Branch to an entry in a jump table. TABLE is a jump table with
Jack Renc47703a2012-02-14 12:01:52 +0400124 absolute offsets. INDEX is a register contains the index into the
125 jump table. SCALE is the scale of INDEX. */
126
Bruce Beare8ff1a272010-03-04 11:03:37 -0800127# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400128 jmp *TABLE(, INDEX, SCALE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800129#endif
130
131 .section .text.ssse3,"ax",@progbits
132ENTRY (MEMCPY)
133 ENTRANCE
134 movl LEN(%esp), %ecx
135 movl SRC(%esp), %eax
136 movl DEST(%esp), %edx
137
138#ifdef USE_AS_MEMMOVE
139 cmp %eax, %edx
140 jb L(copy_forward)
141 je L(fwd_write_0bytes)
142 cmp $32, %ecx
143 jae L(memmove_bwd)
144 jmp L(bk_write_less32bytes_2)
Jack Renc47703a2012-02-14 12:01:52 +0400145
146 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800147L(memmove_bwd):
148 add %ecx, %eax
149 cmp %eax, %edx
150 movl SRC(%esp), %eax
151 jb L(copy_backward)
152
153L(copy_forward):
154#endif
155 cmp $48, %ecx
156 jae L(48bytesormore)
157
158L(fwd_write_less32bytes):
159#ifndef USE_AS_MEMMOVE
160 cmp %dl, %al
161 jb L(bk_write)
162#endif
163 add %ecx, %edx
164 add %ecx, %eax
165 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
166#ifndef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +0400167 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800168L(bk_write):
169 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
170#endif
171
Jack Renc47703a2012-02-14 12:01:52 +0400172 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800173L(48bytesormore):
Jack Renc47703a2012-02-14 12:01:52 +0400174#ifndef USE_AS_MEMMOVE
175 movlpd (%eax), %xmm0
176 movlpd 8(%eax), %xmm1
177 movlpd %xmm0, (%edx)
178 movlpd %xmm1, 8(%edx)
179#else
Bruce Beare8ff1a272010-03-04 11:03:37 -0800180 movdqu (%eax), %xmm0
Jack Renc47703a2012-02-14 12:01:52 +0400181#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800182 PUSH (%edi)
183 movl %edx, %edi
184 and $-16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800185 add $16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800186 sub %edx, %edi
187 add %edi, %ecx
188 sub %edi, %eax
189
190#ifdef SHARED_CACHE_SIZE_HALF
191 cmp $SHARED_CACHE_SIZE_HALF, %ecx
192#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800193# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400194 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800195 add $_GLOBAL_OFFSET_TABLE_, %ebx
196 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
197# else
198 cmp __x86_shared_cache_size_half, %ecx
199# endif
200#endif
201
202 mov %eax, %edi
203 jae L(large_page)
204 and $0xf, %edi
205 jz L(shl_0)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800206 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
207
Jack Renc47703a2012-02-14 12:01:52 +0400208 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800209L(shl_0):
Jack Renc47703a2012-02-14 12:01:52 +0400210#ifdef USE_AS_MEMMOVE
211 movl DEST+4(%esp), %edi
212 movdqu %xmm0, (%edi)
213#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800214 xor %edi, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -0800215 cmp $127, %ecx
216 ja L(shl_0_gobble)
217 lea -32(%ecx), %ecx
Jack Renc47703a2012-02-14 12:01:52 +0400218
219 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800220L(shl_0_loop):
221 movdqa (%eax, %edi), %xmm0
222 movdqa 16(%eax, %edi), %xmm1
223 sub $32, %ecx
224 movdqa %xmm0, (%edx, %edi)
225 movdqa %xmm1, 16(%edx, %edi)
226 lea 32(%edi), %edi
227 jb L(shl_0_end)
228
229 movdqa (%eax, %edi), %xmm0
230 movdqa 16(%eax, %edi), %xmm1
231 sub $32, %ecx
232 movdqa %xmm0, (%edx, %edi)
233 movdqa %xmm1, 16(%edx, %edi)
234 lea 32(%edi), %edi
235 jb L(shl_0_end)
236
237 movdqa (%eax, %edi), %xmm0
238 movdqa 16(%eax, %edi), %xmm1
239 sub $32, %ecx
240 movdqa %xmm0, (%edx, %edi)
241 movdqa %xmm1, 16(%edx, %edi)
242 lea 32(%edi), %edi
243 jb L(shl_0_end)
244
245 movdqa (%eax, %edi), %xmm0
246 movdqa 16(%eax, %edi), %xmm1
247 sub $32, %ecx
248 movdqa %xmm0, (%edx, %edi)
249 movdqa %xmm1, 16(%edx, %edi)
250 lea 32(%edi), %edi
Jack Renc47703a2012-02-14 12:01:52 +0400251
Bruce Beare8ff1a272010-03-04 11:03:37 -0800252L(shl_0_end):
253 lea 32(%ecx), %ecx
254 add %ecx, %edi
255 add %edi, %edx
256 add %edi, %eax
257 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +0400258 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800259
Bruce Beare124a5422010-10-11 12:24:41 -0700260 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800261
Jack Renc47703a2012-02-14 12:01:52 +0400262 .p2align 4
263L(shl_0_gobble):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800264#ifdef DATA_CACHE_SIZE_HALF
265 cmp $DATA_CACHE_SIZE_HALF, %ecx
266#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800267# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400268 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800269 add $_GLOBAL_OFFSET_TABLE_, %ebx
270 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
271# else
272 cmp __x86_data_cache_size_half, %ecx
273# endif
274#endif
Jack Renc47703a2012-02-14 12:01:52 +0400275 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800276 lea -128(%ecx), %ecx
277 jae L(shl_0_gobble_mem_loop)
Jack Renc47703a2012-02-14 12:01:52 +0400278
279 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800280L(shl_0_gobble_cache_loop):
281 movdqa (%eax), %xmm0
282 movdqa 0x10(%eax), %xmm1
283 movdqa 0x20(%eax), %xmm2
284 movdqa 0x30(%eax), %xmm3
285 movdqa 0x40(%eax), %xmm4
286 movdqa 0x50(%eax), %xmm5
287 movdqa 0x60(%eax), %xmm6
288 movdqa 0x70(%eax), %xmm7
289 lea 0x80(%eax), %eax
290 sub $128, %ecx
291 movdqa %xmm0, (%edx)
292 movdqa %xmm1, 0x10(%edx)
293 movdqa %xmm2, 0x20(%edx)
294 movdqa %xmm3, 0x30(%edx)
295 movdqa %xmm4, 0x40(%edx)
296 movdqa %xmm5, 0x50(%edx)
297 movdqa %xmm6, 0x60(%edx)
298 movdqa %xmm7, 0x70(%edx)
299 lea 0x80(%edx), %edx
300
301 jae L(shl_0_gobble_cache_loop)
302 cmp $-0x40, %ecx
303 lea 0x80(%ecx), %ecx
304 jl L(shl_0_cache_less_64bytes)
305
306 movdqa (%eax), %xmm0
307 sub $0x40, %ecx
308 movdqa 0x10(%eax), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -0800309 movdqa %xmm0, (%edx)
310 movdqa %xmm1, 0x10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800311 movdqa 0x20(%eax), %xmm0
312 movdqa 0x30(%eax), %xmm1
313 add $0x40, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800314 movdqa %xmm0, 0x20(%edx)
315 movdqa %xmm1, 0x30(%edx)
316 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400317
Bruce Beare8ff1a272010-03-04 11:03:37 -0800318L(shl_0_cache_less_64bytes):
319 cmp $0x20, %ecx
320 jb L(shl_0_cache_less_32bytes)
321 movdqa (%eax), %xmm0
322 sub $0x20, %ecx
323 movdqa 0x10(%eax), %xmm1
324 add $0x20, %eax
325 movdqa %xmm0, (%edx)
326 movdqa %xmm1, 0x10(%edx)
327 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400328
Bruce Beare8ff1a272010-03-04 11:03:37 -0800329L(shl_0_cache_less_32bytes):
330 cmp $0x10, %ecx
331 jb L(shl_0_cache_less_16bytes)
332 sub $0x10, %ecx
333 movdqa (%eax), %xmm0
334 add $0x10, %eax
335 movdqa %xmm0, (%edx)
336 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400337
Bruce Beare8ff1a272010-03-04 11:03:37 -0800338L(shl_0_cache_less_16bytes):
339 add %ecx, %edx
340 add %ecx, %eax
341 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
342
Jack Renc47703a2012-02-14 12:01:52 +0400343 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800344L(shl_0_gobble_mem_loop):
345 prefetcht0 0x1c0(%eax)
346 prefetcht0 0x280(%eax)
347 prefetcht0 0x1c0(%edx)
348
349 movdqa (%eax), %xmm0
350 movdqa 0x10(%eax), %xmm1
351 movdqa 0x20(%eax), %xmm2
352 movdqa 0x30(%eax), %xmm3
353 movdqa 0x40(%eax), %xmm4
354 movdqa 0x50(%eax), %xmm5
355 movdqa 0x60(%eax), %xmm6
356 movdqa 0x70(%eax), %xmm7
357 lea 0x80(%eax), %eax
358 sub $0x80, %ecx
359 movdqa %xmm0, (%edx)
360 movdqa %xmm1, 0x10(%edx)
361 movdqa %xmm2, 0x20(%edx)
362 movdqa %xmm3, 0x30(%edx)
363 movdqa %xmm4, 0x40(%edx)
364 movdqa %xmm5, 0x50(%edx)
365 movdqa %xmm6, 0x60(%edx)
366 movdqa %xmm7, 0x70(%edx)
367 lea 0x80(%edx), %edx
368
369 jae L(shl_0_gobble_mem_loop)
370 cmp $-0x40, %ecx
371 lea 0x80(%ecx), %ecx
372 jl L(shl_0_mem_less_64bytes)
373
374 movdqa (%eax), %xmm0
375 sub $0x40, %ecx
376 movdqa 0x10(%eax), %xmm1
377
378 movdqa %xmm0, (%edx)
379 movdqa %xmm1, 0x10(%edx)
380
381 movdqa 0x20(%eax), %xmm0
382 movdqa 0x30(%eax), %xmm1
383 add $0x40, %eax
384
385 movdqa %xmm0, 0x20(%edx)
386 movdqa %xmm1, 0x30(%edx)
387 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400388
Bruce Beare8ff1a272010-03-04 11:03:37 -0800389L(shl_0_mem_less_64bytes):
390 cmp $0x20, %ecx
391 jb L(shl_0_mem_less_32bytes)
392 movdqa (%eax), %xmm0
393 sub $0x20, %ecx
394 movdqa 0x10(%eax), %xmm1
395 add $0x20, %eax
396 movdqa %xmm0, (%edx)
397 movdqa %xmm1, 0x10(%edx)
398 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400399
Bruce Beare8ff1a272010-03-04 11:03:37 -0800400L(shl_0_mem_less_32bytes):
401 cmp $0x10, %ecx
402 jb L(shl_0_mem_less_16bytes)
403 sub $0x10, %ecx
404 movdqa (%eax), %xmm0
405 add $0x10, %eax
406 movdqa %xmm0, (%edx)
407 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400408
Bruce Beare8ff1a272010-03-04 11:03:37 -0800409L(shl_0_mem_less_16bytes):
410 add %ecx, %edx
411 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +0400412 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800413
Jack Renc47703a2012-02-14 12:01:52 +0400414 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800415L(shl_1):
Jack Renc47703a2012-02-14 12:01:52 +0400416#ifndef USE_AS_MEMMOVE
417 movaps -1(%eax), %xmm1
418#else
419 movl DEST+4(%esp), %edi
420 movaps -1(%eax), %xmm1
421 movdqu %xmm0, (%edi)
422#endif
423#ifdef DATA_CACHE_SIZE_HALF
424 cmp $DATA_CACHE_SIZE_HALF, %ecx
425#else
426# if (defined SHARED || defined __PIC__)
427 SETUP_PIC_REG(bx)
428 add $_GLOBAL_OFFSET_TABLE_, %ebx
429 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
430# else
431 cmp __x86_data_cache_size_half, %ecx
432# endif
433#endif
434 jb L(sh_1_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800435
Jack Renc47703a2012-02-14 12:01:52 +0400436 lea -64(%ecx), %ecx
437
438 .p2align 4
439L(Shl1LoopStart):
440 prefetcht0 0x1c0(%eax)
441 prefetcht0 0x1c0(%edx)
442 movaps 15(%eax), %xmm2
443 movaps 31(%eax), %xmm3
444 movaps 47(%eax), %xmm4
445 movaps 63(%eax), %xmm5
446 movaps %xmm5, %xmm7
447 palignr $1, %xmm4, %xmm5
448 palignr $1, %xmm3, %xmm4
449 movaps %xmm5, 48(%edx)
450 palignr $1, %xmm2, %xmm3
451 lea 64(%eax), %eax
452 palignr $1, %xmm1, %xmm2
453 movaps %xmm4, 32(%edx)
454 movaps %xmm3, 16(%edx)
455 movaps %xmm7, %xmm1
456 movaps %xmm2, (%edx)
457 lea 64(%edx), %edx
458 sub $64, %ecx
459 ja L(Shl1LoopStart)
460
461L(Shl1LoopLeave):
462 add $32, %ecx
463 jle L(shl_end_0)
464
465 movaps 15(%eax), %xmm2
466 movaps 31(%eax), %xmm3
467 palignr $1, %xmm2, %xmm3
468 palignr $1, %xmm1, %xmm2
469 movaps %xmm2, (%edx)
470 movaps %xmm3, 16(%edx)
471 lea 32(%edx, %ecx), %edx
472 lea 32(%eax, %ecx), %eax
473 POP (%edi)
474 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
475
476 CFI_PUSH (%edi)
477
478 .p2align 4
479L(sh_1_no_prefetch):
480 lea -32(%ecx), %ecx
481 lea -1(%eax), %eax
482 xor %edi, %edi
483
484 .p2align 4
485L(sh_1_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800486 movdqa 16(%eax, %edi), %xmm2
487 sub $32, %ecx
488 movdqa 32(%eax, %edi), %xmm3
489 movdqa %xmm3, %xmm4
490 palignr $1, %xmm2, %xmm3
491 palignr $1, %xmm1, %xmm2
492 lea 32(%edi), %edi
493 movdqa %xmm2, -32(%edx, %edi)
494 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400495 jb L(sh_1_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800496
497 movdqa 16(%eax, %edi), %xmm2
498 sub $32, %ecx
499 movdqa 32(%eax, %edi), %xmm3
500 movdqa %xmm3, %xmm1
501 palignr $1, %xmm2, %xmm3
502 palignr $1, %xmm4, %xmm2
503 lea 32(%edi), %edi
504 movdqa %xmm2, -32(%edx, %edi)
505 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400506 jae L(sh_1_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800507
Jack Renc47703a2012-02-14 12:01:52 +0400508L(sh_1_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800509 lea 32(%ecx), %ecx
510 add %ecx, %edi
511 add %edi, %edx
512 lea 1(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400513 POP (%edi)
514 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800515
Jack Renc47703a2012-02-14 12:01:52 +0400516 CFI_PUSH (%edi)
517
518 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800519L(shl_2):
Jack Renc47703a2012-02-14 12:01:52 +0400520#ifndef USE_AS_MEMMOVE
521 movaps -2(%eax), %xmm1
522#else
523 movl DEST+4(%esp), %edi
524 movaps -2(%eax), %xmm1
525 movdqu %xmm0, (%edi)
526#endif
527#ifdef DATA_CACHE_SIZE_HALF
528 cmp $DATA_CACHE_SIZE_HALF, %ecx
529#else
530# if (defined SHARED || defined __PIC__)
531 SETUP_PIC_REG(bx)
532 add $_GLOBAL_OFFSET_TABLE_, %ebx
533 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
534# else
535 cmp __x86_data_cache_size_half, %ecx
536# endif
537#endif
538 jb L(sh_2_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800539
Jack Renc47703a2012-02-14 12:01:52 +0400540 lea -64(%ecx), %ecx
541
542 .p2align 4
543L(Shl2LoopStart):
544 prefetcht0 0x1c0(%eax)
545 prefetcht0 0x1c0(%edx)
546 movaps 14(%eax), %xmm2
547 movaps 30(%eax), %xmm3
548 movaps 46(%eax), %xmm4
549 movaps 62(%eax), %xmm5
550 movaps %xmm5, %xmm7
551 palignr $2, %xmm4, %xmm5
552 palignr $2, %xmm3, %xmm4
553 movaps %xmm5, 48(%edx)
554 palignr $2, %xmm2, %xmm3
555 lea 64(%eax), %eax
556 palignr $2, %xmm1, %xmm2
557 movaps %xmm4, 32(%edx)
558 movaps %xmm3, 16(%edx)
559 movaps %xmm7, %xmm1
560 movaps %xmm2, (%edx)
561 lea 64(%edx), %edx
562 sub $64, %ecx
563 ja L(Shl2LoopStart)
564
565L(Shl2LoopLeave):
566 add $32, %ecx
567 jle L(shl_end_0)
568
569 movaps 14(%eax), %xmm2
570 movaps 30(%eax), %xmm3
571 palignr $2, %xmm2, %xmm3
572 palignr $2, %xmm1, %xmm2
573 movaps %xmm2, (%edx)
574 movaps %xmm3, 16(%edx)
575 lea 32(%edx, %ecx), %edx
576 lea 32(%eax, %ecx), %eax
577 POP (%edi)
578 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
579
580 CFI_PUSH (%edi)
581
582 .p2align 4
583L(sh_2_no_prefetch):
584 lea -32(%ecx), %ecx
585 lea -2(%eax), %eax
586 xor %edi, %edi
587
588 .p2align 4
589L(sh_2_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800590 movdqa 16(%eax, %edi), %xmm2
591 sub $32, %ecx
592 movdqa 32(%eax, %edi), %xmm3
593 movdqa %xmm3, %xmm4
594 palignr $2, %xmm2, %xmm3
595 palignr $2, %xmm1, %xmm2
596 lea 32(%edi), %edi
597 movdqa %xmm2, -32(%edx, %edi)
598 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400599 jb L(sh_2_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800600
601 movdqa 16(%eax, %edi), %xmm2
602 sub $32, %ecx
603 movdqa 32(%eax, %edi), %xmm3
604 movdqa %xmm3, %xmm1
605 palignr $2, %xmm2, %xmm3
606 palignr $2, %xmm4, %xmm2
607 lea 32(%edi), %edi
608 movdqa %xmm2, -32(%edx, %edi)
609 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400610 jae L(sh_2_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800611
Jack Renc47703a2012-02-14 12:01:52 +0400612L(sh_2_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800613 lea 32(%ecx), %ecx
614 add %ecx, %edi
615 add %edi, %edx
616 lea 2(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400617 POP (%edi)
618 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800619
Jack Renc47703a2012-02-14 12:01:52 +0400620 CFI_PUSH (%edi)
621
622 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800623L(shl_3):
Jack Renc47703a2012-02-14 12:01:52 +0400624#ifndef USE_AS_MEMMOVE
625 movaps -3(%eax), %xmm1
626#else
627 movl DEST+4(%esp), %edi
628 movaps -3(%eax), %xmm1
629 movdqu %xmm0, (%edi)
630#endif
631#ifdef DATA_CACHE_SIZE_HALF
632 cmp $DATA_CACHE_SIZE_HALF, %ecx
633#else
634# if (defined SHARED || defined __PIC__)
635 SETUP_PIC_REG(bx)
636 add $_GLOBAL_OFFSET_TABLE_, %ebx
637 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
638# else
639 cmp __x86_data_cache_size_half, %ecx
640# endif
641#endif
642 jb L(sh_3_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800643
Jack Renc47703a2012-02-14 12:01:52 +0400644 lea -64(%ecx), %ecx
645
646 .p2align 4
647L(Shl3LoopStart):
648 prefetcht0 0x1c0(%eax)
649 prefetcht0 0x1c0(%edx)
650 movaps 13(%eax), %xmm2
651 movaps 29(%eax), %xmm3
652 movaps 45(%eax), %xmm4
653 movaps 61(%eax), %xmm5
654 movaps %xmm5, %xmm7
655 palignr $3, %xmm4, %xmm5
656 palignr $3, %xmm3, %xmm4
657 movaps %xmm5, 48(%edx)
658 palignr $3, %xmm2, %xmm3
659 lea 64(%eax), %eax
660 palignr $3, %xmm1, %xmm2
661 movaps %xmm4, 32(%edx)
662 movaps %xmm3, 16(%edx)
663 movaps %xmm7, %xmm1
664 movaps %xmm2, (%edx)
665 lea 64(%edx), %edx
666 sub $64, %ecx
667 ja L(Shl3LoopStart)
668
669L(Shl3LoopLeave):
670 add $32, %ecx
671 jle L(shl_end_0)
672
673 movaps 13(%eax), %xmm2
674 movaps 29(%eax), %xmm3
675 palignr $3, %xmm2, %xmm3
676 palignr $3, %xmm1, %xmm2
677 movaps %xmm2, (%edx)
678 movaps %xmm3, 16(%edx)
679 lea 32(%edx, %ecx), %edx
680 lea 32(%eax, %ecx), %eax
681 POP (%edi)
682 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
683
684 CFI_PUSH (%edi)
685
686 .p2align 4
687L(sh_3_no_prefetch):
688 lea -32(%ecx), %ecx
689 lea -3(%eax), %eax
690 xor %edi, %edi
691
692 .p2align 4
693L(sh_3_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800694 movdqa 16(%eax, %edi), %xmm2
695 sub $32, %ecx
696 movdqa 32(%eax, %edi), %xmm3
697 movdqa %xmm3, %xmm4
698 palignr $3, %xmm2, %xmm3
699 palignr $3, %xmm1, %xmm2
700 lea 32(%edi), %edi
701 movdqa %xmm2, -32(%edx, %edi)
702 movdqa %xmm3, -16(%edx, %edi)
703
Jack Renc47703a2012-02-14 12:01:52 +0400704 jb L(sh_3_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800705
706 movdqa 16(%eax, %edi), %xmm2
707 sub $32, %ecx
708 movdqa 32(%eax, %edi), %xmm3
709 movdqa %xmm3, %xmm1
710 palignr $3, %xmm2, %xmm3
711 palignr $3, %xmm4, %xmm2
712 lea 32(%edi), %edi
713 movdqa %xmm2, -32(%edx, %edi)
714 movdqa %xmm3, -16(%edx, %edi)
715
Jack Renc47703a2012-02-14 12:01:52 +0400716 jae L(sh_3_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800717
Jack Renc47703a2012-02-14 12:01:52 +0400718L(sh_3_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800719 lea 32(%ecx), %ecx
720 add %ecx, %edi
721 add %edi, %edx
722 lea 3(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400723 POP (%edi)
724 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800725
Jack Renc47703a2012-02-14 12:01:52 +0400726 CFI_PUSH (%edi)
727
728 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800729L(shl_4):
Jack Renc47703a2012-02-14 12:01:52 +0400730#ifndef USE_AS_MEMMOVE
731 movaps -4(%eax), %xmm1
732#else
733 movl DEST+4(%esp), %edi
734 movaps -4(%eax), %xmm1
735 movdqu %xmm0, (%edi)
736#endif
737#ifdef DATA_CACHE_SIZE_HALF
738 cmp $DATA_CACHE_SIZE_HALF, %ecx
739#else
740# if (defined SHARED || defined __PIC__)
741 SETUP_PIC_REG(bx)
742 add $_GLOBAL_OFFSET_TABLE_, %ebx
743 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
744# else
745 cmp __x86_data_cache_size_half, %ecx
746# endif
747#endif
748 jb L(sh_4_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800749
Jack Renc47703a2012-02-14 12:01:52 +0400750 lea -64(%ecx), %ecx
751
752 .p2align 4
753L(Shl4LoopStart):
754 prefetcht0 0x1c0(%eax)
755 prefetcht0 0x1c0(%edx)
756 movaps 12(%eax), %xmm2
757 movaps 28(%eax), %xmm3
758 movaps 44(%eax), %xmm4
759 movaps 60(%eax), %xmm5
760 movaps %xmm5, %xmm7
761 palignr $4, %xmm4, %xmm5
762 palignr $4, %xmm3, %xmm4
763 movaps %xmm5, 48(%edx)
764 palignr $4, %xmm2, %xmm3
765 lea 64(%eax), %eax
766 palignr $4, %xmm1, %xmm2
767 movaps %xmm4, 32(%edx)
768 movaps %xmm3, 16(%edx)
769 movaps %xmm7, %xmm1
770 movaps %xmm2, (%edx)
771 lea 64(%edx), %edx
772 sub $64, %ecx
773 ja L(Shl4LoopStart)
774
775L(Shl4LoopLeave):
776 add $32, %ecx
777 jle L(shl_end_0)
778
779 movaps 12(%eax), %xmm2
780 movaps 28(%eax), %xmm3
781 palignr $4, %xmm2, %xmm3
782 palignr $4, %xmm1, %xmm2
783 movaps %xmm2, (%edx)
784 movaps %xmm3, 16(%edx)
785 lea 32(%edx, %ecx), %edx
786 lea 32(%eax, %ecx), %eax
787 POP (%edi)
788 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
789
790 CFI_PUSH (%edi)
791
792 .p2align 4
793L(sh_4_no_prefetch):
794 lea -32(%ecx), %ecx
795 lea -4(%eax), %eax
796 xor %edi, %edi
797
798 .p2align 4
799L(sh_4_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800800 movdqa 16(%eax, %edi), %xmm2
801 sub $32, %ecx
802 movdqa 32(%eax, %edi), %xmm3
803 movdqa %xmm3, %xmm4
804 palignr $4, %xmm2, %xmm3
805 palignr $4, %xmm1, %xmm2
806 lea 32(%edi), %edi
807 movdqa %xmm2, -32(%edx, %edi)
808 movdqa %xmm3, -16(%edx, %edi)
809
Jack Renc47703a2012-02-14 12:01:52 +0400810 jb L(sh_4_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800811
812 movdqa 16(%eax, %edi), %xmm2
813 sub $32, %ecx
814 movdqa 32(%eax, %edi), %xmm3
815 movdqa %xmm3, %xmm1
816 palignr $4, %xmm2, %xmm3
817 palignr $4, %xmm4, %xmm2
818 lea 32(%edi), %edi
819 movdqa %xmm2, -32(%edx, %edi)
820 movdqa %xmm3, -16(%edx, %edi)
821
Jack Renc47703a2012-02-14 12:01:52 +0400822 jae L(sh_4_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800823
Jack Renc47703a2012-02-14 12:01:52 +0400824L(sh_4_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800825 lea 32(%ecx), %ecx
826 add %ecx, %edi
827 add %edi, %edx
828 lea 4(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400829 POP (%edi)
830 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800831
Jack Renc47703a2012-02-14 12:01:52 +0400832 CFI_PUSH (%edi)
833
834 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800835L(shl_5):
Jack Renc47703a2012-02-14 12:01:52 +0400836#ifndef USE_AS_MEMMOVE
837 movaps -5(%eax), %xmm1
838#else
839 movl DEST+4(%esp), %edi
840 movaps -5(%eax), %xmm1
841 movdqu %xmm0, (%edi)
842#endif
843#ifdef DATA_CACHE_SIZE_HALF
844 cmp $DATA_CACHE_SIZE_HALF, %ecx
845#else
846# if (defined SHARED || defined __PIC__)
847 SETUP_PIC_REG(bx)
848 add $_GLOBAL_OFFSET_TABLE_, %ebx
849 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
850# else
851 cmp __x86_data_cache_size_half, %ecx
852# endif
853#endif
854 jb L(sh_5_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800855
Jack Renc47703a2012-02-14 12:01:52 +0400856 lea -64(%ecx), %ecx
857
858 .p2align 4
859L(Shl5LoopStart):
860 prefetcht0 0x1c0(%eax)
861 prefetcht0 0x1c0(%edx)
862 movaps 11(%eax), %xmm2
863 movaps 27(%eax), %xmm3
864 movaps 43(%eax), %xmm4
865 movaps 59(%eax), %xmm5
866 movaps %xmm5, %xmm7
867 palignr $5, %xmm4, %xmm5
868 palignr $5, %xmm3, %xmm4
869 movaps %xmm5, 48(%edx)
870 palignr $5, %xmm2, %xmm3
871 lea 64(%eax), %eax
872 palignr $5, %xmm1, %xmm2
873 movaps %xmm4, 32(%edx)
874 movaps %xmm3, 16(%edx)
875 movaps %xmm7, %xmm1
876 movaps %xmm2, (%edx)
877 lea 64(%edx), %edx
878 sub $64, %ecx
879 ja L(Shl5LoopStart)
880
881L(Shl5LoopLeave):
882 add $32, %ecx
883 jle L(shl_end_0)
884
885 movaps 11(%eax), %xmm2
886 movaps 27(%eax), %xmm3
887 palignr $5, %xmm2, %xmm3
888 palignr $5, %xmm1, %xmm2
889 movaps %xmm2, (%edx)
890 movaps %xmm3, 16(%edx)
891 lea 32(%edx, %ecx), %edx
892 lea 32(%eax, %ecx), %eax
893 POP (%edi)
894 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
895
896 CFI_PUSH (%edi)
897
898 .p2align 4
899L(sh_5_no_prefetch):
900 lea -32(%ecx), %ecx
901 lea -5(%eax), %eax
902 xor %edi, %edi
903
904 .p2align 4
905L(sh_5_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800906 movdqa 16(%eax, %edi), %xmm2
907 sub $32, %ecx
908 movdqa 32(%eax, %edi), %xmm3
909 movdqa %xmm3, %xmm4
910 palignr $5, %xmm2, %xmm3
911 palignr $5, %xmm1, %xmm2
912 lea 32(%edi), %edi
913 movdqa %xmm2, -32(%edx, %edi)
914 movdqa %xmm3, -16(%edx, %edi)
915
Jack Renc47703a2012-02-14 12:01:52 +0400916 jb L(sh_5_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800917
918 movdqa 16(%eax, %edi), %xmm2
919 sub $32, %ecx
920 movdqa 32(%eax, %edi), %xmm3
921 movdqa %xmm3, %xmm1
922 palignr $5, %xmm2, %xmm3
923 palignr $5, %xmm4, %xmm2
924 lea 32(%edi), %edi
925 movdqa %xmm2, -32(%edx, %edi)
926 movdqa %xmm3, -16(%edx, %edi)
927
Jack Renc47703a2012-02-14 12:01:52 +0400928 jae L(sh_5_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800929
Jack Renc47703a2012-02-14 12:01:52 +0400930L(sh_5_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800931 lea 32(%ecx), %ecx
932 add %ecx, %edi
933 add %edi, %edx
934 lea 5(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400935 POP (%edi)
936 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800937
Jack Renc47703a2012-02-14 12:01:52 +0400938 CFI_PUSH (%edi)
939
940 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800941L(shl_6):
Jack Renc47703a2012-02-14 12:01:52 +0400942#ifndef USE_AS_MEMMOVE
943 movaps -6(%eax), %xmm1
944#else
945 movl DEST+4(%esp), %edi
946 movaps -6(%eax), %xmm1
947 movdqu %xmm0, (%edi)
948#endif
949#ifdef DATA_CACHE_SIZE_HALF
950 cmp $DATA_CACHE_SIZE_HALF, %ecx
951#else
952# if (defined SHARED || defined __PIC__)
953 SETUP_PIC_REG(bx)
954 add $_GLOBAL_OFFSET_TABLE_, %ebx
955 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
956# else
957 cmp __x86_data_cache_size_half, %ecx
958# endif
959#endif
960 jb L(sh_6_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800961
Jack Renc47703a2012-02-14 12:01:52 +0400962 lea -64(%ecx), %ecx
963
964 .p2align 4
965L(Shl6LoopStart):
966 prefetcht0 0x1c0(%eax)
967 prefetcht0 0x1c0(%edx)
968 movaps 10(%eax), %xmm2
969 movaps 26(%eax), %xmm3
970 movaps 42(%eax), %xmm4
971 movaps 58(%eax), %xmm5
972 movaps %xmm5, %xmm7
973 palignr $6, %xmm4, %xmm5
974 palignr $6, %xmm3, %xmm4
975 movaps %xmm5, 48(%edx)
976 palignr $6, %xmm2, %xmm3
977 lea 64(%eax), %eax
978 palignr $6, %xmm1, %xmm2
979 movaps %xmm4, 32(%edx)
980 movaps %xmm3, 16(%edx)
981 movaps %xmm7, %xmm1
982 movaps %xmm2, (%edx)
983 lea 64(%edx), %edx
984 sub $64, %ecx
985 ja L(Shl6LoopStart)
986
987L(Shl6LoopLeave):
988 add $32, %ecx
989 jle L(shl_end_0)
990
991 movaps 10(%eax), %xmm2
992 movaps 26(%eax), %xmm3
993 palignr $6, %xmm2, %xmm3
994 palignr $6, %xmm1, %xmm2
995 movaps %xmm2, (%edx)
996 movaps %xmm3, 16(%edx)
997 lea 32(%edx, %ecx), %edx
998 lea 32(%eax, %ecx), %eax
999 POP (%edi)
1000 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1001
1002 CFI_PUSH (%edi)
1003
1004 .p2align 4
1005L(sh_6_no_prefetch):
1006 lea -32(%ecx), %ecx
1007 lea -6(%eax), %eax
1008 xor %edi, %edi
1009
1010 .p2align 4
1011L(sh_6_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001012 movdqa 16(%eax, %edi), %xmm2
1013 sub $32, %ecx
1014 movdqa 32(%eax, %edi), %xmm3
1015 movdqa %xmm3, %xmm4
1016 palignr $6, %xmm2, %xmm3
1017 palignr $6, %xmm1, %xmm2
1018 lea 32(%edi), %edi
1019 movdqa %xmm2, -32(%edx, %edi)
1020 movdqa %xmm3, -16(%edx, %edi)
1021
Jack Renc47703a2012-02-14 12:01:52 +04001022 jb L(sh_6_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001023
1024 movdqa 16(%eax, %edi), %xmm2
1025 sub $32, %ecx
1026 movdqa 32(%eax, %edi), %xmm3
1027 movdqa %xmm3, %xmm1
1028 palignr $6, %xmm2, %xmm3
1029 palignr $6, %xmm4, %xmm2
1030 lea 32(%edi), %edi
1031 movdqa %xmm2, -32(%edx, %edi)
1032 movdqa %xmm3, -16(%edx, %edi)
1033
Jack Renc47703a2012-02-14 12:01:52 +04001034 jae L(sh_6_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001035
Jack Renc47703a2012-02-14 12:01:52 +04001036L(sh_6_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001037 lea 32(%ecx), %ecx
1038 add %ecx, %edi
1039 add %edi, %edx
1040 lea 6(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001041 POP (%edi)
1042 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001043
Jack Renc47703a2012-02-14 12:01:52 +04001044 CFI_PUSH (%edi)
1045
1046 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001047L(shl_7):
Jack Renc47703a2012-02-14 12:01:52 +04001048#ifndef USE_AS_MEMMOVE
1049 movaps -7(%eax), %xmm1
1050#else
1051 movl DEST+4(%esp), %edi
1052 movaps -7(%eax), %xmm1
1053 movdqu %xmm0, (%edi)
1054#endif
1055#ifdef DATA_CACHE_SIZE_HALF
1056 cmp $DATA_CACHE_SIZE_HALF, %ecx
1057#else
1058# if (defined SHARED || defined __PIC__)
1059 SETUP_PIC_REG(bx)
1060 add $_GLOBAL_OFFSET_TABLE_, %ebx
1061 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1062# else
1063 cmp __x86_data_cache_size_half, %ecx
1064# endif
1065#endif
1066 jb L(sh_7_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001067
Jack Renc47703a2012-02-14 12:01:52 +04001068 lea -64(%ecx), %ecx
1069
1070 .p2align 4
1071L(Shl7LoopStart):
1072 prefetcht0 0x1c0(%eax)
1073 prefetcht0 0x1c0(%edx)
1074 movaps 9(%eax), %xmm2
1075 movaps 25(%eax), %xmm3
1076 movaps 41(%eax), %xmm4
1077 movaps 57(%eax), %xmm5
1078 movaps %xmm5, %xmm7
1079 palignr $7, %xmm4, %xmm5
1080 palignr $7, %xmm3, %xmm4
1081 movaps %xmm5, 48(%edx)
1082 palignr $7, %xmm2, %xmm3
1083 lea 64(%eax), %eax
1084 palignr $7, %xmm1, %xmm2
1085 movaps %xmm4, 32(%edx)
1086 movaps %xmm3, 16(%edx)
1087 movaps %xmm7, %xmm1
1088 movaps %xmm2, (%edx)
1089 lea 64(%edx), %edx
1090 sub $64, %ecx
1091 ja L(Shl7LoopStart)
1092
1093L(Shl7LoopLeave):
1094 add $32, %ecx
1095 jle L(shl_end_0)
1096
1097 movaps 9(%eax), %xmm2
1098 movaps 25(%eax), %xmm3
1099 palignr $7, %xmm2, %xmm3
1100 palignr $7, %xmm1, %xmm2
1101 movaps %xmm2, (%edx)
1102 movaps %xmm3, 16(%edx)
1103 lea 32(%edx, %ecx), %edx
1104 lea 32(%eax, %ecx), %eax
1105 POP (%edi)
1106 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1107
1108 CFI_PUSH (%edi)
1109
1110 .p2align 4
1111L(sh_7_no_prefetch):
1112 lea -32(%ecx), %ecx
1113 lea -7(%eax), %eax
1114 xor %edi, %edi
1115
1116 .p2align 4
1117L(sh_7_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001118 movdqa 16(%eax, %edi), %xmm2
1119 sub $32, %ecx
1120 movdqa 32(%eax, %edi), %xmm3
1121 movdqa %xmm3, %xmm4
1122 palignr $7, %xmm2, %xmm3
1123 palignr $7, %xmm1, %xmm2
1124 lea 32(%edi), %edi
1125 movdqa %xmm2, -32(%edx, %edi)
1126 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001127 jb L(sh_7_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001128
1129 movdqa 16(%eax, %edi), %xmm2
1130 sub $32, %ecx
1131 movdqa 32(%eax, %edi), %xmm3
1132 movdqa %xmm3, %xmm1
1133 palignr $7, %xmm2, %xmm3
1134 palignr $7, %xmm4, %xmm2
1135 lea 32(%edi), %edi
1136 movdqa %xmm2, -32(%edx, %edi)
1137 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001138 jae L(sh_7_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001139
Jack Renc47703a2012-02-14 12:01:52 +04001140L(sh_7_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001141 lea 32(%ecx), %ecx
1142 add %ecx, %edi
1143 add %edi, %edx
1144 lea 7(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001145 POP (%edi)
1146 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001147
Jack Renc47703a2012-02-14 12:01:52 +04001148 CFI_PUSH (%edi)
1149
1150 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001151L(shl_8):
Jack Renc47703a2012-02-14 12:01:52 +04001152#ifndef USE_AS_MEMMOVE
1153 movaps -8(%eax), %xmm1
1154#else
1155 movl DEST+4(%esp), %edi
1156 movaps -8(%eax), %xmm1
1157 movdqu %xmm0, (%edi)
1158#endif
1159#ifdef DATA_CACHE_SIZE_HALF
1160 cmp $DATA_CACHE_SIZE_HALF, %ecx
1161#else
1162# if (defined SHARED || defined __PIC__)
1163 SETUP_PIC_REG(bx)
1164 add $_GLOBAL_OFFSET_TABLE_, %ebx
1165 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1166# else
1167 cmp __x86_data_cache_size_half, %ecx
1168# endif
1169#endif
1170 jb L(sh_8_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001171
Jack Renc47703a2012-02-14 12:01:52 +04001172 lea -64(%ecx), %ecx
1173
1174 .p2align 4
1175L(Shl8LoopStart):
1176 prefetcht0 0x1c0(%eax)
1177 prefetcht0 0x1c0(%edx)
1178 movaps 8(%eax), %xmm2
1179 movaps 24(%eax), %xmm3
1180 movaps 40(%eax), %xmm4
1181 movaps 56(%eax), %xmm5
1182 movaps %xmm5, %xmm7
1183 palignr $8, %xmm4, %xmm5
1184 palignr $8, %xmm3, %xmm4
1185 movaps %xmm5, 48(%edx)
1186 palignr $8, %xmm2, %xmm3
1187 lea 64(%eax), %eax
1188 palignr $8, %xmm1, %xmm2
1189 movaps %xmm4, 32(%edx)
1190 movaps %xmm3, 16(%edx)
1191 movaps %xmm7, %xmm1
1192 movaps %xmm2, (%edx)
1193 lea 64(%edx), %edx
1194 sub $64, %ecx
1195 ja L(Shl8LoopStart)
1196
1197L(LoopLeave8):
1198 add $32, %ecx
1199 jle L(shl_end_0)
1200
1201 movaps 8(%eax), %xmm2
1202 movaps 24(%eax), %xmm3
1203 palignr $8, %xmm2, %xmm3
1204 palignr $8, %xmm1, %xmm2
1205 movaps %xmm2, (%edx)
1206 movaps %xmm3, 16(%edx)
1207 lea 32(%edx, %ecx), %edx
1208 lea 32(%eax, %ecx), %eax
1209 POP (%edi)
1210 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1211
1212 CFI_PUSH (%edi)
1213
1214 .p2align 4
1215L(sh_8_no_prefetch):
1216 lea -32(%ecx), %ecx
1217 lea -8(%eax), %eax
1218 xor %edi, %edi
1219
1220 .p2align 4
1221L(sh_8_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001222 movdqa 16(%eax, %edi), %xmm2
1223 sub $32, %ecx
1224 movdqa 32(%eax, %edi), %xmm3
1225 movdqa %xmm3, %xmm4
1226 palignr $8, %xmm2, %xmm3
1227 palignr $8, %xmm1, %xmm2
1228 lea 32(%edi), %edi
1229 movdqa %xmm2, -32(%edx, %edi)
1230 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001231 jb L(sh_8_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001232
1233 movdqa 16(%eax, %edi), %xmm2
1234 sub $32, %ecx
1235 movdqa 32(%eax, %edi), %xmm3
1236 movdqa %xmm3, %xmm1
1237 palignr $8, %xmm2, %xmm3
1238 palignr $8, %xmm4, %xmm2
1239 lea 32(%edi), %edi
1240 movdqa %xmm2, -32(%edx, %edi)
1241 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001242 jae L(sh_8_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001243
Jack Renc47703a2012-02-14 12:01:52 +04001244L(sh_8_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001245 lea 32(%ecx), %ecx
1246 add %ecx, %edi
1247 add %edi, %edx
1248 lea 8(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001249 POP (%edi)
1250 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001251
Jack Renc47703a2012-02-14 12:01:52 +04001252 CFI_PUSH (%edi)
1253
1254 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001255L(shl_9):
Jack Renc47703a2012-02-14 12:01:52 +04001256#ifndef USE_AS_MEMMOVE
1257 movaps -9(%eax), %xmm1
1258#else
1259 movl DEST+4(%esp), %edi
1260 movaps -9(%eax), %xmm1
1261 movdqu %xmm0, (%edi)
1262#endif
1263#ifdef DATA_CACHE_SIZE_HALF
1264 cmp $DATA_CACHE_SIZE_HALF, %ecx
1265#else
1266# if (defined SHARED || defined __PIC__)
1267 SETUP_PIC_REG(bx)
1268 add $_GLOBAL_OFFSET_TABLE_, %ebx
1269 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1270# else
1271 cmp __x86_data_cache_size_half, %ecx
1272# endif
1273#endif
1274 jb L(sh_9_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001275
Jack Renc47703a2012-02-14 12:01:52 +04001276 lea -64(%ecx), %ecx
1277
1278 .p2align 4
1279L(Shl9LoopStart):
1280 prefetcht0 0x1c0(%eax)
1281 prefetcht0 0x1c0(%edx)
1282 movaps 7(%eax), %xmm2
1283 movaps 23(%eax), %xmm3
1284 movaps 39(%eax), %xmm4
1285 movaps 55(%eax), %xmm5
1286 movaps %xmm5, %xmm7
1287 palignr $9, %xmm4, %xmm5
1288 palignr $9, %xmm3, %xmm4
1289 movaps %xmm5, 48(%edx)
1290 palignr $9, %xmm2, %xmm3
1291 lea 64(%eax), %eax
1292 palignr $9, %xmm1, %xmm2
1293 movaps %xmm4, 32(%edx)
1294 movaps %xmm3, 16(%edx)
1295 movaps %xmm7, %xmm1
1296 movaps %xmm2, (%edx)
1297 lea 64(%edx), %edx
1298 sub $64, %ecx
1299 ja L(Shl9LoopStart)
1300
1301L(Shl9LoopLeave):
1302 add $32, %ecx
1303 jle L(shl_end_0)
1304
1305 movaps 7(%eax), %xmm2
1306 movaps 23(%eax), %xmm3
1307 palignr $9, %xmm2, %xmm3
1308 palignr $9, %xmm1, %xmm2
1309
1310 movaps %xmm2, (%edx)
1311 movaps %xmm3, 16(%edx)
1312 lea 32(%edx, %ecx), %edx
1313 lea 32(%eax, %ecx), %eax
1314 POP (%edi)
1315 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1316
1317 CFI_PUSH (%edi)
1318
1319 .p2align 4
1320L(sh_9_no_prefetch):
1321 lea -32(%ecx), %ecx
1322 lea -9(%eax), %eax
1323 xor %edi, %edi
1324
1325 .p2align 4
1326L(sh_9_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001327 movdqa 16(%eax, %edi), %xmm2
1328 sub $32, %ecx
1329 movdqa 32(%eax, %edi), %xmm3
1330 movdqa %xmm3, %xmm4
1331 palignr $9, %xmm2, %xmm3
1332 palignr $9, %xmm1, %xmm2
1333 lea 32(%edi), %edi
1334 movdqa %xmm2, -32(%edx, %edi)
1335 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001336 jb L(sh_9_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001337
1338 movdqa 16(%eax, %edi), %xmm2
1339 sub $32, %ecx
1340 movdqa 32(%eax, %edi), %xmm3
1341 movdqa %xmm3, %xmm1
1342 palignr $9, %xmm2, %xmm3
1343 palignr $9, %xmm4, %xmm2
1344 lea 32(%edi), %edi
1345 movdqa %xmm2, -32(%edx, %edi)
1346 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001347 jae L(sh_9_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001348
Jack Renc47703a2012-02-14 12:01:52 +04001349L(sh_9_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001350 lea 32(%ecx), %ecx
1351 add %ecx, %edi
1352 add %edi, %edx
1353 lea 9(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001354 POP (%edi)
1355 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001356
Jack Renc47703a2012-02-14 12:01:52 +04001357 CFI_PUSH (%edi)
1358
1359 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001360L(shl_10):
Jack Renc47703a2012-02-14 12:01:52 +04001361#ifndef USE_AS_MEMMOVE
1362 movaps -10(%eax), %xmm1
1363#else
1364 movl DEST+4(%esp), %edi
1365 movaps -10(%eax), %xmm1
1366 movdqu %xmm0, (%edi)
1367#endif
1368#ifdef DATA_CACHE_SIZE_HALF
1369 cmp $DATA_CACHE_SIZE_HALF, %ecx
1370#else
1371# if (defined SHARED || defined __PIC__)
1372 SETUP_PIC_REG(bx)
1373 add $_GLOBAL_OFFSET_TABLE_, %ebx
1374 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1375# else
1376 cmp __x86_data_cache_size_half, %ecx
1377# endif
1378#endif
1379 jb L(sh_10_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001380
Jack Renc47703a2012-02-14 12:01:52 +04001381 lea -64(%ecx), %ecx
1382
1383 .p2align 4
1384L(Shl10LoopStart):
1385 prefetcht0 0x1c0(%eax)
1386 prefetcht0 0x1c0(%edx)
1387 movaps 6(%eax), %xmm2
1388 movaps 22(%eax), %xmm3
1389 movaps 38(%eax), %xmm4
1390 movaps 54(%eax), %xmm5
1391 movaps %xmm5, %xmm7
1392 palignr $10, %xmm4, %xmm5
1393 palignr $10, %xmm3, %xmm4
1394 movaps %xmm5, 48(%edx)
1395 palignr $10, %xmm2, %xmm3
1396 lea 64(%eax), %eax
1397 palignr $10, %xmm1, %xmm2
1398 movaps %xmm4, 32(%edx)
1399 movaps %xmm3, 16(%edx)
1400 movaps %xmm7, %xmm1
1401 movaps %xmm2, (%edx)
1402 lea 64(%edx), %edx
1403 sub $64, %ecx
1404 ja L(Shl10LoopStart)
1405
1406L(Shl10LoopLeave):
1407 add $32, %ecx
1408 jle L(shl_end_0)
1409
1410 movaps 6(%eax), %xmm2
1411 movaps 22(%eax), %xmm3
1412 palignr $10, %xmm2, %xmm3
1413 palignr $10, %xmm1, %xmm2
1414
1415 movaps %xmm2, (%edx)
1416 movaps %xmm3, 16(%edx)
1417 lea 32(%edx, %ecx), %edx
1418 lea 32(%eax, %ecx), %eax
1419 POP (%edi)
1420 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1421
1422 CFI_PUSH (%edi)
1423
1424 .p2align 4
1425L(sh_10_no_prefetch):
1426 lea -32(%ecx), %ecx
1427 lea -10(%eax), %eax
1428 xor %edi, %edi
1429
1430 .p2align 4
1431L(sh_10_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001432 movdqa 16(%eax, %edi), %xmm2
1433 sub $32, %ecx
1434 movdqa 32(%eax, %edi), %xmm3
1435 movdqa %xmm3, %xmm4
1436 palignr $10, %xmm2, %xmm3
1437 palignr $10, %xmm1, %xmm2
1438 lea 32(%edi), %edi
1439 movdqa %xmm2, -32(%edx, %edi)
1440 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001441 jb L(sh_10_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001442
1443 movdqa 16(%eax, %edi), %xmm2
1444 sub $32, %ecx
1445 movdqa 32(%eax, %edi), %xmm3
1446 movdqa %xmm3, %xmm1
1447 palignr $10, %xmm2, %xmm3
1448 palignr $10, %xmm4, %xmm2
1449 lea 32(%edi), %edi
1450 movdqa %xmm2, -32(%edx, %edi)
1451 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001452 jae L(sh_10_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001453
Jack Renc47703a2012-02-14 12:01:52 +04001454L(sh_10_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001455 lea 32(%ecx), %ecx
1456 add %ecx, %edi
1457 add %edi, %edx
1458 lea 10(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001459 POP (%edi)
1460 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001461
Jack Renc47703a2012-02-14 12:01:52 +04001462 CFI_PUSH (%edi)
1463
1464 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001465L(shl_11):
Jack Renc47703a2012-02-14 12:01:52 +04001466#ifndef USE_AS_MEMMOVE
1467 movaps -11(%eax), %xmm1
1468#else
1469 movl DEST+4(%esp), %edi
1470 movaps -11(%eax), %xmm1
1471 movdqu %xmm0, (%edi)
1472#endif
1473#ifdef DATA_CACHE_SIZE_HALF
1474 cmp $DATA_CACHE_SIZE_HALF, %ecx
1475#else
1476# if (defined SHARED || defined __PIC__)
1477 SETUP_PIC_REG(bx)
1478 add $_GLOBAL_OFFSET_TABLE_, %ebx
1479 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1480# else
1481 cmp __x86_data_cache_size_half, %ecx
1482# endif
1483#endif
1484 jb L(sh_11_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001485
Jack Renc47703a2012-02-14 12:01:52 +04001486 lea -64(%ecx), %ecx
1487
1488 .p2align 4
1489L(Shl11LoopStart):
1490 prefetcht0 0x1c0(%eax)
1491 prefetcht0 0x1c0(%edx)
1492 movaps 5(%eax), %xmm2
1493 movaps 21(%eax), %xmm3
1494 movaps 37(%eax), %xmm4
1495 movaps 53(%eax), %xmm5
1496 movaps %xmm5, %xmm7
1497 palignr $11, %xmm4, %xmm5
1498 palignr $11, %xmm3, %xmm4
1499 movaps %xmm5, 48(%edx)
1500 palignr $11, %xmm2, %xmm3
1501 lea 64(%eax), %eax
1502 palignr $11, %xmm1, %xmm2
1503 movaps %xmm4, 32(%edx)
1504 movaps %xmm3, 16(%edx)
1505 movaps %xmm7, %xmm1
1506 movaps %xmm2, (%edx)
1507 lea 64(%edx), %edx
1508 sub $64, %ecx
1509 ja L(Shl11LoopStart)
1510
1511L(Shl11LoopLeave):
1512 add $32, %ecx
1513 jle L(shl_end_0)
1514
1515 movaps 5(%eax), %xmm2
1516 movaps 21(%eax), %xmm3
1517 palignr $11, %xmm2, %xmm3
1518 palignr $11, %xmm1, %xmm2
1519
1520 movaps %xmm2, (%edx)
1521 movaps %xmm3, 16(%edx)
1522 lea 32(%edx, %ecx), %edx
1523 lea 32(%eax, %ecx), %eax
1524 POP (%edi)
1525 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1526
1527 CFI_PUSH (%edi)
1528
1529 .p2align 4
1530L(sh_11_no_prefetch):
1531 lea -32(%ecx), %ecx
1532 lea -11(%eax), %eax
1533 xor %edi, %edi
1534
1535 .p2align 4
1536L(sh_11_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001537 movdqa 16(%eax, %edi), %xmm2
1538 sub $32, %ecx
1539 movdqa 32(%eax, %edi), %xmm3
1540 movdqa %xmm3, %xmm4
1541 palignr $11, %xmm2, %xmm3
1542 palignr $11, %xmm1, %xmm2
1543 lea 32(%edi), %edi
1544 movdqa %xmm2, -32(%edx, %edi)
1545 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001546 jb L(sh_11_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001547
1548 movdqa 16(%eax, %edi), %xmm2
1549 sub $32, %ecx
1550 movdqa 32(%eax, %edi), %xmm3
1551 movdqa %xmm3, %xmm1
1552 palignr $11, %xmm2, %xmm3
1553 palignr $11, %xmm4, %xmm2
1554 lea 32(%edi), %edi
1555 movdqa %xmm2, -32(%edx, %edi)
1556 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001557 jae L(sh_11_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001558
Jack Renc47703a2012-02-14 12:01:52 +04001559L(sh_11_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001560 lea 32(%ecx), %ecx
1561 add %ecx, %edi
1562 add %edi, %edx
1563 lea 11(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001564 POP (%edi)
1565 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001566
Jack Renc47703a2012-02-14 12:01:52 +04001567 CFI_PUSH (%edi)
1568
1569 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001570L(shl_12):
Jack Renc47703a2012-02-14 12:01:52 +04001571#ifndef USE_AS_MEMMOVE
1572 movaps -12(%eax), %xmm1
1573#else
1574 movl DEST+4(%esp), %edi
1575 movaps -12(%eax), %xmm1
1576 movdqu %xmm0, (%edi)
1577#endif
1578#ifdef DATA_CACHE_SIZE_HALF
1579 cmp $DATA_CACHE_SIZE_HALF, %ecx
1580#else
1581# if (defined SHARED || defined __PIC__)
1582 SETUP_PIC_REG(bx)
1583 add $_GLOBAL_OFFSET_TABLE_, %ebx
1584 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1585# else
1586 cmp __x86_data_cache_size_half, %ecx
1587# endif
1588#endif
1589 jb L(sh_12_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001590
Jack Renc47703a2012-02-14 12:01:52 +04001591 lea -64(%ecx), %ecx
1592
1593 .p2align 4
1594L(Shl12LoopStart):
1595 prefetcht0 0x1c0(%eax)
1596 prefetcht0 0x1c0(%edx)
1597 movaps 4(%eax), %xmm2
1598 movaps 20(%eax), %xmm3
1599 movaps 36(%eax), %xmm4
1600 movaps 52(%eax), %xmm5
1601 movaps %xmm5, %xmm7
1602 palignr $12, %xmm4, %xmm5
1603 palignr $12, %xmm3, %xmm4
1604 movaps %xmm5, 48(%edx)
1605 palignr $12, %xmm2, %xmm3
1606 lea 64(%eax), %eax
1607 palignr $12, %xmm1, %xmm2
1608 movaps %xmm4, 32(%edx)
1609 movaps %xmm3, 16(%edx)
1610 movaps %xmm7, %xmm1
1611 movaps %xmm2, (%edx)
1612 lea 64(%edx), %edx
1613 sub $64, %ecx
1614 ja L(Shl12LoopStart)
1615
1616L(Shl12LoopLeave):
1617 add $32, %ecx
1618 jle L(shl_end_0)
1619
1620 movaps 4(%eax), %xmm2
1621 movaps 20(%eax), %xmm3
1622 palignr $12, %xmm2, %xmm3
1623 palignr $12, %xmm1, %xmm2
1624
1625 movaps %xmm2, (%edx)
1626 movaps %xmm3, 16(%edx)
1627 lea 32(%edx, %ecx), %edx
1628 lea 32(%eax, %ecx), %eax
1629 POP (%edi)
1630 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1631
1632 CFI_PUSH (%edi)
1633
1634 .p2align 4
1635L(sh_12_no_prefetch):
1636 lea -32(%ecx), %ecx
1637 lea -12(%eax), %eax
1638 xor %edi, %edi
1639
1640 .p2align 4
1641L(sh_12_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001642 movdqa 16(%eax, %edi), %xmm2
1643 sub $32, %ecx
1644 movdqa 32(%eax, %edi), %xmm3
1645 movdqa %xmm3, %xmm4
1646 palignr $12, %xmm2, %xmm3
1647 palignr $12, %xmm1, %xmm2
1648 lea 32(%edi), %edi
1649 movdqa %xmm2, -32(%edx, %edi)
1650 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001651 jb L(sh_12_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001652
1653 movdqa 16(%eax, %edi), %xmm2
1654 sub $32, %ecx
1655 movdqa 32(%eax, %edi), %xmm3
1656 movdqa %xmm3, %xmm1
1657 palignr $12, %xmm2, %xmm3
1658 palignr $12, %xmm4, %xmm2
1659 lea 32(%edi), %edi
1660 movdqa %xmm2, -32(%edx, %edi)
1661 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001662 jae L(sh_12_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001663
Jack Renc47703a2012-02-14 12:01:52 +04001664L(sh_12_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001665 lea 32(%ecx), %ecx
1666 add %ecx, %edi
1667 add %edi, %edx
1668 lea 12(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001669 POP (%edi)
1670 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001671
Jack Renc47703a2012-02-14 12:01:52 +04001672 CFI_PUSH (%edi)
1673
1674 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001675L(shl_13):
Jack Renc47703a2012-02-14 12:01:52 +04001676#ifndef USE_AS_MEMMOVE
1677 movaps -13(%eax), %xmm1
1678#else
1679 movl DEST+4(%esp), %edi
1680 movaps -13(%eax), %xmm1
1681 movdqu %xmm0, (%edi)
1682#endif
1683#ifdef DATA_CACHE_SIZE_HALF
1684 cmp $DATA_CACHE_SIZE_HALF, %ecx
1685#else
1686# if (defined SHARED || defined __PIC__)
1687 SETUP_PIC_REG(bx)
1688 add $_GLOBAL_OFFSET_TABLE_, %ebx
1689 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1690# else
1691 cmp __x86_data_cache_size_half, %ecx
1692# endif
1693#endif
1694 jb L(sh_13_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001695
Jack Renc47703a2012-02-14 12:01:52 +04001696 lea -64(%ecx), %ecx
1697
1698 .p2align 4
1699L(Shl13LoopStart):
1700 prefetcht0 0x1c0(%eax)
1701 prefetcht0 0x1c0(%edx)
1702 movaps 3(%eax), %xmm2
1703 movaps 19(%eax), %xmm3
1704 movaps 35(%eax), %xmm4
1705 movaps 51(%eax), %xmm5
1706 movaps %xmm5, %xmm7
1707 palignr $13, %xmm4, %xmm5
1708 palignr $13, %xmm3, %xmm4
1709 movaps %xmm5, 48(%edx)
1710 palignr $13, %xmm2, %xmm3
1711 lea 64(%eax), %eax
1712 palignr $13, %xmm1, %xmm2
1713 movaps %xmm4, 32(%edx)
1714 movaps %xmm3, 16(%edx)
1715 movaps %xmm7, %xmm1
1716 movaps %xmm2, (%edx)
1717 lea 64(%edx), %edx
1718 sub $64, %ecx
1719 ja L(Shl13LoopStart)
1720
1721L(Shl13LoopLeave):
1722 add $32, %ecx
1723 jle L(shl_end_0)
1724
1725 movaps 3(%eax), %xmm2
1726 movaps 19(%eax), %xmm3
1727 palignr $13, %xmm2, %xmm3
1728 palignr $13, %xmm1, %xmm2
1729
1730 movaps %xmm2, (%edx)
1731 movaps %xmm3, 16(%edx)
1732 lea 32(%edx, %ecx), %edx
1733 lea 32(%eax, %ecx), %eax
1734 POP (%edi)
1735 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1736
1737 CFI_PUSH (%edi)
1738
1739 .p2align 4
1740L(sh_13_no_prefetch):
1741 lea -32(%ecx), %ecx
1742 lea -13(%eax), %eax
1743 xor %edi, %edi
1744
1745 .p2align 4
1746L(sh_13_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001747 movdqa 16(%eax, %edi), %xmm2
1748 sub $32, %ecx
1749 movdqa 32(%eax, %edi), %xmm3
1750 movdqa %xmm3, %xmm4
1751 palignr $13, %xmm2, %xmm3
1752 palignr $13, %xmm1, %xmm2
1753 lea 32(%edi), %edi
1754 movdqa %xmm2, -32(%edx, %edi)
1755 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001756 jb L(sh_13_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001757
1758 movdqa 16(%eax, %edi), %xmm2
1759 sub $32, %ecx
1760 movdqa 32(%eax, %edi), %xmm3
1761 movdqa %xmm3, %xmm1
1762 palignr $13, %xmm2, %xmm3
1763 palignr $13, %xmm4, %xmm2
1764 lea 32(%edi), %edi
1765 movdqa %xmm2, -32(%edx, %edi)
1766 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001767 jae L(sh_13_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001768
Jack Renc47703a2012-02-14 12:01:52 +04001769L(sh_13_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001770 lea 32(%ecx), %ecx
1771 add %ecx, %edi
1772 add %edi, %edx
1773 lea 13(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001774 POP (%edi)
1775 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001776
Jack Renc47703a2012-02-14 12:01:52 +04001777 CFI_PUSH (%edi)
1778
1779 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001780L(shl_14):
Jack Renc47703a2012-02-14 12:01:52 +04001781#ifndef USE_AS_MEMMOVE
1782 movaps -14(%eax), %xmm1
1783#else
1784 movl DEST+4(%esp), %edi
1785 movaps -14(%eax), %xmm1
1786 movdqu %xmm0, (%edi)
1787#endif
1788#ifdef DATA_CACHE_SIZE_HALF
1789 cmp $DATA_CACHE_SIZE_HALF, %ecx
1790#else
1791# if (defined SHARED || defined __PIC__)
1792 SETUP_PIC_REG(bx)
1793 add $_GLOBAL_OFFSET_TABLE_, %ebx
1794 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1795# else
1796 cmp __x86_data_cache_size_half, %ecx
1797# endif
1798#endif
1799 jb L(sh_14_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001800
Jack Renc47703a2012-02-14 12:01:52 +04001801 lea -64(%ecx), %ecx
1802
1803 .p2align 4
1804L(Shl14LoopStart):
1805 prefetcht0 0x1c0(%eax)
1806 prefetcht0 0x1c0(%edx)
1807 movaps 2(%eax), %xmm2
1808 movaps 18(%eax), %xmm3
1809 movaps 34(%eax), %xmm4
1810 movaps 50(%eax), %xmm5
1811 movaps %xmm5, %xmm7
1812 palignr $14, %xmm4, %xmm5
1813 palignr $14, %xmm3, %xmm4
1814 movaps %xmm5, 48(%edx)
1815 palignr $14, %xmm2, %xmm3
1816 lea 64(%eax), %eax
1817 palignr $14, %xmm1, %xmm2
1818 movaps %xmm4, 32(%edx)
1819 movaps %xmm3, 16(%edx)
1820 movaps %xmm7, %xmm1
1821 movaps %xmm2, (%edx)
1822 lea 64(%edx), %edx
1823 sub $64, %ecx
1824 ja L(Shl14LoopStart)
1825
1826L(Shl14LoopLeave):
1827 add $32, %ecx
1828 jle L(shl_end_0)
1829
1830 movaps 2(%eax), %xmm2
1831 movaps 18(%eax), %xmm3
1832 palignr $14, %xmm2, %xmm3
1833 palignr $14, %xmm1, %xmm2
1834
1835 movaps %xmm2, (%edx)
1836 movaps %xmm3, 16(%edx)
1837 lea 32(%edx, %ecx), %edx
1838 lea 32(%eax, %ecx), %eax
1839 POP (%edi)
1840 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1841
1842 CFI_PUSH (%edi)
1843
1844 .p2align 4
1845L(sh_14_no_prefetch):
1846 lea -32(%ecx), %ecx
1847 lea -14(%eax), %eax
1848 xor %edi, %edi
1849
1850 .p2align 4
1851L(sh_14_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001852 movdqa 16(%eax, %edi), %xmm2
1853 sub $32, %ecx
1854 movdqa 32(%eax, %edi), %xmm3
1855 movdqa %xmm3, %xmm4
1856 palignr $14, %xmm2, %xmm3
1857 palignr $14, %xmm1, %xmm2
1858 lea 32(%edi), %edi
1859 movdqa %xmm2, -32(%edx, %edi)
1860 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001861 jb L(sh_14_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001862
1863 movdqa 16(%eax, %edi), %xmm2
1864 sub $32, %ecx
1865 movdqa 32(%eax, %edi), %xmm3
1866 movdqa %xmm3, %xmm1
1867 palignr $14, %xmm2, %xmm3
1868 palignr $14, %xmm4, %xmm2
1869 lea 32(%edi), %edi
1870 movdqa %xmm2, -32(%edx, %edi)
1871 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001872 jae L(sh_14_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001873
Jack Renc47703a2012-02-14 12:01:52 +04001874L(sh_14_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001875 lea 32(%ecx), %ecx
1876 add %ecx, %edi
1877 add %edi, %edx
1878 lea 14(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001879 POP (%edi)
1880 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001881
Jack Renc47703a2012-02-14 12:01:52 +04001882 CFI_PUSH (%edi)
1883
1884 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001885L(shl_15):
Jack Renc47703a2012-02-14 12:01:52 +04001886#ifndef USE_AS_MEMMOVE
1887 movaps -15(%eax), %xmm1
1888#else
1889 movl DEST+4(%esp), %edi
1890 movaps -15(%eax), %xmm1
1891 movdqu %xmm0, (%edi)
1892#endif
1893#ifdef DATA_CACHE_SIZE_HALF
1894 cmp $DATA_CACHE_SIZE_HALF, %ecx
1895#else
1896# if (defined SHARED || defined __PIC__)
1897 SETUP_PIC_REG(bx)
1898 add $_GLOBAL_OFFSET_TABLE_, %ebx
1899 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1900# else
1901 cmp __x86_data_cache_size_half, %ecx
1902# endif
1903#endif
1904 jb L(sh_15_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001905
Jack Renc47703a2012-02-14 12:01:52 +04001906 lea -64(%ecx), %ecx
1907
1908 .p2align 4
1909L(Shl15LoopStart):
1910 prefetcht0 0x1c0(%eax)
1911 prefetcht0 0x1c0(%edx)
1912 movaps 1(%eax), %xmm2
1913 movaps 17(%eax), %xmm3
1914 movaps 33(%eax), %xmm4
1915 movaps 49(%eax), %xmm5
1916 movaps %xmm5, %xmm7
1917 palignr $15, %xmm4, %xmm5
1918 palignr $15, %xmm3, %xmm4
1919 movaps %xmm5, 48(%edx)
1920 palignr $15, %xmm2, %xmm3
1921 lea 64(%eax), %eax
1922 palignr $15, %xmm1, %xmm2
1923 movaps %xmm4, 32(%edx)
1924 movaps %xmm3, 16(%edx)
1925 movaps %xmm7, %xmm1
1926 movaps %xmm2, (%edx)
1927 lea 64(%edx), %edx
1928 sub $64, %ecx
1929 ja L(Shl15LoopStart)
1930
1931L(Shl15LoopLeave):
1932 add $32, %ecx
1933 jle L(shl_end_0)
1934
1935 movaps 1(%eax), %xmm2
1936 movaps 17(%eax), %xmm3
1937 palignr $15, %xmm2, %xmm3
1938 palignr $15, %xmm1, %xmm2
1939
1940 movaps %xmm2, (%edx)
1941 movaps %xmm3, 16(%edx)
1942 lea 32(%edx, %ecx), %edx
1943 lea 32(%eax, %ecx), %eax
1944 POP (%edi)
1945 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1946
1947 CFI_PUSH (%edi)
1948
1949 .p2align 4
1950L(sh_15_no_prefetch):
1951 lea -32(%ecx), %ecx
1952 lea -15(%eax), %eax
1953 xor %edi, %edi
1954
1955 .p2align 4
1956L(sh_15_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001957 movdqa 16(%eax, %edi), %xmm2
1958 sub $32, %ecx
1959 movdqa 32(%eax, %edi), %xmm3
1960 movdqa %xmm3, %xmm4
1961 palignr $15, %xmm2, %xmm3
1962 palignr $15, %xmm1, %xmm2
1963 lea 32(%edi), %edi
1964 movdqa %xmm2, -32(%edx, %edi)
1965 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001966 jb L(sh_15_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001967
1968 movdqa 16(%eax, %edi), %xmm2
1969 sub $32, %ecx
1970 movdqa 32(%eax, %edi), %xmm3
1971 movdqa %xmm3, %xmm1
1972 palignr $15, %xmm2, %xmm3
1973 palignr $15, %xmm4, %xmm2
1974 lea 32(%edi), %edi
1975 movdqa %xmm2, -32(%edx, %edi)
1976 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001977 jae L(sh_15_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001978
Jack Renc47703a2012-02-14 12:01:52 +04001979L(sh_15_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001980 lea 32(%ecx), %ecx
1981 add %ecx, %edi
1982 add %edi, %edx
1983 lea 15(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001984 POP (%edi)
1985 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001986
Jack Renc47703a2012-02-14 12:01:52 +04001987 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001988
Jack Renc47703a2012-02-14 12:01:52 +04001989 .p2align 4
1990L(shl_end_0):
1991 lea 32(%ecx), %ecx
1992 lea (%edx, %ecx), %edx
1993 lea (%eax, %ecx), %eax
1994 POP (%edi)
1995 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1996
1997 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001998L(fwd_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04001999 movq -44(%eax), %xmm0
2000 movq %xmm0, -44(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002001L(fwd_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002002 movq -36(%eax), %xmm0
2003 movq %xmm0, -36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002004L(fwd_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002005 movq -28(%eax), %xmm0
2006 movq %xmm0, -28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002007L(fwd_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002008 movq -20(%eax), %xmm0
2009 movq %xmm0, -20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002010L(fwd_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002011 movq -12(%eax), %xmm0
2012 movq %xmm0, -12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002013L(fwd_write_4bytes):
2014 movl -4(%eax), %ecx
2015 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002016#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002017 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002018#else
Jack Renc47703a2012-02-14 12:01:52 +04002019 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002020#endif
2021 RETURN
2022
2023 .p2align 4
2024L(fwd_write_40bytes):
2025 movq -40(%eax), %xmm0
2026 movq %xmm0, -40(%edx)
2027L(fwd_write_32bytes):
2028 movq -32(%eax), %xmm0
2029 movq %xmm0, -32(%edx)
2030L(fwd_write_24bytes):
2031 movq -24(%eax), %xmm0
2032 movq %xmm0, -24(%edx)
2033L(fwd_write_16bytes):
2034 movq -16(%eax), %xmm0
2035 movq %xmm0, -16(%edx)
2036L(fwd_write_8bytes):
2037 movq -8(%eax), %xmm0
2038 movq %xmm0, -8(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002039L(fwd_write_0bytes):
Elliott Hughesbed110a2016-03-03 10:41:42 -08002040#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002041 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002042#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002043 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002044#endif
2045 RETURN
2046
Jack Renc47703a2012-02-14 12:01:52 +04002047 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002048L(fwd_write_5bytes):
2049 movl -5(%eax), %ecx
2050 movl -4(%eax), %eax
2051 movl %ecx, -5(%edx)
2052 movl %eax, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002053#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002054 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002055#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002056 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002057#endif
2058 RETURN
2059
Jack Renc47703a2012-02-14 12:01:52 +04002060 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002061L(fwd_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002062 movq -45(%eax), %xmm0
2063 movq %xmm0, -45(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002064L(fwd_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002065 movq -37(%eax), %xmm0
2066 movq %xmm0, -37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002067L(fwd_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002068 movq -29(%eax), %xmm0
2069 movq %xmm0, -29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002070L(fwd_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002071 movq -21(%eax), %xmm0
2072 movq %xmm0, -21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002073L(fwd_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002074 movq -13(%eax), %xmm0
2075 movq %xmm0, -13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002076 movl -5(%eax), %ecx
2077 movl %ecx, -5(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002078 movzbl -1(%eax), %ecx
2079 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002080#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002081 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002082#else
Jack Renc47703a2012-02-14 12:01:52 +04002083 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002084#endif
2085 RETURN
2086
2087 .p2align 4
2088L(fwd_write_41bytes):
2089 movq -41(%eax), %xmm0
2090 movq %xmm0, -41(%edx)
2091L(fwd_write_33bytes):
2092 movq -33(%eax), %xmm0
2093 movq %xmm0, -33(%edx)
2094L(fwd_write_25bytes):
2095 movq -25(%eax), %xmm0
2096 movq %xmm0, -25(%edx)
2097L(fwd_write_17bytes):
2098 movq -17(%eax), %xmm0
2099 movq %xmm0, -17(%edx)
2100L(fwd_write_9bytes):
2101 movq -9(%eax), %xmm0
2102 movq %xmm0, -9(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002103L(fwd_write_1bytes):
2104 movzbl -1(%eax), %ecx
2105 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002106#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002107 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002108#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002109 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002110#endif
2111 RETURN
2112
Jack Renc47703a2012-02-14 12:01:52 +04002113 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002114L(fwd_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002115 movq -46(%eax), %xmm0
2116 movq %xmm0, -46(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002117L(fwd_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002118 movq -38(%eax), %xmm0
2119 movq %xmm0, -38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002120L(fwd_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002121 movq -30(%eax), %xmm0
2122 movq %xmm0, -30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002123L(fwd_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002124 movq -22(%eax), %xmm0
2125 movq %xmm0, -22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002126L(fwd_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002127 movq -14(%eax), %xmm0
2128 movq %xmm0, -14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002129L(fwd_write_6bytes):
2130 movl -6(%eax), %ecx
2131 movl %ecx, -6(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002132 movzwl -2(%eax), %ecx
2133 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002134#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002135 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002136#else
Jack Renc47703a2012-02-14 12:01:52 +04002137 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002138#endif
2139 RETURN
2140
2141 .p2align 4
2142L(fwd_write_42bytes):
2143 movq -42(%eax), %xmm0
2144 movq %xmm0, -42(%edx)
2145L(fwd_write_34bytes):
2146 movq -34(%eax), %xmm0
2147 movq %xmm0, -34(%edx)
2148L(fwd_write_26bytes):
2149 movq -26(%eax), %xmm0
2150 movq %xmm0, -26(%edx)
2151L(fwd_write_18bytes):
2152 movq -18(%eax), %xmm0
2153 movq %xmm0, -18(%edx)
2154L(fwd_write_10bytes):
2155 movq -10(%eax), %xmm0
2156 movq %xmm0, -10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002157L(fwd_write_2bytes):
2158 movzwl -2(%eax), %ecx
2159 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002160#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002161 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002162#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002163 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002164#endif
2165 RETURN
2166
Jack Renc47703a2012-02-14 12:01:52 +04002167 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002168L(fwd_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002169 movq -47(%eax), %xmm0
2170 movq %xmm0, -47(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002171L(fwd_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002172 movq -39(%eax), %xmm0
2173 movq %xmm0, -39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002174L(fwd_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002175 movq -31(%eax), %xmm0
2176 movq %xmm0, -31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002177L(fwd_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002178 movq -23(%eax), %xmm0
2179 movq %xmm0, -23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002180L(fwd_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002181 movq -15(%eax), %xmm0
2182 movq %xmm0, -15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002183L(fwd_write_7bytes):
2184 movl -7(%eax), %ecx
2185 movl %ecx, -7(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002186 movzwl -3(%eax), %ecx
2187 movzbl -1(%eax), %eax
2188 movw %cx, -3(%edx)
2189 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002190#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002191 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002192#else
Jack Renc47703a2012-02-14 12:01:52 +04002193 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002194#endif
2195 RETURN
2196
2197 .p2align 4
2198L(fwd_write_43bytes):
2199 movq -43(%eax), %xmm0
2200 movq %xmm0, -43(%edx)
2201L(fwd_write_35bytes):
2202 movq -35(%eax), %xmm0
2203 movq %xmm0, -35(%edx)
2204L(fwd_write_27bytes):
2205 movq -27(%eax), %xmm0
2206 movq %xmm0, -27(%edx)
2207L(fwd_write_19bytes):
2208 movq -19(%eax), %xmm0
2209 movq %xmm0, -19(%edx)
2210L(fwd_write_11bytes):
2211 movq -11(%eax), %xmm0
2212 movq %xmm0, -11(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002213L(fwd_write_3bytes):
2214 movzwl -3(%eax), %ecx
2215 movzbl -1(%eax), %eax
2216 movw %cx, -3(%edx)
2217 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002218#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002219 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002220#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002221 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002222#endif
Jack Renc47703a2012-02-14 12:01:52 +04002223 RETURN
2224
2225 .p2align 4
2226L(fwd_write_40bytes_align):
2227 movdqa -40(%eax), %xmm0
2228 movdqa %xmm0, -40(%edx)
2229L(fwd_write_24bytes_align):
2230 movdqa -24(%eax), %xmm0
2231 movdqa %xmm0, -24(%edx)
2232L(fwd_write_8bytes_align):
2233 movq -8(%eax), %xmm0
2234 movq %xmm0, -8(%edx)
2235L(fwd_write_0bytes_align):
Elliott Hughesbed110a2016-03-03 10:41:42 -08002236#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002237 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002238#else
Jack Renc47703a2012-02-14 12:01:52 +04002239 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002240#endif
2241 RETURN
2242
2243 .p2align 4
2244L(fwd_write_32bytes_align):
2245 movdqa -32(%eax), %xmm0
2246 movdqa %xmm0, -32(%edx)
2247L(fwd_write_16bytes_align):
2248 movdqa -16(%eax), %xmm0
2249 movdqa %xmm0, -16(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002250#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002251 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002252#else
Jack Renc47703a2012-02-14 12:01:52 +04002253 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002254#endif
2255 RETURN
2256
2257 .p2align 4
2258L(fwd_write_5bytes_align):
2259 movl -5(%eax), %ecx
2260 movl -4(%eax), %eax
2261 movl %ecx, -5(%edx)
2262 movl %eax, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002263#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002264 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002265#else
Jack Renc47703a2012-02-14 12:01:52 +04002266 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002267#endif
2268 RETURN
2269
2270 .p2align 4
2271L(fwd_write_45bytes_align):
2272 movdqa -45(%eax), %xmm0
2273 movdqa %xmm0, -45(%edx)
2274L(fwd_write_29bytes_align):
2275 movdqa -29(%eax), %xmm0
2276 movdqa %xmm0, -29(%edx)
2277L(fwd_write_13bytes_align):
2278 movq -13(%eax), %xmm0
2279 movq %xmm0, -13(%edx)
2280 movl -5(%eax), %ecx
2281 movl %ecx, -5(%edx)
2282 movzbl -1(%eax), %ecx
2283 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002284#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002285 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002286#else
Jack Renc47703a2012-02-14 12:01:52 +04002287 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002288#endif
2289 RETURN
2290
2291 .p2align 4
2292L(fwd_write_37bytes_align):
2293 movdqa -37(%eax), %xmm0
2294 movdqa %xmm0, -37(%edx)
2295L(fwd_write_21bytes_align):
2296 movdqa -21(%eax), %xmm0
2297 movdqa %xmm0, -21(%edx)
2298 movl -5(%eax), %ecx
2299 movl %ecx, -5(%edx)
2300 movzbl -1(%eax), %ecx
2301 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002302#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002303 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002304#else
Jack Renc47703a2012-02-14 12:01:52 +04002305 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002306#endif
2307 RETURN
2308
2309 .p2align 4
2310L(fwd_write_41bytes_align):
2311 movdqa -41(%eax), %xmm0
2312 movdqa %xmm0, -41(%edx)
2313L(fwd_write_25bytes_align):
2314 movdqa -25(%eax), %xmm0
2315 movdqa %xmm0, -25(%edx)
2316L(fwd_write_9bytes_align):
2317 movq -9(%eax), %xmm0
2318 movq %xmm0, -9(%edx)
2319L(fwd_write_1bytes_align):
2320 movzbl -1(%eax), %ecx
2321 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002322#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002323 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002324#else
Jack Renc47703a2012-02-14 12:01:52 +04002325 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002326#endif
2327 RETURN
2328
2329 .p2align 4
2330L(fwd_write_33bytes_align):
2331 movdqa -33(%eax), %xmm0
2332 movdqa %xmm0, -33(%edx)
2333L(fwd_write_17bytes_align):
2334 movdqa -17(%eax), %xmm0
2335 movdqa %xmm0, -17(%edx)
2336 movzbl -1(%eax), %ecx
2337 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002338#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002339 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002340#else
Jack Renc47703a2012-02-14 12:01:52 +04002341 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002342#endif
2343 RETURN
2344
2345 .p2align 4
2346L(fwd_write_46bytes_align):
2347 movdqa -46(%eax), %xmm0
2348 movdqa %xmm0, -46(%edx)
2349L(fwd_write_30bytes_align):
2350 movdqa -30(%eax), %xmm0
2351 movdqa %xmm0, -30(%edx)
2352L(fwd_write_14bytes_align):
2353 movq -14(%eax), %xmm0
2354 movq %xmm0, -14(%edx)
2355L(fwd_write_6bytes_align):
2356 movl -6(%eax), %ecx
2357 movl %ecx, -6(%edx)
2358 movzwl -2(%eax), %ecx
2359 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002360#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002361 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002362#else
Jack Renc47703a2012-02-14 12:01:52 +04002363 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002364#endif
2365 RETURN
2366
2367 .p2align 4
2368L(fwd_write_38bytes_align):
2369 movdqa -38(%eax), %xmm0
2370 movdqa %xmm0, -38(%edx)
2371L(fwd_write_22bytes_align):
2372 movdqa -22(%eax), %xmm0
2373 movdqa %xmm0, -22(%edx)
2374 movl -6(%eax), %ecx
2375 movl %ecx, -6(%edx)
2376 movzwl -2(%eax), %ecx
2377 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002378#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002379 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002380#else
Jack Renc47703a2012-02-14 12:01:52 +04002381 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002382#endif
2383 RETURN
2384
2385 .p2align 4
2386L(fwd_write_42bytes_align):
2387 movdqa -42(%eax), %xmm0
2388 movdqa %xmm0, -42(%edx)
2389L(fwd_write_26bytes_align):
2390 movdqa -26(%eax), %xmm0
2391 movdqa %xmm0, -26(%edx)
2392L(fwd_write_10bytes_align):
2393 movq -10(%eax), %xmm0
2394 movq %xmm0, -10(%edx)
2395L(fwd_write_2bytes_align):
2396 movzwl -2(%eax), %ecx
2397 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002398#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002399 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002400#else
Jack Renc47703a2012-02-14 12:01:52 +04002401 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002402#endif
2403 RETURN
2404
2405 .p2align 4
2406L(fwd_write_34bytes_align):
2407 movdqa -34(%eax), %xmm0
2408 movdqa %xmm0, -34(%edx)
2409L(fwd_write_18bytes_align):
2410 movdqa -18(%eax), %xmm0
2411 movdqa %xmm0, -18(%edx)
2412 movzwl -2(%eax), %ecx
2413 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002414#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002415 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002416#else
Jack Renc47703a2012-02-14 12:01:52 +04002417 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002418#endif
2419 RETURN
2420
2421 .p2align 4
2422L(fwd_write_47bytes_align):
2423 movdqa -47(%eax), %xmm0
2424 movdqa %xmm0, -47(%edx)
2425L(fwd_write_31bytes_align):
2426 movdqa -31(%eax), %xmm0
2427 movdqa %xmm0, -31(%edx)
2428L(fwd_write_15bytes_align):
2429 movq -15(%eax), %xmm0
2430 movq %xmm0, -15(%edx)
2431L(fwd_write_7bytes_align):
2432 movl -7(%eax), %ecx
2433 movl %ecx, -7(%edx)
2434 movzwl -3(%eax), %ecx
2435 movzbl -1(%eax), %eax
2436 movw %cx, -3(%edx)
2437 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002438#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002439 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002440#else
Jack Renc47703a2012-02-14 12:01:52 +04002441 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002442#endif
2443 RETURN
2444
2445 .p2align 4
2446L(fwd_write_39bytes_align):
2447 movdqa -39(%eax), %xmm0
2448 movdqa %xmm0, -39(%edx)
2449L(fwd_write_23bytes_align):
2450 movdqa -23(%eax), %xmm0
2451 movdqa %xmm0, -23(%edx)
2452 movl -7(%eax), %ecx
2453 movl %ecx, -7(%edx)
2454 movzwl -3(%eax), %ecx
2455 movzbl -1(%eax), %eax
2456 movw %cx, -3(%edx)
2457 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002458#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002459 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002460#else
Jack Renc47703a2012-02-14 12:01:52 +04002461 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002462#endif
2463 RETURN
2464
2465 .p2align 4
2466L(fwd_write_43bytes_align):
2467 movdqa -43(%eax), %xmm0
2468 movdqa %xmm0, -43(%edx)
2469L(fwd_write_27bytes_align):
2470 movdqa -27(%eax), %xmm0
2471 movdqa %xmm0, -27(%edx)
2472L(fwd_write_11bytes_align):
2473 movq -11(%eax), %xmm0
2474 movq %xmm0, -11(%edx)
2475L(fwd_write_3bytes_align):
2476 movzwl -3(%eax), %ecx
2477 movzbl -1(%eax), %eax
2478 movw %cx, -3(%edx)
2479 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002480#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002481 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002482#else
Jack Renc47703a2012-02-14 12:01:52 +04002483 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002484#endif
2485 RETURN
2486
2487 .p2align 4
2488L(fwd_write_35bytes_align):
2489 movdqa -35(%eax), %xmm0
2490 movdqa %xmm0, -35(%edx)
2491L(fwd_write_19bytes_align):
2492 movdqa -19(%eax), %xmm0
2493 movdqa %xmm0, -19(%edx)
2494 movzwl -3(%eax), %ecx
2495 movzbl -1(%eax), %eax
2496 movw %cx, -3(%edx)
2497 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002498#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002499 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002500#else
Jack Renc47703a2012-02-14 12:01:52 +04002501 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002502#endif
2503 RETURN
2504
2505 .p2align 4
2506L(fwd_write_44bytes_align):
2507 movdqa -44(%eax), %xmm0
2508 movdqa %xmm0, -44(%edx)
2509L(fwd_write_28bytes_align):
2510 movdqa -28(%eax), %xmm0
2511 movdqa %xmm0, -28(%edx)
2512L(fwd_write_12bytes_align):
2513 movq -12(%eax), %xmm0
2514 movq %xmm0, -12(%edx)
2515L(fwd_write_4bytes_align):
2516 movl -4(%eax), %ecx
2517 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002518#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002519 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002520#else
Jack Renc47703a2012-02-14 12:01:52 +04002521 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002522#endif
2523 RETURN
2524
2525 .p2align 4
2526L(fwd_write_36bytes_align):
2527 movdqa -36(%eax), %xmm0
2528 movdqa %xmm0, -36(%edx)
2529L(fwd_write_20bytes_align):
2530 movdqa -20(%eax), %xmm0
2531 movdqa %xmm0, -20(%edx)
2532 movl -4(%eax), %ecx
2533 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002534#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002535 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002536#else
Jack Renc47703a2012-02-14 12:01:52 +04002537 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002538#endif
Bruce Beare124a5422010-10-11 12:24:41 -07002539 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08002540
Jack Renc47703a2012-02-14 12:01:52 +04002541 CFI_PUSH (%edi)
2542
2543 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002544L(large_page):
2545 movdqu (%eax), %xmm1
Jack Renc47703a2012-02-14 12:01:52 +04002546#ifdef USE_AS_MEMMOVE
2547 movl DEST+4(%esp), %edi
2548 movdqu %xmm0, (%edi)
2549#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -08002550 lea 16(%eax), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002551 movntdq %xmm1, (%edx)
2552 lea 16(%edx), %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -08002553 lea -0x90(%ecx), %ecx
2554 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +04002555
2556 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002557L(large_page_loop):
2558 movdqu (%eax), %xmm0
2559 movdqu 0x10(%eax), %xmm1
2560 movdqu 0x20(%eax), %xmm2
2561 movdqu 0x30(%eax), %xmm3
2562 movdqu 0x40(%eax), %xmm4
2563 movdqu 0x50(%eax), %xmm5
2564 movdqu 0x60(%eax), %xmm6
2565 movdqu 0x70(%eax), %xmm7
2566 lea 0x80(%eax), %eax
2567
2568 sub $0x80, %ecx
2569 movntdq %xmm0, (%edx)
2570 movntdq %xmm1, 0x10(%edx)
2571 movntdq %xmm2, 0x20(%edx)
2572 movntdq %xmm3, 0x30(%edx)
2573 movntdq %xmm4, 0x40(%edx)
2574 movntdq %xmm5, 0x50(%edx)
2575 movntdq %xmm6, 0x60(%edx)
2576 movntdq %xmm7, 0x70(%edx)
2577 lea 0x80(%edx), %edx
2578 jae L(large_page_loop)
2579 cmp $-0x40, %ecx
2580 lea 0x80(%ecx), %ecx
2581 jl L(large_page_less_64bytes)
2582
2583 movdqu (%eax), %xmm0
2584 movdqu 0x10(%eax), %xmm1
2585 movdqu 0x20(%eax), %xmm2
2586 movdqu 0x30(%eax), %xmm3
2587 lea 0x40(%eax), %eax
2588
2589 movntdq %xmm0, (%edx)
2590 movntdq %xmm1, 0x10(%edx)
2591 movntdq %xmm2, 0x20(%edx)
2592 movntdq %xmm3, 0x30(%edx)
2593 lea 0x40(%edx), %edx
2594 sub $0x40, %ecx
2595L(large_page_less_64bytes):
2596 cmp $32, %ecx
2597 jb L(large_page_less_32bytes)
2598 movdqu (%eax), %xmm0
2599 movdqu 0x10(%eax), %xmm1
2600 lea 0x20(%eax), %eax
2601 movntdq %xmm0, (%edx)
2602 movntdq %xmm1, 0x10(%edx)
2603 lea 0x20(%edx), %edx
2604 sub $0x20, %ecx
2605L(large_page_less_32bytes):
2606 add %ecx, %edx
2607 add %ecx, %eax
2608 sfence
2609 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2610
Jack Renc47703a2012-02-14 12:01:52 +04002611 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002612L(bk_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002613 movq 36(%eax), %xmm0
2614 movq %xmm0, 36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002615L(bk_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002616 movq 28(%eax), %xmm0
2617 movq %xmm0, 28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002618L(bk_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002619 movq 20(%eax), %xmm0
2620 movq %xmm0, 20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002621L(bk_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002622 movq 12(%eax), %xmm0
2623 movq %xmm0, 12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002624L(bk_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002625 movq 4(%eax), %xmm0
2626 movq %xmm0, 4(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002627L(bk_write_4bytes):
2628 movl (%eax), %ecx
2629 movl %ecx, (%edx)
2630L(bk_write_0bytes):
Bruce Beare8ff1a272010-03-04 11:03:37 -08002631 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002632#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002633 movl LEN(%esp), %ecx
2634 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002635#endif
2636 RETURN
2637
Jack Renc47703a2012-02-14 12:01:52 +04002638 .p2align 4
2639L(bk_write_40bytes):
2640 movq 32(%eax), %xmm0
2641 movq %xmm0, 32(%edx)
2642L(bk_write_32bytes):
2643 movq 24(%eax), %xmm0
2644 movq %xmm0, 24(%edx)
2645L(bk_write_24bytes):
2646 movq 16(%eax), %xmm0
2647 movq %xmm0, 16(%edx)
2648L(bk_write_16bytes):
2649 movq 8(%eax), %xmm0
2650 movq %xmm0, 8(%edx)
2651L(bk_write_8bytes):
2652 movq (%eax), %xmm0
2653 movq %xmm0, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002654 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002655#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002656 movl LEN(%esp), %ecx
2657 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002658#endif
2659 RETURN
2660
2661 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002662L(bk_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002663 movq 37(%eax), %xmm0
2664 movq %xmm0, 37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002665L(bk_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002666 movq 29(%eax), %xmm0
2667 movq %xmm0, 29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002668L(bk_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002669 movq 21(%eax), %xmm0
2670 movq %xmm0, 21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002671L(bk_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002672 movq 13(%eax), %xmm0
2673 movq %xmm0, 13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002674L(bk_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002675 movq 5(%eax), %xmm0
2676 movq %xmm0, 5(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002677L(bk_write_5bytes):
2678 movl 1(%eax), %ecx
2679 movl %ecx, 1(%edx)
2680L(bk_write_1bytes):
2681 movzbl (%eax), %ecx
2682 movb %cl, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002683 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002684#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002685 movl LEN(%esp), %ecx
2686 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002687#endif
2688 RETURN
2689
Jack Renc47703a2012-02-14 12:01:52 +04002690 .p2align 4
2691L(bk_write_41bytes):
2692 movq 33(%eax), %xmm0
2693 movq %xmm0, 33(%edx)
2694L(bk_write_33bytes):
2695 movq 25(%eax), %xmm0
2696 movq %xmm0, 25(%edx)
2697L(bk_write_25bytes):
2698 movq 17(%eax), %xmm0
2699 movq %xmm0, 17(%edx)
2700L(bk_write_17bytes):
2701 movq 9(%eax), %xmm0
2702 movq %xmm0, 9(%edx)
2703L(bk_write_9bytes):
2704 movq 1(%eax), %xmm0
2705 movq %xmm0, 1(%edx)
2706 movzbl (%eax), %ecx
2707 movb %cl, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002708 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002709#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002710 movl LEN(%esp), %ecx
2711 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002712#endif
2713 RETURN
2714
2715 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002716L(bk_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002717 movq 38(%eax), %xmm0
2718 movq %xmm0, 38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002719L(bk_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002720 movq 30(%eax), %xmm0
2721 movq %xmm0, 30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002722L(bk_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002723 movq 22(%eax), %xmm0
2724 movq %xmm0, 22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002725L(bk_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002726 movq 14(%eax), %xmm0
2727 movq %xmm0, 14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002728L(bk_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002729 movq 6(%eax), %xmm0
2730 movq %xmm0, 6(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002731L(bk_write_6bytes):
2732 movl 2(%eax), %ecx
2733 movl %ecx, 2(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002734 movzwl (%eax), %ecx
2735 movw %cx, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002736 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002737#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002738 movl LEN(%esp), %ecx
2739 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002740#endif
2741 RETURN
2742
2743 .p2align 4
2744L(bk_write_42bytes):
2745 movq 34(%eax), %xmm0
2746 movq %xmm0, 34(%edx)
2747L(bk_write_34bytes):
2748 movq 26(%eax), %xmm0
2749 movq %xmm0, 26(%edx)
2750L(bk_write_26bytes):
2751 movq 18(%eax), %xmm0
2752 movq %xmm0, 18(%edx)
2753L(bk_write_18bytes):
2754 movq 10(%eax), %xmm0
2755 movq %xmm0, 10(%edx)
2756L(bk_write_10bytes):
2757 movq 2(%eax), %xmm0
2758 movq %xmm0, 2(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002759L(bk_write_2bytes):
2760 movzwl (%eax), %ecx
2761 movw %cx, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002762 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002763#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002764 movl LEN(%esp), %ecx
2765 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002766#endif
2767 RETURN
2768
Jack Renc47703a2012-02-14 12:01:52 +04002769 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002770L(bk_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002771 movq 39(%eax), %xmm0
2772 movq %xmm0, 39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002773L(bk_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002774 movq 31(%eax), %xmm0
2775 movq %xmm0, 31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002776L(bk_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002777 movq 23(%eax), %xmm0
2778 movq %xmm0, 23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002779L(bk_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002780 movq 15(%eax), %xmm0
2781 movq %xmm0, 15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002782L(bk_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002783 movq 7(%eax), %xmm0
2784 movq %xmm0, 7(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002785L(bk_write_7bytes):
2786 movl 3(%eax), %ecx
2787 movl %ecx, 3(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002788 movzwl 1(%eax), %ecx
2789 movw %cx, 1(%edx)
2790 movzbl (%eax), %eax
2791 movb %al, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002792 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002793#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002794 movl LEN(%esp), %ecx
2795 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002796#endif
2797 RETURN
2798
2799 .p2align 4
2800L(bk_write_43bytes):
2801 movq 35(%eax), %xmm0
2802 movq %xmm0, 35(%edx)
2803L(bk_write_35bytes):
2804 movq 27(%eax), %xmm0
2805 movq %xmm0, 27(%edx)
2806L(bk_write_27bytes):
2807 movq 19(%eax), %xmm0
2808 movq %xmm0, 19(%edx)
2809L(bk_write_19bytes):
2810 movq 11(%eax), %xmm0
2811 movq %xmm0, 11(%edx)
2812L(bk_write_11bytes):
2813 movq 3(%eax), %xmm0
2814 movq %xmm0, 3(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002815L(bk_write_3bytes):
2816 movzwl 1(%eax), %ecx
2817 movw %cx, 1(%edx)
2818 movzbl (%eax), %eax
2819 movb %al, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002820 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002821#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002822 movl LEN(%esp), %ecx
2823 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002824#endif
2825 RETURN_END
2826
2827
2828 .pushsection .rodata.ssse3,"a",@progbits
Jack Renc47703a2012-02-14 12:01:52 +04002829 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002830L(table_48bytes_fwd):
2831 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2832 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2833 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2834 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2835 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2836 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2837 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2838 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2839 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2840 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2841 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2842 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2843 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2844 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2845 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2846 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2847 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2848 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2849 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2850 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2851 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2852 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2853 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2854 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2855 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2856 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2857 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2858 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2859 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2860 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2861 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2862 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2863 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2864 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2865 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2866 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2878 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2879
Jack Renc47703a2012-02-14 12:01:52 +04002880 .p2align 2
2881L(table_48bytes_fwd_align):
2882 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2883 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2884 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2885 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2886 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2887 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2888 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2889 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2890 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2891 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2892 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2893 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2894 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2895 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2896 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2897 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2898 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2899 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2900 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2901 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2902 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2903 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2904 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2905 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2906 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2907 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2908 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2909 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2910 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2911 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2912 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2913 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2914 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2915 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2916 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2917 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2929 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2930
2931 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002932L(shl_table):
2933 .int JMPTBL (L(shl_0), L(shl_table))
2934 .int JMPTBL (L(shl_1), L(shl_table))
2935 .int JMPTBL (L(shl_2), L(shl_table))
2936 .int JMPTBL (L(shl_3), L(shl_table))
2937 .int JMPTBL (L(shl_4), L(shl_table))
2938 .int JMPTBL (L(shl_5), L(shl_table))
2939 .int JMPTBL (L(shl_6), L(shl_table))
2940 .int JMPTBL (L(shl_7), L(shl_table))
2941 .int JMPTBL (L(shl_8), L(shl_table))
2942 .int JMPTBL (L(shl_9), L(shl_table))
2943 .int JMPTBL (L(shl_10), L(shl_table))
2944 .int JMPTBL (L(shl_11), L(shl_table))
2945 .int JMPTBL (L(shl_12), L(shl_table))
2946 .int JMPTBL (L(shl_13), L(shl_table))
2947 .int JMPTBL (L(shl_14), L(shl_table))
2948 .int JMPTBL (L(shl_15), L(shl_table))
2949
Jack Renc47703a2012-02-14 12:01:52 +04002950 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002951L(table_48_bytes_bwd):
2952 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2953 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2954 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2955 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2956 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2957 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2958 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2959 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2960 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2961 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2962 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2963 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2964 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2965 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2966 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2967 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2968 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2969 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2970 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2971 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2972 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2973 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2974 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2975 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2976 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2977 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2978 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2979 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2980 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2981 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2982 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2983 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2984 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2985 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2986 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2987 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2999 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3000
3001 .popsection
3002
3003#ifdef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +04003004 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003005L(copy_backward):
Jack Renc47703a2012-02-14 12:01:52 +04003006 PUSH (%edi)
3007 movl %eax, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003008 lea (%ecx,%edx,1),%edx
Jack Renc47703a2012-02-14 12:01:52 +04003009 lea (%ecx,%edi,1),%edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003010 testl $0x3, %edx
3011 jnz L(bk_align)
3012
3013L(bk_aligned_4):
3014 cmp $64, %ecx
3015 jae L(bk_write_more64bytes)
3016
3017L(bk_write_64bytesless):
3018 cmp $32, %ecx
3019 jb L(bk_write_less32bytes)
3020
3021L(bk_write_more32bytes):
3022 /* Copy 32 bytes at a time. */
3023 sub $32, %ecx
Jack Renc47703a2012-02-14 12:01:52 +04003024 movq -8(%edi), %xmm0
3025 movq %xmm0, -8(%edx)
3026 movq -16(%edi), %xmm0
3027 movq %xmm0, -16(%edx)
3028 movq -24(%edi), %xmm0
3029 movq %xmm0, -24(%edx)
3030 movq -32(%edi), %xmm0
3031 movq %xmm0, -32(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003032 sub $32, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003033 sub $32, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003034
3035L(bk_write_less32bytes):
Jack Renc47703a2012-02-14 12:01:52 +04003036 movl %edi, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003037 sub %ecx, %edx
3038 sub %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04003039 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003040L(bk_write_less32bytes_2):
3041 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3042
Jack Renc47703a2012-02-14 12:01:52 +04003043 CFI_PUSH (%edi)
3044
3045 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003046L(bk_align):
3047 cmp $8, %ecx
3048 jbe L(bk_write_less32bytes)
3049 testl $1, %edx
3050 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
Jack Renc47703a2012-02-14 12:01:52 +04003051 then (EDX & 2) must be != 0. */
Bruce Beare8ff1a272010-03-04 11:03:37 -08003052 jz L(bk_got2)
Jack Renc47703a2012-02-14 12:01:52 +04003053 sub $1, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003054 sub $1, %ecx
3055 sub $1, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003056 movzbl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003057 movb %al, (%edx)
3058
3059 testl $2, %edx
3060 jz L(bk_aligned_4)
3061
3062L(bk_got2):
Jack Renc47703a2012-02-14 12:01:52 +04003063 sub $2, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003064 sub $2, %ecx
3065 sub $2, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003066 movzwl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003067 movw %ax, (%edx)
3068 jmp L(bk_aligned_4)
3069
Jack Renc47703a2012-02-14 12:01:52 +04003070 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003071L(bk_write_more64bytes):
3072 /* Check alignment of last byte. */
3073 testl $15, %edx
3074 jz L(bk_ssse3_cpy_pre)
3075
3076/* EDX is aligned 4 bytes, but not 16 bytes. */
3077L(bk_ssse3_align):
Jack Renc47703a2012-02-14 12:01:52 +04003078 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003079 sub $4, %ecx
3080 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003081 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003082 movl %eax, (%edx)
3083
3084 testl $15, %edx
3085 jz L(bk_ssse3_cpy_pre)
3086
Jack Renc47703a2012-02-14 12:01:52 +04003087 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003088 sub $4, %ecx
3089 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003090 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003091 movl %eax, (%edx)
3092
3093 testl $15, %edx
3094 jz L(bk_ssse3_cpy_pre)
3095
Jack Renc47703a2012-02-14 12:01:52 +04003096 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003097 sub $4, %ecx
3098 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003099 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003100 movl %eax, (%edx)
3101
3102L(bk_ssse3_cpy_pre):
3103 cmp $64, %ecx
3104 jb L(bk_write_more32bytes)
3105
Jack Renc47703a2012-02-14 12:01:52 +04003106 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003107L(bk_ssse3_cpy):
Jack Renc47703a2012-02-14 12:01:52 +04003108 sub $64, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003109 sub $64, %ecx
3110 sub $64, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003111 movdqu 0x30(%edi), %xmm3
Bruce Beare8ff1a272010-03-04 11:03:37 -08003112 movdqa %xmm3, 0x30(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003113 movdqu 0x20(%edi), %xmm2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003114 movdqa %xmm2, 0x20(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003115 movdqu 0x10(%edi), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -08003116 movdqa %xmm1, 0x10(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003117 movdqu (%edi), %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -08003118 movdqa %xmm0, (%edx)
3119 cmp $64, %ecx
3120 jae L(bk_ssse3_cpy)
3121 jmp L(bk_write_64bytesless)
3122
3123#endif
3124
3125END (MEMCPY)