blob: 2b3b7a53df4141cf67bbb6c9924f9f3529686cf3 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040031#include "cache.h"
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040032
Bruce Beare8ff1a272010-03-04 11:03:37 -080033#ifndef MEMCPY
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040034# define MEMCPY memcpy
Bruce Beare8ff1a272010-03-04 11:03:37 -080035#endif
36
Haibo Huang8a0f0ed2018-05-24 20:39:18 -070037#ifndef USE_AS_MEMMOVE
38# define USE_AS_MEMMOVE
39#endif
40
Bruce Beare8ff1a272010-03-04 11:03:37 -080041#ifndef L
42# define L(label) .L##label
43#endif
44
Bruce Beare8ff1a272010-03-04 11:03:37 -080045#ifndef cfi_startproc
Jack Renc47703a2012-02-14 12:01:52 +040046# define cfi_startproc .cfi_startproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080047#endif
48
49#ifndef cfi_endproc
Jack Renc47703a2012-02-14 12:01:52 +040050# define cfi_endproc .cfi_endproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080051#endif
52
53#ifndef cfi_rel_offset
54# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
55#endif
56
57#ifndef cfi_restore
Jack Renc47703a2012-02-14 12:01:52 +040058# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080059#endif
60
61#ifndef cfi_adjust_cfa_offset
62# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
63#endif
64
65#ifndef ENTRY
Jack Renc47703a2012-02-14 12:01:52 +040066# define ENTRY(name) \
67 .type name, @function; \
68 .globl name; \
69 .p2align 4; \
70name: \
Bruce Beare8ff1a272010-03-04 11:03:37 -080071 cfi_startproc
72#endif
73
Haibo Huang8a0f0ed2018-05-24 20:39:18 -070074#ifndef ALIAS_SYMBOL
75# define ALIAS_SYMBOL(alias, original) \
76 .globl alias; \
77 .equ alias, original
78#endif
79
Bruce Beare8ff1a272010-03-04 11:03:37 -080080#ifndef END
Jack Renc47703a2012-02-14 12:01:52 +040081# define END(name) \
82 cfi_endproc; \
Bruce Beare8ff1a272010-03-04 11:03:37 -080083 .size name, .-name
84#endif
85
Elliott Hughesbed110a2016-03-03 10:41:42 -080086#define DEST PARMS
87#define SRC DEST+4
88#define LEN SRC+4
Bruce Beare8ff1a272010-03-04 11:03:37 -080089
Jack Renc47703a2012-02-14 12:01:52 +040090#define CFI_PUSH(REG) \
91 cfi_adjust_cfa_offset (4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080092 cfi_rel_offset (REG, 0)
93
Jack Renc47703a2012-02-14 12:01:52 +040094#define CFI_POP(REG) \
95 cfi_adjust_cfa_offset (-4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080096 cfi_restore (REG)
97
98#define PUSH(REG) pushl REG; CFI_PUSH (REG)
99#define POP(REG) popl REG; CFI_POP (REG)
100
Nick Kralevich5982e332011-11-11 15:47:24 -0800101#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800102# define PARMS 8 /* Preserve EBX. */
103# define ENTRANCE PUSH (%ebx);
104# define RETURN_END POP (%ebx); ret
105# define RETURN RETURN_END; CFI_PUSH (%ebx)
106# define JMPTBL(I, B) I - B
Jack Renc47703a2012-02-14 12:01:52 +0400107
Varvara Rainchik5a922842014-04-24 15:41:20 +0400108# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
Bruce Beare8ff1a272010-03-04 11:03:37 -0800109
110/* Load an entry in a jump table into EBX and branch to it. TABLE is a
Jack Renc47703a2012-02-14 12:01:52 +0400111 jump table with relative offsets. INDEX is a register contains the
112 index into the jump table. SCALE is the scale of INDEX. */
113
Bruce Beare8ff1a272010-03-04 11:03:37 -0800114# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400115 /* We first load PC into EBX. */ \
116 SETUP_PIC_REG(bx); \
117 /* Get the address of the jump table. */ \
118 addl $(TABLE - .), %ebx; \
119 /* Get the entry and convert the relative offset to the \
120 absolute address. */ \
121 addl (%ebx, INDEX, SCALE), %ebx; \
122 /* We loaded the jump table. Go. */ \
123 jmp *%ebx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800124#else
Jack Renc47703a2012-02-14 12:01:52 +0400125
Bruce Beare8ff1a272010-03-04 11:03:37 -0800126# define PARMS 4
127# define ENTRANCE
128# define RETURN_END ret
129# define RETURN RETURN_END
130# define JMPTBL(I, B) I
131
132/* Branch to an entry in a jump table. TABLE is a jump table with
Jack Renc47703a2012-02-14 12:01:52 +0400133 absolute offsets. INDEX is a register contains the index into the
134 jump table. SCALE is the scale of INDEX. */
135
Bruce Beare8ff1a272010-03-04 11:03:37 -0800136# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400137 jmp *TABLE(, INDEX, SCALE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800138#endif
139
140 .section .text.ssse3,"ax",@progbits
141ENTRY (MEMCPY)
142 ENTRANCE
143 movl LEN(%esp), %ecx
144 movl SRC(%esp), %eax
145 movl DEST(%esp), %edx
146
147#ifdef USE_AS_MEMMOVE
148 cmp %eax, %edx
149 jb L(copy_forward)
150 je L(fwd_write_0bytes)
151 cmp $32, %ecx
152 jae L(memmove_bwd)
153 jmp L(bk_write_less32bytes_2)
Jack Renc47703a2012-02-14 12:01:52 +0400154
155 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800156L(memmove_bwd):
157 add %ecx, %eax
158 cmp %eax, %edx
159 movl SRC(%esp), %eax
160 jb L(copy_backward)
161
162L(copy_forward):
163#endif
164 cmp $48, %ecx
165 jae L(48bytesormore)
166
167L(fwd_write_less32bytes):
168#ifndef USE_AS_MEMMOVE
169 cmp %dl, %al
170 jb L(bk_write)
171#endif
172 add %ecx, %edx
173 add %ecx, %eax
174 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
175#ifndef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +0400176 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800177L(bk_write):
178 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
179#endif
180
Jack Renc47703a2012-02-14 12:01:52 +0400181 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800182L(48bytesormore):
Jack Renc47703a2012-02-14 12:01:52 +0400183#ifndef USE_AS_MEMMOVE
184 movlpd (%eax), %xmm0
185 movlpd 8(%eax), %xmm1
186 movlpd %xmm0, (%edx)
187 movlpd %xmm1, 8(%edx)
188#else
Bruce Beare8ff1a272010-03-04 11:03:37 -0800189 movdqu (%eax), %xmm0
Jack Renc47703a2012-02-14 12:01:52 +0400190#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800191 PUSH (%edi)
192 movl %edx, %edi
193 and $-16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800194 add $16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800195 sub %edx, %edi
196 add %edi, %ecx
197 sub %edi, %eax
198
199#ifdef SHARED_CACHE_SIZE_HALF
200 cmp $SHARED_CACHE_SIZE_HALF, %ecx
201#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800202# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400203 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800204 add $_GLOBAL_OFFSET_TABLE_, %ebx
205 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
206# else
207 cmp __x86_shared_cache_size_half, %ecx
208# endif
209#endif
210
211 mov %eax, %edi
212 jae L(large_page)
213 and $0xf, %edi
214 jz L(shl_0)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800215 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
216
Jack Renc47703a2012-02-14 12:01:52 +0400217 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800218L(shl_0):
Jack Renc47703a2012-02-14 12:01:52 +0400219#ifdef USE_AS_MEMMOVE
220 movl DEST+4(%esp), %edi
221 movdqu %xmm0, (%edi)
222#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800223 xor %edi, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -0800224 cmp $127, %ecx
225 ja L(shl_0_gobble)
226 lea -32(%ecx), %ecx
Jack Renc47703a2012-02-14 12:01:52 +0400227
228 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800229L(shl_0_loop):
230 movdqa (%eax, %edi), %xmm0
231 movdqa 16(%eax, %edi), %xmm1
232 sub $32, %ecx
233 movdqa %xmm0, (%edx, %edi)
234 movdqa %xmm1, 16(%edx, %edi)
235 lea 32(%edi), %edi
236 jb L(shl_0_end)
237
238 movdqa (%eax, %edi), %xmm0
239 movdqa 16(%eax, %edi), %xmm1
240 sub $32, %ecx
241 movdqa %xmm0, (%edx, %edi)
242 movdqa %xmm1, 16(%edx, %edi)
243 lea 32(%edi), %edi
244 jb L(shl_0_end)
245
246 movdqa (%eax, %edi), %xmm0
247 movdqa 16(%eax, %edi), %xmm1
248 sub $32, %ecx
249 movdqa %xmm0, (%edx, %edi)
250 movdqa %xmm1, 16(%edx, %edi)
251 lea 32(%edi), %edi
252 jb L(shl_0_end)
253
254 movdqa (%eax, %edi), %xmm0
255 movdqa 16(%eax, %edi), %xmm1
256 sub $32, %ecx
257 movdqa %xmm0, (%edx, %edi)
258 movdqa %xmm1, 16(%edx, %edi)
259 lea 32(%edi), %edi
Jack Renc47703a2012-02-14 12:01:52 +0400260
Bruce Beare8ff1a272010-03-04 11:03:37 -0800261L(shl_0_end):
262 lea 32(%ecx), %ecx
263 add %ecx, %edi
264 add %edi, %edx
265 add %edi, %eax
266 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +0400267 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800268
Bruce Beare124a5422010-10-11 12:24:41 -0700269 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800270
Jack Renc47703a2012-02-14 12:01:52 +0400271 .p2align 4
272L(shl_0_gobble):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800273#ifdef DATA_CACHE_SIZE_HALF
274 cmp $DATA_CACHE_SIZE_HALF, %ecx
275#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800276# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400277 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800278 add $_GLOBAL_OFFSET_TABLE_, %ebx
279 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
280# else
281 cmp __x86_data_cache_size_half, %ecx
282# endif
283#endif
Jack Renc47703a2012-02-14 12:01:52 +0400284 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800285 lea -128(%ecx), %ecx
286 jae L(shl_0_gobble_mem_loop)
Jack Renc47703a2012-02-14 12:01:52 +0400287
288 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800289L(shl_0_gobble_cache_loop):
290 movdqa (%eax), %xmm0
291 movdqa 0x10(%eax), %xmm1
292 movdqa 0x20(%eax), %xmm2
293 movdqa 0x30(%eax), %xmm3
294 movdqa 0x40(%eax), %xmm4
295 movdqa 0x50(%eax), %xmm5
296 movdqa 0x60(%eax), %xmm6
297 movdqa 0x70(%eax), %xmm7
298 lea 0x80(%eax), %eax
299 sub $128, %ecx
300 movdqa %xmm0, (%edx)
301 movdqa %xmm1, 0x10(%edx)
302 movdqa %xmm2, 0x20(%edx)
303 movdqa %xmm3, 0x30(%edx)
304 movdqa %xmm4, 0x40(%edx)
305 movdqa %xmm5, 0x50(%edx)
306 movdqa %xmm6, 0x60(%edx)
307 movdqa %xmm7, 0x70(%edx)
308 lea 0x80(%edx), %edx
309
310 jae L(shl_0_gobble_cache_loop)
311 cmp $-0x40, %ecx
312 lea 0x80(%ecx), %ecx
313 jl L(shl_0_cache_less_64bytes)
314
315 movdqa (%eax), %xmm0
316 sub $0x40, %ecx
317 movdqa 0x10(%eax), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -0800318 movdqa %xmm0, (%edx)
319 movdqa %xmm1, 0x10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800320 movdqa 0x20(%eax), %xmm0
321 movdqa 0x30(%eax), %xmm1
322 add $0x40, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800323 movdqa %xmm0, 0x20(%edx)
324 movdqa %xmm1, 0x30(%edx)
325 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400326
Bruce Beare8ff1a272010-03-04 11:03:37 -0800327L(shl_0_cache_less_64bytes):
328 cmp $0x20, %ecx
329 jb L(shl_0_cache_less_32bytes)
330 movdqa (%eax), %xmm0
331 sub $0x20, %ecx
332 movdqa 0x10(%eax), %xmm1
333 add $0x20, %eax
334 movdqa %xmm0, (%edx)
335 movdqa %xmm1, 0x10(%edx)
336 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400337
Bruce Beare8ff1a272010-03-04 11:03:37 -0800338L(shl_0_cache_less_32bytes):
339 cmp $0x10, %ecx
340 jb L(shl_0_cache_less_16bytes)
341 sub $0x10, %ecx
342 movdqa (%eax), %xmm0
343 add $0x10, %eax
344 movdqa %xmm0, (%edx)
345 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400346
Bruce Beare8ff1a272010-03-04 11:03:37 -0800347L(shl_0_cache_less_16bytes):
348 add %ecx, %edx
349 add %ecx, %eax
350 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
351
Jack Renc47703a2012-02-14 12:01:52 +0400352 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800353L(shl_0_gobble_mem_loop):
354 prefetcht0 0x1c0(%eax)
355 prefetcht0 0x280(%eax)
356 prefetcht0 0x1c0(%edx)
357
358 movdqa (%eax), %xmm0
359 movdqa 0x10(%eax), %xmm1
360 movdqa 0x20(%eax), %xmm2
361 movdqa 0x30(%eax), %xmm3
362 movdqa 0x40(%eax), %xmm4
363 movdqa 0x50(%eax), %xmm5
364 movdqa 0x60(%eax), %xmm6
365 movdqa 0x70(%eax), %xmm7
366 lea 0x80(%eax), %eax
367 sub $0x80, %ecx
368 movdqa %xmm0, (%edx)
369 movdqa %xmm1, 0x10(%edx)
370 movdqa %xmm2, 0x20(%edx)
371 movdqa %xmm3, 0x30(%edx)
372 movdqa %xmm4, 0x40(%edx)
373 movdqa %xmm5, 0x50(%edx)
374 movdqa %xmm6, 0x60(%edx)
375 movdqa %xmm7, 0x70(%edx)
376 lea 0x80(%edx), %edx
377
378 jae L(shl_0_gobble_mem_loop)
379 cmp $-0x40, %ecx
380 lea 0x80(%ecx), %ecx
381 jl L(shl_0_mem_less_64bytes)
382
383 movdqa (%eax), %xmm0
384 sub $0x40, %ecx
385 movdqa 0x10(%eax), %xmm1
386
387 movdqa %xmm0, (%edx)
388 movdqa %xmm1, 0x10(%edx)
389
390 movdqa 0x20(%eax), %xmm0
391 movdqa 0x30(%eax), %xmm1
392 add $0x40, %eax
393
394 movdqa %xmm0, 0x20(%edx)
395 movdqa %xmm1, 0x30(%edx)
396 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400397
Bruce Beare8ff1a272010-03-04 11:03:37 -0800398L(shl_0_mem_less_64bytes):
399 cmp $0x20, %ecx
400 jb L(shl_0_mem_less_32bytes)
401 movdqa (%eax), %xmm0
402 sub $0x20, %ecx
403 movdqa 0x10(%eax), %xmm1
404 add $0x20, %eax
405 movdqa %xmm0, (%edx)
406 movdqa %xmm1, 0x10(%edx)
407 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400408
Bruce Beare8ff1a272010-03-04 11:03:37 -0800409L(shl_0_mem_less_32bytes):
410 cmp $0x10, %ecx
411 jb L(shl_0_mem_less_16bytes)
412 sub $0x10, %ecx
413 movdqa (%eax), %xmm0
414 add $0x10, %eax
415 movdqa %xmm0, (%edx)
416 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400417
Bruce Beare8ff1a272010-03-04 11:03:37 -0800418L(shl_0_mem_less_16bytes):
419 add %ecx, %edx
420 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +0400421 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800422
Jack Renc47703a2012-02-14 12:01:52 +0400423 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800424L(shl_1):
Jack Renc47703a2012-02-14 12:01:52 +0400425#ifndef USE_AS_MEMMOVE
426 movaps -1(%eax), %xmm1
427#else
428 movl DEST+4(%esp), %edi
429 movaps -1(%eax), %xmm1
430 movdqu %xmm0, (%edi)
431#endif
432#ifdef DATA_CACHE_SIZE_HALF
433 cmp $DATA_CACHE_SIZE_HALF, %ecx
434#else
435# if (defined SHARED || defined __PIC__)
436 SETUP_PIC_REG(bx)
437 add $_GLOBAL_OFFSET_TABLE_, %ebx
438 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
439# else
440 cmp __x86_data_cache_size_half, %ecx
441# endif
442#endif
443 jb L(sh_1_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800444
Jack Renc47703a2012-02-14 12:01:52 +0400445 lea -64(%ecx), %ecx
446
447 .p2align 4
448L(Shl1LoopStart):
449 prefetcht0 0x1c0(%eax)
450 prefetcht0 0x1c0(%edx)
451 movaps 15(%eax), %xmm2
452 movaps 31(%eax), %xmm3
453 movaps 47(%eax), %xmm4
454 movaps 63(%eax), %xmm5
455 movaps %xmm5, %xmm7
456 palignr $1, %xmm4, %xmm5
457 palignr $1, %xmm3, %xmm4
458 movaps %xmm5, 48(%edx)
459 palignr $1, %xmm2, %xmm3
460 lea 64(%eax), %eax
461 palignr $1, %xmm1, %xmm2
462 movaps %xmm4, 32(%edx)
463 movaps %xmm3, 16(%edx)
464 movaps %xmm7, %xmm1
465 movaps %xmm2, (%edx)
466 lea 64(%edx), %edx
467 sub $64, %ecx
468 ja L(Shl1LoopStart)
469
470L(Shl1LoopLeave):
471 add $32, %ecx
472 jle L(shl_end_0)
473
474 movaps 15(%eax), %xmm2
475 movaps 31(%eax), %xmm3
476 palignr $1, %xmm2, %xmm3
477 palignr $1, %xmm1, %xmm2
478 movaps %xmm2, (%edx)
479 movaps %xmm3, 16(%edx)
480 lea 32(%edx, %ecx), %edx
481 lea 32(%eax, %ecx), %eax
482 POP (%edi)
483 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
484
485 CFI_PUSH (%edi)
486
487 .p2align 4
488L(sh_1_no_prefetch):
489 lea -32(%ecx), %ecx
490 lea -1(%eax), %eax
491 xor %edi, %edi
492
493 .p2align 4
494L(sh_1_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800495 movdqa 16(%eax, %edi), %xmm2
496 sub $32, %ecx
497 movdqa 32(%eax, %edi), %xmm3
498 movdqa %xmm3, %xmm4
499 palignr $1, %xmm2, %xmm3
500 palignr $1, %xmm1, %xmm2
501 lea 32(%edi), %edi
502 movdqa %xmm2, -32(%edx, %edi)
503 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400504 jb L(sh_1_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800505
506 movdqa 16(%eax, %edi), %xmm2
507 sub $32, %ecx
508 movdqa 32(%eax, %edi), %xmm3
509 movdqa %xmm3, %xmm1
510 palignr $1, %xmm2, %xmm3
511 palignr $1, %xmm4, %xmm2
512 lea 32(%edi), %edi
513 movdqa %xmm2, -32(%edx, %edi)
514 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400515 jae L(sh_1_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800516
Jack Renc47703a2012-02-14 12:01:52 +0400517L(sh_1_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800518 lea 32(%ecx), %ecx
519 add %ecx, %edi
520 add %edi, %edx
521 lea 1(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400522 POP (%edi)
523 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800524
Jack Renc47703a2012-02-14 12:01:52 +0400525 CFI_PUSH (%edi)
526
527 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800528L(shl_2):
Jack Renc47703a2012-02-14 12:01:52 +0400529#ifndef USE_AS_MEMMOVE
530 movaps -2(%eax), %xmm1
531#else
532 movl DEST+4(%esp), %edi
533 movaps -2(%eax), %xmm1
534 movdqu %xmm0, (%edi)
535#endif
536#ifdef DATA_CACHE_SIZE_HALF
537 cmp $DATA_CACHE_SIZE_HALF, %ecx
538#else
539# if (defined SHARED || defined __PIC__)
540 SETUP_PIC_REG(bx)
541 add $_GLOBAL_OFFSET_TABLE_, %ebx
542 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
543# else
544 cmp __x86_data_cache_size_half, %ecx
545# endif
546#endif
547 jb L(sh_2_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800548
Jack Renc47703a2012-02-14 12:01:52 +0400549 lea -64(%ecx), %ecx
550
551 .p2align 4
552L(Shl2LoopStart):
553 prefetcht0 0x1c0(%eax)
554 prefetcht0 0x1c0(%edx)
555 movaps 14(%eax), %xmm2
556 movaps 30(%eax), %xmm3
557 movaps 46(%eax), %xmm4
558 movaps 62(%eax), %xmm5
559 movaps %xmm5, %xmm7
560 palignr $2, %xmm4, %xmm5
561 palignr $2, %xmm3, %xmm4
562 movaps %xmm5, 48(%edx)
563 palignr $2, %xmm2, %xmm3
564 lea 64(%eax), %eax
565 palignr $2, %xmm1, %xmm2
566 movaps %xmm4, 32(%edx)
567 movaps %xmm3, 16(%edx)
568 movaps %xmm7, %xmm1
569 movaps %xmm2, (%edx)
570 lea 64(%edx), %edx
571 sub $64, %ecx
572 ja L(Shl2LoopStart)
573
574L(Shl2LoopLeave):
575 add $32, %ecx
576 jle L(shl_end_0)
577
578 movaps 14(%eax), %xmm2
579 movaps 30(%eax), %xmm3
580 palignr $2, %xmm2, %xmm3
581 palignr $2, %xmm1, %xmm2
582 movaps %xmm2, (%edx)
583 movaps %xmm3, 16(%edx)
584 lea 32(%edx, %ecx), %edx
585 lea 32(%eax, %ecx), %eax
586 POP (%edi)
587 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
588
589 CFI_PUSH (%edi)
590
591 .p2align 4
592L(sh_2_no_prefetch):
593 lea -32(%ecx), %ecx
594 lea -2(%eax), %eax
595 xor %edi, %edi
596
597 .p2align 4
598L(sh_2_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800599 movdqa 16(%eax, %edi), %xmm2
600 sub $32, %ecx
601 movdqa 32(%eax, %edi), %xmm3
602 movdqa %xmm3, %xmm4
603 palignr $2, %xmm2, %xmm3
604 palignr $2, %xmm1, %xmm2
605 lea 32(%edi), %edi
606 movdqa %xmm2, -32(%edx, %edi)
607 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400608 jb L(sh_2_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800609
610 movdqa 16(%eax, %edi), %xmm2
611 sub $32, %ecx
612 movdqa 32(%eax, %edi), %xmm3
613 movdqa %xmm3, %xmm1
614 palignr $2, %xmm2, %xmm3
615 palignr $2, %xmm4, %xmm2
616 lea 32(%edi), %edi
617 movdqa %xmm2, -32(%edx, %edi)
618 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400619 jae L(sh_2_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800620
Jack Renc47703a2012-02-14 12:01:52 +0400621L(sh_2_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800622 lea 32(%ecx), %ecx
623 add %ecx, %edi
624 add %edi, %edx
625 lea 2(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400626 POP (%edi)
627 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800628
Jack Renc47703a2012-02-14 12:01:52 +0400629 CFI_PUSH (%edi)
630
631 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800632L(shl_3):
Jack Renc47703a2012-02-14 12:01:52 +0400633#ifndef USE_AS_MEMMOVE
634 movaps -3(%eax), %xmm1
635#else
636 movl DEST+4(%esp), %edi
637 movaps -3(%eax), %xmm1
638 movdqu %xmm0, (%edi)
639#endif
640#ifdef DATA_CACHE_SIZE_HALF
641 cmp $DATA_CACHE_SIZE_HALF, %ecx
642#else
643# if (defined SHARED || defined __PIC__)
644 SETUP_PIC_REG(bx)
645 add $_GLOBAL_OFFSET_TABLE_, %ebx
646 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
647# else
648 cmp __x86_data_cache_size_half, %ecx
649# endif
650#endif
651 jb L(sh_3_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800652
Jack Renc47703a2012-02-14 12:01:52 +0400653 lea -64(%ecx), %ecx
654
655 .p2align 4
656L(Shl3LoopStart):
657 prefetcht0 0x1c0(%eax)
658 prefetcht0 0x1c0(%edx)
659 movaps 13(%eax), %xmm2
660 movaps 29(%eax), %xmm3
661 movaps 45(%eax), %xmm4
662 movaps 61(%eax), %xmm5
663 movaps %xmm5, %xmm7
664 palignr $3, %xmm4, %xmm5
665 palignr $3, %xmm3, %xmm4
666 movaps %xmm5, 48(%edx)
667 palignr $3, %xmm2, %xmm3
668 lea 64(%eax), %eax
669 palignr $3, %xmm1, %xmm2
670 movaps %xmm4, 32(%edx)
671 movaps %xmm3, 16(%edx)
672 movaps %xmm7, %xmm1
673 movaps %xmm2, (%edx)
674 lea 64(%edx), %edx
675 sub $64, %ecx
676 ja L(Shl3LoopStart)
677
678L(Shl3LoopLeave):
679 add $32, %ecx
680 jle L(shl_end_0)
681
682 movaps 13(%eax), %xmm2
683 movaps 29(%eax), %xmm3
684 palignr $3, %xmm2, %xmm3
685 palignr $3, %xmm1, %xmm2
686 movaps %xmm2, (%edx)
687 movaps %xmm3, 16(%edx)
688 lea 32(%edx, %ecx), %edx
689 lea 32(%eax, %ecx), %eax
690 POP (%edi)
691 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
692
693 CFI_PUSH (%edi)
694
695 .p2align 4
696L(sh_3_no_prefetch):
697 lea -32(%ecx), %ecx
698 lea -3(%eax), %eax
699 xor %edi, %edi
700
701 .p2align 4
702L(sh_3_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800703 movdqa 16(%eax, %edi), %xmm2
704 sub $32, %ecx
705 movdqa 32(%eax, %edi), %xmm3
706 movdqa %xmm3, %xmm4
707 palignr $3, %xmm2, %xmm3
708 palignr $3, %xmm1, %xmm2
709 lea 32(%edi), %edi
710 movdqa %xmm2, -32(%edx, %edi)
711 movdqa %xmm3, -16(%edx, %edi)
712
Jack Renc47703a2012-02-14 12:01:52 +0400713 jb L(sh_3_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800714
715 movdqa 16(%eax, %edi), %xmm2
716 sub $32, %ecx
717 movdqa 32(%eax, %edi), %xmm3
718 movdqa %xmm3, %xmm1
719 palignr $3, %xmm2, %xmm3
720 palignr $3, %xmm4, %xmm2
721 lea 32(%edi), %edi
722 movdqa %xmm2, -32(%edx, %edi)
723 movdqa %xmm3, -16(%edx, %edi)
724
Jack Renc47703a2012-02-14 12:01:52 +0400725 jae L(sh_3_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800726
Jack Renc47703a2012-02-14 12:01:52 +0400727L(sh_3_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800728 lea 32(%ecx), %ecx
729 add %ecx, %edi
730 add %edi, %edx
731 lea 3(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400732 POP (%edi)
733 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800734
Jack Renc47703a2012-02-14 12:01:52 +0400735 CFI_PUSH (%edi)
736
737 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800738L(shl_4):
Jack Renc47703a2012-02-14 12:01:52 +0400739#ifndef USE_AS_MEMMOVE
740 movaps -4(%eax), %xmm1
741#else
742 movl DEST+4(%esp), %edi
743 movaps -4(%eax), %xmm1
744 movdqu %xmm0, (%edi)
745#endif
746#ifdef DATA_CACHE_SIZE_HALF
747 cmp $DATA_CACHE_SIZE_HALF, %ecx
748#else
749# if (defined SHARED || defined __PIC__)
750 SETUP_PIC_REG(bx)
751 add $_GLOBAL_OFFSET_TABLE_, %ebx
752 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
753# else
754 cmp __x86_data_cache_size_half, %ecx
755# endif
756#endif
757 jb L(sh_4_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800758
Jack Renc47703a2012-02-14 12:01:52 +0400759 lea -64(%ecx), %ecx
760
761 .p2align 4
762L(Shl4LoopStart):
763 prefetcht0 0x1c0(%eax)
764 prefetcht0 0x1c0(%edx)
765 movaps 12(%eax), %xmm2
766 movaps 28(%eax), %xmm3
767 movaps 44(%eax), %xmm4
768 movaps 60(%eax), %xmm5
769 movaps %xmm5, %xmm7
770 palignr $4, %xmm4, %xmm5
771 palignr $4, %xmm3, %xmm4
772 movaps %xmm5, 48(%edx)
773 palignr $4, %xmm2, %xmm3
774 lea 64(%eax), %eax
775 palignr $4, %xmm1, %xmm2
776 movaps %xmm4, 32(%edx)
777 movaps %xmm3, 16(%edx)
778 movaps %xmm7, %xmm1
779 movaps %xmm2, (%edx)
780 lea 64(%edx), %edx
781 sub $64, %ecx
782 ja L(Shl4LoopStart)
783
784L(Shl4LoopLeave):
785 add $32, %ecx
786 jle L(shl_end_0)
787
788 movaps 12(%eax), %xmm2
789 movaps 28(%eax), %xmm3
790 palignr $4, %xmm2, %xmm3
791 palignr $4, %xmm1, %xmm2
792 movaps %xmm2, (%edx)
793 movaps %xmm3, 16(%edx)
794 lea 32(%edx, %ecx), %edx
795 lea 32(%eax, %ecx), %eax
796 POP (%edi)
797 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
798
799 CFI_PUSH (%edi)
800
801 .p2align 4
802L(sh_4_no_prefetch):
803 lea -32(%ecx), %ecx
804 lea -4(%eax), %eax
805 xor %edi, %edi
806
807 .p2align 4
808L(sh_4_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800809 movdqa 16(%eax, %edi), %xmm2
810 sub $32, %ecx
811 movdqa 32(%eax, %edi), %xmm3
812 movdqa %xmm3, %xmm4
813 palignr $4, %xmm2, %xmm3
814 palignr $4, %xmm1, %xmm2
815 lea 32(%edi), %edi
816 movdqa %xmm2, -32(%edx, %edi)
817 movdqa %xmm3, -16(%edx, %edi)
818
Jack Renc47703a2012-02-14 12:01:52 +0400819 jb L(sh_4_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800820
821 movdqa 16(%eax, %edi), %xmm2
822 sub $32, %ecx
823 movdqa 32(%eax, %edi), %xmm3
824 movdqa %xmm3, %xmm1
825 palignr $4, %xmm2, %xmm3
826 palignr $4, %xmm4, %xmm2
827 lea 32(%edi), %edi
828 movdqa %xmm2, -32(%edx, %edi)
829 movdqa %xmm3, -16(%edx, %edi)
830
Jack Renc47703a2012-02-14 12:01:52 +0400831 jae L(sh_4_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800832
Jack Renc47703a2012-02-14 12:01:52 +0400833L(sh_4_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800834 lea 32(%ecx), %ecx
835 add %ecx, %edi
836 add %edi, %edx
837 lea 4(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400838 POP (%edi)
839 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800840
Jack Renc47703a2012-02-14 12:01:52 +0400841 CFI_PUSH (%edi)
842
843 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800844L(shl_5):
Jack Renc47703a2012-02-14 12:01:52 +0400845#ifndef USE_AS_MEMMOVE
846 movaps -5(%eax), %xmm1
847#else
848 movl DEST+4(%esp), %edi
849 movaps -5(%eax), %xmm1
850 movdqu %xmm0, (%edi)
851#endif
852#ifdef DATA_CACHE_SIZE_HALF
853 cmp $DATA_CACHE_SIZE_HALF, %ecx
854#else
855# if (defined SHARED || defined __PIC__)
856 SETUP_PIC_REG(bx)
857 add $_GLOBAL_OFFSET_TABLE_, %ebx
858 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
859# else
860 cmp __x86_data_cache_size_half, %ecx
861# endif
862#endif
863 jb L(sh_5_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800864
Jack Renc47703a2012-02-14 12:01:52 +0400865 lea -64(%ecx), %ecx
866
867 .p2align 4
868L(Shl5LoopStart):
869 prefetcht0 0x1c0(%eax)
870 prefetcht0 0x1c0(%edx)
871 movaps 11(%eax), %xmm2
872 movaps 27(%eax), %xmm3
873 movaps 43(%eax), %xmm4
874 movaps 59(%eax), %xmm5
875 movaps %xmm5, %xmm7
876 palignr $5, %xmm4, %xmm5
877 palignr $5, %xmm3, %xmm4
878 movaps %xmm5, 48(%edx)
879 palignr $5, %xmm2, %xmm3
880 lea 64(%eax), %eax
881 palignr $5, %xmm1, %xmm2
882 movaps %xmm4, 32(%edx)
883 movaps %xmm3, 16(%edx)
884 movaps %xmm7, %xmm1
885 movaps %xmm2, (%edx)
886 lea 64(%edx), %edx
887 sub $64, %ecx
888 ja L(Shl5LoopStart)
889
890L(Shl5LoopLeave):
891 add $32, %ecx
892 jle L(shl_end_0)
893
894 movaps 11(%eax), %xmm2
895 movaps 27(%eax), %xmm3
896 palignr $5, %xmm2, %xmm3
897 palignr $5, %xmm1, %xmm2
898 movaps %xmm2, (%edx)
899 movaps %xmm3, 16(%edx)
900 lea 32(%edx, %ecx), %edx
901 lea 32(%eax, %ecx), %eax
902 POP (%edi)
903 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
904
905 CFI_PUSH (%edi)
906
907 .p2align 4
908L(sh_5_no_prefetch):
909 lea -32(%ecx), %ecx
910 lea -5(%eax), %eax
911 xor %edi, %edi
912
913 .p2align 4
914L(sh_5_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800915 movdqa 16(%eax, %edi), %xmm2
916 sub $32, %ecx
917 movdqa 32(%eax, %edi), %xmm3
918 movdqa %xmm3, %xmm4
919 palignr $5, %xmm2, %xmm3
920 palignr $5, %xmm1, %xmm2
921 lea 32(%edi), %edi
922 movdqa %xmm2, -32(%edx, %edi)
923 movdqa %xmm3, -16(%edx, %edi)
924
Jack Renc47703a2012-02-14 12:01:52 +0400925 jb L(sh_5_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800926
927 movdqa 16(%eax, %edi), %xmm2
928 sub $32, %ecx
929 movdqa 32(%eax, %edi), %xmm3
930 movdqa %xmm3, %xmm1
931 palignr $5, %xmm2, %xmm3
932 palignr $5, %xmm4, %xmm2
933 lea 32(%edi), %edi
934 movdqa %xmm2, -32(%edx, %edi)
935 movdqa %xmm3, -16(%edx, %edi)
936
Jack Renc47703a2012-02-14 12:01:52 +0400937 jae L(sh_5_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800938
Jack Renc47703a2012-02-14 12:01:52 +0400939L(sh_5_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800940 lea 32(%ecx), %ecx
941 add %ecx, %edi
942 add %edi, %edx
943 lea 5(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400944 POP (%edi)
945 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800946
Jack Renc47703a2012-02-14 12:01:52 +0400947 CFI_PUSH (%edi)
948
949 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800950L(shl_6):
Jack Renc47703a2012-02-14 12:01:52 +0400951#ifndef USE_AS_MEMMOVE
952 movaps -6(%eax), %xmm1
953#else
954 movl DEST+4(%esp), %edi
955 movaps -6(%eax), %xmm1
956 movdqu %xmm0, (%edi)
957#endif
958#ifdef DATA_CACHE_SIZE_HALF
959 cmp $DATA_CACHE_SIZE_HALF, %ecx
960#else
961# if (defined SHARED || defined __PIC__)
962 SETUP_PIC_REG(bx)
963 add $_GLOBAL_OFFSET_TABLE_, %ebx
964 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
965# else
966 cmp __x86_data_cache_size_half, %ecx
967# endif
968#endif
969 jb L(sh_6_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800970
Jack Renc47703a2012-02-14 12:01:52 +0400971 lea -64(%ecx), %ecx
972
973 .p2align 4
974L(Shl6LoopStart):
975 prefetcht0 0x1c0(%eax)
976 prefetcht0 0x1c0(%edx)
977 movaps 10(%eax), %xmm2
978 movaps 26(%eax), %xmm3
979 movaps 42(%eax), %xmm4
980 movaps 58(%eax), %xmm5
981 movaps %xmm5, %xmm7
982 palignr $6, %xmm4, %xmm5
983 palignr $6, %xmm3, %xmm4
984 movaps %xmm5, 48(%edx)
985 palignr $6, %xmm2, %xmm3
986 lea 64(%eax), %eax
987 palignr $6, %xmm1, %xmm2
988 movaps %xmm4, 32(%edx)
989 movaps %xmm3, 16(%edx)
990 movaps %xmm7, %xmm1
991 movaps %xmm2, (%edx)
992 lea 64(%edx), %edx
993 sub $64, %ecx
994 ja L(Shl6LoopStart)
995
996L(Shl6LoopLeave):
997 add $32, %ecx
998 jle L(shl_end_0)
999
1000 movaps 10(%eax), %xmm2
1001 movaps 26(%eax), %xmm3
1002 palignr $6, %xmm2, %xmm3
1003 palignr $6, %xmm1, %xmm2
1004 movaps %xmm2, (%edx)
1005 movaps %xmm3, 16(%edx)
1006 lea 32(%edx, %ecx), %edx
1007 lea 32(%eax, %ecx), %eax
1008 POP (%edi)
1009 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1010
1011 CFI_PUSH (%edi)
1012
1013 .p2align 4
1014L(sh_6_no_prefetch):
1015 lea -32(%ecx), %ecx
1016 lea -6(%eax), %eax
1017 xor %edi, %edi
1018
1019 .p2align 4
1020L(sh_6_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001021 movdqa 16(%eax, %edi), %xmm2
1022 sub $32, %ecx
1023 movdqa 32(%eax, %edi), %xmm3
1024 movdqa %xmm3, %xmm4
1025 palignr $6, %xmm2, %xmm3
1026 palignr $6, %xmm1, %xmm2
1027 lea 32(%edi), %edi
1028 movdqa %xmm2, -32(%edx, %edi)
1029 movdqa %xmm3, -16(%edx, %edi)
1030
Jack Renc47703a2012-02-14 12:01:52 +04001031 jb L(sh_6_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001032
1033 movdqa 16(%eax, %edi), %xmm2
1034 sub $32, %ecx
1035 movdqa 32(%eax, %edi), %xmm3
1036 movdqa %xmm3, %xmm1
1037 palignr $6, %xmm2, %xmm3
1038 palignr $6, %xmm4, %xmm2
1039 lea 32(%edi), %edi
1040 movdqa %xmm2, -32(%edx, %edi)
1041 movdqa %xmm3, -16(%edx, %edi)
1042
Jack Renc47703a2012-02-14 12:01:52 +04001043 jae L(sh_6_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001044
Jack Renc47703a2012-02-14 12:01:52 +04001045L(sh_6_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001046 lea 32(%ecx), %ecx
1047 add %ecx, %edi
1048 add %edi, %edx
1049 lea 6(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001050 POP (%edi)
1051 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001052
Jack Renc47703a2012-02-14 12:01:52 +04001053 CFI_PUSH (%edi)
1054
1055 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001056L(shl_7):
Jack Renc47703a2012-02-14 12:01:52 +04001057#ifndef USE_AS_MEMMOVE
1058 movaps -7(%eax), %xmm1
1059#else
1060 movl DEST+4(%esp), %edi
1061 movaps -7(%eax), %xmm1
1062 movdqu %xmm0, (%edi)
1063#endif
1064#ifdef DATA_CACHE_SIZE_HALF
1065 cmp $DATA_CACHE_SIZE_HALF, %ecx
1066#else
1067# if (defined SHARED || defined __PIC__)
1068 SETUP_PIC_REG(bx)
1069 add $_GLOBAL_OFFSET_TABLE_, %ebx
1070 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1071# else
1072 cmp __x86_data_cache_size_half, %ecx
1073# endif
1074#endif
1075 jb L(sh_7_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001076
Jack Renc47703a2012-02-14 12:01:52 +04001077 lea -64(%ecx), %ecx
1078
1079 .p2align 4
1080L(Shl7LoopStart):
1081 prefetcht0 0x1c0(%eax)
1082 prefetcht0 0x1c0(%edx)
1083 movaps 9(%eax), %xmm2
1084 movaps 25(%eax), %xmm3
1085 movaps 41(%eax), %xmm4
1086 movaps 57(%eax), %xmm5
1087 movaps %xmm5, %xmm7
1088 palignr $7, %xmm4, %xmm5
1089 palignr $7, %xmm3, %xmm4
1090 movaps %xmm5, 48(%edx)
1091 palignr $7, %xmm2, %xmm3
1092 lea 64(%eax), %eax
1093 palignr $7, %xmm1, %xmm2
1094 movaps %xmm4, 32(%edx)
1095 movaps %xmm3, 16(%edx)
1096 movaps %xmm7, %xmm1
1097 movaps %xmm2, (%edx)
1098 lea 64(%edx), %edx
1099 sub $64, %ecx
1100 ja L(Shl7LoopStart)
1101
1102L(Shl7LoopLeave):
1103 add $32, %ecx
1104 jle L(shl_end_0)
1105
1106 movaps 9(%eax), %xmm2
1107 movaps 25(%eax), %xmm3
1108 palignr $7, %xmm2, %xmm3
1109 palignr $7, %xmm1, %xmm2
1110 movaps %xmm2, (%edx)
1111 movaps %xmm3, 16(%edx)
1112 lea 32(%edx, %ecx), %edx
1113 lea 32(%eax, %ecx), %eax
1114 POP (%edi)
1115 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1116
1117 CFI_PUSH (%edi)
1118
1119 .p2align 4
1120L(sh_7_no_prefetch):
1121 lea -32(%ecx), %ecx
1122 lea -7(%eax), %eax
1123 xor %edi, %edi
1124
1125 .p2align 4
1126L(sh_7_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001127 movdqa 16(%eax, %edi), %xmm2
1128 sub $32, %ecx
1129 movdqa 32(%eax, %edi), %xmm3
1130 movdqa %xmm3, %xmm4
1131 palignr $7, %xmm2, %xmm3
1132 palignr $7, %xmm1, %xmm2
1133 lea 32(%edi), %edi
1134 movdqa %xmm2, -32(%edx, %edi)
1135 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001136 jb L(sh_7_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001137
1138 movdqa 16(%eax, %edi), %xmm2
1139 sub $32, %ecx
1140 movdqa 32(%eax, %edi), %xmm3
1141 movdqa %xmm3, %xmm1
1142 palignr $7, %xmm2, %xmm3
1143 palignr $7, %xmm4, %xmm2
1144 lea 32(%edi), %edi
1145 movdqa %xmm2, -32(%edx, %edi)
1146 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001147 jae L(sh_7_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001148
Jack Renc47703a2012-02-14 12:01:52 +04001149L(sh_7_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001150 lea 32(%ecx), %ecx
1151 add %ecx, %edi
1152 add %edi, %edx
1153 lea 7(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001154 POP (%edi)
1155 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001156
Jack Renc47703a2012-02-14 12:01:52 +04001157 CFI_PUSH (%edi)
1158
1159 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001160L(shl_8):
Jack Renc47703a2012-02-14 12:01:52 +04001161#ifndef USE_AS_MEMMOVE
1162 movaps -8(%eax), %xmm1
1163#else
1164 movl DEST+4(%esp), %edi
1165 movaps -8(%eax), %xmm1
1166 movdqu %xmm0, (%edi)
1167#endif
1168#ifdef DATA_CACHE_SIZE_HALF
1169 cmp $DATA_CACHE_SIZE_HALF, %ecx
1170#else
1171# if (defined SHARED || defined __PIC__)
1172 SETUP_PIC_REG(bx)
1173 add $_GLOBAL_OFFSET_TABLE_, %ebx
1174 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1175# else
1176 cmp __x86_data_cache_size_half, %ecx
1177# endif
1178#endif
1179 jb L(sh_8_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001180
Jack Renc47703a2012-02-14 12:01:52 +04001181 lea -64(%ecx), %ecx
1182
1183 .p2align 4
1184L(Shl8LoopStart):
1185 prefetcht0 0x1c0(%eax)
1186 prefetcht0 0x1c0(%edx)
1187 movaps 8(%eax), %xmm2
1188 movaps 24(%eax), %xmm3
1189 movaps 40(%eax), %xmm4
1190 movaps 56(%eax), %xmm5
1191 movaps %xmm5, %xmm7
1192 palignr $8, %xmm4, %xmm5
1193 palignr $8, %xmm3, %xmm4
1194 movaps %xmm5, 48(%edx)
1195 palignr $8, %xmm2, %xmm3
1196 lea 64(%eax), %eax
1197 palignr $8, %xmm1, %xmm2
1198 movaps %xmm4, 32(%edx)
1199 movaps %xmm3, 16(%edx)
1200 movaps %xmm7, %xmm1
1201 movaps %xmm2, (%edx)
1202 lea 64(%edx), %edx
1203 sub $64, %ecx
1204 ja L(Shl8LoopStart)
1205
1206L(LoopLeave8):
1207 add $32, %ecx
1208 jle L(shl_end_0)
1209
1210 movaps 8(%eax), %xmm2
1211 movaps 24(%eax), %xmm3
1212 palignr $8, %xmm2, %xmm3
1213 palignr $8, %xmm1, %xmm2
1214 movaps %xmm2, (%edx)
1215 movaps %xmm3, 16(%edx)
1216 lea 32(%edx, %ecx), %edx
1217 lea 32(%eax, %ecx), %eax
1218 POP (%edi)
1219 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1220
1221 CFI_PUSH (%edi)
1222
1223 .p2align 4
1224L(sh_8_no_prefetch):
1225 lea -32(%ecx), %ecx
1226 lea -8(%eax), %eax
1227 xor %edi, %edi
1228
1229 .p2align 4
1230L(sh_8_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001231 movdqa 16(%eax, %edi), %xmm2
1232 sub $32, %ecx
1233 movdqa 32(%eax, %edi), %xmm3
1234 movdqa %xmm3, %xmm4
1235 palignr $8, %xmm2, %xmm3
1236 palignr $8, %xmm1, %xmm2
1237 lea 32(%edi), %edi
1238 movdqa %xmm2, -32(%edx, %edi)
1239 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001240 jb L(sh_8_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001241
1242 movdqa 16(%eax, %edi), %xmm2
1243 sub $32, %ecx
1244 movdqa 32(%eax, %edi), %xmm3
1245 movdqa %xmm3, %xmm1
1246 palignr $8, %xmm2, %xmm3
1247 palignr $8, %xmm4, %xmm2
1248 lea 32(%edi), %edi
1249 movdqa %xmm2, -32(%edx, %edi)
1250 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001251 jae L(sh_8_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001252
Jack Renc47703a2012-02-14 12:01:52 +04001253L(sh_8_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001254 lea 32(%ecx), %ecx
1255 add %ecx, %edi
1256 add %edi, %edx
1257 lea 8(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001258 POP (%edi)
1259 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001260
Jack Renc47703a2012-02-14 12:01:52 +04001261 CFI_PUSH (%edi)
1262
1263 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001264L(shl_9):
Jack Renc47703a2012-02-14 12:01:52 +04001265#ifndef USE_AS_MEMMOVE
1266 movaps -9(%eax), %xmm1
1267#else
1268 movl DEST+4(%esp), %edi
1269 movaps -9(%eax), %xmm1
1270 movdqu %xmm0, (%edi)
1271#endif
1272#ifdef DATA_CACHE_SIZE_HALF
1273 cmp $DATA_CACHE_SIZE_HALF, %ecx
1274#else
1275# if (defined SHARED || defined __PIC__)
1276 SETUP_PIC_REG(bx)
1277 add $_GLOBAL_OFFSET_TABLE_, %ebx
1278 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1279# else
1280 cmp __x86_data_cache_size_half, %ecx
1281# endif
1282#endif
1283 jb L(sh_9_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001284
Jack Renc47703a2012-02-14 12:01:52 +04001285 lea -64(%ecx), %ecx
1286
1287 .p2align 4
1288L(Shl9LoopStart):
1289 prefetcht0 0x1c0(%eax)
1290 prefetcht0 0x1c0(%edx)
1291 movaps 7(%eax), %xmm2
1292 movaps 23(%eax), %xmm3
1293 movaps 39(%eax), %xmm4
1294 movaps 55(%eax), %xmm5
1295 movaps %xmm5, %xmm7
1296 palignr $9, %xmm4, %xmm5
1297 palignr $9, %xmm3, %xmm4
1298 movaps %xmm5, 48(%edx)
1299 palignr $9, %xmm2, %xmm3
1300 lea 64(%eax), %eax
1301 palignr $9, %xmm1, %xmm2
1302 movaps %xmm4, 32(%edx)
1303 movaps %xmm3, 16(%edx)
1304 movaps %xmm7, %xmm1
1305 movaps %xmm2, (%edx)
1306 lea 64(%edx), %edx
1307 sub $64, %ecx
1308 ja L(Shl9LoopStart)
1309
1310L(Shl9LoopLeave):
1311 add $32, %ecx
1312 jle L(shl_end_0)
1313
1314 movaps 7(%eax), %xmm2
1315 movaps 23(%eax), %xmm3
1316 palignr $9, %xmm2, %xmm3
1317 palignr $9, %xmm1, %xmm2
1318
1319 movaps %xmm2, (%edx)
1320 movaps %xmm3, 16(%edx)
1321 lea 32(%edx, %ecx), %edx
1322 lea 32(%eax, %ecx), %eax
1323 POP (%edi)
1324 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1325
1326 CFI_PUSH (%edi)
1327
1328 .p2align 4
1329L(sh_9_no_prefetch):
1330 lea -32(%ecx), %ecx
1331 lea -9(%eax), %eax
1332 xor %edi, %edi
1333
1334 .p2align 4
1335L(sh_9_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001336 movdqa 16(%eax, %edi), %xmm2
1337 sub $32, %ecx
1338 movdqa 32(%eax, %edi), %xmm3
1339 movdqa %xmm3, %xmm4
1340 palignr $9, %xmm2, %xmm3
1341 palignr $9, %xmm1, %xmm2
1342 lea 32(%edi), %edi
1343 movdqa %xmm2, -32(%edx, %edi)
1344 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001345 jb L(sh_9_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001346
1347 movdqa 16(%eax, %edi), %xmm2
1348 sub $32, %ecx
1349 movdqa 32(%eax, %edi), %xmm3
1350 movdqa %xmm3, %xmm1
1351 palignr $9, %xmm2, %xmm3
1352 palignr $9, %xmm4, %xmm2
1353 lea 32(%edi), %edi
1354 movdqa %xmm2, -32(%edx, %edi)
1355 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001356 jae L(sh_9_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001357
Jack Renc47703a2012-02-14 12:01:52 +04001358L(sh_9_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001359 lea 32(%ecx), %ecx
1360 add %ecx, %edi
1361 add %edi, %edx
1362 lea 9(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001363 POP (%edi)
1364 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001365
Jack Renc47703a2012-02-14 12:01:52 +04001366 CFI_PUSH (%edi)
1367
1368 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001369L(shl_10):
Jack Renc47703a2012-02-14 12:01:52 +04001370#ifndef USE_AS_MEMMOVE
1371 movaps -10(%eax), %xmm1
1372#else
1373 movl DEST+4(%esp), %edi
1374 movaps -10(%eax), %xmm1
1375 movdqu %xmm0, (%edi)
1376#endif
1377#ifdef DATA_CACHE_SIZE_HALF
1378 cmp $DATA_CACHE_SIZE_HALF, %ecx
1379#else
1380# if (defined SHARED || defined __PIC__)
1381 SETUP_PIC_REG(bx)
1382 add $_GLOBAL_OFFSET_TABLE_, %ebx
1383 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1384# else
1385 cmp __x86_data_cache_size_half, %ecx
1386# endif
1387#endif
1388 jb L(sh_10_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001389
Jack Renc47703a2012-02-14 12:01:52 +04001390 lea -64(%ecx), %ecx
1391
1392 .p2align 4
1393L(Shl10LoopStart):
1394 prefetcht0 0x1c0(%eax)
1395 prefetcht0 0x1c0(%edx)
1396 movaps 6(%eax), %xmm2
1397 movaps 22(%eax), %xmm3
1398 movaps 38(%eax), %xmm4
1399 movaps 54(%eax), %xmm5
1400 movaps %xmm5, %xmm7
1401 palignr $10, %xmm4, %xmm5
1402 palignr $10, %xmm3, %xmm4
1403 movaps %xmm5, 48(%edx)
1404 palignr $10, %xmm2, %xmm3
1405 lea 64(%eax), %eax
1406 palignr $10, %xmm1, %xmm2
1407 movaps %xmm4, 32(%edx)
1408 movaps %xmm3, 16(%edx)
1409 movaps %xmm7, %xmm1
1410 movaps %xmm2, (%edx)
1411 lea 64(%edx), %edx
1412 sub $64, %ecx
1413 ja L(Shl10LoopStart)
1414
1415L(Shl10LoopLeave):
1416 add $32, %ecx
1417 jle L(shl_end_0)
1418
1419 movaps 6(%eax), %xmm2
1420 movaps 22(%eax), %xmm3
1421 palignr $10, %xmm2, %xmm3
1422 palignr $10, %xmm1, %xmm2
1423
1424 movaps %xmm2, (%edx)
1425 movaps %xmm3, 16(%edx)
1426 lea 32(%edx, %ecx), %edx
1427 lea 32(%eax, %ecx), %eax
1428 POP (%edi)
1429 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1430
1431 CFI_PUSH (%edi)
1432
1433 .p2align 4
1434L(sh_10_no_prefetch):
1435 lea -32(%ecx), %ecx
1436 lea -10(%eax), %eax
1437 xor %edi, %edi
1438
1439 .p2align 4
1440L(sh_10_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001441 movdqa 16(%eax, %edi), %xmm2
1442 sub $32, %ecx
1443 movdqa 32(%eax, %edi), %xmm3
1444 movdqa %xmm3, %xmm4
1445 palignr $10, %xmm2, %xmm3
1446 palignr $10, %xmm1, %xmm2
1447 lea 32(%edi), %edi
1448 movdqa %xmm2, -32(%edx, %edi)
1449 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001450 jb L(sh_10_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001451
1452 movdqa 16(%eax, %edi), %xmm2
1453 sub $32, %ecx
1454 movdqa 32(%eax, %edi), %xmm3
1455 movdqa %xmm3, %xmm1
1456 palignr $10, %xmm2, %xmm3
1457 palignr $10, %xmm4, %xmm2
1458 lea 32(%edi), %edi
1459 movdqa %xmm2, -32(%edx, %edi)
1460 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001461 jae L(sh_10_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001462
Jack Renc47703a2012-02-14 12:01:52 +04001463L(sh_10_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001464 lea 32(%ecx), %ecx
1465 add %ecx, %edi
1466 add %edi, %edx
1467 lea 10(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001468 POP (%edi)
1469 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001470
Jack Renc47703a2012-02-14 12:01:52 +04001471 CFI_PUSH (%edi)
1472
1473 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001474L(shl_11):
Jack Renc47703a2012-02-14 12:01:52 +04001475#ifndef USE_AS_MEMMOVE
1476 movaps -11(%eax), %xmm1
1477#else
1478 movl DEST+4(%esp), %edi
1479 movaps -11(%eax), %xmm1
1480 movdqu %xmm0, (%edi)
1481#endif
1482#ifdef DATA_CACHE_SIZE_HALF
1483 cmp $DATA_CACHE_SIZE_HALF, %ecx
1484#else
1485# if (defined SHARED || defined __PIC__)
1486 SETUP_PIC_REG(bx)
1487 add $_GLOBAL_OFFSET_TABLE_, %ebx
1488 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1489# else
1490 cmp __x86_data_cache_size_half, %ecx
1491# endif
1492#endif
1493 jb L(sh_11_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001494
Jack Renc47703a2012-02-14 12:01:52 +04001495 lea -64(%ecx), %ecx
1496
1497 .p2align 4
1498L(Shl11LoopStart):
1499 prefetcht0 0x1c0(%eax)
1500 prefetcht0 0x1c0(%edx)
1501 movaps 5(%eax), %xmm2
1502 movaps 21(%eax), %xmm3
1503 movaps 37(%eax), %xmm4
1504 movaps 53(%eax), %xmm5
1505 movaps %xmm5, %xmm7
1506 palignr $11, %xmm4, %xmm5
1507 palignr $11, %xmm3, %xmm4
1508 movaps %xmm5, 48(%edx)
1509 palignr $11, %xmm2, %xmm3
1510 lea 64(%eax), %eax
1511 palignr $11, %xmm1, %xmm2
1512 movaps %xmm4, 32(%edx)
1513 movaps %xmm3, 16(%edx)
1514 movaps %xmm7, %xmm1
1515 movaps %xmm2, (%edx)
1516 lea 64(%edx), %edx
1517 sub $64, %ecx
1518 ja L(Shl11LoopStart)
1519
1520L(Shl11LoopLeave):
1521 add $32, %ecx
1522 jle L(shl_end_0)
1523
1524 movaps 5(%eax), %xmm2
1525 movaps 21(%eax), %xmm3
1526 palignr $11, %xmm2, %xmm3
1527 palignr $11, %xmm1, %xmm2
1528
1529 movaps %xmm2, (%edx)
1530 movaps %xmm3, 16(%edx)
1531 lea 32(%edx, %ecx), %edx
1532 lea 32(%eax, %ecx), %eax
1533 POP (%edi)
1534 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1535
1536 CFI_PUSH (%edi)
1537
1538 .p2align 4
1539L(sh_11_no_prefetch):
1540 lea -32(%ecx), %ecx
1541 lea -11(%eax), %eax
1542 xor %edi, %edi
1543
1544 .p2align 4
1545L(sh_11_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001546 movdqa 16(%eax, %edi), %xmm2
1547 sub $32, %ecx
1548 movdqa 32(%eax, %edi), %xmm3
1549 movdqa %xmm3, %xmm4
1550 palignr $11, %xmm2, %xmm3
1551 palignr $11, %xmm1, %xmm2
1552 lea 32(%edi), %edi
1553 movdqa %xmm2, -32(%edx, %edi)
1554 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001555 jb L(sh_11_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001556
1557 movdqa 16(%eax, %edi), %xmm2
1558 sub $32, %ecx
1559 movdqa 32(%eax, %edi), %xmm3
1560 movdqa %xmm3, %xmm1
1561 palignr $11, %xmm2, %xmm3
1562 palignr $11, %xmm4, %xmm2
1563 lea 32(%edi), %edi
1564 movdqa %xmm2, -32(%edx, %edi)
1565 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001566 jae L(sh_11_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001567
Jack Renc47703a2012-02-14 12:01:52 +04001568L(sh_11_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001569 lea 32(%ecx), %ecx
1570 add %ecx, %edi
1571 add %edi, %edx
1572 lea 11(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001573 POP (%edi)
1574 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001575
Jack Renc47703a2012-02-14 12:01:52 +04001576 CFI_PUSH (%edi)
1577
1578 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001579L(shl_12):
Jack Renc47703a2012-02-14 12:01:52 +04001580#ifndef USE_AS_MEMMOVE
1581 movaps -12(%eax), %xmm1
1582#else
1583 movl DEST+4(%esp), %edi
1584 movaps -12(%eax), %xmm1
1585 movdqu %xmm0, (%edi)
1586#endif
1587#ifdef DATA_CACHE_SIZE_HALF
1588 cmp $DATA_CACHE_SIZE_HALF, %ecx
1589#else
1590# if (defined SHARED || defined __PIC__)
1591 SETUP_PIC_REG(bx)
1592 add $_GLOBAL_OFFSET_TABLE_, %ebx
1593 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1594# else
1595 cmp __x86_data_cache_size_half, %ecx
1596# endif
1597#endif
1598 jb L(sh_12_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001599
Jack Renc47703a2012-02-14 12:01:52 +04001600 lea -64(%ecx), %ecx
1601
1602 .p2align 4
1603L(Shl12LoopStart):
1604 prefetcht0 0x1c0(%eax)
1605 prefetcht0 0x1c0(%edx)
1606 movaps 4(%eax), %xmm2
1607 movaps 20(%eax), %xmm3
1608 movaps 36(%eax), %xmm4
1609 movaps 52(%eax), %xmm5
1610 movaps %xmm5, %xmm7
1611 palignr $12, %xmm4, %xmm5
1612 palignr $12, %xmm3, %xmm4
1613 movaps %xmm5, 48(%edx)
1614 palignr $12, %xmm2, %xmm3
1615 lea 64(%eax), %eax
1616 palignr $12, %xmm1, %xmm2
1617 movaps %xmm4, 32(%edx)
1618 movaps %xmm3, 16(%edx)
1619 movaps %xmm7, %xmm1
1620 movaps %xmm2, (%edx)
1621 lea 64(%edx), %edx
1622 sub $64, %ecx
1623 ja L(Shl12LoopStart)
1624
1625L(Shl12LoopLeave):
1626 add $32, %ecx
1627 jle L(shl_end_0)
1628
1629 movaps 4(%eax), %xmm2
1630 movaps 20(%eax), %xmm3
1631 palignr $12, %xmm2, %xmm3
1632 palignr $12, %xmm1, %xmm2
1633
1634 movaps %xmm2, (%edx)
1635 movaps %xmm3, 16(%edx)
1636 lea 32(%edx, %ecx), %edx
1637 lea 32(%eax, %ecx), %eax
1638 POP (%edi)
1639 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1640
1641 CFI_PUSH (%edi)
1642
1643 .p2align 4
1644L(sh_12_no_prefetch):
1645 lea -32(%ecx), %ecx
1646 lea -12(%eax), %eax
1647 xor %edi, %edi
1648
1649 .p2align 4
1650L(sh_12_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001651 movdqa 16(%eax, %edi), %xmm2
1652 sub $32, %ecx
1653 movdqa 32(%eax, %edi), %xmm3
1654 movdqa %xmm3, %xmm4
1655 palignr $12, %xmm2, %xmm3
1656 palignr $12, %xmm1, %xmm2
1657 lea 32(%edi), %edi
1658 movdqa %xmm2, -32(%edx, %edi)
1659 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001660 jb L(sh_12_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001661
1662 movdqa 16(%eax, %edi), %xmm2
1663 sub $32, %ecx
1664 movdqa 32(%eax, %edi), %xmm3
1665 movdqa %xmm3, %xmm1
1666 palignr $12, %xmm2, %xmm3
1667 palignr $12, %xmm4, %xmm2
1668 lea 32(%edi), %edi
1669 movdqa %xmm2, -32(%edx, %edi)
1670 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001671 jae L(sh_12_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001672
Jack Renc47703a2012-02-14 12:01:52 +04001673L(sh_12_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001674 lea 32(%ecx), %ecx
1675 add %ecx, %edi
1676 add %edi, %edx
1677 lea 12(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001678 POP (%edi)
1679 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001680
Jack Renc47703a2012-02-14 12:01:52 +04001681 CFI_PUSH (%edi)
1682
1683 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001684L(shl_13):
Jack Renc47703a2012-02-14 12:01:52 +04001685#ifndef USE_AS_MEMMOVE
1686 movaps -13(%eax), %xmm1
1687#else
1688 movl DEST+4(%esp), %edi
1689 movaps -13(%eax), %xmm1
1690 movdqu %xmm0, (%edi)
1691#endif
1692#ifdef DATA_CACHE_SIZE_HALF
1693 cmp $DATA_CACHE_SIZE_HALF, %ecx
1694#else
1695# if (defined SHARED || defined __PIC__)
1696 SETUP_PIC_REG(bx)
1697 add $_GLOBAL_OFFSET_TABLE_, %ebx
1698 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1699# else
1700 cmp __x86_data_cache_size_half, %ecx
1701# endif
1702#endif
1703 jb L(sh_13_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001704
Jack Renc47703a2012-02-14 12:01:52 +04001705 lea -64(%ecx), %ecx
1706
1707 .p2align 4
1708L(Shl13LoopStart):
1709 prefetcht0 0x1c0(%eax)
1710 prefetcht0 0x1c0(%edx)
1711 movaps 3(%eax), %xmm2
1712 movaps 19(%eax), %xmm3
1713 movaps 35(%eax), %xmm4
1714 movaps 51(%eax), %xmm5
1715 movaps %xmm5, %xmm7
1716 palignr $13, %xmm4, %xmm5
1717 palignr $13, %xmm3, %xmm4
1718 movaps %xmm5, 48(%edx)
1719 palignr $13, %xmm2, %xmm3
1720 lea 64(%eax), %eax
1721 palignr $13, %xmm1, %xmm2
1722 movaps %xmm4, 32(%edx)
1723 movaps %xmm3, 16(%edx)
1724 movaps %xmm7, %xmm1
1725 movaps %xmm2, (%edx)
1726 lea 64(%edx), %edx
1727 sub $64, %ecx
1728 ja L(Shl13LoopStart)
1729
1730L(Shl13LoopLeave):
1731 add $32, %ecx
1732 jle L(shl_end_0)
1733
1734 movaps 3(%eax), %xmm2
1735 movaps 19(%eax), %xmm3
1736 palignr $13, %xmm2, %xmm3
1737 palignr $13, %xmm1, %xmm2
1738
1739 movaps %xmm2, (%edx)
1740 movaps %xmm3, 16(%edx)
1741 lea 32(%edx, %ecx), %edx
1742 lea 32(%eax, %ecx), %eax
1743 POP (%edi)
1744 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1745
1746 CFI_PUSH (%edi)
1747
1748 .p2align 4
1749L(sh_13_no_prefetch):
1750 lea -32(%ecx), %ecx
1751 lea -13(%eax), %eax
1752 xor %edi, %edi
1753
1754 .p2align 4
1755L(sh_13_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001756 movdqa 16(%eax, %edi), %xmm2
1757 sub $32, %ecx
1758 movdqa 32(%eax, %edi), %xmm3
1759 movdqa %xmm3, %xmm4
1760 palignr $13, %xmm2, %xmm3
1761 palignr $13, %xmm1, %xmm2
1762 lea 32(%edi), %edi
1763 movdqa %xmm2, -32(%edx, %edi)
1764 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001765 jb L(sh_13_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001766
1767 movdqa 16(%eax, %edi), %xmm2
1768 sub $32, %ecx
1769 movdqa 32(%eax, %edi), %xmm3
1770 movdqa %xmm3, %xmm1
1771 palignr $13, %xmm2, %xmm3
1772 palignr $13, %xmm4, %xmm2
1773 lea 32(%edi), %edi
1774 movdqa %xmm2, -32(%edx, %edi)
1775 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001776 jae L(sh_13_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001777
Jack Renc47703a2012-02-14 12:01:52 +04001778L(sh_13_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001779 lea 32(%ecx), %ecx
1780 add %ecx, %edi
1781 add %edi, %edx
1782 lea 13(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001783 POP (%edi)
1784 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001785
Jack Renc47703a2012-02-14 12:01:52 +04001786 CFI_PUSH (%edi)
1787
1788 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001789L(shl_14):
Jack Renc47703a2012-02-14 12:01:52 +04001790#ifndef USE_AS_MEMMOVE
1791 movaps -14(%eax), %xmm1
1792#else
1793 movl DEST+4(%esp), %edi
1794 movaps -14(%eax), %xmm1
1795 movdqu %xmm0, (%edi)
1796#endif
1797#ifdef DATA_CACHE_SIZE_HALF
1798 cmp $DATA_CACHE_SIZE_HALF, %ecx
1799#else
1800# if (defined SHARED || defined __PIC__)
1801 SETUP_PIC_REG(bx)
1802 add $_GLOBAL_OFFSET_TABLE_, %ebx
1803 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1804# else
1805 cmp __x86_data_cache_size_half, %ecx
1806# endif
1807#endif
1808 jb L(sh_14_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001809
Jack Renc47703a2012-02-14 12:01:52 +04001810 lea -64(%ecx), %ecx
1811
1812 .p2align 4
1813L(Shl14LoopStart):
1814 prefetcht0 0x1c0(%eax)
1815 prefetcht0 0x1c0(%edx)
1816 movaps 2(%eax), %xmm2
1817 movaps 18(%eax), %xmm3
1818 movaps 34(%eax), %xmm4
1819 movaps 50(%eax), %xmm5
1820 movaps %xmm5, %xmm7
1821 palignr $14, %xmm4, %xmm5
1822 palignr $14, %xmm3, %xmm4
1823 movaps %xmm5, 48(%edx)
1824 palignr $14, %xmm2, %xmm3
1825 lea 64(%eax), %eax
1826 palignr $14, %xmm1, %xmm2
1827 movaps %xmm4, 32(%edx)
1828 movaps %xmm3, 16(%edx)
1829 movaps %xmm7, %xmm1
1830 movaps %xmm2, (%edx)
1831 lea 64(%edx), %edx
1832 sub $64, %ecx
1833 ja L(Shl14LoopStart)
1834
1835L(Shl14LoopLeave):
1836 add $32, %ecx
1837 jle L(shl_end_0)
1838
1839 movaps 2(%eax), %xmm2
1840 movaps 18(%eax), %xmm3
1841 palignr $14, %xmm2, %xmm3
1842 palignr $14, %xmm1, %xmm2
1843
1844 movaps %xmm2, (%edx)
1845 movaps %xmm3, 16(%edx)
1846 lea 32(%edx, %ecx), %edx
1847 lea 32(%eax, %ecx), %eax
1848 POP (%edi)
1849 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1850
1851 CFI_PUSH (%edi)
1852
1853 .p2align 4
1854L(sh_14_no_prefetch):
1855 lea -32(%ecx), %ecx
1856 lea -14(%eax), %eax
1857 xor %edi, %edi
1858
1859 .p2align 4
1860L(sh_14_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001861 movdqa 16(%eax, %edi), %xmm2
1862 sub $32, %ecx
1863 movdqa 32(%eax, %edi), %xmm3
1864 movdqa %xmm3, %xmm4
1865 palignr $14, %xmm2, %xmm3
1866 palignr $14, %xmm1, %xmm2
1867 lea 32(%edi), %edi
1868 movdqa %xmm2, -32(%edx, %edi)
1869 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001870 jb L(sh_14_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001871
1872 movdqa 16(%eax, %edi), %xmm2
1873 sub $32, %ecx
1874 movdqa 32(%eax, %edi), %xmm3
1875 movdqa %xmm3, %xmm1
1876 palignr $14, %xmm2, %xmm3
1877 palignr $14, %xmm4, %xmm2
1878 lea 32(%edi), %edi
1879 movdqa %xmm2, -32(%edx, %edi)
1880 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001881 jae L(sh_14_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001882
Jack Renc47703a2012-02-14 12:01:52 +04001883L(sh_14_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001884 lea 32(%ecx), %ecx
1885 add %ecx, %edi
1886 add %edi, %edx
1887 lea 14(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001888 POP (%edi)
1889 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001890
Jack Renc47703a2012-02-14 12:01:52 +04001891 CFI_PUSH (%edi)
1892
1893 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001894L(shl_15):
Jack Renc47703a2012-02-14 12:01:52 +04001895#ifndef USE_AS_MEMMOVE
1896 movaps -15(%eax), %xmm1
1897#else
1898 movl DEST+4(%esp), %edi
1899 movaps -15(%eax), %xmm1
1900 movdqu %xmm0, (%edi)
1901#endif
1902#ifdef DATA_CACHE_SIZE_HALF
1903 cmp $DATA_CACHE_SIZE_HALF, %ecx
1904#else
1905# if (defined SHARED || defined __PIC__)
1906 SETUP_PIC_REG(bx)
1907 add $_GLOBAL_OFFSET_TABLE_, %ebx
1908 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1909# else
1910 cmp __x86_data_cache_size_half, %ecx
1911# endif
1912#endif
1913 jb L(sh_15_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001914
Jack Renc47703a2012-02-14 12:01:52 +04001915 lea -64(%ecx), %ecx
1916
1917 .p2align 4
1918L(Shl15LoopStart):
1919 prefetcht0 0x1c0(%eax)
1920 prefetcht0 0x1c0(%edx)
1921 movaps 1(%eax), %xmm2
1922 movaps 17(%eax), %xmm3
1923 movaps 33(%eax), %xmm4
1924 movaps 49(%eax), %xmm5
1925 movaps %xmm5, %xmm7
1926 palignr $15, %xmm4, %xmm5
1927 palignr $15, %xmm3, %xmm4
1928 movaps %xmm5, 48(%edx)
1929 palignr $15, %xmm2, %xmm3
1930 lea 64(%eax), %eax
1931 palignr $15, %xmm1, %xmm2
1932 movaps %xmm4, 32(%edx)
1933 movaps %xmm3, 16(%edx)
1934 movaps %xmm7, %xmm1
1935 movaps %xmm2, (%edx)
1936 lea 64(%edx), %edx
1937 sub $64, %ecx
1938 ja L(Shl15LoopStart)
1939
1940L(Shl15LoopLeave):
1941 add $32, %ecx
1942 jle L(shl_end_0)
1943
1944 movaps 1(%eax), %xmm2
1945 movaps 17(%eax), %xmm3
1946 palignr $15, %xmm2, %xmm3
1947 palignr $15, %xmm1, %xmm2
1948
1949 movaps %xmm2, (%edx)
1950 movaps %xmm3, 16(%edx)
1951 lea 32(%edx, %ecx), %edx
1952 lea 32(%eax, %ecx), %eax
1953 POP (%edi)
1954 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1955
1956 CFI_PUSH (%edi)
1957
1958 .p2align 4
1959L(sh_15_no_prefetch):
1960 lea -32(%ecx), %ecx
1961 lea -15(%eax), %eax
1962 xor %edi, %edi
1963
1964 .p2align 4
1965L(sh_15_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001966 movdqa 16(%eax, %edi), %xmm2
1967 sub $32, %ecx
1968 movdqa 32(%eax, %edi), %xmm3
1969 movdqa %xmm3, %xmm4
1970 palignr $15, %xmm2, %xmm3
1971 palignr $15, %xmm1, %xmm2
1972 lea 32(%edi), %edi
1973 movdqa %xmm2, -32(%edx, %edi)
1974 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001975 jb L(sh_15_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001976
1977 movdqa 16(%eax, %edi), %xmm2
1978 sub $32, %ecx
1979 movdqa 32(%eax, %edi), %xmm3
1980 movdqa %xmm3, %xmm1
1981 palignr $15, %xmm2, %xmm3
1982 palignr $15, %xmm4, %xmm2
1983 lea 32(%edi), %edi
1984 movdqa %xmm2, -32(%edx, %edi)
1985 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001986 jae L(sh_15_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001987
Jack Renc47703a2012-02-14 12:01:52 +04001988L(sh_15_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001989 lea 32(%ecx), %ecx
1990 add %ecx, %edi
1991 add %edi, %edx
1992 lea 15(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001993 POP (%edi)
1994 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001995
Jack Renc47703a2012-02-14 12:01:52 +04001996 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001997
Jack Renc47703a2012-02-14 12:01:52 +04001998 .p2align 4
1999L(shl_end_0):
2000 lea 32(%ecx), %ecx
2001 lea (%edx, %ecx), %edx
2002 lea (%eax, %ecx), %eax
2003 POP (%edi)
2004 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
2005
2006 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002007L(fwd_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002008 movq -44(%eax), %xmm0
2009 movq %xmm0, -44(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002010L(fwd_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002011 movq -36(%eax), %xmm0
2012 movq %xmm0, -36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002013L(fwd_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002014 movq -28(%eax), %xmm0
2015 movq %xmm0, -28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002016L(fwd_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002017 movq -20(%eax), %xmm0
2018 movq %xmm0, -20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002019L(fwd_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002020 movq -12(%eax), %xmm0
2021 movq %xmm0, -12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002022L(fwd_write_4bytes):
2023 movl -4(%eax), %ecx
2024 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002025#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002026 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002027#else
Jack Renc47703a2012-02-14 12:01:52 +04002028 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002029#endif
2030 RETURN
2031
2032 .p2align 4
2033L(fwd_write_40bytes):
2034 movq -40(%eax), %xmm0
2035 movq %xmm0, -40(%edx)
2036L(fwd_write_32bytes):
2037 movq -32(%eax), %xmm0
2038 movq %xmm0, -32(%edx)
2039L(fwd_write_24bytes):
2040 movq -24(%eax), %xmm0
2041 movq %xmm0, -24(%edx)
2042L(fwd_write_16bytes):
2043 movq -16(%eax), %xmm0
2044 movq %xmm0, -16(%edx)
2045L(fwd_write_8bytes):
2046 movq -8(%eax), %xmm0
2047 movq %xmm0, -8(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002048L(fwd_write_0bytes):
Elliott Hughesbed110a2016-03-03 10:41:42 -08002049#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002050 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002051#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002052 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002053#endif
2054 RETURN
2055
Jack Renc47703a2012-02-14 12:01:52 +04002056 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002057L(fwd_write_5bytes):
2058 movl -5(%eax), %ecx
2059 movl -4(%eax), %eax
2060 movl %ecx, -5(%edx)
2061 movl %eax, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002062#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002063 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002064#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002065 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002066#endif
2067 RETURN
2068
Jack Renc47703a2012-02-14 12:01:52 +04002069 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002070L(fwd_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002071 movq -45(%eax), %xmm0
2072 movq %xmm0, -45(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002073L(fwd_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002074 movq -37(%eax), %xmm0
2075 movq %xmm0, -37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002076L(fwd_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002077 movq -29(%eax), %xmm0
2078 movq %xmm0, -29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002079L(fwd_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002080 movq -21(%eax), %xmm0
2081 movq %xmm0, -21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002082L(fwd_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002083 movq -13(%eax), %xmm0
2084 movq %xmm0, -13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002085 movl -5(%eax), %ecx
2086 movl %ecx, -5(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002087 movzbl -1(%eax), %ecx
2088 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002089#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002090 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002091#else
Jack Renc47703a2012-02-14 12:01:52 +04002092 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002093#endif
2094 RETURN
2095
2096 .p2align 4
2097L(fwd_write_41bytes):
2098 movq -41(%eax), %xmm0
2099 movq %xmm0, -41(%edx)
2100L(fwd_write_33bytes):
2101 movq -33(%eax), %xmm0
2102 movq %xmm0, -33(%edx)
2103L(fwd_write_25bytes):
2104 movq -25(%eax), %xmm0
2105 movq %xmm0, -25(%edx)
2106L(fwd_write_17bytes):
2107 movq -17(%eax), %xmm0
2108 movq %xmm0, -17(%edx)
2109L(fwd_write_9bytes):
2110 movq -9(%eax), %xmm0
2111 movq %xmm0, -9(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002112L(fwd_write_1bytes):
2113 movzbl -1(%eax), %ecx
2114 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002115#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002116 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002117#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002118 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002119#endif
2120 RETURN
2121
Jack Renc47703a2012-02-14 12:01:52 +04002122 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002123L(fwd_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002124 movq -46(%eax), %xmm0
2125 movq %xmm0, -46(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002126L(fwd_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002127 movq -38(%eax), %xmm0
2128 movq %xmm0, -38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002129L(fwd_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002130 movq -30(%eax), %xmm0
2131 movq %xmm0, -30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002132L(fwd_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002133 movq -22(%eax), %xmm0
2134 movq %xmm0, -22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002135L(fwd_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002136 movq -14(%eax), %xmm0
2137 movq %xmm0, -14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002138L(fwd_write_6bytes):
2139 movl -6(%eax), %ecx
2140 movl %ecx, -6(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002141 movzwl -2(%eax), %ecx
2142 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002143#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002144 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002145#else
Jack Renc47703a2012-02-14 12:01:52 +04002146 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002147#endif
2148 RETURN
2149
2150 .p2align 4
2151L(fwd_write_42bytes):
2152 movq -42(%eax), %xmm0
2153 movq %xmm0, -42(%edx)
2154L(fwd_write_34bytes):
2155 movq -34(%eax), %xmm0
2156 movq %xmm0, -34(%edx)
2157L(fwd_write_26bytes):
2158 movq -26(%eax), %xmm0
2159 movq %xmm0, -26(%edx)
2160L(fwd_write_18bytes):
2161 movq -18(%eax), %xmm0
2162 movq %xmm0, -18(%edx)
2163L(fwd_write_10bytes):
2164 movq -10(%eax), %xmm0
2165 movq %xmm0, -10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002166L(fwd_write_2bytes):
2167 movzwl -2(%eax), %ecx
2168 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002169#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002170 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002171#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002172 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002173#endif
2174 RETURN
2175
Jack Renc47703a2012-02-14 12:01:52 +04002176 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002177L(fwd_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002178 movq -47(%eax), %xmm0
2179 movq %xmm0, -47(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002180L(fwd_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002181 movq -39(%eax), %xmm0
2182 movq %xmm0, -39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002183L(fwd_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002184 movq -31(%eax), %xmm0
2185 movq %xmm0, -31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002186L(fwd_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002187 movq -23(%eax), %xmm0
2188 movq %xmm0, -23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002189L(fwd_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002190 movq -15(%eax), %xmm0
2191 movq %xmm0, -15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002192L(fwd_write_7bytes):
2193 movl -7(%eax), %ecx
2194 movl %ecx, -7(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002195 movzwl -3(%eax), %ecx
2196 movzbl -1(%eax), %eax
2197 movw %cx, -3(%edx)
2198 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002199#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002200 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002201#else
Jack Renc47703a2012-02-14 12:01:52 +04002202 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002203#endif
2204 RETURN
2205
2206 .p2align 4
2207L(fwd_write_43bytes):
2208 movq -43(%eax), %xmm0
2209 movq %xmm0, -43(%edx)
2210L(fwd_write_35bytes):
2211 movq -35(%eax), %xmm0
2212 movq %xmm0, -35(%edx)
2213L(fwd_write_27bytes):
2214 movq -27(%eax), %xmm0
2215 movq %xmm0, -27(%edx)
2216L(fwd_write_19bytes):
2217 movq -19(%eax), %xmm0
2218 movq %xmm0, -19(%edx)
2219L(fwd_write_11bytes):
2220 movq -11(%eax), %xmm0
2221 movq %xmm0, -11(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002222L(fwd_write_3bytes):
2223 movzwl -3(%eax), %ecx
2224 movzbl -1(%eax), %eax
2225 movw %cx, -3(%edx)
2226 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002227#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002228 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002229#else
Bruce Beare8ff1a272010-03-04 11:03:37 -08002230 movl DEST(%esp), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002231#endif
Jack Renc47703a2012-02-14 12:01:52 +04002232 RETURN
2233
2234 .p2align 4
2235L(fwd_write_40bytes_align):
2236 movdqa -40(%eax), %xmm0
2237 movdqa %xmm0, -40(%edx)
2238L(fwd_write_24bytes_align):
2239 movdqa -24(%eax), %xmm0
2240 movdqa %xmm0, -24(%edx)
2241L(fwd_write_8bytes_align):
2242 movq -8(%eax), %xmm0
2243 movq %xmm0, -8(%edx)
2244L(fwd_write_0bytes_align):
Elliott Hughesbed110a2016-03-03 10:41:42 -08002245#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002246 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002247#else
Jack Renc47703a2012-02-14 12:01:52 +04002248 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002249#endif
2250 RETURN
2251
2252 .p2align 4
2253L(fwd_write_32bytes_align):
2254 movdqa -32(%eax), %xmm0
2255 movdqa %xmm0, -32(%edx)
2256L(fwd_write_16bytes_align):
2257 movdqa -16(%eax), %xmm0
2258 movdqa %xmm0, -16(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002259#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002260 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002261#else
Jack Renc47703a2012-02-14 12:01:52 +04002262 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002263#endif
2264 RETURN
2265
2266 .p2align 4
2267L(fwd_write_5bytes_align):
2268 movl -5(%eax), %ecx
2269 movl -4(%eax), %eax
2270 movl %ecx, -5(%edx)
2271 movl %eax, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002272#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002273 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002274#else
Jack Renc47703a2012-02-14 12:01:52 +04002275 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002276#endif
2277 RETURN
2278
2279 .p2align 4
2280L(fwd_write_45bytes_align):
2281 movdqa -45(%eax), %xmm0
2282 movdqa %xmm0, -45(%edx)
2283L(fwd_write_29bytes_align):
2284 movdqa -29(%eax), %xmm0
2285 movdqa %xmm0, -29(%edx)
2286L(fwd_write_13bytes_align):
2287 movq -13(%eax), %xmm0
2288 movq %xmm0, -13(%edx)
2289 movl -5(%eax), %ecx
2290 movl %ecx, -5(%edx)
2291 movzbl -1(%eax), %ecx
2292 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002293#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002294 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002295#else
Jack Renc47703a2012-02-14 12:01:52 +04002296 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002297#endif
2298 RETURN
2299
2300 .p2align 4
2301L(fwd_write_37bytes_align):
2302 movdqa -37(%eax), %xmm0
2303 movdqa %xmm0, -37(%edx)
2304L(fwd_write_21bytes_align):
2305 movdqa -21(%eax), %xmm0
2306 movdqa %xmm0, -21(%edx)
2307 movl -5(%eax), %ecx
2308 movl %ecx, -5(%edx)
2309 movzbl -1(%eax), %ecx
2310 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002311#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002312 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002313#else
Jack Renc47703a2012-02-14 12:01:52 +04002314 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002315#endif
2316 RETURN
2317
2318 .p2align 4
2319L(fwd_write_41bytes_align):
2320 movdqa -41(%eax), %xmm0
2321 movdqa %xmm0, -41(%edx)
2322L(fwd_write_25bytes_align):
2323 movdqa -25(%eax), %xmm0
2324 movdqa %xmm0, -25(%edx)
2325L(fwd_write_9bytes_align):
2326 movq -9(%eax), %xmm0
2327 movq %xmm0, -9(%edx)
2328L(fwd_write_1bytes_align):
2329 movzbl -1(%eax), %ecx
2330 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002331#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002332 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002333#else
Jack Renc47703a2012-02-14 12:01:52 +04002334 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002335#endif
2336 RETURN
2337
2338 .p2align 4
2339L(fwd_write_33bytes_align):
2340 movdqa -33(%eax), %xmm0
2341 movdqa %xmm0, -33(%edx)
2342L(fwd_write_17bytes_align):
2343 movdqa -17(%eax), %xmm0
2344 movdqa %xmm0, -17(%edx)
2345 movzbl -1(%eax), %ecx
2346 movb %cl, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002347#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002348 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002349#else
Jack Renc47703a2012-02-14 12:01:52 +04002350 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002351#endif
2352 RETURN
2353
2354 .p2align 4
2355L(fwd_write_46bytes_align):
2356 movdqa -46(%eax), %xmm0
2357 movdqa %xmm0, -46(%edx)
2358L(fwd_write_30bytes_align):
2359 movdqa -30(%eax), %xmm0
2360 movdqa %xmm0, -30(%edx)
2361L(fwd_write_14bytes_align):
2362 movq -14(%eax), %xmm0
2363 movq %xmm0, -14(%edx)
2364L(fwd_write_6bytes_align):
2365 movl -6(%eax), %ecx
2366 movl %ecx, -6(%edx)
2367 movzwl -2(%eax), %ecx
2368 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002369#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002370 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002371#else
Jack Renc47703a2012-02-14 12:01:52 +04002372 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002373#endif
2374 RETURN
2375
2376 .p2align 4
2377L(fwd_write_38bytes_align):
2378 movdqa -38(%eax), %xmm0
2379 movdqa %xmm0, -38(%edx)
2380L(fwd_write_22bytes_align):
2381 movdqa -22(%eax), %xmm0
2382 movdqa %xmm0, -22(%edx)
2383 movl -6(%eax), %ecx
2384 movl %ecx, -6(%edx)
2385 movzwl -2(%eax), %ecx
2386 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002387#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002388 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002389#else
Jack Renc47703a2012-02-14 12:01:52 +04002390 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002391#endif
2392 RETURN
2393
2394 .p2align 4
2395L(fwd_write_42bytes_align):
2396 movdqa -42(%eax), %xmm0
2397 movdqa %xmm0, -42(%edx)
2398L(fwd_write_26bytes_align):
2399 movdqa -26(%eax), %xmm0
2400 movdqa %xmm0, -26(%edx)
2401L(fwd_write_10bytes_align):
2402 movq -10(%eax), %xmm0
2403 movq %xmm0, -10(%edx)
2404L(fwd_write_2bytes_align):
2405 movzwl -2(%eax), %ecx
2406 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002407#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002408 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002409#else
Jack Renc47703a2012-02-14 12:01:52 +04002410 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002411#endif
2412 RETURN
2413
2414 .p2align 4
2415L(fwd_write_34bytes_align):
2416 movdqa -34(%eax), %xmm0
2417 movdqa %xmm0, -34(%edx)
2418L(fwd_write_18bytes_align):
2419 movdqa -18(%eax), %xmm0
2420 movdqa %xmm0, -18(%edx)
2421 movzwl -2(%eax), %ecx
2422 movw %cx, -2(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002423#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002424 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002425#else
Jack Renc47703a2012-02-14 12:01:52 +04002426 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002427#endif
2428 RETURN
2429
2430 .p2align 4
2431L(fwd_write_47bytes_align):
2432 movdqa -47(%eax), %xmm0
2433 movdqa %xmm0, -47(%edx)
2434L(fwd_write_31bytes_align):
2435 movdqa -31(%eax), %xmm0
2436 movdqa %xmm0, -31(%edx)
2437L(fwd_write_15bytes_align):
2438 movq -15(%eax), %xmm0
2439 movq %xmm0, -15(%edx)
2440L(fwd_write_7bytes_align):
2441 movl -7(%eax), %ecx
2442 movl %ecx, -7(%edx)
2443 movzwl -3(%eax), %ecx
2444 movzbl -1(%eax), %eax
2445 movw %cx, -3(%edx)
2446 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002447#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002448 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002449#else
Jack Renc47703a2012-02-14 12:01:52 +04002450 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002451#endif
2452 RETURN
2453
2454 .p2align 4
2455L(fwd_write_39bytes_align):
2456 movdqa -39(%eax), %xmm0
2457 movdqa %xmm0, -39(%edx)
2458L(fwd_write_23bytes_align):
2459 movdqa -23(%eax), %xmm0
2460 movdqa %xmm0, -23(%edx)
2461 movl -7(%eax), %ecx
2462 movl %ecx, -7(%edx)
2463 movzwl -3(%eax), %ecx
2464 movzbl -1(%eax), %eax
2465 movw %cx, -3(%edx)
2466 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002467#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002468 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002469#else
Jack Renc47703a2012-02-14 12:01:52 +04002470 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002471#endif
2472 RETURN
2473
2474 .p2align 4
2475L(fwd_write_43bytes_align):
2476 movdqa -43(%eax), %xmm0
2477 movdqa %xmm0, -43(%edx)
2478L(fwd_write_27bytes_align):
2479 movdqa -27(%eax), %xmm0
2480 movdqa %xmm0, -27(%edx)
2481L(fwd_write_11bytes_align):
2482 movq -11(%eax), %xmm0
2483 movq %xmm0, -11(%edx)
2484L(fwd_write_3bytes_align):
2485 movzwl -3(%eax), %ecx
2486 movzbl -1(%eax), %eax
2487 movw %cx, -3(%edx)
2488 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002489#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002490 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002491#else
Jack Renc47703a2012-02-14 12:01:52 +04002492 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002493#endif
2494 RETURN
2495
2496 .p2align 4
2497L(fwd_write_35bytes_align):
2498 movdqa -35(%eax), %xmm0
2499 movdqa %xmm0, -35(%edx)
2500L(fwd_write_19bytes_align):
2501 movdqa -19(%eax), %xmm0
2502 movdqa %xmm0, -19(%edx)
2503 movzwl -3(%eax), %ecx
2504 movzbl -1(%eax), %eax
2505 movw %cx, -3(%edx)
2506 movb %al, -1(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002507#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002508 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002509#else
Jack Renc47703a2012-02-14 12:01:52 +04002510 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002511#endif
2512 RETURN
2513
2514 .p2align 4
2515L(fwd_write_44bytes_align):
2516 movdqa -44(%eax), %xmm0
2517 movdqa %xmm0, -44(%edx)
2518L(fwd_write_28bytes_align):
2519 movdqa -28(%eax), %xmm0
2520 movdqa %xmm0, -28(%edx)
2521L(fwd_write_12bytes_align):
2522 movq -12(%eax), %xmm0
2523 movq %xmm0, -12(%edx)
2524L(fwd_write_4bytes_align):
2525 movl -4(%eax), %ecx
2526 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002527#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002528 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002529#else
Jack Renc47703a2012-02-14 12:01:52 +04002530 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002531#endif
2532 RETURN
2533
2534 .p2align 4
2535L(fwd_write_36bytes_align):
2536 movdqa -36(%eax), %xmm0
2537 movdqa %xmm0, -36(%edx)
2538L(fwd_write_20bytes_align):
2539 movdqa -20(%eax), %xmm0
2540 movdqa %xmm0, -20(%edx)
2541 movl -4(%eax), %ecx
2542 movl %ecx, -4(%edx)
Elliott Hughesbed110a2016-03-03 10:41:42 -08002543#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002544 movl %edx, %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002545#else
Jack Renc47703a2012-02-14 12:01:52 +04002546 movl DEST(%esp), %eax
Jack Renc47703a2012-02-14 12:01:52 +04002547#endif
Bruce Beare124a5422010-10-11 12:24:41 -07002548 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08002549
Jack Renc47703a2012-02-14 12:01:52 +04002550 CFI_PUSH (%edi)
2551
2552 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002553L(large_page):
2554 movdqu (%eax), %xmm1
Jack Renc47703a2012-02-14 12:01:52 +04002555#ifdef USE_AS_MEMMOVE
2556 movl DEST+4(%esp), %edi
2557 movdqu %xmm0, (%edi)
2558#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -08002559 lea 16(%eax), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002560 movntdq %xmm1, (%edx)
2561 lea 16(%edx), %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -08002562 lea -0x90(%ecx), %ecx
2563 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +04002564
2565 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002566L(large_page_loop):
2567 movdqu (%eax), %xmm0
2568 movdqu 0x10(%eax), %xmm1
2569 movdqu 0x20(%eax), %xmm2
2570 movdqu 0x30(%eax), %xmm3
2571 movdqu 0x40(%eax), %xmm4
2572 movdqu 0x50(%eax), %xmm5
2573 movdqu 0x60(%eax), %xmm6
2574 movdqu 0x70(%eax), %xmm7
2575 lea 0x80(%eax), %eax
2576
2577 sub $0x80, %ecx
2578 movntdq %xmm0, (%edx)
2579 movntdq %xmm1, 0x10(%edx)
2580 movntdq %xmm2, 0x20(%edx)
2581 movntdq %xmm3, 0x30(%edx)
2582 movntdq %xmm4, 0x40(%edx)
2583 movntdq %xmm5, 0x50(%edx)
2584 movntdq %xmm6, 0x60(%edx)
2585 movntdq %xmm7, 0x70(%edx)
2586 lea 0x80(%edx), %edx
2587 jae L(large_page_loop)
2588 cmp $-0x40, %ecx
2589 lea 0x80(%ecx), %ecx
2590 jl L(large_page_less_64bytes)
2591
2592 movdqu (%eax), %xmm0
2593 movdqu 0x10(%eax), %xmm1
2594 movdqu 0x20(%eax), %xmm2
2595 movdqu 0x30(%eax), %xmm3
2596 lea 0x40(%eax), %eax
2597
2598 movntdq %xmm0, (%edx)
2599 movntdq %xmm1, 0x10(%edx)
2600 movntdq %xmm2, 0x20(%edx)
2601 movntdq %xmm3, 0x30(%edx)
2602 lea 0x40(%edx), %edx
2603 sub $0x40, %ecx
2604L(large_page_less_64bytes):
2605 cmp $32, %ecx
2606 jb L(large_page_less_32bytes)
2607 movdqu (%eax), %xmm0
2608 movdqu 0x10(%eax), %xmm1
2609 lea 0x20(%eax), %eax
2610 movntdq %xmm0, (%edx)
2611 movntdq %xmm1, 0x10(%edx)
2612 lea 0x20(%edx), %edx
2613 sub $0x20, %ecx
2614L(large_page_less_32bytes):
2615 add %ecx, %edx
2616 add %ecx, %eax
2617 sfence
2618 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2619
Jack Renc47703a2012-02-14 12:01:52 +04002620 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002621L(bk_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002622 movq 36(%eax), %xmm0
2623 movq %xmm0, 36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002624L(bk_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002625 movq 28(%eax), %xmm0
2626 movq %xmm0, 28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002627L(bk_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002628 movq 20(%eax), %xmm0
2629 movq %xmm0, 20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002630L(bk_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002631 movq 12(%eax), %xmm0
2632 movq %xmm0, 12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002633L(bk_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002634 movq 4(%eax), %xmm0
2635 movq %xmm0, 4(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002636L(bk_write_4bytes):
2637 movl (%eax), %ecx
2638 movl %ecx, (%edx)
2639L(bk_write_0bytes):
Bruce Beare8ff1a272010-03-04 11:03:37 -08002640 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002641#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002642 movl LEN(%esp), %ecx
2643 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002644#endif
2645 RETURN
2646
Jack Renc47703a2012-02-14 12:01:52 +04002647 .p2align 4
2648L(bk_write_40bytes):
2649 movq 32(%eax), %xmm0
2650 movq %xmm0, 32(%edx)
2651L(bk_write_32bytes):
2652 movq 24(%eax), %xmm0
2653 movq %xmm0, 24(%edx)
2654L(bk_write_24bytes):
2655 movq 16(%eax), %xmm0
2656 movq %xmm0, 16(%edx)
2657L(bk_write_16bytes):
2658 movq 8(%eax), %xmm0
2659 movq %xmm0, 8(%edx)
2660L(bk_write_8bytes):
2661 movq (%eax), %xmm0
2662 movq %xmm0, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002663 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002664#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002665 movl LEN(%esp), %ecx
2666 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002667#endif
2668 RETURN
2669
2670 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002671L(bk_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002672 movq 37(%eax), %xmm0
2673 movq %xmm0, 37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002674L(bk_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002675 movq 29(%eax), %xmm0
2676 movq %xmm0, 29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002677L(bk_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002678 movq 21(%eax), %xmm0
2679 movq %xmm0, 21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002680L(bk_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002681 movq 13(%eax), %xmm0
2682 movq %xmm0, 13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002683L(bk_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002684 movq 5(%eax), %xmm0
2685 movq %xmm0, 5(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002686L(bk_write_5bytes):
2687 movl 1(%eax), %ecx
2688 movl %ecx, 1(%edx)
2689L(bk_write_1bytes):
2690 movzbl (%eax), %ecx
2691 movb %cl, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002692 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002693#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002694 movl LEN(%esp), %ecx
2695 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002696#endif
2697 RETURN
2698
Jack Renc47703a2012-02-14 12:01:52 +04002699 .p2align 4
2700L(bk_write_41bytes):
2701 movq 33(%eax), %xmm0
2702 movq %xmm0, 33(%edx)
2703L(bk_write_33bytes):
2704 movq 25(%eax), %xmm0
2705 movq %xmm0, 25(%edx)
2706L(bk_write_25bytes):
2707 movq 17(%eax), %xmm0
2708 movq %xmm0, 17(%edx)
2709L(bk_write_17bytes):
2710 movq 9(%eax), %xmm0
2711 movq %xmm0, 9(%edx)
2712L(bk_write_9bytes):
2713 movq 1(%eax), %xmm0
2714 movq %xmm0, 1(%edx)
2715 movzbl (%eax), %ecx
2716 movb %cl, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002717 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002718#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002719 movl LEN(%esp), %ecx
2720 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002721#endif
2722 RETURN
2723
2724 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002725L(bk_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002726 movq 38(%eax), %xmm0
2727 movq %xmm0, 38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002728L(bk_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002729 movq 30(%eax), %xmm0
2730 movq %xmm0, 30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002731L(bk_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002732 movq 22(%eax), %xmm0
2733 movq %xmm0, 22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002734L(bk_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002735 movq 14(%eax), %xmm0
2736 movq %xmm0, 14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002737L(bk_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002738 movq 6(%eax), %xmm0
2739 movq %xmm0, 6(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002740L(bk_write_6bytes):
2741 movl 2(%eax), %ecx
2742 movl %ecx, 2(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002743 movzwl (%eax), %ecx
2744 movw %cx, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002745 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002746#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002747 movl LEN(%esp), %ecx
2748 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002749#endif
2750 RETURN
2751
2752 .p2align 4
2753L(bk_write_42bytes):
2754 movq 34(%eax), %xmm0
2755 movq %xmm0, 34(%edx)
2756L(bk_write_34bytes):
2757 movq 26(%eax), %xmm0
2758 movq %xmm0, 26(%edx)
2759L(bk_write_26bytes):
2760 movq 18(%eax), %xmm0
2761 movq %xmm0, 18(%edx)
2762L(bk_write_18bytes):
2763 movq 10(%eax), %xmm0
2764 movq %xmm0, 10(%edx)
2765L(bk_write_10bytes):
2766 movq 2(%eax), %xmm0
2767 movq %xmm0, 2(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002768L(bk_write_2bytes):
2769 movzwl (%eax), %ecx
2770 movw %cx, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002771 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002772#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002773 movl LEN(%esp), %ecx
2774 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002775#endif
2776 RETURN
2777
Jack Renc47703a2012-02-14 12:01:52 +04002778 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002779L(bk_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002780 movq 39(%eax), %xmm0
2781 movq %xmm0, 39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002782L(bk_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002783 movq 31(%eax), %xmm0
2784 movq %xmm0, 31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002785L(bk_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002786 movq 23(%eax), %xmm0
2787 movq %xmm0, 23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002788L(bk_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002789 movq 15(%eax), %xmm0
2790 movq %xmm0, 15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002791L(bk_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002792 movq 7(%eax), %xmm0
2793 movq %xmm0, 7(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002794L(bk_write_7bytes):
2795 movl 3(%eax), %ecx
2796 movl %ecx, 3(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002797 movzwl 1(%eax), %ecx
2798 movw %cx, 1(%edx)
2799 movzbl (%eax), %eax
2800 movb %al, (%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002801 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002802#ifdef USE_AS_MEMPCPY
Jack Renc47703a2012-02-14 12:01:52 +04002803 movl LEN(%esp), %ecx
2804 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04002805#endif
2806 RETURN
2807
2808 .p2align 4
2809L(bk_write_43bytes):
2810 movq 35(%eax), %xmm0
2811 movq %xmm0, 35(%edx)
2812L(bk_write_35bytes):
2813 movq 27(%eax), %xmm0
2814 movq %xmm0, 27(%edx)
2815L(bk_write_27bytes):
2816 movq 19(%eax), %xmm0
2817 movq %xmm0, 19(%edx)
2818L(bk_write_19bytes):
2819 movq 11(%eax), %xmm0
2820 movq %xmm0, 11(%edx)
2821L(bk_write_11bytes):
2822 movq 3(%eax), %xmm0
2823 movq %xmm0, 3(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002824L(bk_write_3bytes):
2825 movzwl 1(%eax), %ecx
2826 movw %cx, 1(%edx)
2827 movzbl (%eax), %eax
2828 movb %al, (%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002829 movl DEST(%esp), %eax
Elliott Hughesbed110a2016-03-03 10:41:42 -08002830#ifdef USE_AS_MEMPCPY
Bruce Beare8ff1a272010-03-04 11:03:37 -08002831 movl LEN(%esp), %ecx
2832 add %ecx, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002833#endif
2834 RETURN_END
2835
2836
2837 .pushsection .rodata.ssse3,"a",@progbits
Jack Renc47703a2012-02-14 12:01:52 +04002838 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002839L(table_48bytes_fwd):
2840 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2841 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2842 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2843 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2844 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2845 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2846 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2847 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2848 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2849 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2850 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2851 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2852 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2853 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2854 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2855 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2856 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2857 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2858 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2859 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2860 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2861 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2862 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2863 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2864 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2865 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2866 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2878 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2879 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2880 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2881 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2882 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2883 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2884 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2885 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2886 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2887 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2888
Jack Renc47703a2012-02-14 12:01:52 +04002889 .p2align 2
2890L(table_48bytes_fwd_align):
2891 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2892 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2893 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2894 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2895 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2896 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2897 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2898 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2899 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2900 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2901 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2902 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2903 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2904 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2905 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2906 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2907 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2908 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2909 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2910 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2911 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2912 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2913 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2914 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2915 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2916 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2917 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2929 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2930 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2931 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2932 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2933 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2934 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2935 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2936 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2937 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2938 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2939
2940 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002941L(shl_table):
2942 .int JMPTBL (L(shl_0), L(shl_table))
2943 .int JMPTBL (L(shl_1), L(shl_table))
2944 .int JMPTBL (L(shl_2), L(shl_table))
2945 .int JMPTBL (L(shl_3), L(shl_table))
2946 .int JMPTBL (L(shl_4), L(shl_table))
2947 .int JMPTBL (L(shl_5), L(shl_table))
2948 .int JMPTBL (L(shl_6), L(shl_table))
2949 .int JMPTBL (L(shl_7), L(shl_table))
2950 .int JMPTBL (L(shl_8), L(shl_table))
2951 .int JMPTBL (L(shl_9), L(shl_table))
2952 .int JMPTBL (L(shl_10), L(shl_table))
2953 .int JMPTBL (L(shl_11), L(shl_table))
2954 .int JMPTBL (L(shl_12), L(shl_table))
2955 .int JMPTBL (L(shl_13), L(shl_table))
2956 .int JMPTBL (L(shl_14), L(shl_table))
2957 .int JMPTBL (L(shl_15), L(shl_table))
2958
Jack Renc47703a2012-02-14 12:01:52 +04002959 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002960L(table_48_bytes_bwd):
2961 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2962 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2963 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2964 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2965 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2966 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2967 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2968 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2969 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2970 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2971 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2972 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2973 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2974 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2975 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2976 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2977 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2978 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2979 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2980 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2981 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2982 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2983 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2984 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2985 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2986 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2987 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2999 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3000 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3001 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3002 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3003 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3004 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3005 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3006 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3007 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3008 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3009
3010 .popsection
3011
3012#ifdef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +04003013 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003014L(copy_backward):
Jack Renc47703a2012-02-14 12:01:52 +04003015 PUSH (%edi)
3016 movl %eax, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003017 lea (%ecx,%edx,1),%edx
Jack Renc47703a2012-02-14 12:01:52 +04003018 lea (%ecx,%edi,1),%edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003019 testl $0x3, %edx
3020 jnz L(bk_align)
3021
3022L(bk_aligned_4):
3023 cmp $64, %ecx
3024 jae L(bk_write_more64bytes)
3025
3026L(bk_write_64bytesless):
3027 cmp $32, %ecx
3028 jb L(bk_write_less32bytes)
3029
3030L(bk_write_more32bytes):
3031 /* Copy 32 bytes at a time. */
3032 sub $32, %ecx
Jack Renc47703a2012-02-14 12:01:52 +04003033 movq -8(%edi), %xmm0
3034 movq %xmm0, -8(%edx)
3035 movq -16(%edi), %xmm0
3036 movq %xmm0, -16(%edx)
3037 movq -24(%edi), %xmm0
3038 movq %xmm0, -24(%edx)
3039 movq -32(%edi), %xmm0
3040 movq %xmm0, -32(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003041 sub $32, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003042 sub $32, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003043
3044L(bk_write_less32bytes):
Jack Renc47703a2012-02-14 12:01:52 +04003045 movl %edi, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003046 sub %ecx, %edx
3047 sub %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04003048 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003049L(bk_write_less32bytes_2):
3050 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3051
Jack Renc47703a2012-02-14 12:01:52 +04003052 CFI_PUSH (%edi)
3053
3054 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003055L(bk_align):
3056 cmp $8, %ecx
3057 jbe L(bk_write_less32bytes)
3058 testl $1, %edx
3059 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
Jack Renc47703a2012-02-14 12:01:52 +04003060 then (EDX & 2) must be != 0. */
Bruce Beare8ff1a272010-03-04 11:03:37 -08003061 jz L(bk_got2)
Jack Renc47703a2012-02-14 12:01:52 +04003062 sub $1, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003063 sub $1, %ecx
3064 sub $1, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003065 movzbl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003066 movb %al, (%edx)
3067
3068 testl $2, %edx
3069 jz L(bk_aligned_4)
3070
3071L(bk_got2):
Jack Renc47703a2012-02-14 12:01:52 +04003072 sub $2, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003073 sub $2, %ecx
3074 sub $2, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003075 movzwl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003076 movw %ax, (%edx)
3077 jmp L(bk_aligned_4)
3078
Jack Renc47703a2012-02-14 12:01:52 +04003079 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003080L(bk_write_more64bytes):
3081 /* Check alignment of last byte. */
3082 testl $15, %edx
3083 jz L(bk_ssse3_cpy_pre)
3084
3085/* EDX is aligned 4 bytes, but not 16 bytes. */
3086L(bk_ssse3_align):
Jack Renc47703a2012-02-14 12:01:52 +04003087 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003088 sub $4, %ecx
3089 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003090 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003091 movl %eax, (%edx)
3092
3093 testl $15, %edx
3094 jz L(bk_ssse3_cpy_pre)
3095
Jack Renc47703a2012-02-14 12:01:52 +04003096 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003097 sub $4, %ecx
3098 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003099 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003100 movl %eax, (%edx)
3101
3102 testl $15, %edx
3103 jz L(bk_ssse3_cpy_pre)
3104
Jack Renc47703a2012-02-14 12:01:52 +04003105 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003106 sub $4, %ecx
3107 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003108 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003109 movl %eax, (%edx)
3110
3111L(bk_ssse3_cpy_pre):
3112 cmp $64, %ecx
3113 jb L(bk_write_more32bytes)
3114
Jack Renc47703a2012-02-14 12:01:52 +04003115 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003116L(bk_ssse3_cpy):
Jack Renc47703a2012-02-14 12:01:52 +04003117 sub $64, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003118 sub $64, %ecx
3119 sub $64, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003120 movdqu 0x30(%edi), %xmm3
Bruce Beare8ff1a272010-03-04 11:03:37 -08003121 movdqa %xmm3, 0x30(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003122 movdqu 0x20(%edi), %xmm2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003123 movdqa %xmm2, 0x20(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003124 movdqu 0x10(%edi), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -08003125 movdqa %xmm1, 0x10(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003126 movdqu (%edi), %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -08003127 movdqa %xmm0, (%edx)
3128 cmp $64, %ecx
3129 jae L(bk_ssse3_cpy)
3130 jmp L(bk_write_64bytesless)
3131
3132#endif
3133
3134END (MEMCPY)
Haibo Huang8a0f0ed2018-05-24 20:39:18 -07003135
3136ALIAS_SYMBOL(memmove, MEMCPY)