blob: 6a8f0673cc172192eca70ca3ffba7b4a6e26b73e [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE memmove
35#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg) .cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
Varvara Rainchikfce86142014-05-27 12:41:55 +040077# define SRC PARMS
78# define DEST SRC+4
79# define LEN DEST+4
Varvara Rainchik5a922842014-04-24 15:41:20 +040080#else
Varvara Rainchikfce86142014-05-27 12:41:55 +040081# define DEST PARMS
82# define SRC DEST+4
83# define LEN SRC+4
Varvara Rainchik5a922842014-04-24 15:41:20 +040084#endif
85
86#define CFI_PUSH(REG) \
87 cfi_adjust_cfa_offset (4); \
88 cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG) \
91 cfi_adjust_cfa_offset (-4); \
92 cfi_restore (REG)
93
94#define PUSH(REG) pushl REG; CFI_PUSH (REG)
95#define POP(REG) popl REG; CFI_POP (REG)
96
97#define PARMS 8 /* Preserve EBX. */
98#define ENTRANCE PUSH (%ebx);
99#define RETURN_END POP (%ebx); ret
100#define RETURN RETURN_END; CFI_PUSH (%ebx)
101
102 .section .text.sse2,"ax",@progbits
103ENTRY (MEMMOVE)
104 ENTRANCE
105 movl LEN(%esp), %ecx
106 movl SRC(%esp), %eax
107 movl DEST(%esp), %edx
108
109/* Check whether we should copy backward or forward. */
110 cmp %eax, %edx
111 je L(mm_return)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400112 jg L(mm_len_0_or_more_backward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400113
114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
115 separately. */
116 cmp $16, %ecx
117 jbe L(mm_len_0_16_bytes_forward)
118
Varvara Rainchikfce86142014-05-27 12:41:55 +0400119 cmpl $32, %ecx
120 ja L(mm_len_32_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400121
122/* Copy [0..32] and return. */
123 movdqu (%eax), %xmm0
124 movdqu -16(%eax, %ecx), %xmm1
125 movdqu %xmm0, (%edx)
126 movdqu %xmm1, -16(%edx, %ecx)
127 jmp L(mm_return)
128
129L(mm_len_32_or_more_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400130 cmpl $64, %ecx
131 ja L(mm_len_64_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400132
133/* Copy [0..64] and return. */
134 movdqu (%eax), %xmm0
135 movdqu 16(%eax), %xmm1
136 movdqu -16(%eax, %ecx), %xmm2
137 movdqu -32(%eax, %ecx), %xmm3
138 movdqu %xmm0, (%edx)
139 movdqu %xmm1, 16(%edx)
140 movdqu %xmm2, -16(%edx, %ecx)
141 movdqu %xmm3, -32(%edx, %ecx)
142 jmp L(mm_return)
143
144L(mm_len_64_or_more_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400145 cmpl $128, %ecx
146 ja L(mm_len_128_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400147
148/* Copy [0..128] and return. */
149 movdqu (%eax), %xmm0
150 movdqu 16(%eax), %xmm1
151 movdqu 32(%eax), %xmm2
152 movdqu 48(%eax), %xmm3
153 movdqu -64(%eax, %ecx), %xmm4
154 movdqu -48(%eax, %ecx), %xmm5
155 movdqu -32(%eax, %ecx), %xmm6
156 movdqu -16(%eax, %ecx), %xmm7
157 movdqu %xmm0, (%edx)
158 movdqu %xmm1, 16(%edx)
159 movdqu %xmm2, 32(%edx)
160 movdqu %xmm3, 48(%edx)
161 movdqu %xmm4, -64(%edx, %ecx)
162 movdqu %xmm5, -48(%edx, %ecx)
163 movdqu %xmm6, -32(%edx, %ecx)
164 movdqu %xmm7, -16(%edx, %ecx)
165 jmp L(mm_return)
166
167L(mm_len_128_or_more_forward):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400168 PUSH (%esi)
169 PUSH (%edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400170
171/* Aligning the address of destination. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400172 movdqu (%eax), %xmm0
173 movdqu 16(%eax), %xmm1
174 movdqu 32(%eax), %xmm2
175 movdqu 48(%eax), %xmm3
Varvara Rainchik5a922842014-04-24 15:41:20 +0400176
Varvara Rainchikfce86142014-05-27 12:41:55 +0400177 leal 64(%edx), %edi
178 andl $-64, %edi
179 subl %edx, %eax
Varvara Rainchik5a922842014-04-24 15:41:20 +0400180
Varvara Rainchikfce86142014-05-27 12:41:55 +0400181 movdqu (%eax, %edi), %xmm4
182 movdqu 16(%eax, %edi), %xmm5
183 movdqu 32(%eax, %edi), %xmm6
184 movdqu 48(%eax, %edi), %xmm7
Varvara Rainchik5a922842014-04-24 15:41:20 +0400185
Varvara Rainchikfce86142014-05-27 12:41:55 +0400186 movdqu %xmm0, (%edx)
187 movdqu %xmm1, 16(%edx)
188 movdqu %xmm2, 32(%edx)
189 movdqu %xmm3, 48(%edx)
190 movdqa %xmm4, (%edi)
191 movaps %xmm5, 16(%edi)
192 movaps %xmm6, 32(%edi)
193 movaps %xmm7, 48(%edi)
194 addl $64, %edi
Varvara Rainchik5a922842014-04-24 15:41:20 +0400195
Varvara Rainchikfce86142014-05-27 12:41:55 +0400196 leal (%edx, %ecx), %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400197 andl $-64, %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400198 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400199 jbe L(mm_copy_remaining_forward)
200
Varvara Rainchikfce86142014-05-27 12:41:55 +0400201 cmp $SHARED_CACHE_SIZE_HALF, %ecx
202 jae L(mm_large_page_loop_forward)
203
Varvara Rainchik5a922842014-04-24 15:41:20 +0400204 .p2align 4
205L(mm_main_loop_forward):
206
Varvara Rainchikfce86142014-05-27 12:41:55 +0400207 prefetcht0 128(%eax, %edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400208
Varvara Rainchikfce86142014-05-27 12:41:55 +0400209 movdqu (%eax, %edi), %xmm0
210 movdqu 16(%eax, %edi), %xmm1
211 movdqu 32(%eax, %edi), %xmm2
212 movdqu 48(%eax, %edi), %xmm3
213 movdqa %xmm0, (%edi)
214 movaps %xmm1, 16(%edi)
215 movaps %xmm2, 32(%edi)
216 movaps %xmm3, 48(%edi)
217 leal 64(%edi), %edi
218 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400219 ja L(mm_main_loop_forward)
220
221L(mm_copy_remaining_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400222 addl %edx, %ecx
223 subl %edi, %ecx
224/* We copied all up till %edi position in the dst.
Varvara Rainchik5a922842014-04-24 15:41:20 +0400225 In %ecx now is how many bytes are left to copy.
226 Now we need to advance %esi. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400227 leal (%edi, %eax), %esi
Varvara Rainchik5a922842014-04-24 15:41:20 +0400228
229L(mm_remaining_0_64_bytes_forward):
230 cmp $32, %ecx
231 ja L(mm_remaining_33_64_bytes_forward)
232 cmp $16, %ecx
233 ja L(mm_remaining_17_32_bytes_forward)
234 testl %ecx, %ecx
235 .p2align 4,,2
236 je L(mm_return_pop_all)
237
238 cmpb $8, %cl
239 ja L(mm_remaining_9_16_bytes_forward)
240 cmpb $4, %cl
241 .p2align 4,,5
242 ja L(mm_remaining_5_8_bytes_forward)
243 cmpb $2, %cl
244 .p2align 4,,1
245 ja L(mm_remaining_3_4_bytes_forward)
246 movzbl -1(%esi,%ecx), %eax
247 movzbl (%esi), %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400248 movb %al, -1(%edi,%ecx)
249 movb %bl, (%edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400250 jmp L(mm_return_pop_all)
251
252L(mm_remaining_33_64_bytes_forward):
253 movdqu (%esi), %xmm0
254 movdqu 16(%esi), %xmm1
255 movdqu -32(%esi, %ecx), %xmm2
256 movdqu -16(%esi, %ecx), %xmm3
Varvara Rainchikfce86142014-05-27 12:41:55 +0400257 movdqu %xmm0, (%edi)
258 movdqu %xmm1, 16(%edi)
259 movdqu %xmm2, -32(%edi, %ecx)
260 movdqu %xmm3, -16(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400261 jmp L(mm_return_pop_all)
262
263L(mm_remaining_17_32_bytes_forward):
264 movdqu (%esi), %xmm0
265 movdqu -16(%esi, %ecx), %xmm1
Varvara Rainchikfce86142014-05-27 12:41:55 +0400266 movdqu %xmm0, (%edi)
267 movdqu %xmm1, -16(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400268 jmp L(mm_return_pop_all)
269
270L(mm_remaining_9_16_bytes_forward):
271 movq (%esi), %xmm0
272 movq -8(%esi, %ecx), %xmm1
Varvara Rainchikfce86142014-05-27 12:41:55 +0400273 movq %xmm0, (%edi)
274 movq %xmm1, -8(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400275 jmp L(mm_return_pop_all)
276
Varvara Rainchikfce86142014-05-27 12:41:55 +0400277L(mm_remaining_5_8_bytes_forward):
278 movl (%esi), %eax
279 movl -4(%esi,%ecx), %ebx
280 movl %eax, (%edi)
281 movl %ebx, -4(%edi,%ecx)
282 jmp L(mm_return_pop_all)
283
284L(mm_remaining_3_4_bytes_forward):
285 movzwl -2(%esi,%ecx), %eax
286 movzwl (%esi), %ebx
287 movw %ax, -2(%edi,%ecx)
288 movw %bx, (%edi)
289 jmp L(mm_return_pop_all)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400290
291L(mm_len_0_16_bytes_forward):
292 testb $24, %cl
293 jne L(mm_len_9_16_bytes_forward)
294 testb $4, %cl
295 .p2align 4,,5
296 jne L(mm_len_5_8_bytes_forward)
297 testl %ecx, %ecx
298 .p2align 4,,2
299 je L(mm_return)
300 testb $2, %cl
301 .p2align 4,,1
302 jne L(mm_len_2_4_bytes_forward)
303 movzbl -1(%eax,%ecx), %ebx
304 movzbl (%eax), %eax
305 movb %bl, -1(%edx,%ecx)
306 movb %al, (%edx)
307 jmp L(mm_return)
308
309L(mm_len_2_4_bytes_forward):
310 movzwl -2(%eax,%ecx), %ebx
311 movzwl (%eax), %eax
312 movw %bx, -2(%edx,%ecx)
313 movw %ax, (%edx)
314 jmp L(mm_return)
315
316L(mm_len_5_8_bytes_forward):
317 movl (%eax), %ebx
318 movl -4(%eax,%ecx), %eax
319 movl %ebx, (%edx)
320 movl %eax, -4(%edx,%ecx)
321 jmp L(mm_return)
322
323L(mm_len_9_16_bytes_forward):
324 movq (%eax), %xmm0
325 movq -8(%eax, %ecx), %xmm1
326 movq %xmm0, (%edx)
327 movq %xmm1, -8(%edx, %ecx)
328 jmp L(mm_return)
329
Christopher Ferris97b6e132016-02-17 19:17:02 -0800330 CFI_POP (%edi)
331 CFI_POP (%esi)
332
Varvara Rainchikfce86142014-05-27 12:41:55 +0400333L(mm_recalc_len):
334/* Compute in %ecx how many bytes are left to copy after
335 the main loop stops. */
336 movl %ebx, %ecx
337 subl %edx, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400338/* The code for copying backwards. */
339L(mm_len_0_or_more_backward):
340
Varvara Rainchikfce86142014-05-27 12:41:55 +0400341/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
Varvara Rainchik5a922842014-04-24 15:41:20 +0400342 separately. */
343 cmp $16, %ecx
344 jbe L(mm_len_0_16_bytes_backward)
345
Varvara Rainchikfce86142014-05-27 12:41:55 +0400346 cmpl $32, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400347 jg L(mm_len_32_or_more_backward)
348
349/* Copy [0..32] and return. */
350 movdqu (%eax), %xmm0
351 movdqu -16(%eax, %ecx), %xmm1
352 movdqu %xmm0, (%edx)
353 movdqu %xmm1, -16(%edx, %ecx)
354 jmp L(mm_return)
355
356L(mm_len_32_or_more_backward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400357 cmpl $64, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400358 jg L(mm_len_64_or_more_backward)
359
360/* Copy [0..64] and return. */
361 movdqu (%eax), %xmm0
362 movdqu 16(%eax), %xmm1
363 movdqu -16(%eax, %ecx), %xmm2
364 movdqu -32(%eax, %ecx), %xmm3
365 movdqu %xmm0, (%edx)
366 movdqu %xmm1, 16(%edx)
367 movdqu %xmm2, -16(%edx, %ecx)
368 movdqu %xmm3, -32(%edx, %ecx)
369 jmp L(mm_return)
370
371L(mm_len_64_or_more_backward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400372 cmpl $128, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400373 jg L(mm_len_128_or_more_backward)
374
375/* Copy [0..128] and return. */
376 movdqu (%eax), %xmm0
377 movdqu 16(%eax), %xmm1
378 movdqu 32(%eax), %xmm2
379 movdqu 48(%eax), %xmm3
380 movdqu -64(%eax, %ecx), %xmm4
381 movdqu -48(%eax, %ecx), %xmm5
382 movdqu -32(%eax, %ecx), %xmm6
383 movdqu -16(%eax, %ecx), %xmm7
384 movdqu %xmm0, (%edx)
385 movdqu %xmm1, 16(%edx)
386 movdqu %xmm2, 32(%edx)
387 movdqu %xmm3, 48(%edx)
388 movdqu %xmm4, -64(%edx, %ecx)
389 movdqu %xmm5, -48(%edx, %ecx)
390 movdqu %xmm6, -32(%edx, %ecx)
391 movdqu %xmm7, -16(%edx, %ecx)
392 jmp L(mm_return)
393
394L(mm_len_128_or_more_backward):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400395 PUSH (%esi)
396 PUSH (%edi)
397
398/* Aligning the address of destination. We need to save
399 16 bits from the source in order not to overwrite them. */
400 movdqu -16(%eax, %ecx), %xmm0
401 movdqu -32(%eax, %ecx), %xmm1
402 movdqu -48(%eax, %ecx), %xmm2
403 movdqu -64(%eax, %ecx), %xmm3
404
405 leal (%edx, %ecx), %edi
406 andl $-64, %edi
407
408 movl %eax, %esi
409 subl %edx, %esi
410
411 movdqu -16(%edi, %esi), %xmm4
412 movdqu -32(%edi, %esi), %xmm5
413 movdqu -48(%edi, %esi), %xmm6
414 movdqu -64(%edi, %esi), %xmm7
415
416 movdqu %xmm0, -16(%edx, %ecx)
417 movdqu %xmm1, -32(%edx, %ecx)
418 movdqu %xmm2, -48(%edx, %ecx)
419 movdqu %xmm3, -64(%edx, %ecx)
420 movdqa %xmm4, -16(%edi)
421 movdqa %xmm5, -32(%edi)
422 movdqa %xmm6, -48(%edi)
423 movdqa %xmm7, -64(%edi)
424 leal -64(%edi), %edi
425
426 leal 64(%edx), %ebx
427 andl $-64, %ebx
428
Varvara Rainchik5a922842014-04-24 15:41:20 +0400429 cmp %edi, %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400430 jae L(mm_main_loop_backward_end)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400431
Varvara Rainchikfce86142014-05-27 12:41:55 +0400432 cmp $SHARED_CACHE_SIZE_HALF, %ecx
433 jae L(mm_large_page_loop_backward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400434
435 .p2align 4
436L(mm_main_loop_backward):
437
438 prefetcht0 -128(%edi, %esi)
439
440 movdqu -64(%edi, %esi), %xmm0
441 movdqu -48(%edi, %esi), %xmm1
442 movdqu -32(%edi, %esi), %xmm2
443 movdqu -16(%edi, %esi), %xmm3
444 movdqa %xmm0, -64(%edi)
445 movdqa %xmm1, -48(%edi)
446 movdqa %xmm2, -32(%edi)
447 movdqa %xmm3, -16(%edi)
448 leal -64(%edi), %edi
449 cmp %edi, %ebx
450 jb L(mm_main_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400451L(mm_main_loop_backward_end):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400452 POP (%edi)
453 POP (%esi)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400454 jmp L(mm_recalc_len)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400455
456/* Copy [0..16] and return. */
457L(mm_len_0_16_bytes_backward):
458 testb $24, %cl
459 jnz L(mm_len_9_16_bytes_backward)
460 testb $4, %cl
461 .p2align 4,,5
462 jnz L(mm_len_5_8_bytes_backward)
463 testl %ecx, %ecx
464 .p2align 4,,2
465 je L(mm_return)
466 testb $2, %cl
467 .p2align 4,,1
468 jne L(mm_len_3_4_bytes_backward)
469 movzbl -1(%eax,%ecx), %ebx
470 movzbl (%eax), %eax
471 movb %bl, -1(%edx,%ecx)
472 movb %al, (%edx)
473 jmp L(mm_return)
474
475L(mm_len_3_4_bytes_backward):
476 movzwl -2(%eax,%ecx), %ebx
477 movzwl (%eax), %eax
478 movw %bx, -2(%edx,%ecx)
479 movw %ax, (%edx)
480 jmp L(mm_return)
481
482L(mm_len_9_16_bytes_backward):
483 PUSH (%esi)
484 movl -4(%eax,%ecx), %ebx
485 movl -8(%eax,%ecx), %esi
486 movl %ebx, -4(%edx,%ecx)
487 movl %esi, -8(%edx,%ecx)
488 subl $8, %ecx
489 POP (%esi)
490 jmp L(mm_len_0_16_bytes_backward)
491
492L(mm_len_5_8_bytes_backward):
493 movl (%eax), %ebx
494 movl -4(%eax,%ecx), %eax
495 movl %ebx, (%edx)
496 movl %eax, -4(%edx,%ecx)
497
498L(mm_return):
499 movl %edx, %eax
500 RETURN
501
502L(mm_return_pop_all):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400503 movl %edx, %eax
Varvara Rainchik5a922842014-04-24 15:41:20 +0400504 POP (%edi)
505 POP (%esi)
506 RETURN
507
508/* Big length copy forward part. */
509
Varvara Rainchik5a922842014-04-24 15:41:20 +0400510 .p2align 4
511L(mm_large_page_loop_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400512 movdqu (%eax, %edi), %xmm0
513 movdqu 16(%eax, %edi), %xmm1
514 movdqu 32(%eax, %edi), %xmm2
515 movdqu 48(%eax, %edi), %xmm3
516 movntdq %xmm0, (%edi)
517 movntdq %xmm1, 16(%edi)
518 movntdq %xmm2, 32(%edi)
519 movntdq %xmm3, 48(%edi)
520 leal 64(%edi), %edi
521 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400522 ja L(mm_large_page_loop_forward)
523 sfence
Varvara Rainchikfce86142014-05-27 12:41:55 +0400524 jmp L(mm_copy_remaining_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400525
526/* Big length copy backward part. */
Varvara Rainchik5a922842014-04-24 15:41:20 +0400527 .p2align 4
528L(mm_large_page_loop_backward):
529 movdqu -64(%edi, %esi), %xmm0
530 movdqu -48(%edi, %esi), %xmm1
531 movdqu -32(%edi, %esi), %xmm2
532 movdqu -16(%edi, %esi), %xmm3
533 movntdq %xmm0, -64(%edi)
534 movntdq %xmm1, -48(%edi)
535 movntdq %xmm2, -32(%edi)
536 movntdq %xmm3, -16(%edi)
537 leal -64(%edi), %edi
538 cmp %edi, %ebx
539 jb L(mm_large_page_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400540 sfence
Varvara Rainchik5a922842014-04-24 15:41:20 +0400541 POP (%edi)
542 POP (%esi)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400543 jmp L(mm_recalc_len)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400544
545END (MEMMOVE)