blob: 79b5d1b7edf0aa87772bce640d849248ce9af8c7 [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Elliott Hughesed777142022-07-25 16:25:11 +000031#define FOR_SILVERMONT
Varvara Rainchik5a922842014-04-24 15:41:20 +040032#include "cache.h"
33
34#ifndef MEMMOVE
Haibo Huangb9244ff2018-08-11 10:12:13 -070035# define MEMMOVE memmove_generic
Varvara Rainchik5a922842014-04-24 15:41:20 +040036#endif
37
38#ifndef L
39# define L(label) .L##label
40#endif
41
42#ifndef cfi_startproc
43# define cfi_startproc .cfi_startproc
44#endif
45
46#ifndef cfi_endproc
47# define cfi_endproc .cfi_endproc
48#endif
49
50#ifndef cfi_rel_offset
51# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
52#endif
53
54#ifndef cfi_restore
55# define cfi_restore(reg) .cfi_restore reg
56#endif
57
58#ifndef cfi_adjust_cfa_offset
59# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name) \
64 .type name, @function; \
65 .globl name; \
66 .p2align 4; \
67name: \
68 cfi_startproc
69#endif
70
71#ifndef END
72# define END(name) \
73 cfi_endproc; \
74 .size name, .-name
75#endif
76
Elliott Hughesbed110a2016-03-03 10:41:42 -080077#define DEST PARMS
78#define SRC DEST+4
79#define LEN SRC+4
Varvara Rainchik5a922842014-04-24 15:41:20 +040080
81#define CFI_PUSH(REG) \
82 cfi_adjust_cfa_offset (4); \
83 cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG) \
86 cfi_adjust_cfa_offset (-4); \
87 cfi_restore (REG)
88
89#define PUSH(REG) pushl REG; CFI_PUSH (REG)
90#define POP(REG) popl REG; CFI_POP (REG)
91
92#define PARMS 8 /* Preserve EBX. */
93#define ENTRANCE PUSH (%ebx);
94#define RETURN_END POP (%ebx); ret
95#define RETURN RETURN_END; CFI_PUSH (%ebx)
96
97 .section .text.sse2,"ax",@progbits
98ENTRY (MEMMOVE)
99 ENTRANCE
100 movl LEN(%esp), %ecx
101 movl SRC(%esp), %eax
102 movl DEST(%esp), %edx
103
104/* Check whether we should copy backward or forward. */
105 cmp %eax, %edx
106 je L(mm_return)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400107 jg L(mm_len_0_or_more_backward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400108
109/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
110 separately. */
111 cmp $16, %ecx
112 jbe L(mm_len_0_16_bytes_forward)
113
Varvara Rainchikfce86142014-05-27 12:41:55 +0400114 cmpl $32, %ecx
115 ja L(mm_len_32_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400116
117/* Copy [0..32] and return. */
118 movdqu (%eax), %xmm0
119 movdqu -16(%eax, %ecx), %xmm1
120 movdqu %xmm0, (%edx)
121 movdqu %xmm1, -16(%edx, %ecx)
122 jmp L(mm_return)
123
124L(mm_len_32_or_more_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400125 cmpl $64, %ecx
126 ja L(mm_len_64_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400127
128/* Copy [0..64] and return. */
129 movdqu (%eax), %xmm0
130 movdqu 16(%eax), %xmm1
131 movdqu -16(%eax, %ecx), %xmm2
132 movdqu -32(%eax, %ecx), %xmm3
133 movdqu %xmm0, (%edx)
134 movdqu %xmm1, 16(%edx)
135 movdqu %xmm2, -16(%edx, %ecx)
136 movdqu %xmm3, -32(%edx, %ecx)
137 jmp L(mm_return)
138
139L(mm_len_64_or_more_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400140 cmpl $128, %ecx
141 ja L(mm_len_128_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400142
143/* Copy [0..128] and return. */
144 movdqu (%eax), %xmm0
145 movdqu 16(%eax), %xmm1
146 movdqu 32(%eax), %xmm2
147 movdqu 48(%eax), %xmm3
148 movdqu -64(%eax, %ecx), %xmm4
149 movdqu -48(%eax, %ecx), %xmm5
150 movdqu -32(%eax, %ecx), %xmm6
151 movdqu -16(%eax, %ecx), %xmm7
152 movdqu %xmm0, (%edx)
153 movdqu %xmm1, 16(%edx)
154 movdqu %xmm2, 32(%edx)
155 movdqu %xmm3, 48(%edx)
156 movdqu %xmm4, -64(%edx, %ecx)
157 movdqu %xmm5, -48(%edx, %ecx)
158 movdqu %xmm6, -32(%edx, %ecx)
159 movdqu %xmm7, -16(%edx, %ecx)
160 jmp L(mm_return)
161
162L(mm_len_128_or_more_forward):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400163 PUSH (%esi)
164 PUSH (%edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400165
166/* Aligning the address of destination. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400167 movdqu (%eax), %xmm0
168 movdqu 16(%eax), %xmm1
169 movdqu 32(%eax), %xmm2
170 movdqu 48(%eax), %xmm3
Varvara Rainchik5a922842014-04-24 15:41:20 +0400171
Varvara Rainchikfce86142014-05-27 12:41:55 +0400172 leal 64(%edx), %edi
173 andl $-64, %edi
174 subl %edx, %eax
Varvara Rainchik5a922842014-04-24 15:41:20 +0400175
Varvara Rainchikfce86142014-05-27 12:41:55 +0400176 movdqu (%eax, %edi), %xmm4
177 movdqu 16(%eax, %edi), %xmm5
178 movdqu 32(%eax, %edi), %xmm6
179 movdqu 48(%eax, %edi), %xmm7
Varvara Rainchik5a922842014-04-24 15:41:20 +0400180
Varvara Rainchikfce86142014-05-27 12:41:55 +0400181 movdqu %xmm0, (%edx)
182 movdqu %xmm1, 16(%edx)
183 movdqu %xmm2, 32(%edx)
184 movdqu %xmm3, 48(%edx)
185 movdqa %xmm4, (%edi)
186 movaps %xmm5, 16(%edi)
187 movaps %xmm6, 32(%edi)
188 movaps %xmm7, 48(%edi)
189 addl $64, %edi
Varvara Rainchik5a922842014-04-24 15:41:20 +0400190
Varvara Rainchikfce86142014-05-27 12:41:55 +0400191 leal (%edx, %ecx), %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400192 andl $-64, %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400193 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400194 jbe L(mm_copy_remaining_forward)
195
Varvara Rainchikfce86142014-05-27 12:41:55 +0400196 cmp $SHARED_CACHE_SIZE_HALF, %ecx
197 jae L(mm_large_page_loop_forward)
198
Varvara Rainchik5a922842014-04-24 15:41:20 +0400199 .p2align 4
200L(mm_main_loop_forward):
201
Varvara Rainchikfce86142014-05-27 12:41:55 +0400202 prefetcht0 128(%eax, %edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400203
Varvara Rainchikfce86142014-05-27 12:41:55 +0400204 movdqu (%eax, %edi), %xmm0
205 movdqu 16(%eax, %edi), %xmm1
206 movdqu 32(%eax, %edi), %xmm2
207 movdqu 48(%eax, %edi), %xmm3
208 movdqa %xmm0, (%edi)
209 movaps %xmm1, 16(%edi)
210 movaps %xmm2, 32(%edi)
211 movaps %xmm3, 48(%edi)
212 leal 64(%edi), %edi
213 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400214 ja L(mm_main_loop_forward)
215
216L(mm_copy_remaining_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400217 addl %edx, %ecx
218 subl %edi, %ecx
219/* We copied all up till %edi position in the dst.
Varvara Rainchik5a922842014-04-24 15:41:20 +0400220 In %ecx now is how many bytes are left to copy.
221 Now we need to advance %esi. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400222 leal (%edi, %eax), %esi
Varvara Rainchik5a922842014-04-24 15:41:20 +0400223
224L(mm_remaining_0_64_bytes_forward):
225 cmp $32, %ecx
226 ja L(mm_remaining_33_64_bytes_forward)
227 cmp $16, %ecx
228 ja L(mm_remaining_17_32_bytes_forward)
229 testl %ecx, %ecx
230 .p2align 4,,2
231 je L(mm_return_pop_all)
232
233 cmpb $8, %cl
234 ja L(mm_remaining_9_16_bytes_forward)
235 cmpb $4, %cl
236 .p2align 4,,5
237 ja L(mm_remaining_5_8_bytes_forward)
238 cmpb $2, %cl
239 .p2align 4,,1
240 ja L(mm_remaining_3_4_bytes_forward)
241 movzbl -1(%esi,%ecx), %eax
242 movzbl (%esi), %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400243 movb %al, -1(%edi,%ecx)
244 movb %bl, (%edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400245 jmp L(mm_return_pop_all)
246
247L(mm_remaining_33_64_bytes_forward):
248 movdqu (%esi), %xmm0
249 movdqu 16(%esi), %xmm1
250 movdqu -32(%esi, %ecx), %xmm2
251 movdqu -16(%esi, %ecx), %xmm3
Varvara Rainchikfce86142014-05-27 12:41:55 +0400252 movdqu %xmm0, (%edi)
253 movdqu %xmm1, 16(%edi)
254 movdqu %xmm2, -32(%edi, %ecx)
255 movdqu %xmm3, -16(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400256 jmp L(mm_return_pop_all)
257
258L(mm_remaining_17_32_bytes_forward):
259 movdqu (%esi), %xmm0
260 movdqu -16(%esi, %ecx), %xmm1
Varvara Rainchikfce86142014-05-27 12:41:55 +0400261 movdqu %xmm0, (%edi)
262 movdqu %xmm1, -16(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400263 jmp L(mm_return_pop_all)
264
265L(mm_remaining_9_16_bytes_forward):
266 movq (%esi), %xmm0
267 movq -8(%esi, %ecx), %xmm1
Varvara Rainchikfce86142014-05-27 12:41:55 +0400268 movq %xmm0, (%edi)
269 movq %xmm1, -8(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400270 jmp L(mm_return_pop_all)
271
Varvara Rainchikfce86142014-05-27 12:41:55 +0400272L(mm_remaining_5_8_bytes_forward):
273 movl (%esi), %eax
274 movl -4(%esi,%ecx), %ebx
275 movl %eax, (%edi)
276 movl %ebx, -4(%edi,%ecx)
277 jmp L(mm_return_pop_all)
278
279L(mm_remaining_3_4_bytes_forward):
280 movzwl -2(%esi,%ecx), %eax
281 movzwl (%esi), %ebx
282 movw %ax, -2(%edi,%ecx)
283 movw %bx, (%edi)
284 jmp L(mm_return_pop_all)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400285
286L(mm_len_0_16_bytes_forward):
287 testb $24, %cl
288 jne L(mm_len_9_16_bytes_forward)
289 testb $4, %cl
290 .p2align 4,,5
291 jne L(mm_len_5_8_bytes_forward)
292 testl %ecx, %ecx
293 .p2align 4,,2
294 je L(mm_return)
295 testb $2, %cl
296 .p2align 4,,1
297 jne L(mm_len_2_4_bytes_forward)
298 movzbl -1(%eax,%ecx), %ebx
299 movzbl (%eax), %eax
300 movb %bl, -1(%edx,%ecx)
301 movb %al, (%edx)
302 jmp L(mm_return)
303
304L(mm_len_2_4_bytes_forward):
305 movzwl -2(%eax,%ecx), %ebx
306 movzwl (%eax), %eax
307 movw %bx, -2(%edx,%ecx)
308 movw %ax, (%edx)
309 jmp L(mm_return)
310
311L(mm_len_5_8_bytes_forward):
312 movl (%eax), %ebx
313 movl -4(%eax,%ecx), %eax
314 movl %ebx, (%edx)
315 movl %eax, -4(%edx,%ecx)
316 jmp L(mm_return)
317
318L(mm_len_9_16_bytes_forward):
319 movq (%eax), %xmm0
320 movq -8(%eax, %ecx), %xmm1
321 movq %xmm0, (%edx)
322 movq %xmm1, -8(%edx, %ecx)
323 jmp L(mm_return)
324
Christopher Ferris97b6e132016-02-17 19:17:02 -0800325 CFI_POP (%edi)
326 CFI_POP (%esi)
327
Varvara Rainchikfce86142014-05-27 12:41:55 +0400328L(mm_recalc_len):
329/* Compute in %ecx how many bytes are left to copy after
330 the main loop stops. */
331 movl %ebx, %ecx
332 subl %edx, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400333/* The code for copying backwards. */
334L(mm_len_0_or_more_backward):
335
Varvara Rainchikfce86142014-05-27 12:41:55 +0400336/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
Varvara Rainchik5a922842014-04-24 15:41:20 +0400337 separately. */
338 cmp $16, %ecx
339 jbe L(mm_len_0_16_bytes_backward)
340
Varvara Rainchikfce86142014-05-27 12:41:55 +0400341 cmpl $32, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400342 jg L(mm_len_32_or_more_backward)
343
344/* Copy [0..32] and return. */
345 movdqu (%eax), %xmm0
346 movdqu -16(%eax, %ecx), %xmm1
347 movdqu %xmm0, (%edx)
348 movdqu %xmm1, -16(%edx, %ecx)
349 jmp L(mm_return)
350
351L(mm_len_32_or_more_backward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400352 cmpl $64, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400353 jg L(mm_len_64_or_more_backward)
354
355/* Copy [0..64] and return. */
356 movdqu (%eax), %xmm0
357 movdqu 16(%eax), %xmm1
358 movdqu -16(%eax, %ecx), %xmm2
359 movdqu -32(%eax, %ecx), %xmm3
360 movdqu %xmm0, (%edx)
361 movdqu %xmm1, 16(%edx)
362 movdqu %xmm2, -16(%edx, %ecx)
363 movdqu %xmm3, -32(%edx, %ecx)
364 jmp L(mm_return)
365
366L(mm_len_64_or_more_backward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400367 cmpl $128, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400368 jg L(mm_len_128_or_more_backward)
369
370/* Copy [0..128] and return. */
371 movdqu (%eax), %xmm0
372 movdqu 16(%eax), %xmm1
373 movdqu 32(%eax), %xmm2
374 movdqu 48(%eax), %xmm3
375 movdqu -64(%eax, %ecx), %xmm4
376 movdqu -48(%eax, %ecx), %xmm5
377 movdqu -32(%eax, %ecx), %xmm6
378 movdqu -16(%eax, %ecx), %xmm7
379 movdqu %xmm0, (%edx)
380 movdqu %xmm1, 16(%edx)
381 movdqu %xmm2, 32(%edx)
382 movdqu %xmm3, 48(%edx)
383 movdqu %xmm4, -64(%edx, %ecx)
384 movdqu %xmm5, -48(%edx, %ecx)
385 movdqu %xmm6, -32(%edx, %ecx)
386 movdqu %xmm7, -16(%edx, %ecx)
387 jmp L(mm_return)
388
389L(mm_len_128_or_more_backward):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400390 PUSH (%esi)
391 PUSH (%edi)
392
393/* Aligning the address of destination. We need to save
394 16 bits from the source in order not to overwrite them. */
395 movdqu -16(%eax, %ecx), %xmm0
396 movdqu -32(%eax, %ecx), %xmm1
397 movdqu -48(%eax, %ecx), %xmm2
398 movdqu -64(%eax, %ecx), %xmm3
399
400 leal (%edx, %ecx), %edi
401 andl $-64, %edi
402
403 movl %eax, %esi
404 subl %edx, %esi
405
406 movdqu -16(%edi, %esi), %xmm4
407 movdqu -32(%edi, %esi), %xmm5
408 movdqu -48(%edi, %esi), %xmm6
409 movdqu -64(%edi, %esi), %xmm7
410
411 movdqu %xmm0, -16(%edx, %ecx)
412 movdqu %xmm1, -32(%edx, %ecx)
413 movdqu %xmm2, -48(%edx, %ecx)
414 movdqu %xmm3, -64(%edx, %ecx)
415 movdqa %xmm4, -16(%edi)
416 movdqa %xmm5, -32(%edi)
417 movdqa %xmm6, -48(%edi)
418 movdqa %xmm7, -64(%edi)
419 leal -64(%edi), %edi
420
421 leal 64(%edx), %ebx
422 andl $-64, %ebx
423
Varvara Rainchik5a922842014-04-24 15:41:20 +0400424 cmp %edi, %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400425 jae L(mm_main_loop_backward_end)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400426
Varvara Rainchikfce86142014-05-27 12:41:55 +0400427 cmp $SHARED_CACHE_SIZE_HALF, %ecx
428 jae L(mm_large_page_loop_backward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400429
430 .p2align 4
431L(mm_main_loop_backward):
432
433 prefetcht0 -128(%edi, %esi)
434
435 movdqu -64(%edi, %esi), %xmm0
436 movdqu -48(%edi, %esi), %xmm1
437 movdqu -32(%edi, %esi), %xmm2
438 movdqu -16(%edi, %esi), %xmm3
439 movdqa %xmm0, -64(%edi)
440 movdqa %xmm1, -48(%edi)
441 movdqa %xmm2, -32(%edi)
442 movdqa %xmm3, -16(%edi)
443 leal -64(%edi), %edi
444 cmp %edi, %ebx
445 jb L(mm_main_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400446L(mm_main_loop_backward_end):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400447 POP (%edi)
448 POP (%esi)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400449 jmp L(mm_recalc_len)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400450
451/* Copy [0..16] and return. */
452L(mm_len_0_16_bytes_backward):
453 testb $24, %cl
454 jnz L(mm_len_9_16_bytes_backward)
455 testb $4, %cl
456 .p2align 4,,5
457 jnz L(mm_len_5_8_bytes_backward)
458 testl %ecx, %ecx
459 .p2align 4,,2
460 je L(mm_return)
461 testb $2, %cl
462 .p2align 4,,1
463 jne L(mm_len_3_4_bytes_backward)
464 movzbl -1(%eax,%ecx), %ebx
465 movzbl (%eax), %eax
466 movb %bl, -1(%edx,%ecx)
467 movb %al, (%edx)
468 jmp L(mm_return)
469
470L(mm_len_3_4_bytes_backward):
471 movzwl -2(%eax,%ecx), %ebx
472 movzwl (%eax), %eax
473 movw %bx, -2(%edx,%ecx)
474 movw %ax, (%edx)
475 jmp L(mm_return)
476
477L(mm_len_9_16_bytes_backward):
478 PUSH (%esi)
479 movl -4(%eax,%ecx), %ebx
480 movl -8(%eax,%ecx), %esi
481 movl %ebx, -4(%edx,%ecx)
482 movl %esi, -8(%edx,%ecx)
483 subl $8, %ecx
484 POP (%esi)
485 jmp L(mm_len_0_16_bytes_backward)
486
487L(mm_len_5_8_bytes_backward):
488 movl (%eax), %ebx
489 movl -4(%eax,%ecx), %eax
490 movl %ebx, (%edx)
491 movl %eax, -4(%edx,%ecx)
492
493L(mm_return):
494 movl %edx, %eax
495 RETURN
496
497L(mm_return_pop_all):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400498 movl %edx, %eax
Varvara Rainchik5a922842014-04-24 15:41:20 +0400499 POP (%edi)
500 POP (%esi)
501 RETURN
502
503/* Big length copy forward part. */
504
Varvara Rainchik5a922842014-04-24 15:41:20 +0400505 .p2align 4
506L(mm_large_page_loop_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400507 movdqu (%eax, %edi), %xmm0
508 movdqu 16(%eax, %edi), %xmm1
509 movdqu 32(%eax, %edi), %xmm2
510 movdqu 48(%eax, %edi), %xmm3
511 movntdq %xmm0, (%edi)
512 movntdq %xmm1, 16(%edi)
513 movntdq %xmm2, 32(%edi)
514 movntdq %xmm3, 48(%edi)
515 leal 64(%edi), %edi
516 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400517 ja L(mm_large_page_loop_forward)
518 sfence
Varvara Rainchikfce86142014-05-27 12:41:55 +0400519 jmp L(mm_copy_remaining_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400520
521/* Big length copy backward part. */
Varvara Rainchik5a922842014-04-24 15:41:20 +0400522 .p2align 4
523L(mm_large_page_loop_backward):
524 movdqu -64(%edi, %esi), %xmm0
525 movdqu -48(%edi, %esi), %xmm1
526 movdqu -32(%edi, %esi), %xmm2
527 movdqu -16(%edi, %esi), %xmm3
528 movntdq %xmm0, -64(%edi)
529 movntdq %xmm1, -48(%edi)
530 movntdq %xmm2, -32(%edi)
531 movntdq %xmm3, -16(%edi)
532 leal -64(%edi), %edi
533 cmp %edi, %ebx
534 jb L(mm_large_page_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400535 sfence
Varvara Rainchik5a922842014-04-24 15:41:20 +0400536 POP (%edi)
537 POP (%esi)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400538 jmp L(mm_recalc_len)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400539
540END (MEMMOVE)