blob: 8b32680baa0debc6a0f7d43f4b0caa1f37040980 [file] [log] [blame]
Varvara Rainchika020a242014-04-29 17:44:56 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Varvara Rainchika020a242014-04-29 17:44:56 +040031
32#ifndef MEMMOVE
33# define MEMMOVE memmove
34#endif
35
36#ifndef L
37# define L(label) .L##label
38#endif
39
40#ifndef cfi_startproc
41# define cfi_startproc .cfi_startproc
42#endif
43
44#ifndef cfi_endproc
45# define cfi_endproc .cfi_endproc
46#endif
47
48#ifndef cfi_rel_offset
49# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
50#endif
51
52#ifndef cfi_restore
53# define cfi_restore(reg) .cfi_restore reg
54#endif
55
56#ifndef cfi_adjust_cfa_offset
57# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
58#endif
59
60#ifndef ENTRY
61# define ENTRY(name) \
62 .type name, @function; \
63 .globl name; \
64 .p2align 4; \
65name: \
66 cfi_startproc
67#endif
68
Haibo Huang8a0f0ed2018-05-24 20:39:18 -070069#ifndef ALIAS_SYMBOL
70# define ALIAS_SYMBOL(alias, original) \
71 .globl alias; \
72 .equ alias, original
73#endif
74
Varvara Rainchika020a242014-04-29 17:44:56 +040075#ifndef END
76# define END(name) \
77 cfi_endproc; \
78 .size name, .-name
79#endif
80
81#define CFI_PUSH(REG) \
82 cfi_adjust_cfa_offset (4); \
83 cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG) \
86 cfi_adjust_cfa_offset (-4); \
87 cfi_restore (REG)
88
89#define PUSH(REG) push REG;
90#define POP(REG) pop REG;
91
92#define ENTRANCE PUSH (%rbx);
93#define RETURN_END POP (%rbx); ret
94#define RETURN RETURN_END;
95
96 .section .text.sse2,"ax",@progbits
97ENTRY (MEMMOVE)
98 ENTRANCE
Varvara Rainchika020a242014-04-29 17:44:56 +040099 mov %rdi, %rax
100
101/* Check whether we should copy backward or forward. */
102 cmp %rsi, %rdi
103 je L(mm_return)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400104 jg L(mm_len_0_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400105
106/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
107 separately. */
108 cmp $16, %rdx
109 jbe L(mm_len_0_16_bytes_forward)
110
111 cmp $32, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400112 ja L(mm_len_32_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400113
114/* Copy [0..32] and return. */
115 movdqu (%rsi), %xmm0
116 movdqu -16(%rsi, %rdx), %xmm1
117 movdqu %xmm0, (%rdi)
118 movdqu %xmm1, -16(%rdi, %rdx)
119 jmp L(mm_return)
120
121L(mm_len_32_or_more_forward):
122 cmp $64, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400123 ja L(mm_len_64_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400124
125/* Copy [0..64] and return. */
126 movdqu (%rsi), %xmm0
127 movdqu 16(%rsi), %xmm1
128 movdqu -16(%rsi, %rdx), %xmm2
129 movdqu -32(%rsi, %rdx), %xmm3
130 movdqu %xmm0, (%rdi)
131 movdqu %xmm1, 16(%rdi)
132 movdqu %xmm2, -16(%rdi, %rdx)
133 movdqu %xmm3, -32(%rdi, %rdx)
134 jmp L(mm_return)
135
136L(mm_len_64_or_more_forward):
137 cmp $128, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400138 ja L(mm_len_128_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400139
140/* Copy [0..128] and return. */
141 movdqu (%rsi), %xmm0
142 movdqu 16(%rsi), %xmm1
143 movdqu 32(%rsi), %xmm2
144 movdqu 48(%rsi), %xmm3
145 movdqu -64(%rsi, %rdx), %xmm4
146 movdqu -48(%rsi, %rdx), %xmm5
147 movdqu -32(%rsi, %rdx), %xmm6
148 movdqu -16(%rsi, %rdx), %xmm7
149 movdqu %xmm0, (%rdi)
150 movdqu %xmm1, 16(%rdi)
151 movdqu %xmm2, 32(%rdi)
152 movdqu %xmm3, 48(%rdi)
153 movdqu %xmm4, -64(%rdi, %rdx)
154 movdqu %xmm5, -48(%rdi, %rdx)
155 movdqu %xmm6, -32(%rdi, %rdx)
156 movdqu %xmm7, -16(%rdi, %rdx)
157 jmp L(mm_return)
158
159L(mm_len_128_or_more_forward):
Varvara Rainchika020a242014-04-29 17:44:56 +0400160/* Aligning the address of destination. */
161/* save first unaligned 64 bytes */
162 movdqu (%rsi), %xmm0
163 movdqu 16(%rsi), %xmm1
164 movdqu 32(%rsi), %xmm2
165 movdqu 48(%rsi), %xmm3
166
Varvara Rainchikfce86142014-05-27 12:41:55 +0400167 lea 64(%rdi), %r8
168 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */
169 sub %rdi, %rsi /* rsi = src - dst = diff */
Varvara Rainchika020a242014-04-29 17:44:56 +0400170
Varvara Rainchikfce86142014-05-27 12:41:55 +0400171 movdqu (%r8, %rsi), %xmm4
172 movdqu 16(%r8, %rsi), %xmm5
173 movdqu 32(%r8, %rsi), %xmm6
174 movdqu 48(%r8, %rsi), %xmm7
Varvara Rainchika020a242014-04-29 17:44:56 +0400175
Varvara Rainchikfce86142014-05-27 12:41:55 +0400176 movdqu %xmm0, (%rdi)
177 movdqu %xmm1, 16(%rdi)
178 movdqu %xmm2, 32(%rdi)
179 movdqu %xmm3, 48(%rdi)
180 movdqa %xmm4, (%r8)
181 movaps %xmm5, 16(%r8)
182 movaps %xmm6, 32(%r8)
183 movaps %xmm7, 48(%r8)
184 add $64, %r8
Varvara Rainchika020a242014-04-29 17:44:56 +0400185
Varvara Rainchikfce86142014-05-27 12:41:55 +0400186 lea (%rdi, %rdx), %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400187 and $-64, %rbx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400188 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400189 jbe L(mm_copy_remaining_forward)
190
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000191 cmp __x86_shared_cache_size_half(%rip), %rdx
192
193 ja L(mm_overlapping_check_forward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400194
Varvara Rainchika020a242014-04-29 17:44:56 +0400195 .p2align 4
196L(mm_main_loop_forward):
197
Varvara Rainchikfce86142014-05-27 12:41:55 +0400198 prefetcht0 128(%r8, %rsi)
Varvara Rainchika020a242014-04-29 17:44:56 +0400199
Varvara Rainchikfce86142014-05-27 12:41:55 +0400200 movdqu (%r8, %rsi), %xmm0
201 movdqu 16(%r8, %rsi), %xmm1
202 movdqu 32(%r8, %rsi), %xmm2
203 movdqu 48(%r8, %rsi), %xmm3
204 movdqa %xmm0, (%r8)
205 movaps %xmm1, 16(%r8)
206 movaps %xmm2, 32(%r8)
207 movaps %xmm3, 48(%r8)
208 lea 64(%r8), %r8
209 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400210 ja L(mm_main_loop_forward)
211
212L(mm_copy_remaining_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400213 add %rdi, %rdx
214 sub %r8, %rdx
Varvara Rainchika020a242014-04-29 17:44:56 +0400215/* We copied all up till %rdi position in the dst.
216 In %rdx now is how many bytes are left to copy.
217 Now we need to advance %r8. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400218 lea (%r8, %rsi), %r9
Varvara Rainchika020a242014-04-29 17:44:56 +0400219
220L(mm_remaining_0_64_bytes_forward):
221 cmp $32, %rdx
222 ja L(mm_remaining_33_64_bytes_forward)
223 cmp $16, %rdx
224 ja L(mm_remaining_17_32_bytes_forward)
225 test %rdx, %rdx
226 .p2align 4,,2
227 je L(mm_return)
228
229 cmpb $8, %dl
230 ja L(mm_remaining_9_16_bytes_forward)
231 cmpb $4, %dl
232 .p2align 4,,5
233 ja L(mm_remaining_5_8_bytes_forward)
234 cmpb $2, %dl
235 .p2align 4,,1
236 ja L(mm_remaining_3_4_bytes_forward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400237 movzbl -1(%r9,%rdx), %esi
238 movzbl (%r9), %ebx
239 movb %sil, -1(%r8,%rdx)
240 movb %bl, (%r8)
Varvara Rainchika020a242014-04-29 17:44:56 +0400241 jmp L(mm_return)
242
243L(mm_remaining_33_64_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400244 movdqu (%r9), %xmm0
245 movdqu 16(%r9), %xmm1
246 movdqu -32(%r9, %rdx), %xmm2
247 movdqu -16(%r9, %rdx), %xmm3
248 movdqu %xmm0, (%r8)
249 movdqu %xmm1, 16(%r8)
250 movdqu %xmm2, -32(%r8, %rdx)
251 movdqu %xmm3, -16(%r8, %rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400252 jmp L(mm_return)
253
254L(mm_remaining_17_32_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400255 movdqu (%r9), %xmm0
256 movdqu -16(%r9, %rdx), %xmm1
257 movdqu %xmm0, (%r8)
258 movdqu %xmm1, -16(%r8, %rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400259 jmp L(mm_return)
260
261L(mm_remaining_5_8_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400262 movl (%r9), %esi
263 movl -4(%r9,%rdx), %ebx
264 movl %esi, (%r8)
265 movl %ebx, -4(%r8,%rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400266 jmp L(mm_return)
267
268L(mm_remaining_9_16_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400269 mov (%r9), %rsi
270 mov -8(%r9, %rdx), %rbx
271 mov %rsi, (%r8)
272 mov %rbx, -8(%r8, %rdx)
273 jmp L(mm_return)
274
275L(mm_remaining_3_4_bytes_forward):
276 movzwl -2(%r9,%rdx), %esi
277 movzwl (%r9), %ebx
278 movw %si, -2(%r8,%rdx)
279 movw %bx, (%r8)
Varvara Rainchika020a242014-04-29 17:44:56 +0400280 jmp L(mm_return)
281
282L(mm_len_0_16_bytes_forward):
283 testb $24, %dl
284 jne L(mm_len_9_16_bytes_forward)
285 testb $4, %dl
286 .p2align 4,,5
287 jne L(mm_len_5_8_bytes_forward)
288 test %rdx, %rdx
289 .p2align 4,,2
290 je L(mm_return)
291 testb $2, %dl
292 .p2align 4,,1
293 jne L(mm_len_2_4_bytes_forward)
294 movzbl -1(%rsi,%rdx), %ebx
295 movzbl (%rsi), %esi
296 movb %bl, -1(%rdi,%rdx)
297 movb %sil, (%rdi)
298 jmp L(mm_return)
299
300L(mm_len_2_4_bytes_forward):
301 movzwl -2(%rsi,%rdx), %ebx
302 movzwl (%rsi), %esi
303 movw %bx, -2(%rdi,%rdx)
304 movw %si, (%rdi)
305 jmp L(mm_return)
306
307L(mm_len_5_8_bytes_forward):
308 movl (%rsi), %ebx
309 movl -4(%rsi,%rdx), %esi
310 movl %ebx, (%rdi)
311 movl %esi, -4(%rdi,%rdx)
312 jmp L(mm_return)
313
314L(mm_len_9_16_bytes_forward):
315 mov (%rsi), %rbx
316 mov -8(%rsi, %rdx), %rsi
317 mov %rbx, (%rdi)
318 mov %rsi, -8(%rdi, %rdx)
319 jmp L(mm_return)
320
Varvara Rainchikfce86142014-05-27 12:41:55 +0400321L(mm_recalc_len):
322/* Compute in %rdx how many bytes are left to copy after
323 the main loop stops. */
324 mov %rbx, %rdx
325 sub %rdi, %rdx
Varvara Rainchika020a242014-04-29 17:44:56 +0400326/* The code for copying backwards. */
327L(mm_len_0_or_more_backward):
328
Varvara Rainchikfce86142014-05-27 12:41:55 +0400329/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
Varvara Rainchika020a242014-04-29 17:44:56 +0400330 separately. */
331 cmp $16, %rdx
332 jbe L(mm_len_0_16_bytes_backward)
333
334 cmp $32, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400335 ja L(mm_len_32_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400336
337/* Copy [0..32] and return. */
338 movdqu (%rsi), %xmm0
339 movdqu -16(%rsi, %rdx), %xmm1
340 movdqu %xmm0, (%rdi)
341 movdqu %xmm1, -16(%rdi, %rdx)
342 jmp L(mm_return)
343
344L(mm_len_32_or_more_backward):
345 cmp $64, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400346 ja L(mm_len_64_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400347
348/* Copy [0..64] and return. */
349 movdqu (%rsi), %xmm0
350 movdqu 16(%rsi), %xmm1
351 movdqu -16(%rsi, %rdx), %xmm2
352 movdqu -32(%rsi, %rdx), %xmm3
353 movdqu %xmm0, (%rdi)
354 movdqu %xmm1, 16(%rdi)
355 movdqu %xmm2, -16(%rdi, %rdx)
356 movdqu %xmm3, -32(%rdi, %rdx)
357 jmp L(mm_return)
358
359L(mm_len_64_or_more_backward):
360 cmp $128, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400361 ja L(mm_len_128_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400362
363/* Copy [0..128] and return. */
364 movdqu (%rsi), %xmm0
365 movdqu 16(%rsi), %xmm1
366 movdqu 32(%rsi), %xmm2
367 movdqu 48(%rsi), %xmm3
368 movdqu -64(%rsi, %rdx), %xmm4
369 movdqu -48(%rsi, %rdx), %xmm5
370 movdqu -32(%rsi, %rdx), %xmm6
371 movdqu -16(%rsi, %rdx), %xmm7
372 movdqu %xmm0, (%rdi)
373 movdqu %xmm1, 16(%rdi)
374 movdqu %xmm2, 32(%rdi)
375 movdqu %xmm3, 48(%rdi)
376 movdqu %xmm4, -64(%rdi, %rdx)
377 movdqu %xmm5, -48(%rdi, %rdx)
378 movdqu %xmm6, -32(%rdi, %rdx)
379 movdqu %xmm7, -16(%rdi, %rdx)
380 jmp L(mm_return)
381
382L(mm_len_128_or_more_backward):
Varvara Rainchika020a242014-04-29 17:44:56 +0400383/* Aligning the address of destination. We need to save
384 16 bits from the source in order not to overwrite them. */
385 movdqu -16(%rsi, %rdx), %xmm0
386 movdqu -32(%rsi, %rdx), %xmm1
387 movdqu -48(%rsi, %rdx), %xmm2
388 movdqu -64(%rsi, %rdx), %xmm3
389
390 lea (%rdi, %rdx), %r9
391 and $-64, %r9 /* r9 = aligned dst */
392
393 mov %rsi, %r8
394 sub %rdi, %r8 /* r8 = src - dst, diff */
395
396 movdqu -16(%r9, %r8), %xmm4
397 movdqu -32(%r9, %r8), %xmm5
398 movdqu -48(%r9, %r8), %xmm6
399 movdqu -64(%r9, %r8), %xmm7
400
401 movdqu %xmm0, -16(%rdi, %rdx)
402 movdqu %xmm1, -32(%rdi, %rdx)
403 movdqu %xmm2, -48(%rdi, %rdx)
404 movdqu %xmm3, -64(%rdi, %rdx)
405 movdqa %xmm4, -16(%r9)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400406 movaps %xmm5, -32(%r9)
407 movaps %xmm6, -48(%r9)
408 movaps %xmm7, -64(%r9)
Varvara Rainchika020a242014-04-29 17:44:56 +0400409 lea -64(%r9), %r9
410
411 lea 64(%rdi), %rbx
412 and $-64, %rbx
413
Varvara Rainchika020a242014-04-29 17:44:56 +0400414 cmp %r9, %rbx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400415 jae L(mm_recalc_len)
416
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000417 cmp __x86_shared_cache_size_half(%rip), %rdx
418
419 ja L(mm_overlapping_check_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400420
421 .p2align 4
422L(mm_main_loop_backward):
423
424 prefetcht0 -128(%r9, %r8)
425
426 movdqu -64(%r9, %r8), %xmm0
427 movdqu -48(%r9, %r8), %xmm1
428 movdqu -32(%r9, %r8), %xmm2
429 movdqu -16(%r9, %r8), %xmm3
430 movdqa %xmm0, -64(%r9)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400431 movaps %xmm1, -48(%r9)
432 movaps %xmm2, -32(%r9)
433 movaps %xmm3, -16(%r9)
Varvara Rainchika020a242014-04-29 17:44:56 +0400434 lea -64(%r9), %r9
435 cmp %r9, %rbx
436 jb L(mm_main_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400437 jmp L(mm_recalc_len)
Varvara Rainchika020a242014-04-29 17:44:56 +0400438
439/* Copy [0..16] and return. */
440L(mm_len_0_16_bytes_backward):
441 testb $24, %dl
442 jnz L(mm_len_9_16_bytes_backward)
443 testb $4, %dl
444 .p2align 4,,5
445 jnz L(mm_len_5_8_bytes_backward)
446 test %rdx, %rdx
447 .p2align 4,,2
448 je L(mm_return)
449 testb $2, %dl
450 .p2align 4,,1
451 jne L(mm_len_3_4_bytes_backward)
452 movzbl -1(%rsi,%rdx), %ebx
453 movzbl (%rsi), %ecx
454 movb %bl, -1(%rdi,%rdx)
455 movb %cl, (%rdi)
456 jmp L(mm_return)
457
458L(mm_len_3_4_bytes_backward):
459 movzwl -2(%rsi,%rdx), %ebx
460 movzwl (%rsi), %ecx
461 movw %bx, -2(%rdi,%rdx)
462 movw %cx, (%rdi)
463 jmp L(mm_return)
464
465L(mm_len_9_16_bytes_backward):
466 movl -4(%rsi,%rdx), %ebx
467 movl -8(%rsi,%rdx), %ecx
468 movl %ebx, -4(%rdi,%rdx)
469 movl %ecx, -8(%rdi,%rdx)
470 sub $8, %rdx
471 jmp L(mm_len_0_16_bytes_backward)
472
473L(mm_len_5_8_bytes_backward):
474 movl (%rsi), %ebx
475 movl -4(%rsi,%rdx), %ecx
476 movl %ebx, (%rdi)
477 movl %ecx, -4(%rdi,%rdx)
478
479L(mm_return):
480 RETURN
481
482/* Big length copy forward part. */
483
Varvara Rainchika020a242014-04-29 17:44:56 +0400484 .p2align 4
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000485
486L(mm_overlapping_check_forward):
487 mov %rsi, %r9
488 add %rdx, %r9
489 cmp __x86_shared_cache_size(%rip), %r9
490 jbe L(mm_main_loop_forward)
491
Varvara Rainchika020a242014-04-29 17:44:56 +0400492L(mm_large_page_loop_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400493 movdqu (%r8, %rsi), %xmm0
494 movdqu 16(%r8, %rsi), %xmm1
495 movdqu 32(%r8, %rsi), %xmm2
496 movdqu 48(%r8, %rsi), %xmm3
497 movntdq %xmm0, (%r8)
498 movntdq %xmm1, 16(%r8)
499 movntdq %xmm2, 32(%r8)
500 movntdq %xmm3, 48(%r8)
501 lea 64(%r8), %r8
502 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400503 ja L(mm_large_page_loop_forward)
504 sfence
Varvara Rainchikfce86142014-05-27 12:41:55 +0400505 jmp L(mm_copy_remaining_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400506
507/* Big length copy backward part. */
Varvara Rainchika020a242014-04-29 17:44:56 +0400508 .p2align 4
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000509
510L(mm_overlapping_check_backward):
511 mov %rdi, %r11
512 sub %rsi, %r11 /* r11 = dst - src, diff */
513 add %rdx, %r11
514 cmp __x86_shared_cache_size(%rip), %r11
515 jbe L(mm_main_loop_backward)
516
Varvara Rainchika020a242014-04-29 17:44:56 +0400517L(mm_large_page_loop_backward):
518 movdqu -64(%r9, %r8), %xmm0
519 movdqu -48(%r9, %r8), %xmm1
520 movdqu -32(%r9, %r8), %xmm2
521 movdqu -16(%r9, %r8), %xmm3
522 movntdq %xmm0, -64(%r9)
523 movntdq %xmm1, -48(%r9)
524 movntdq %xmm2, -32(%r9)
525 movntdq %xmm3, -16(%r9)
526 lea -64(%r9), %r9
527 cmp %r9, %rbx
528 jb L(mm_large_page_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400529 sfence
530 jmp L(mm_recalc_len)
Varvara Rainchika020a242014-04-29 17:44:56 +0400531
532END (MEMMOVE)
Haibo Huang8a0f0ed2018-05-24 20:39:18 -0700533
534ALIAS_SYMBOL(memcpy, MEMMOVE)