blob: b7873856f3b22397744b791904c64650db5f9ad2 [file] [log] [blame]
Varvara Rainchika020a242014-04-29 17:44:56 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Varvara Rainchika020a242014-04-29 17:44:56 +040031
32#ifndef MEMMOVE
33# define MEMMOVE memmove
34#endif
35
36#ifndef L
37# define L(label) .L##label
38#endif
39
40#ifndef cfi_startproc
41# define cfi_startproc .cfi_startproc
42#endif
43
44#ifndef cfi_endproc
45# define cfi_endproc .cfi_endproc
46#endif
47
48#ifndef cfi_rel_offset
49# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
50#endif
51
52#ifndef cfi_restore
53# define cfi_restore(reg) .cfi_restore reg
54#endif
55
56#ifndef cfi_adjust_cfa_offset
57# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
58#endif
59
60#ifndef ENTRY
61# define ENTRY(name) \
62 .type name, @function; \
63 .globl name; \
64 .p2align 4; \
65name: \
66 cfi_startproc
67#endif
68
Haibo Huang8a0f0ed2018-05-24 20:39:18 -070069#ifndef ALIAS_SYMBOL
70# define ALIAS_SYMBOL(alias, original) \
71 .globl alias; \
72 .equ alias, original
73#endif
74
Varvara Rainchika020a242014-04-29 17:44:56 +040075#ifndef END
76# define END(name) \
77 cfi_endproc; \
78 .size name, .-name
79#endif
80
81#define CFI_PUSH(REG) \
Ryuichiro Chibaf59ab532025-01-16 06:40:58 +000082 cfi_adjust_cfa_offset (8); \
Varvara Rainchika020a242014-04-29 17:44:56 +040083 cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG) \
Ryuichiro Chibaf59ab532025-01-16 06:40:58 +000086 cfi_adjust_cfa_offset (-8); \
Varvara Rainchika020a242014-04-29 17:44:56 +040087 cfi_restore (REG)
88
89#define PUSH(REG) push REG;
90#define POP(REG) pop REG;
91
Ryuichiro Chibaf59ab532025-01-16 06:40:58 +000092#define ENTRANCE \
93 PUSH (%rbx); \
94 CFI_PUSH (%rbx);
95#define RETURN_END \
96 POP (%rbx); \
97 CFI_POP (%rbx); \
98 ret
Varvara Rainchika020a242014-04-29 17:44:56 +040099#define RETURN RETURN_END;
100
101 .section .text.sse2,"ax",@progbits
102ENTRY (MEMMOVE)
103 ENTRANCE
Varvara Rainchika020a242014-04-29 17:44:56 +0400104 mov %rdi, %rax
105
106/* Check whether we should copy backward or forward. */
107 cmp %rsi, %rdi
108 je L(mm_return)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400109 jg L(mm_len_0_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400110
111/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
112 separately. */
113 cmp $16, %rdx
114 jbe L(mm_len_0_16_bytes_forward)
115
116 cmp $32, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400117 ja L(mm_len_32_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400118
119/* Copy [0..32] and return. */
120 movdqu (%rsi), %xmm0
121 movdqu -16(%rsi, %rdx), %xmm1
122 movdqu %xmm0, (%rdi)
123 movdqu %xmm1, -16(%rdi, %rdx)
124 jmp L(mm_return)
125
126L(mm_len_32_or_more_forward):
127 cmp $64, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400128 ja L(mm_len_64_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400129
130/* Copy [0..64] and return. */
131 movdqu (%rsi), %xmm0
132 movdqu 16(%rsi), %xmm1
133 movdqu -16(%rsi, %rdx), %xmm2
134 movdqu -32(%rsi, %rdx), %xmm3
135 movdqu %xmm0, (%rdi)
136 movdqu %xmm1, 16(%rdi)
137 movdqu %xmm2, -16(%rdi, %rdx)
138 movdqu %xmm3, -32(%rdi, %rdx)
139 jmp L(mm_return)
140
141L(mm_len_64_or_more_forward):
142 cmp $128, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400143 ja L(mm_len_128_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400144
145/* Copy [0..128] and return. */
146 movdqu (%rsi), %xmm0
147 movdqu 16(%rsi), %xmm1
148 movdqu 32(%rsi), %xmm2
149 movdqu 48(%rsi), %xmm3
150 movdqu -64(%rsi, %rdx), %xmm4
151 movdqu -48(%rsi, %rdx), %xmm5
152 movdqu -32(%rsi, %rdx), %xmm6
153 movdqu -16(%rsi, %rdx), %xmm7
154 movdqu %xmm0, (%rdi)
155 movdqu %xmm1, 16(%rdi)
156 movdqu %xmm2, 32(%rdi)
157 movdqu %xmm3, 48(%rdi)
158 movdqu %xmm4, -64(%rdi, %rdx)
159 movdqu %xmm5, -48(%rdi, %rdx)
160 movdqu %xmm6, -32(%rdi, %rdx)
161 movdqu %xmm7, -16(%rdi, %rdx)
162 jmp L(mm_return)
163
164L(mm_len_128_or_more_forward):
Varvara Rainchika020a242014-04-29 17:44:56 +0400165/* Aligning the address of destination. */
166/* save first unaligned 64 bytes */
167 movdqu (%rsi), %xmm0
168 movdqu 16(%rsi), %xmm1
169 movdqu 32(%rsi), %xmm2
170 movdqu 48(%rsi), %xmm3
171
Varvara Rainchikfce86142014-05-27 12:41:55 +0400172 lea 64(%rdi), %r8
173 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */
174 sub %rdi, %rsi /* rsi = src - dst = diff */
Varvara Rainchika020a242014-04-29 17:44:56 +0400175
Varvara Rainchikfce86142014-05-27 12:41:55 +0400176 movdqu (%r8, %rsi), %xmm4
177 movdqu 16(%r8, %rsi), %xmm5
178 movdqu 32(%r8, %rsi), %xmm6
179 movdqu 48(%r8, %rsi), %xmm7
Varvara Rainchika020a242014-04-29 17:44:56 +0400180
Varvara Rainchikfce86142014-05-27 12:41:55 +0400181 movdqu %xmm0, (%rdi)
182 movdqu %xmm1, 16(%rdi)
183 movdqu %xmm2, 32(%rdi)
184 movdqu %xmm3, 48(%rdi)
185 movdqa %xmm4, (%r8)
186 movaps %xmm5, 16(%r8)
187 movaps %xmm6, 32(%r8)
188 movaps %xmm7, 48(%r8)
189 add $64, %r8
Varvara Rainchika020a242014-04-29 17:44:56 +0400190
Varvara Rainchikfce86142014-05-27 12:41:55 +0400191 lea (%rdi, %rdx), %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400192 and $-64, %rbx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400193 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400194 jbe L(mm_copy_remaining_forward)
195
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000196 cmp __x86_shared_cache_size_half(%rip), %rdx
197
198 ja L(mm_overlapping_check_forward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400199
Varvara Rainchika020a242014-04-29 17:44:56 +0400200 .p2align 4
201L(mm_main_loop_forward):
202
Varvara Rainchikfce86142014-05-27 12:41:55 +0400203 prefetcht0 128(%r8, %rsi)
Varvara Rainchika020a242014-04-29 17:44:56 +0400204
Varvara Rainchikfce86142014-05-27 12:41:55 +0400205 movdqu (%r8, %rsi), %xmm0
206 movdqu 16(%r8, %rsi), %xmm1
207 movdqu 32(%r8, %rsi), %xmm2
208 movdqu 48(%r8, %rsi), %xmm3
209 movdqa %xmm0, (%r8)
210 movaps %xmm1, 16(%r8)
211 movaps %xmm2, 32(%r8)
212 movaps %xmm3, 48(%r8)
213 lea 64(%r8), %r8
214 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400215 ja L(mm_main_loop_forward)
216
217L(mm_copy_remaining_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400218 add %rdi, %rdx
219 sub %r8, %rdx
Varvara Rainchika020a242014-04-29 17:44:56 +0400220/* We copied all up till %rdi position in the dst.
221 In %rdx now is how many bytes are left to copy.
222 Now we need to advance %r8. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400223 lea (%r8, %rsi), %r9
Varvara Rainchika020a242014-04-29 17:44:56 +0400224
225L(mm_remaining_0_64_bytes_forward):
226 cmp $32, %rdx
227 ja L(mm_remaining_33_64_bytes_forward)
228 cmp $16, %rdx
229 ja L(mm_remaining_17_32_bytes_forward)
230 test %rdx, %rdx
231 .p2align 4,,2
232 je L(mm_return)
233
234 cmpb $8, %dl
235 ja L(mm_remaining_9_16_bytes_forward)
236 cmpb $4, %dl
237 .p2align 4,,5
238 ja L(mm_remaining_5_8_bytes_forward)
239 cmpb $2, %dl
240 .p2align 4,,1
241 ja L(mm_remaining_3_4_bytes_forward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400242 movzbl -1(%r9,%rdx), %esi
243 movzbl (%r9), %ebx
244 movb %sil, -1(%r8,%rdx)
245 movb %bl, (%r8)
Varvara Rainchika020a242014-04-29 17:44:56 +0400246 jmp L(mm_return)
247
248L(mm_remaining_33_64_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400249 movdqu (%r9), %xmm0
250 movdqu 16(%r9), %xmm1
251 movdqu -32(%r9, %rdx), %xmm2
252 movdqu -16(%r9, %rdx), %xmm3
253 movdqu %xmm0, (%r8)
254 movdqu %xmm1, 16(%r8)
255 movdqu %xmm2, -32(%r8, %rdx)
256 movdqu %xmm3, -16(%r8, %rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400257 jmp L(mm_return)
258
259L(mm_remaining_17_32_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400260 movdqu (%r9), %xmm0
261 movdqu -16(%r9, %rdx), %xmm1
262 movdqu %xmm0, (%r8)
263 movdqu %xmm1, -16(%r8, %rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400264 jmp L(mm_return)
265
266L(mm_remaining_5_8_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400267 movl (%r9), %esi
268 movl -4(%r9,%rdx), %ebx
269 movl %esi, (%r8)
270 movl %ebx, -4(%r8,%rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400271 jmp L(mm_return)
272
273L(mm_remaining_9_16_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400274 mov (%r9), %rsi
275 mov -8(%r9, %rdx), %rbx
276 mov %rsi, (%r8)
277 mov %rbx, -8(%r8, %rdx)
278 jmp L(mm_return)
279
280L(mm_remaining_3_4_bytes_forward):
281 movzwl -2(%r9,%rdx), %esi
282 movzwl (%r9), %ebx
283 movw %si, -2(%r8,%rdx)
284 movw %bx, (%r8)
Varvara Rainchika020a242014-04-29 17:44:56 +0400285 jmp L(mm_return)
286
287L(mm_len_0_16_bytes_forward):
288 testb $24, %dl
289 jne L(mm_len_9_16_bytes_forward)
290 testb $4, %dl
291 .p2align 4,,5
292 jne L(mm_len_5_8_bytes_forward)
293 test %rdx, %rdx
294 .p2align 4,,2
295 je L(mm_return)
296 testb $2, %dl
297 .p2align 4,,1
298 jne L(mm_len_2_4_bytes_forward)
299 movzbl -1(%rsi,%rdx), %ebx
300 movzbl (%rsi), %esi
301 movb %bl, -1(%rdi,%rdx)
302 movb %sil, (%rdi)
303 jmp L(mm_return)
304
305L(mm_len_2_4_bytes_forward):
306 movzwl -2(%rsi,%rdx), %ebx
307 movzwl (%rsi), %esi
308 movw %bx, -2(%rdi,%rdx)
309 movw %si, (%rdi)
310 jmp L(mm_return)
311
312L(mm_len_5_8_bytes_forward):
313 movl (%rsi), %ebx
314 movl -4(%rsi,%rdx), %esi
315 movl %ebx, (%rdi)
316 movl %esi, -4(%rdi,%rdx)
317 jmp L(mm_return)
318
319L(mm_len_9_16_bytes_forward):
320 mov (%rsi), %rbx
321 mov -8(%rsi, %rdx), %rsi
322 mov %rbx, (%rdi)
323 mov %rsi, -8(%rdi, %rdx)
324 jmp L(mm_return)
325
Varvara Rainchikfce86142014-05-27 12:41:55 +0400326L(mm_recalc_len):
327/* Compute in %rdx how many bytes are left to copy after
328 the main loop stops. */
329 mov %rbx, %rdx
330 sub %rdi, %rdx
Varvara Rainchika020a242014-04-29 17:44:56 +0400331/* The code for copying backwards. */
332L(mm_len_0_or_more_backward):
333
Varvara Rainchikfce86142014-05-27 12:41:55 +0400334/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
Varvara Rainchika020a242014-04-29 17:44:56 +0400335 separately. */
336 cmp $16, %rdx
337 jbe L(mm_len_0_16_bytes_backward)
338
339 cmp $32, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400340 ja L(mm_len_32_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400341
342/* Copy [0..32] and return. */
343 movdqu (%rsi), %xmm0
344 movdqu -16(%rsi, %rdx), %xmm1
345 movdqu %xmm0, (%rdi)
346 movdqu %xmm1, -16(%rdi, %rdx)
347 jmp L(mm_return)
348
349L(mm_len_32_or_more_backward):
350 cmp $64, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400351 ja L(mm_len_64_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400352
353/* Copy [0..64] and return. */
354 movdqu (%rsi), %xmm0
355 movdqu 16(%rsi), %xmm1
356 movdqu -16(%rsi, %rdx), %xmm2
357 movdqu -32(%rsi, %rdx), %xmm3
358 movdqu %xmm0, (%rdi)
359 movdqu %xmm1, 16(%rdi)
360 movdqu %xmm2, -16(%rdi, %rdx)
361 movdqu %xmm3, -32(%rdi, %rdx)
362 jmp L(mm_return)
363
364L(mm_len_64_or_more_backward):
365 cmp $128, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400366 ja L(mm_len_128_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400367
368/* Copy [0..128] and return. */
369 movdqu (%rsi), %xmm0
370 movdqu 16(%rsi), %xmm1
371 movdqu 32(%rsi), %xmm2
372 movdqu 48(%rsi), %xmm3
373 movdqu -64(%rsi, %rdx), %xmm4
374 movdqu -48(%rsi, %rdx), %xmm5
375 movdqu -32(%rsi, %rdx), %xmm6
376 movdqu -16(%rsi, %rdx), %xmm7
377 movdqu %xmm0, (%rdi)
378 movdqu %xmm1, 16(%rdi)
379 movdqu %xmm2, 32(%rdi)
380 movdqu %xmm3, 48(%rdi)
381 movdqu %xmm4, -64(%rdi, %rdx)
382 movdqu %xmm5, -48(%rdi, %rdx)
383 movdqu %xmm6, -32(%rdi, %rdx)
384 movdqu %xmm7, -16(%rdi, %rdx)
385 jmp L(mm_return)
386
387L(mm_len_128_or_more_backward):
Varvara Rainchika020a242014-04-29 17:44:56 +0400388/* Aligning the address of destination. We need to save
389 16 bits from the source in order not to overwrite them. */
390 movdqu -16(%rsi, %rdx), %xmm0
391 movdqu -32(%rsi, %rdx), %xmm1
392 movdqu -48(%rsi, %rdx), %xmm2
393 movdqu -64(%rsi, %rdx), %xmm3
394
395 lea (%rdi, %rdx), %r9
396 and $-64, %r9 /* r9 = aligned dst */
397
398 mov %rsi, %r8
399 sub %rdi, %r8 /* r8 = src - dst, diff */
400
401 movdqu -16(%r9, %r8), %xmm4
402 movdqu -32(%r9, %r8), %xmm5
403 movdqu -48(%r9, %r8), %xmm6
404 movdqu -64(%r9, %r8), %xmm7
405
406 movdqu %xmm0, -16(%rdi, %rdx)
407 movdqu %xmm1, -32(%rdi, %rdx)
408 movdqu %xmm2, -48(%rdi, %rdx)
409 movdqu %xmm3, -64(%rdi, %rdx)
410 movdqa %xmm4, -16(%r9)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400411 movaps %xmm5, -32(%r9)
412 movaps %xmm6, -48(%r9)
413 movaps %xmm7, -64(%r9)
Varvara Rainchika020a242014-04-29 17:44:56 +0400414 lea -64(%r9), %r9
415
416 lea 64(%rdi), %rbx
417 and $-64, %rbx
418
Varvara Rainchika020a242014-04-29 17:44:56 +0400419 cmp %r9, %rbx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400420 jae L(mm_recalc_len)
421
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000422 cmp __x86_shared_cache_size_half(%rip), %rdx
423
424 ja L(mm_overlapping_check_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400425
426 .p2align 4
427L(mm_main_loop_backward):
428
429 prefetcht0 -128(%r9, %r8)
430
431 movdqu -64(%r9, %r8), %xmm0
432 movdqu -48(%r9, %r8), %xmm1
433 movdqu -32(%r9, %r8), %xmm2
434 movdqu -16(%r9, %r8), %xmm3
435 movdqa %xmm0, -64(%r9)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400436 movaps %xmm1, -48(%r9)
437 movaps %xmm2, -32(%r9)
438 movaps %xmm3, -16(%r9)
Varvara Rainchika020a242014-04-29 17:44:56 +0400439 lea -64(%r9), %r9
440 cmp %r9, %rbx
441 jb L(mm_main_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400442 jmp L(mm_recalc_len)
Varvara Rainchika020a242014-04-29 17:44:56 +0400443
444/* Copy [0..16] and return. */
445L(mm_len_0_16_bytes_backward):
446 testb $24, %dl
447 jnz L(mm_len_9_16_bytes_backward)
448 testb $4, %dl
449 .p2align 4,,5
450 jnz L(mm_len_5_8_bytes_backward)
451 test %rdx, %rdx
452 .p2align 4,,2
453 je L(mm_return)
454 testb $2, %dl
455 .p2align 4,,1
456 jne L(mm_len_3_4_bytes_backward)
457 movzbl -1(%rsi,%rdx), %ebx
458 movzbl (%rsi), %ecx
459 movb %bl, -1(%rdi,%rdx)
460 movb %cl, (%rdi)
461 jmp L(mm_return)
462
463L(mm_len_3_4_bytes_backward):
464 movzwl -2(%rsi,%rdx), %ebx
465 movzwl (%rsi), %ecx
466 movw %bx, -2(%rdi,%rdx)
467 movw %cx, (%rdi)
468 jmp L(mm_return)
469
470L(mm_len_9_16_bytes_backward):
471 movl -4(%rsi,%rdx), %ebx
472 movl -8(%rsi,%rdx), %ecx
473 movl %ebx, -4(%rdi,%rdx)
474 movl %ecx, -8(%rdi,%rdx)
475 sub $8, %rdx
476 jmp L(mm_len_0_16_bytes_backward)
477
478L(mm_len_5_8_bytes_backward):
479 movl (%rsi), %ebx
480 movl -4(%rsi,%rdx), %ecx
481 movl %ebx, (%rdi)
482 movl %ecx, -4(%rdi,%rdx)
483
484L(mm_return):
485 RETURN
486
487/* Big length copy forward part. */
488
Varvara Rainchika020a242014-04-29 17:44:56 +0400489 .p2align 4
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000490
491L(mm_overlapping_check_forward):
492 mov %rsi, %r9
493 add %rdx, %r9
494 cmp __x86_shared_cache_size(%rip), %r9
495 jbe L(mm_main_loop_forward)
496
Varvara Rainchika020a242014-04-29 17:44:56 +0400497L(mm_large_page_loop_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400498 movdqu (%r8, %rsi), %xmm0
499 movdqu 16(%r8, %rsi), %xmm1
500 movdqu 32(%r8, %rsi), %xmm2
501 movdqu 48(%r8, %rsi), %xmm3
502 movntdq %xmm0, (%r8)
503 movntdq %xmm1, 16(%r8)
504 movntdq %xmm2, 32(%r8)
505 movntdq %xmm3, 48(%r8)
506 lea 64(%r8), %r8
507 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400508 ja L(mm_large_page_loop_forward)
509 sfence
Varvara Rainchikfce86142014-05-27 12:41:55 +0400510 jmp L(mm_copy_remaining_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400511
512/* Big length copy backward part. */
Varvara Rainchika020a242014-04-29 17:44:56 +0400513 .p2align 4
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000514
515L(mm_overlapping_check_backward):
516 mov %rdi, %r11
517 sub %rsi, %r11 /* r11 = dst - src, diff */
518 add %rdx, %r11
519 cmp __x86_shared_cache_size(%rip), %r11
520 jbe L(mm_main_loop_backward)
521
Varvara Rainchika020a242014-04-29 17:44:56 +0400522L(mm_large_page_loop_backward):
523 movdqu -64(%r9, %r8), %xmm0
524 movdqu -48(%r9, %r8), %xmm1
525 movdqu -32(%r9, %r8), %xmm2
526 movdqu -16(%r9, %r8), %xmm3
527 movntdq %xmm0, -64(%r9)
528 movntdq %xmm1, -48(%r9)
529 movntdq %xmm2, -32(%r9)
530 movntdq %xmm3, -16(%r9)
531 lea -64(%r9), %r9
532 cmp %r9, %rbx
533 jb L(mm_large_page_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400534 sfence
535 jmp L(mm_recalc_len)
Varvara Rainchika020a242014-04-29 17:44:56 +0400536
537END (MEMMOVE)
Haibo Huang8a0f0ed2018-05-24 20:39:18 -0700538
539ALIAS_SYMBOL(memcpy, MEMMOVE)