blob: 9f5fb12132772d1b90a319b5279c0bb46f54b9b2 [file] [log] [blame]
Varvara Rainchika020a242014-04-29 17:44:56 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Varvara Rainchika020a242014-04-29 17:44:56 +040031
32#ifndef MEMMOVE
33# define MEMMOVE memmove
34#endif
35
36#ifndef L
37# define L(label) .L##label
38#endif
39
40#ifndef cfi_startproc
41# define cfi_startproc .cfi_startproc
42#endif
43
44#ifndef cfi_endproc
45# define cfi_endproc .cfi_endproc
46#endif
47
48#ifndef cfi_rel_offset
49# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
50#endif
51
52#ifndef cfi_restore
53# define cfi_restore(reg) .cfi_restore reg
54#endif
55
56#ifndef cfi_adjust_cfa_offset
57# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
58#endif
59
60#ifndef ENTRY
61# define ENTRY(name) \
62 .type name, @function; \
63 .globl name; \
64 .p2align 4; \
65name: \
66 cfi_startproc
67#endif
68
Haibo Huang8a0f0ed2018-05-24 20:39:18 -070069#ifndef ALIAS_SYMBOL
70# define ALIAS_SYMBOL(alias, original) \
71 .globl alias; \
72 .equ alias, original
73#endif
74
Varvara Rainchika020a242014-04-29 17:44:56 +040075#ifndef END
76# define END(name) \
77 cfi_endproc; \
78 .size name, .-name
79#endif
80
81#define CFI_PUSH(REG) \
Ryuichiro Chibaf59ab532025-01-16 06:40:58 +000082 cfi_adjust_cfa_offset (8); \
Varvara Rainchika020a242014-04-29 17:44:56 +040083 cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG) \
Ryuichiro Chibaf59ab532025-01-16 06:40:58 +000086 cfi_adjust_cfa_offset (-8); \
Varvara Rainchika020a242014-04-29 17:44:56 +040087 cfi_restore (REG)
88
89#define PUSH(REG) push REG;
90#define POP(REG) pop REG;
91
Ryuichiro Chibaf59ab532025-01-16 06:40:58 +000092#define ENTRANCE \
93 PUSH (%rbx); \
94 CFI_PUSH (%rbx);
95#define RETURN_END \
96 POP (%rbx); \
97 CFI_POP (%rbx); \
98 ret
Varvara Rainchika020a242014-04-29 17:44:56 +040099#define RETURN RETURN_END;
100
101 .section .text.sse2,"ax",@progbits
Daniel Verkamp901e9a82025-01-21 14:38:02 -0800102ENTRY (__memcpy_chk)
103 cmp %rcx, %rdx
104 ja __memcpy_chk_fail
105/* Fall through to memcpy/memmove. */
106END (__memcpy_chk)
Varvara Rainchika020a242014-04-29 17:44:56 +0400107ENTRY (MEMMOVE)
108 ENTRANCE
Varvara Rainchika020a242014-04-29 17:44:56 +0400109 mov %rdi, %rax
110
111/* Check whether we should copy backward or forward. */
112 cmp %rsi, %rdi
113 je L(mm_return)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400114 jg L(mm_len_0_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400115
116/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
117 separately. */
118 cmp $16, %rdx
119 jbe L(mm_len_0_16_bytes_forward)
120
121 cmp $32, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400122 ja L(mm_len_32_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400123
124/* Copy [0..32] and return. */
125 movdqu (%rsi), %xmm0
126 movdqu -16(%rsi, %rdx), %xmm1
127 movdqu %xmm0, (%rdi)
128 movdqu %xmm1, -16(%rdi, %rdx)
129 jmp L(mm_return)
130
131L(mm_len_32_or_more_forward):
132 cmp $64, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400133 ja L(mm_len_64_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400134
135/* Copy [0..64] and return. */
136 movdqu (%rsi), %xmm0
137 movdqu 16(%rsi), %xmm1
138 movdqu -16(%rsi, %rdx), %xmm2
139 movdqu -32(%rsi, %rdx), %xmm3
140 movdqu %xmm0, (%rdi)
141 movdqu %xmm1, 16(%rdi)
142 movdqu %xmm2, -16(%rdi, %rdx)
143 movdqu %xmm3, -32(%rdi, %rdx)
144 jmp L(mm_return)
145
146L(mm_len_64_or_more_forward):
147 cmp $128, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400148 ja L(mm_len_128_or_more_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400149
150/* Copy [0..128] and return. */
151 movdqu (%rsi), %xmm0
152 movdqu 16(%rsi), %xmm1
153 movdqu 32(%rsi), %xmm2
154 movdqu 48(%rsi), %xmm3
155 movdqu -64(%rsi, %rdx), %xmm4
156 movdqu -48(%rsi, %rdx), %xmm5
157 movdqu -32(%rsi, %rdx), %xmm6
158 movdqu -16(%rsi, %rdx), %xmm7
159 movdqu %xmm0, (%rdi)
160 movdqu %xmm1, 16(%rdi)
161 movdqu %xmm2, 32(%rdi)
162 movdqu %xmm3, 48(%rdi)
163 movdqu %xmm4, -64(%rdi, %rdx)
164 movdqu %xmm5, -48(%rdi, %rdx)
165 movdqu %xmm6, -32(%rdi, %rdx)
166 movdqu %xmm7, -16(%rdi, %rdx)
167 jmp L(mm_return)
168
169L(mm_len_128_or_more_forward):
Varvara Rainchika020a242014-04-29 17:44:56 +0400170/* Aligning the address of destination. */
171/* save first unaligned 64 bytes */
172 movdqu (%rsi), %xmm0
173 movdqu 16(%rsi), %xmm1
174 movdqu 32(%rsi), %xmm2
175 movdqu 48(%rsi), %xmm3
176
Varvara Rainchikfce86142014-05-27 12:41:55 +0400177 lea 64(%rdi), %r8
178 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */
179 sub %rdi, %rsi /* rsi = src - dst = diff */
Varvara Rainchika020a242014-04-29 17:44:56 +0400180
Varvara Rainchikfce86142014-05-27 12:41:55 +0400181 movdqu (%r8, %rsi), %xmm4
182 movdqu 16(%r8, %rsi), %xmm5
183 movdqu 32(%r8, %rsi), %xmm6
184 movdqu 48(%r8, %rsi), %xmm7
Varvara Rainchika020a242014-04-29 17:44:56 +0400185
Varvara Rainchikfce86142014-05-27 12:41:55 +0400186 movdqu %xmm0, (%rdi)
187 movdqu %xmm1, 16(%rdi)
188 movdqu %xmm2, 32(%rdi)
189 movdqu %xmm3, 48(%rdi)
190 movdqa %xmm4, (%r8)
191 movaps %xmm5, 16(%r8)
192 movaps %xmm6, 32(%r8)
193 movaps %xmm7, 48(%r8)
194 add $64, %r8
Varvara Rainchika020a242014-04-29 17:44:56 +0400195
Varvara Rainchikfce86142014-05-27 12:41:55 +0400196 lea (%rdi, %rdx), %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400197 and $-64, %rbx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400198 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400199 jbe L(mm_copy_remaining_forward)
200
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000201 cmp __x86_shared_cache_size_half(%rip), %rdx
202
203 ja L(mm_overlapping_check_forward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400204
Varvara Rainchika020a242014-04-29 17:44:56 +0400205 .p2align 4
206L(mm_main_loop_forward):
207
Varvara Rainchikfce86142014-05-27 12:41:55 +0400208 prefetcht0 128(%r8, %rsi)
Varvara Rainchika020a242014-04-29 17:44:56 +0400209
Varvara Rainchikfce86142014-05-27 12:41:55 +0400210 movdqu (%r8, %rsi), %xmm0
211 movdqu 16(%r8, %rsi), %xmm1
212 movdqu 32(%r8, %rsi), %xmm2
213 movdqu 48(%r8, %rsi), %xmm3
214 movdqa %xmm0, (%r8)
215 movaps %xmm1, 16(%r8)
216 movaps %xmm2, 32(%r8)
217 movaps %xmm3, 48(%r8)
218 lea 64(%r8), %r8
219 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400220 ja L(mm_main_loop_forward)
221
222L(mm_copy_remaining_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400223 add %rdi, %rdx
224 sub %r8, %rdx
Varvara Rainchika020a242014-04-29 17:44:56 +0400225/* We copied all up till %rdi position in the dst.
226 In %rdx now is how many bytes are left to copy.
227 Now we need to advance %r8. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400228 lea (%r8, %rsi), %r9
Varvara Rainchika020a242014-04-29 17:44:56 +0400229
230L(mm_remaining_0_64_bytes_forward):
231 cmp $32, %rdx
232 ja L(mm_remaining_33_64_bytes_forward)
233 cmp $16, %rdx
234 ja L(mm_remaining_17_32_bytes_forward)
235 test %rdx, %rdx
236 .p2align 4,,2
237 je L(mm_return)
238
239 cmpb $8, %dl
240 ja L(mm_remaining_9_16_bytes_forward)
241 cmpb $4, %dl
242 .p2align 4,,5
243 ja L(mm_remaining_5_8_bytes_forward)
244 cmpb $2, %dl
245 .p2align 4,,1
246 ja L(mm_remaining_3_4_bytes_forward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400247 movzbl -1(%r9,%rdx), %esi
248 movzbl (%r9), %ebx
249 movb %sil, -1(%r8,%rdx)
250 movb %bl, (%r8)
Varvara Rainchika020a242014-04-29 17:44:56 +0400251 jmp L(mm_return)
252
253L(mm_remaining_33_64_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400254 movdqu (%r9), %xmm0
255 movdqu 16(%r9), %xmm1
256 movdqu -32(%r9, %rdx), %xmm2
257 movdqu -16(%r9, %rdx), %xmm3
258 movdqu %xmm0, (%r8)
259 movdqu %xmm1, 16(%r8)
260 movdqu %xmm2, -32(%r8, %rdx)
261 movdqu %xmm3, -16(%r8, %rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400262 jmp L(mm_return)
263
264L(mm_remaining_17_32_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400265 movdqu (%r9), %xmm0
266 movdqu -16(%r9, %rdx), %xmm1
267 movdqu %xmm0, (%r8)
268 movdqu %xmm1, -16(%r8, %rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400269 jmp L(mm_return)
270
271L(mm_remaining_5_8_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400272 movl (%r9), %esi
273 movl -4(%r9,%rdx), %ebx
274 movl %esi, (%r8)
275 movl %ebx, -4(%r8,%rdx)
Varvara Rainchika020a242014-04-29 17:44:56 +0400276 jmp L(mm_return)
277
278L(mm_remaining_9_16_bytes_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400279 mov (%r9), %rsi
280 mov -8(%r9, %rdx), %rbx
281 mov %rsi, (%r8)
282 mov %rbx, -8(%r8, %rdx)
283 jmp L(mm_return)
284
285L(mm_remaining_3_4_bytes_forward):
286 movzwl -2(%r9,%rdx), %esi
287 movzwl (%r9), %ebx
288 movw %si, -2(%r8,%rdx)
289 movw %bx, (%r8)
Varvara Rainchika020a242014-04-29 17:44:56 +0400290 jmp L(mm_return)
291
292L(mm_len_0_16_bytes_forward):
293 testb $24, %dl
294 jne L(mm_len_9_16_bytes_forward)
295 testb $4, %dl
296 .p2align 4,,5
297 jne L(mm_len_5_8_bytes_forward)
298 test %rdx, %rdx
299 .p2align 4,,2
300 je L(mm_return)
301 testb $2, %dl
302 .p2align 4,,1
303 jne L(mm_len_2_4_bytes_forward)
304 movzbl -1(%rsi,%rdx), %ebx
305 movzbl (%rsi), %esi
306 movb %bl, -1(%rdi,%rdx)
307 movb %sil, (%rdi)
308 jmp L(mm_return)
309
310L(mm_len_2_4_bytes_forward):
311 movzwl -2(%rsi,%rdx), %ebx
312 movzwl (%rsi), %esi
313 movw %bx, -2(%rdi,%rdx)
314 movw %si, (%rdi)
315 jmp L(mm_return)
316
317L(mm_len_5_8_bytes_forward):
318 movl (%rsi), %ebx
319 movl -4(%rsi,%rdx), %esi
320 movl %ebx, (%rdi)
321 movl %esi, -4(%rdi,%rdx)
322 jmp L(mm_return)
323
324L(mm_len_9_16_bytes_forward):
325 mov (%rsi), %rbx
326 mov -8(%rsi, %rdx), %rsi
327 mov %rbx, (%rdi)
328 mov %rsi, -8(%rdi, %rdx)
329 jmp L(mm_return)
330
Varvara Rainchikfce86142014-05-27 12:41:55 +0400331L(mm_recalc_len):
332/* Compute in %rdx how many bytes are left to copy after
333 the main loop stops. */
334 mov %rbx, %rdx
335 sub %rdi, %rdx
Varvara Rainchika020a242014-04-29 17:44:56 +0400336/* The code for copying backwards. */
337L(mm_len_0_or_more_backward):
338
Varvara Rainchikfce86142014-05-27 12:41:55 +0400339/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
Varvara Rainchika020a242014-04-29 17:44:56 +0400340 separately. */
341 cmp $16, %rdx
342 jbe L(mm_len_0_16_bytes_backward)
343
344 cmp $32, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400345 ja L(mm_len_32_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400346
347/* Copy [0..32] and return. */
348 movdqu (%rsi), %xmm0
349 movdqu -16(%rsi, %rdx), %xmm1
350 movdqu %xmm0, (%rdi)
351 movdqu %xmm1, -16(%rdi, %rdx)
352 jmp L(mm_return)
353
354L(mm_len_32_or_more_backward):
355 cmp $64, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400356 ja L(mm_len_64_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400357
358/* Copy [0..64] and return. */
359 movdqu (%rsi), %xmm0
360 movdqu 16(%rsi), %xmm1
361 movdqu -16(%rsi, %rdx), %xmm2
362 movdqu -32(%rsi, %rdx), %xmm3
363 movdqu %xmm0, (%rdi)
364 movdqu %xmm1, 16(%rdi)
365 movdqu %xmm2, -16(%rdi, %rdx)
366 movdqu %xmm3, -32(%rdi, %rdx)
367 jmp L(mm_return)
368
369L(mm_len_64_or_more_backward):
370 cmp $128, %rdx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400371 ja L(mm_len_128_or_more_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400372
373/* Copy [0..128] and return. */
374 movdqu (%rsi), %xmm0
375 movdqu 16(%rsi), %xmm1
376 movdqu 32(%rsi), %xmm2
377 movdqu 48(%rsi), %xmm3
378 movdqu -64(%rsi, %rdx), %xmm4
379 movdqu -48(%rsi, %rdx), %xmm5
380 movdqu -32(%rsi, %rdx), %xmm6
381 movdqu -16(%rsi, %rdx), %xmm7
382 movdqu %xmm0, (%rdi)
383 movdqu %xmm1, 16(%rdi)
384 movdqu %xmm2, 32(%rdi)
385 movdqu %xmm3, 48(%rdi)
386 movdqu %xmm4, -64(%rdi, %rdx)
387 movdqu %xmm5, -48(%rdi, %rdx)
388 movdqu %xmm6, -32(%rdi, %rdx)
389 movdqu %xmm7, -16(%rdi, %rdx)
390 jmp L(mm_return)
391
392L(mm_len_128_or_more_backward):
Varvara Rainchika020a242014-04-29 17:44:56 +0400393/* Aligning the address of destination. We need to save
394 16 bits from the source in order not to overwrite them. */
395 movdqu -16(%rsi, %rdx), %xmm0
396 movdqu -32(%rsi, %rdx), %xmm1
397 movdqu -48(%rsi, %rdx), %xmm2
398 movdqu -64(%rsi, %rdx), %xmm3
399
400 lea (%rdi, %rdx), %r9
401 and $-64, %r9 /* r9 = aligned dst */
402
403 mov %rsi, %r8
404 sub %rdi, %r8 /* r8 = src - dst, diff */
405
406 movdqu -16(%r9, %r8), %xmm4
407 movdqu -32(%r9, %r8), %xmm5
408 movdqu -48(%r9, %r8), %xmm6
409 movdqu -64(%r9, %r8), %xmm7
410
411 movdqu %xmm0, -16(%rdi, %rdx)
412 movdqu %xmm1, -32(%rdi, %rdx)
413 movdqu %xmm2, -48(%rdi, %rdx)
414 movdqu %xmm3, -64(%rdi, %rdx)
415 movdqa %xmm4, -16(%r9)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400416 movaps %xmm5, -32(%r9)
417 movaps %xmm6, -48(%r9)
418 movaps %xmm7, -64(%r9)
Varvara Rainchika020a242014-04-29 17:44:56 +0400419 lea -64(%r9), %r9
420
421 lea 64(%rdi), %rbx
422 and $-64, %rbx
423
Varvara Rainchika020a242014-04-29 17:44:56 +0400424 cmp %r9, %rbx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400425 jae L(mm_recalc_len)
426
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000427 cmp __x86_shared_cache_size_half(%rip), %rdx
428
429 ja L(mm_overlapping_check_backward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400430
431 .p2align 4
432L(mm_main_loop_backward):
433
434 prefetcht0 -128(%r9, %r8)
435
436 movdqu -64(%r9, %r8), %xmm0
437 movdqu -48(%r9, %r8), %xmm1
438 movdqu -32(%r9, %r8), %xmm2
439 movdqu -16(%r9, %r8), %xmm3
440 movdqa %xmm0, -64(%r9)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400441 movaps %xmm1, -48(%r9)
442 movaps %xmm2, -32(%r9)
443 movaps %xmm3, -16(%r9)
Varvara Rainchika020a242014-04-29 17:44:56 +0400444 lea -64(%r9), %r9
445 cmp %r9, %rbx
446 jb L(mm_main_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400447 jmp L(mm_recalc_len)
Varvara Rainchika020a242014-04-29 17:44:56 +0400448
449/* Copy [0..16] and return. */
450L(mm_len_0_16_bytes_backward):
451 testb $24, %dl
452 jnz L(mm_len_9_16_bytes_backward)
453 testb $4, %dl
454 .p2align 4,,5
455 jnz L(mm_len_5_8_bytes_backward)
456 test %rdx, %rdx
457 .p2align 4,,2
458 je L(mm_return)
459 testb $2, %dl
460 .p2align 4,,1
461 jne L(mm_len_3_4_bytes_backward)
462 movzbl -1(%rsi,%rdx), %ebx
463 movzbl (%rsi), %ecx
464 movb %bl, -1(%rdi,%rdx)
465 movb %cl, (%rdi)
466 jmp L(mm_return)
467
468L(mm_len_3_4_bytes_backward):
469 movzwl -2(%rsi,%rdx), %ebx
470 movzwl (%rsi), %ecx
471 movw %bx, -2(%rdi,%rdx)
472 movw %cx, (%rdi)
473 jmp L(mm_return)
474
475L(mm_len_9_16_bytes_backward):
476 movl -4(%rsi,%rdx), %ebx
477 movl -8(%rsi,%rdx), %ecx
478 movl %ebx, -4(%rdi,%rdx)
479 movl %ecx, -8(%rdi,%rdx)
480 sub $8, %rdx
481 jmp L(mm_len_0_16_bytes_backward)
482
483L(mm_len_5_8_bytes_backward):
484 movl (%rsi), %ebx
485 movl -4(%rsi,%rdx), %ecx
486 movl %ebx, (%rdi)
487 movl %ecx, -4(%rdi,%rdx)
488
489L(mm_return):
490 RETURN
491
492/* Big length copy forward part. */
493
Varvara Rainchika020a242014-04-29 17:44:56 +0400494 .p2align 4
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000495
496L(mm_overlapping_check_forward):
497 mov %rsi, %r9
498 add %rdx, %r9
499 cmp __x86_shared_cache_size(%rip), %r9
500 jbe L(mm_main_loop_forward)
501
Varvara Rainchika020a242014-04-29 17:44:56 +0400502L(mm_large_page_loop_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400503 movdqu (%r8, %rsi), %xmm0
504 movdqu 16(%r8, %rsi), %xmm1
505 movdqu 32(%r8, %rsi), %xmm2
506 movdqu 48(%r8, %rsi), %xmm3
507 movntdq %xmm0, (%r8)
508 movntdq %xmm1, 16(%r8)
509 movntdq %xmm2, 32(%r8)
510 movntdq %xmm3, 48(%r8)
511 lea 64(%r8), %r8
512 cmp %r8, %rbx
Varvara Rainchika020a242014-04-29 17:44:56 +0400513 ja L(mm_large_page_loop_forward)
514 sfence
Varvara Rainchikfce86142014-05-27 12:41:55 +0400515 jmp L(mm_copy_remaining_forward)
Varvara Rainchika020a242014-04-29 17:44:56 +0400516
517/* Big length copy backward part. */
Varvara Rainchika020a242014-04-29 17:44:56 +0400518 .p2align 4
Elliott Hughes4f3b7e12024-07-19 12:00:17 +0000519
520L(mm_overlapping_check_backward):
521 mov %rdi, %r11
522 sub %rsi, %r11 /* r11 = dst - src, diff */
523 add %rdx, %r11
524 cmp __x86_shared_cache_size(%rip), %r11
525 jbe L(mm_main_loop_backward)
526
Varvara Rainchika020a242014-04-29 17:44:56 +0400527L(mm_large_page_loop_backward):
528 movdqu -64(%r9, %r8), %xmm0
529 movdqu -48(%r9, %r8), %xmm1
530 movdqu -32(%r9, %r8), %xmm2
531 movdqu -16(%r9, %r8), %xmm3
532 movntdq %xmm0, -64(%r9)
533 movntdq %xmm1, -48(%r9)
534 movntdq %xmm2, -32(%r9)
535 movntdq %xmm3, -16(%r9)
536 lea -64(%r9), %r9
537 cmp %r9, %rbx
538 jb L(mm_large_page_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400539 sfence
540 jmp L(mm_recalc_len)
Varvara Rainchika020a242014-04-29 17:44:56 +0400541
542END (MEMMOVE)
Haibo Huang8a0f0ed2018-05-24 20:39:18 -0700543
544ALIAS_SYMBOL(memcpy, MEMMOVE)