blob: 46f1541559c0ec4bd9e8c61f5e4a993c5cdb7b4b [file] [log] [blame]
Vaisakh K V83e55842024-03-29 12:47:39 +05301/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
6 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, unaligned accesses.
12 *
13 */
14
15#include <private/bionic_asm.h>
16
17#define dstin x0
18#define src x1
19#define count x2
20#define dst x3
21#define srcend x4
22#define dstend x5
23#define A_l x6
24#define A_lw w6
25#define A_h x7
26#define B_l x8
27#define B_lw w8
28#define B_h x9
29#define C_l x10
30#define C_lw w10
31#define C_h x11
32#define D_l x12
33#define D_h x13
34#define E_l x14
35#define E_h x15
36#define F_l x16
37#define F_h x17
38#define G_l count
39#define G_h dst
40#define H_l src
41#define H_h srcend
42#define tmp1 x14
43#define tmp2 x16
44#define SMALL_BUFFER_SIZE 48
45
46/* This implementation handles overlaps and supports both memcpy and memmove
47 from a single entry point. It uses unaligned accesses and branchless
48 sequences to keep the code small, simple and improve performance.
49
50 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51 copies of up to 128 bytes, and large copies. The overhead of the overlap
52 check is negligible since it is only required for large copies.
53
54 Large copies use a software pipelined loop processing 64 bytes per iteration.
55 The destination pointer is 16-byte aligned to minimize unaligned accesses.
56 The loop tail is handled by always copying 64 bytes from the end.
57*/
58
59ALIAS_SYMBOL (__memmove_aarch64_nt, __memcpy_aarch64_nt)
60ENTRY (__memcpy_aarch64_nt)
61
62 add srcend, src, count
63 add dstend, dstin, count
64 cmp count, 128
65 b.hi L(copy_long)
66 cmp count, 32
67 b.hi L(copy32_128)
68
69 /* Small copies: 0..32 bytes. */
70 cmp count, 16
71 b.lo L(copy16)
72 ldp A_l, A_h, [src]
73 ldp D_l, D_h, [srcend, -16]
74 stp A_l, A_h, [dstin]
75 stp D_l, D_h, [dstend, -16]
76 ret
77
78 /* Copy 8-15 bytes. */
79L(copy16):
80 tbz count, 3, L(copy8)
81 ldr A_l, [src]
82 ldr A_h, [srcend, -8]
83 str A_l, [dstin]
84 str A_h, [dstend, -8]
85 ret
86
87 .p2align 3
88 /* Copy 4-7 bytes. */
89L(copy8):
90 tbz count, 2, L(copy4)
91 ldr A_lw, [src]
92 ldr B_lw, [srcend, -4]
93 str A_lw, [dstin]
94 str B_lw, [dstend, -4]
95 ret
96
97 /* Copy 0..3 bytes using a branchless sequence. */
98L(copy4):
99 cbz count, L(copy0)
100 lsr tmp1, count, 1
101 ldrb A_lw, [src]
102 ldrb C_lw, [srcend, -1]
103 ldrb B_lw, [src, tmp1]
104 strb A_lw, [dstin]
105 strb B_lw, [dstin, tmp1]
106 strb C_lw, [dstend, -1]
107L(copy0):
108 ret
109
110 .p2align 4
111 /* Medium copies: 33..128 bytes. */
112L(copy32_128):
113 ldp A_l, A_h, [src]
114 ldp B_l, B_h, [src, 16]
115 ldp C_l, C_h, [srcend, -32]
116 ldp D_l, D_h, [srcend, -16]
117 cmp count, 64
118 b.hi L(copy128)
119 stp A_l, A_h, [dstin]
120 stp B_l, B_h, [dstin, 16]
121 stp C_l, C_h, [dstend, -32]
122 stp D_l, D_h, [dstend, -16]
123 ret
124
125 .p2align 4
126 /* Copy 65..128 bytes. */
127L(copy128):
128 ldp E_l, E_h, [src, 32]
129 ldp F_l, F_h, [src, 48]
130 cmp count, 96
131 b.ls L(copy96)
132 ldp G_l, G_h, [srcend, -64]
133 ldp H_l, H_h, [srcend, -48]
134 stp G_l, G_h, [dstend, -64]
135 stp H_l, H_h, [dstend, -48]
136L(copy96):
137 stp A_l, A_h, [dstin]
138 stp B_l, B_h, [dstin, 16]
139 stp E_l, E_h, [dstin, 32]
140 stp F_l, F_h, [dstin, 48]
141 stp C_l, C_h, [dstend, -32]
142 stp D_l, D_h, [dstend, -16]
143 ret
144
145 .p2align 4
146 /* Copy more than 128 bytes. */
147L(copy_long):
148 mov tmp2, #SMALL_BUFFER_SIZE
149 cmp count, tmp2, LSL#10
150 bgt L(copy_long_nt)
151 /* Use backwards copy if there is an overlap. */
152 sub tmp1, dstin, src
153 cbz tmp1, L(copy0)
154 cmp tmp1, count
155 b.lo L(copy_long_backwards)
156
157 /* Copy 16 bytes and then align dst to 16-byte alignment. */
158
159 ldp D_l, D_h, [src]
160 and tmp1, dstin, 15
161 bic dst, dstin, 15
162 sub src, src, tmp1
163 add count, count, tmp1 /* Count is now 16 too large. */
164 ldp A_l, A_h, [src, 16]
165 stp D_l, D_h, [dstin]
166 ldp B_l, B_h, [src, 32]
167 ldp C_l, C_h, [src, 48]
168 ldp D_l, D_h, [src, 64]!
169 subs count, count, 128 + 16 /* Test and readjust count. */
170 b.ls L(copy64_from_end)
171
172L(loop64):
173 stp A_l, A_h, [dst, 16]
174 ldp A_l, A_h, [src, 16]
175 stp B_l, B_h, [dst, 32]
176 ldp B_l, B_h, [src, 32]
177 stp C_l, C_h, [dst, 48]
178 ldp C_l, C_h, [src, 48]
179 stp D_l, D_h, [dst, 64]!
180 ldp D_l, D_h, [src, 64]!
181 subs count, count, 64
182 b.hi L(loop64)
183
184 /* Write the last iteration and copy 64 bytes from the end. */
185L(copy64_from_end):
186 ldp E_l, E_h, [srcend, -64]
187 stp A_l, A_h, [dst, 16]
188 ldp A_l, A_h, [srcend, -48]
189 stp B_l, B_h, [dst, 32]
190 ldp B_l, B_h, [srcend, -32]
191 stp C_l, C_h, [dst, 48]
192 ldp C_l, C_h, [srcend, -16]
193 stp D_l, D_h, [dst, 64]
194 stp E_l, E_h, [dstend, -64]
195 stp A_l, A_h, [dstend, -48]
196 stp B_l, B_h, [dstend, -32]
197 stp C_l, C_h, [dstend, -16]
198 ret
199
200 .p2align 4
201
202 /* Large backwards copy for overlapping copies.
203 Copy 16 bytes and then align dst to 16-byte alignment. */
204L(copy_long_backwards):
205 ldp D_l, D_h, [srcend, -16]
206 and tmp1, dstend, 15
207 sub srcend, srcend, tmp1
208 sub count, count, tmp1
209 ldp A_l, A_h, [srcend, -16]
210 stp D_l, D_h, [dstend, -16]
211 ldp B_l, B_h, [srcend, -32]
212 ldp C_l, C_h, [srcend, -48]
213 ldp D_l, D_h, [srcend, -64]!
214 sub dstend, dstend, tmp1
215 subs count, count, 128
216 b.ls L(copy64_from_start)
217
218L(loop64_backwards):
219 stp A_l, A_h, [dstend, -16]
220 ldp A_l, A_h, [srcend, -16]
221 stp B_l, B_h, [dstend, -32]
222 ldp B_l, B_h, [srcend, -32]
223 stp C_l, C_h, [dstend, -48]
224 ldp C_l, C_h, [srcend, -48]
225 stp D_l, D_h, [dstend, -64]!
226 ldp D_l, D_h, [srcend, -64]!
227 subs count, count, 64
228 b.hi L(loop64_backwards)
229
230 /* Write the last iteration and copy 64 bytes from the start. */
231L(copy64_from_start):
232 ldp G_l, G_h, [src, 48]
233 stp A_l, A_h, [dstend, -16]
234 ldp A_l, A_h, [src, 32]
235 stp B_l, B_h, [dstend, -32]
236 ldp B_l, B_h, [src, 16]
237 stp C_l, C_h, [dstend, -48]
238 ldp C_l, C_h, [src]
239 stp D_l, D_h, [dstend, -64]
240 stp G_l, G_h, [dstin, 48]
241 stp A_l, A_h, [dstin, 32]
242 stp B_l, B_h, [dstin, 16]
243 stp C_l, C_h, [dstin]
244 ret
245
246 .p2align 4
247 /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions. */
248L(copy_long_nt):
249 /* Use backwards copy if there is an overlap. */
250 sub tmp1, dstin, src
251 cbz tmp1, L(copy0)
252 cmp tmp1, count
253 b.lo L(copy_long_backwards_nt)
254
255 /* Copy 16 bytes and then align dst to 16-byte alignment. */
256
257 ldnp D_l, D_h, [src]
258 and tmp1, dstin, 15
259 bic dst, dstin, 15
260 sub src, src, tmp1
261 add count, count, tmp1 /* Count is now 16 too large. */
262 ldnp A_l, A_h, [src, 16]
263 stnp D_l, D_h, [dstin]
264 ldnp B_l, B_h, [src, 32]
265 ldnp C_l, C_h, [src, 48]
266 ldnp D_l, D_h, [src, 64]
267 add src, src, #64
268 subs count, count, 128 + 16 /* Test and readjust count. */
269 b.ls L(copy64_from_end_nt)
270
271L(loop64_nt):
272 stnp A_l, A_h, [dst, 16]
273 ldnp A_l, A_h, [src, 16]
274 stnp B_l, B_h, [dst, 32]
275 ldnp B_l, B_h, [src, 32]
276 stnp C_l, C_h, [dst, 48]
277 ldnp C_l, C_h, [src, 48]
278 stnp D_l, D_h, [dst, 64]
279 add dst, dst, #64
280 ldnp D_l, D_h, [src, 64]
281 add src, src, #64
282 subs count, count, 64
283 b.hi L(loop64_nt)
284
285 /* Write the last iteration and copy 64 bytes from the end. */
286L(copy64_from_end_nt):
287 ldnp E_l, E_h, [srcend, -64]
288 stnp A_l, A_h, [dst, 16]
289 ldnp A_l, A_h, [srcend, -48]
290 stnp B_l, B_h, [dst, 32]
291 ldnp B_l, B_h, [srcend, -32]
292 stnp C_l, C_h, [dst, 48]
293 ldnp C_l, C_h, [srcend, -16]
294 stnp D_l, D_h, [dst, 64]
295 stnp E_l, E_h, [dstend, -64]
296 stnp A_l, A_h, [dstend, -48]
297 stnp B_l, B_h, [dstend, -32]
298 stnp C_l, C_h, [dstend, -16]
299 ret
300
301 .p2align 4
302
303 /* Large backwards copy for overlapping copies.
304 Copy 16 bytes and then align dst to 16-byte alignment. */
305L(copy_long_backwards_nt):
306 ldnp D_l, D_h, [srcend, -16]
307 and tmp1, dstend, 15
308 sub srcend, srcend, tmp1
309 sub count, count, tmp1
310 ldnp A_l, A_h, [srcend, -16]
311 stnp D_l, D_h, [dstend, -16]
312 ldnp B_l, B_h, [srcend, -32]
313 ldnp C_l, C_h, [srcend, -48]
314 ldnp D_l, D_h, [srcend, -64]
315 add srcend, srcend, #-64
316 sub dstend, dstend, tmp1
317 subs count, count, 128
318 b.ls L(copy64_from_start_nt)
319
320L(loop64_backwards_nt):
321 stnp A_l, A_h, [dstend, -16]
322 ldnp A_l, A_h, [srcend, -16]
323 stnp B_l, B_h, [dstend, -32]
324 ldnp B_l, B_h, [srcend, -32]
325 stnp C_l, C_h, [dstend, -48]
326 ldnp C_l, C_h, [srcend, -48]
327 stnp D_l, D_h, [dstend, -64]
328 add dstend, dstend, #-64
329 ldnp D_l, D_h, [srcend, -64]
330 add srcend, srcend, #-64
331 subs count, count, 64
332 b.hi L(loop64_backwards_nt)
333
334 /* Write the last iteration and copy 64 bytes from the start. */
335L(copy64_from_start_nt):
336 ldnp G_l, G_h, [src, 48]
337 stnp A_l, A_h, [dstend, -16]
338 ldnp A_l, A_h, [src, 32]
339 stnp B_l, B_h, [dstend, -32]
340 ldnp B_l, B_h, [src, 16]
341 stnp C_l, C_h, [dstend, -48]
342 ldnp C_l, C_h, [src]
343 stnp D_l, D_h, [dstend, -64]
344 stnp G_l, G_h, [dstin, 48]
345 stnp A_l, A_h, [dstin, 32]
346 stnp B_l, B_h, [dstin, 16]
347 stnp C_l, C_h, [dstin]
348 ret
349
350END (__memcpy_aarch64_nt)
351