blob: d1bfb7c85ed7a252943511635faf0631434c0463 [file] [log] [blame]
Greta Yorsh5b349fc2011-10-04 16:02:25 +00001/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
15 * permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
30 (!(defined (__ARM_ARCH_7A__))))
31
32 /* Do nothing here. See memcpy-stub.c in the same directory. */
33
34#else
35 /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
36
37 /* Use the version of memcpy implemented using LDRD and STRD.
38 This version is tuned for Cortex-A15.
39 This might not be the best for other ARMv7-A CPUs,
40 but there is no predefine to distinguish between
41 different CPUs in the same architecture,
42 and this version is better than the plain memcpy provided in newlib.
43
44 Therefore, we use this version for all ARMv7-A CPUS. */
45
46 /* To make the same code compile for both ARM and Thumb instruction
47 sets, switch to unified syntax at the beginning of this function.
48 However, by using the same code, we may be missing optimization
49 opportunities. For instance, in LDRD/STRD instructions, the first
50 destination register must be even and the second consecutive in
51 ARM state, but not in Thumb state. */
52
53 .syntax unified
54
55#if defined (__thumb__)
56 .thumb
57 .thumb_func
58#endif
59
60 .global memcpy
61 .type memcpy, %function
62memcpy:
63
64 /* Assumes that n >= 0, and dst, src are valid pointers.
65 If there is at least 8 bytes to copy, use LDRD/STRD.
66 If src and dst are misaligned with different offsets,
67 first copy byte by byte until dst is aligned,
68 and then copy using LDRD/STRD and shift if needed.
69 When less than 8 left, copy a word and then byte by byte. */
70
71 /* Save registers (r0 holds the return value):
72 optimized push {r0, r4, r5, lr}.
73 To try and improve performance, stack layout changed,
74 i.e., not keeping the stack looking like users expect
75 (highest numbered register at highest address). */
76 push {r0, lr}
77 strd r4, r5, [sp, #-8]!
78
79 /* TODO: Add debug frame directives.
80 We don't need exception unwind directives, because the code below
81 does not throw any exceptions and does not call any other functions.
82 Generally, newlib functions like this lack debug information for
83 assembler source. */
84
85 /* Get copying of tiny blocks out of the way first. */
86 /* Is there at least 4 bytes to copy? */
87 subs r2, r2, #4
88 blt copy_less_than_4 /* If n < 4. */
89
90 /* Check word alignment. */
91 ands ip, r0, #3 /* ip = last 2 bits of dst. */
92 bne dst_not_word_aligned /* If dst is not word-aligned. */
93
94 /* Get here if dst is word-aligned. */
95 ands ip, r1, #3 /* ip = last 2 bits of src. */
96 bne src_not_word_aligned /* If src is not word-aligned. */
97word_aligned:
98 /* Get here if source and dst both are word-aligned.
99 The number of bytes remaining to copy is r2+4. */
100
101 /* Is there is at least 64 bytes to copy? */
102 subs r2, r2, #60
103 blt copy_less_than_64 /* If r2 + 4 < 64. */
104
105 /* First, align the destination buffer to 8-bytes,
106 to make sure double loads and stores don't cross cache line boundary,
107 as they are then more expensive even if the data is in the cache
108 (require two load/store issue cycles instead of one).
109 If only one of the buffers is not 8-bytes aligned,
110 then it's more important to align dst than src,
111 because there is more penalty for stores
112 than loads that cross cacheline boundary.
113 This check and realignment are only worth doing
114 if there is a lot to copy. */
115
116 /* Get here if dst is word aligned,
117 i.e., the 2 least significant bits are 0.
118 If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
119 then copy 1 word (4 bytes). */
120 ands r3, r0, #4
121 beq 11f /* If dst already two-word aligned. */
122 ldr r3, [r1], #4
123 str r3, [r0], #4
124 subs r2, r2, #4
125 blt copy_less_than_64
126
12711:
128 /* TODO: Align to cacheline (useful for PLD optimization). */
129
130 /* Every loop iteration copies 64 bytes. */
1311:
132 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
133 ldrd r4, r5, [r1, \offset]
134 strd r4, r5, [r0, \offset]
135 .endr
136
137 add r0, r0, #64
138 add r1, r1, #64
139 subs r2, r2, #64
140 bge 1b /* If there is more to copy. */
141
142copy_less_than_64:
143
144 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
145 Restore the count if there is more than 7 bytes to copy. */
146 adds r2, r2, #56
147 blt copy_less_than_8
148
149 /* Copy 8 bytes at a time. */
1502:
151 ldrd r4, r5, [r1], #8
152 strd r4, r5, [r0], #8
153 subs r2, r2, #8
154 bge 2b /* If there is more to copy. */
155
156copy_less_than_8:
157
158 /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
159 Check if there is more to copy. */
160 cmn r2, #8
161 beq return /* If r2 + 8 == 0. */
162
163 /* Restore the count if there is more than 3 bytes to copy. */
164 adds r2, r2, #4
165 blt copy_less_than_4
166
167 /* Copy 4 bytes. */
168 ldr r3, [r1], #4
169 str r3, [r0], #4
170
171copy_less_than_4:
172 /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
173
174 /* Restore the count, check if there is more to copy. */
175 adds r2, r2, #4
176 beq return /* If r2 == 0. */
177
178 /* Get here with r2 is in {1,2,3}={01,10,11}. */
179 /* Logical shift left r2, insert 0s, update flags. */
180 lsls r2, r2, #31
181
182 /* Copy byte by byte.
183 Condition ne means the last bit of r2 is 0.
184 Condition cs means the second to last bit of r2 is set,
185 i.e., r2 is 1 or 3. */
186 itt ne
187 ldrbne r3, [r1], #1
188 strbne r3, [r0], #1
189
190 itttt cs
191 ldrbcs r4, [r1], #1
192 ldrbcs r5, [r1]
193 strbcs r4, [r0], #1
194 strbcs r5, [r0]
195
196return:
197 /* Restore registers: optimized pop {r0, r4, r5, pc} */
198 ldrd r4, r5, [sp], #8
199 pop {r0, pc} /* This is the only return point of memcpy. */
200
201#ifndef __ARM_FEATURE_UNALIGNED
202
203 /* The following assembly macro implements misaligned copy in software.
204 Assumes that dst is word aligned, src is at offset "pull" bits from
205 word, push = 32 - pull, and the number of bytes that remain to copy
206 is r2 + 4, r2 >= 0. */
207
208 /* In the code below, r2 is the number of bytes that remain to be
209 written. The number of bytes read is always larger, because we have
210 partial words in the shift queue. */
211
212 .macro miscopy pull push shiftleft shiftright
213
214 /* Align src to the previous word boundary. */
215 bic r1, r1, #3
216
217 /* Initialize the shift queue. */
218 ldr r5, [r1], #4 /* Load a word from source. */
219
220 subs r2, r2, #4
221 blt 6f /* Go to misaligned copy of less than 8 bytes. */
222
223 /* Get here if there is more than 8 bytes to copy.
224 The number of bytes to copy is r2+8, r2 >= 0. */
225
226 /* Save registers: push { r6, r7 }.
227 We need additional registers for LDRD and STRD, because in ARM state
228 the first destination register must be even and the second
229 consecutive. */
230 strd r6, r7, [sp, #-8]!
231
232 subs r2, r2, #56
233 blt 4f /* Go to misaligned copy of less than 64 bytes. */
234
2353:
236 /* Get here if there is more than 64 bytes to copy.
237 The number of bytes to copy is r2+64, r2 >= 0. */
238
239 /* Copy 64 bytes in every iteration.
240 Use a partial word from the shift queue. */
241 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
242 mov r6, r5, \shiftleft #\pull
243 ldrd r4, r5, [r1, \offset]
244 orr r6, r6, r4, \shiftright #\push
245 mov r7, r4, \shiftleft #\pull
246 orr r7, r7, r5, \shiftright #\push
247 strd r6, r7, [r0, \offset]
248 .endr
249
250 add r1, r1, #64
251 add r0, r0, #64
252 subs r2, r2, #64
253 bge 3b
254
2554:
256 /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
257 and they are misaligned. */
258
259 /* Restore the count if there is more than 7 bytes to copy. */
260 adds r2, r2, #56
261
262 /* If less than 8 bytes to copy,
263 restore registers saved for this loop: optimized poplt { r6, r7 }. */
264 itt lt
265 ldrdlt r6, r7, [sp], #8
266 blt 6f /* Go to misaligned copy of less than 8 bytes. */
267
2685:
269 /* Copy 8 bytes at a time.
270 Use a partial word from the shift queue. */
271 mov r6, r5, \shiftleft #\pull
272 ldrd r4, r5, [r1], #8
273 orr r6, r6, r4, \shiftright #\push
274 mov r7, r4, \shiftleft #\pull
275 orr r7, r7, r5, \shiftright #\push
276 strd r6, r7, [r0], #8
277
278 subs r2, r2, #8
279 bge 5b /* If there is more to copy. */
280
281 /* Restore registers saved for this loop: optimized pop { r6, r7 }. */
282 ldrd r6, r7, [sp], #8
283
2846:
285 /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
286 and they are misaligned. */
287
288 /* Check if there is more to copy. */
289 cmn r2, #8
290 beq return
291
292 /* Check if there is less than 4 bytes to copy. */
293 cmn r2, #4
294
295 itt lt
296 /* Restore src offset from word-align. */
297 sublt r1, r1, #(\push / 8)
298 blt copy_less_than_4
299
300 /* Use a partial word from the shift queue. */
301 mov r3, r5, \shiftleft #\pull
302 /* Load a word from src, but without writeback
303 (this word is not fully written to dst). */
304 ldr r5, [r1]
305
306 /* Restore src offset from word-align. */
307 add r1, r1, #(\pull / 8)
308
309 /* Shift bytes to create one dst word and store it. */
310 orr r3, r3, r5, \shiftright #\push
311 str r3, [r0], #4
312
313 /* Use single byte copying of the remaining bytes. */
314 b copy_less_than_4
315
316 .endm
317
318#endif /* not __ARM_FEATURE_UNALIGNED */
319
320dst_not_word_aligned:
321
322 /* Get here when dst is not aligned and ip has the last 2 bits of dst,
323 i.e., ip is the offset of dst from word.
324 The number of bytes that remains to copy is r2 + 4,
325 i.e., there are at least 4 bytes to copy.
326 Write a partial word (0 to 3 bytes), such that dst becomes
327 word-aligned. */
328
329 /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
330 then there are (4 - ip) bytes to fill up to align dst to the next
331 word. */
332 rsb ip, ip, #4 /* ip = #4 - ip. */
333 cmp ip, #2
334
335 /* Copy byte by byte with conditionals. */
336 itt gt
337 ldrbgt r3, [r1], #1
338 strbgt r3, [r0], #1
339
340 itt ge
341 ldrbge r4, [r1], #1
342 strbge r4, [r0], #1
343
344 ldrb lr, [r1], #1
345 strb lr, [r0], #1
346
347 /* Update the count.
348 ip holds the number of bytes we have just copied. */
349 subs r2, r2, ip /* r2 = r2 - ip. */
350 blt copy_less_than_4 /* If r2 < ip. */
351
352 /* Get here if there are more than 4 bytes to copy.
353 Check if src is aligned. If beforehand src and dst were not word
354 aligned but congruent (same offset), then now they are both
355 word-aligned, and we can copy the rest efficiently (without
356 shifting). */
357 ands ip, r1, #3 /* ip = last 2 bits of src. */
358 beq word_aligned /* If r1 is word-aligned. */
359
360src_not_word_aligned:
361 /* Get here when src is not word-aligned, but dst is word-aligned.
362 The number of bytes that remains to copy is r2+4. */
363
364#ifdef __ARM_FEATURE_UNALIGNED
365 /* Copy word by word using LDR when alignment can be done in hardware,
366 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
367 subs r2, r2, #60
368 blt 8f
369
3707:
371 /* Copy 64 bytes in every loop iteration. */
372 .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
373 ldr r3, [r1, \offset]
374 str r3, [r0, \offset]
375 .endr
376
377 add r0, r0, #64
378 add r1, r1, #64
379 subs r2, r2, #64
380 bge 7b
381
3828:
383 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
384 Check if there is more than 3 bytes to copy. */
385 adds r2, r2, #60
386 blt copy_less_than_4
387
3889:
389 /* Get here if there is less than 64 but at least 4 bytes to copy,
390 where the number of bytes to copy is r2+4. */
391 ldr r3, [r1], #4
392 str r3, [r0], #4
393 subs r2, r2, #4
394 bge 9b
395
396 b copy_less_than_4
397
398#else /* not __ARM_FEATURE_UNALIGNED */
399
400 /* ip has last 2 bits of src,
401 i.e., ip is the offset of src from word, and ip > 0.
402 Compute shifts needed to copy from src to dst. */
403 cmp ip, #2
404 beq miscopy_16_16 /* If ip == 2. */
405 bge miscopy_24_8 /* If ip == 3. */
406
407 /* Get here if ip == 1. */
408
409 /* Endian independent macros for shifting bytes within registers. */
410
411#ifndef __ARMEB__
412miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
413miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
414miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
415#else /* not __ARMEB__ */
416miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
417miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
418miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
419#endif /* not __ARMEB__ */
420
421#endif /* not __ARM_FEATURE_UNALIGNED */
422
423#endif /* memcpy */