| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 1 | /* | 
|  | 2 | * Copyright (C) 2008 The Android Open Source Project | 
|  | 3 | * All rights reserved. | 
|  | 4 | * | 
|  | 5 | * Redistribution and use in source and binary forms, with or without | 
|  | 6 | * modification, are permitted provided that the following conditions | 
|  | 7 | * are met: | 
|  | 8 | *  * Redistributions of source code must retain the above copyright | 
|  | 9 | *    notice, this list of conditions and the following disclaimer. | 
|  | 10 | *  * Redistributions in binary form must reproduce the above copyright | 
|  | 11 | *    notice, this list of conditions and the following disclaimer in | 
|  | 12 | *    the documentation and/or other materials provided with the | 
|  | 13 | *    distribution. | 
|  | 14 | * | 
|  | 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 16 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 17 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
|  | 18 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
|  | 19 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
|  | 20 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
|  | 21 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
|  | 22 | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
|  | 23 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
|  | 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
|  | 25 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 26 | * SUCH DAMAGE. | 
|  | 27 | */ | 
|  | 28 |  | 
|  | 29 | #include <machine/cpu-features.h> | 
| Kenny Root | 420878c | 2011-02-16 11:55:58 -0800 | [diff] [blame] | 30 | #include <machine/asm.h> | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 31 |  | 
| Colin Cross | ecede40 | 2010-03-09 16:23:51 -0800 | [diff] [blame] | 32 | #if defined(__ARM_NEON__) | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 33 |  | 
|  | 34 | .text | 
|  | 35 | .fpu    neon | 
|  | 36 |  | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 37 | /* a prefetch distance of 4 cache-lines works best experimentally */ | 
|  | 38 | #define CACHE_LINE_SIZE     64 | 
|  | 39 | #define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4) | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 40 |  | 
| Evgeniy Stepanov | 487b613 | 2011-10-04 14:22:15 +0400 | [diff] [blame] | 41 | ENTRY(memcpy) | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 42 | .save       {r0, lr} | 
|  | 43 | stmfd       sp!, {r0, lr} | 
|  | 44 |  | 
|  | 45 | /* start preloading as early as possible */ | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 46 | pld         [r1, #(CACHE_LINE_SIZE*0)] | 
|  | 47 | pld         [r1, #(CACHE_LINE_SIZE*1)] | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 48 |  | 
|  | 49 | /* do we have at least 16-bytes to copy (needed for alignment below) */ | 
|  | 50 | cmp         r2, #16 | 
|  | 51 | blo         5f | 
|  | 52 |  | 
|  | 53 | /* align destination to half cache-line for the write-buffer */ | 
|  | 54 | rsb         r3, r0, #0 | 
|  | 55 | ands        r3, r3, #0xF | 
|  | 56 | beq         0f | 
|  | 57 |  | 
|  | 58 | /* copy up to 15-bytes (count in r3) */ | 
|  | 59 | sub         r2, r2, r3 | 
|  | 60 | movs        ip, r3, lsl #31 | 
|  | 61 | ldrmib      lr, [r1], #1 | 
|  | 62 | strmib      lr, [r0], #1 | 
|  | 63 | ldrcsb      ip, [r1], #1 | 
|  | 64 | ldrcsb      lr, [r1], #1 | 
|  | 65 | strcsb      ip, [r0], #1 | 
|  | 66 | strcsb      lr, [r0], #1 | 
|  | 67 | movs        ip, r3, lsl #29 | 
|  | 68 | bge         1f | 
|  | 69 | // copies 4 bytes, destination 32-bits aligned | 
|  | 70 | vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]! | 
|  | 71 | vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! | 
|  | 72 | 1:      bcc         2f | 
|  | 73 | // copies 8 bytes, destination 64-bits aligned | 
|  | 74 | vld1.8      {d0}, [r1]! | 
|  | 75 | vst1.8      {d0}, [r0, :64]! | 
|  | 76 | 2: | 
|  | 77 |  | 
|  | 78 | 0:      /* preload immediately the next cache line, which we may need */ | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 79 | pld         [r1, #(CACHE_LINE_SIZE*0)] | 
|  | 80 | pld         [r1, #(CACHE_LINE_SIZE*1)] | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 81 |  | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 82 | /* make sure we have at least 64 bytes to copy */ | 
|  | 83 | subs        r2, r2, #64 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 84 | blo         2f | 
|  | 85 |  | 
|  | 86 | /* preload all the cache lines we need. | 
|  | 87 | * NOTE: the number of pld below depends on PREFETCH_DISTANCE, | 
|  | 88 | * ideally would would increase the distance in the main loop to | 
|  | 89 | * avoid the goofy code below. In practice this doesn't seem to make | 
|  | 90 | * a big difference. | 
|  | 91 | */ | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 92 | pld         [r1, #(CACHE_LINE_SIZE*2)] | 
|  | 93 | pld         [r1, #(CACHE_LINE_SIZE*3)] | 
|  | 94 | pld         [r1, #(PREFETCH_DISTANCE)] | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 95 |  | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 96 | 1:      /* The main loop copies 64 bytes at a time */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 97 | vld1.8      {d0  - d3},   [r1]! | 
|  | 98 | vld1.8      {d4  - d7},   [r1]! | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 99 | pld         [r1, #(PREFETCH_DISTANCE)] | 
|  | 100 | subs        r2, r2, #64 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 101 | vst1.8      {d0  - d3},   [r0, :128]! | 
|  | 102 | vst1.8      {d4  - d7},   [r0, :128]! | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 103 | bhs         1b | 
|  | 104 |  | 
|  | 105 | 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */ | 
| Mathias Agopian | 199f9d9 | 2009-10-28 02:54:37 -0700 | [diff] [blame] | 106 | add         r2, r2, #64 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 107 | subs        r2, r2, #32 | 
|  | 108 | blo         4f | 
|  | 109 |  | 
|  | 110 | 3:      /* 32 bytes at a time. These cache lines were already preloaded */ | 
|  | 111 | vld1.8      {d0 - d3},  [r1]! | 
|  | 112 | subs        r2, r2, #32 | 
|  | 113 | vst1.8      {d0 - d3},  [r0, :128]! | 
|  | 114 | bhs         3b | 
|  | 115 |  | 
|  | 116 | 4:      /* less than 32 left */ | 
|  | 117 | add         r2, r2, #32 | 
|  | 118 | tst         r2, #0x10 | 
|  | 119 | beq         5f | 
|  | 120 | // copies 16 bytes, 128-bits aligned | 
|  | 121 | vld1.8      {d0, d1}, [r1]! | 
|  | 122 | vst1.8      {d0, d1}, [r0, :128]! | 
|  | 123 |  | 
|  | 124 | 5:      /* copy up to 15-bytes (count in r2) */ | 
|  | 125 | movs        ip, r2, lsl #29 | 
|  | 126 | bcc         1f | 
|  | 127 | vld1.8      {d0}, [r1]! | 
|  | 128 | vst1.8      {d0}, [r0]! | 
|  | 129 | 1:      bge         2f | 
|  | 130 | vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]! | 
|  | 131 | vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]! | 
|  | 132 | 2:      movs        ip, r2, lsl #31 | 
|  | 133 | ldrmib      r3, [r1], #1 | 
|  | 134 | ldrcsb      ip, [r1], #1 | 
|  | 135 | ldrcsb      lr, [r1], #1 | 
|  | 136 | strmib      r3, [r0], #1 | 
|  | 137 | strcsb      ip, [r0], #1 | 
|  | 138 | strcsb      lr, [r0], #1 | 
|  | 139 |  | 
|  | 140 | ldmfd       sp!, {r0, lr} | 
|  | 141 | bx          lr | 
| Kenny Root | 420878c | 2011-02-16 11:55:58 -0800 | [diff] [blame] | 142 | END(memcpy) | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 143 |  | 
|  | 144 |  | 
|  | 145 | #else   /* __ARM_ARCH__ < 7 */ | 
|  | 146 |  | 
|  | 147 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 148 | /* | 
|  | 149 | * Optimized memcpy() for ARM. | 
|  | 150 | * | 
|  | 151 | * note that memcpy() always returns the destination pointer, | 
|  | 152 | * so we have to preserve R0. | 
|  | 153 | */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 154 |  | 
| Kenny Root | 420878c | 2011-02-16 11:55:58 -0800 | [diff] [blame] | 155 | ENTRY(memcpy) | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 156 | /* The stack must always be 64-bits aligned to be compliant with the | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 157 | * ARM ABI. Since we have to save R0, we might as well save R4 | 
|  | 158 | * which we can use for better pipelining of the reads below | 
|  | 159 | */ | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 160 | .save       {r0, r4, lr} | 
|  | 161 | stmfd       sp!, {r0, r4, lr} | 
|  | 162 | /* Making room for r5-r11 which will be spilled later */ | 
|  | 163 | .pad        #28 | 
|  | 164 | sub         sp, sp, #28 | 
|  | 165 |  | 
|  | 166 | // preload the destination because we'll align it to a cache line | 
|  | 167 | // with small writes. Also start the source "pump". | 
|  | 168 | PLD         (r0, #0) | 
|  | 169 | PLD         (r1, #0) | 
|  | 170 | PLD         (r1, #32) | 
|  | 171 |  | 
|  | 172 | /* it simplifies things to take care of len<4 early */ | 
|  | 173 | cmp			r2, #4 | 
|  | 174 | blo			copy_last_3_and_return | 
|  | 175 |  | 
|  | 176 | /* compute the offset to align the source | 
|  | 177 | * offset = (4-(src&3))&3 = -src & 3 | 
|  | 178 | */ | 
|  | 179 | rsb			r3, r1, #0 | 
|  | 180 | ands		r3, r3, #3 | 
|  | 181 | beq			src_aligned | 
|  | 182 |  | 
|  | 183 | /* align source to 32 bits. We need to insert 2 instructions between | 
|  | 184 | * a ldr[b|h] and str[b|h] because byte and half-word instructions | 
|  | 185 | * stall 2 cycles. | 
|  | 186 | */ | 
|  | 187 | movs		r12, r3, lsl #31 | 
|  | 188 | sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */ | 
|  | 189 | ldrmib		r3, [r1], #1 | 
|  | 190 | ldrcsb		r4, [r1], #1 | 
|  | 191 | ldrcsb		r12,[r1], #1 | 
|  | 192 | strmib		r3, [r0], #1 | 
|  | 193 | strcsb		r4, [r0], #1 | 
|  | 194 | strcsb		r12,[r0], #1 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 195 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 196 | src_aligned: | 
|  | 197 |  | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 198 | /* see if src and dst are aligned together (congruent) */ | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 199 | eor			r12, r0, r1 | 
|  | 200 | tst			r12, #3 | 
|  | 201 | bne			non_congruent | 
|  | 202 |  | 
|  | 203 | /* Use post-incriment mode for stm to spill r5-r11 to reserved stack | 
|  | 204 | * frame. Don't update sp. | 
|  | 205 | */ | 
|  | 206 | stmea		sp, {r5-r11} | 
|  | 207 |  | 
|  | 208 | /* align the destination to a cache-line */ | 
|  | 209 | rsb         r3, r0, #0 | 
|  | 210 | ands		r3, r3, #0x1C | 
|  | 211 | beq         congruent_aligned32 | 
|  | 212 | cmp         r3, r2 | 
|  | 213 | andhi		r3, r2, #0x1C | 
|  | 214 |  | 
|  | 215 | /* conditionnaly copies 0 to 7 words (length in r3) */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 216 | movs		r12, r3, lsl #28 | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 217 | ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */ | 
|  | 218 | ldmmiia		r1!, {r8, r9}			/*  8 bytes */ | 
|  | 219 | stmcsia		r0!, {r4, r5, r6, r7} | 
|  | 220 | stmmiia		r0!, {r8, r9} | 
|  | 221 | tst         r3, #0x4 | 
|  | 222 | ldrne		r10,[r1], #4			/*  4 bytes */ | 
|  | 223 | strne		r10,[r0], #4 | 
|  | 224 | sub         r2, r2, r3 | 
|  | 225 |  | 
|  | 226 | congruent_aligned32: | 
|  | 227 | /* | 
|  | 228 | * here source is aligned to 32 bytes. | 
|  | 229 | */ | 
|  | 230 |  | 
|  | 231 | cached_aligned32: | 
|  | 232 | subs        r2, r2, #32 | 
|  | 233 | blo         less_than_32_left | 
|  | 234 |  | 
|  | 235 | /* | 
|  | 236 | * We preload a cache-line up to 64 bytes ahead. On the 926, this will | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 237 | * stall only until the requested world is fetched, but the linefill | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 238 | * continues in the the background. | 
|  | 239 | * While the linefill is going, we write our previous cache-line | 
|  | 240 | * into the write-buffer (which should have some free space). | 
|  | 241 | * When the linefill is done, the writebuffer will | 
|  | 242 | * start dumping its content into memory | 
|  | 243 | * | 
|  | 244 | * While all this is going, we then load a full cache line into | 
|  | 245 | * 8 registers, this cache line should be in the cache by now | 
|  | 246 | * (or partly in the cache). | 
|  | 247 | * | 
|  | 248 | * This code should work well regardless of the source/dest alignment. | 
|  | 249 | * | 
|  | 250 | */ | 
|  | 251 |  | 
|  | 252 | // Align the preload register to a cache-line because the cpu does | 
|  | 253 | // "critical word first" (the first word requested is loaded first). | 
|  | 254 | bic         r12, r1, #0x1F | 
|  | 255 | add         r12, r12, #64 | 
|  | 256 |  | 
|  | 257 | 1:      ldmia       r1!, { r4-r11 } | 
|  | 258 | PLD         (r12, #64) | 
|  | 259 | subs        r2, r2, #32 | 
|  | 260 |  | 
|  | 261 | // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi | 
|  | 262 | // for ARM9 preload will not be safely guarded by the preceding subs. | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 263 | // When it is safely guarded the only possibility to have SIGSEGV here | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 264 | // is because the caller overstates the length. | 
|  | 265 | ldrhi       r3, [r12], #32      /* cheap ARM9 preload */ | 
|  | 266 | stmia       r0!, { r4-r11 } | 
|  | 267 | bhs         1b | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 268 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 269 | add         r2, r2, #32 | 
|  | 270 |  | 
|  | 271 |  | 
|  | 272 |  | 
|  | 273 |  | 
|  | 274 | less_than_32_left: | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 275 | /* | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 276 | * less than 32 bytes left at this point (length in r2) | 
|  | 277 | */ | 
|  | 278 |  | 
|  | 279 | /* skip all this if there is nothing to do, which should | 
|  | 280 | * be a common case (if not executed the code below takes | 
|  | 281 | * about 16 cycles) | 
|  | 282 | */ | 
|  | 283 | tst			r2, #0x1F | 
|  | 284 | beq			1f | 
|  | 285 |  | 
|  | 286 | /* conditionnaly copies 0 to 31 bytes */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 287 | movs		r12, r2, lsl #28 | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 288 | ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */ | 
|  | 289 | ldmmiia		r1!, {r8, r9}			/*  8 bytes */ | 
|  | 290 | stmcsia		r0!, {r4, r5, r6, r7} | 
|  | 291 | stmmiia		r0!, {r8, r9} | 
|  | 292 | movs		r12, r2, lsl #30 | 
|  | 293 | ldrcs		r3, [r1], #4			/*  4 bytes */ | 
|  | 294 | ldrmih		r4, [r1], #2			/*  2 bytes */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 295 | strcs		r3, [r0], #4 | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 296 | strmih		r4, [r0], #2 | 
|  | 297 | tst         r2, #0x1 | 
|  | 298 | ldrneb		r3, [r1]				/*  last byte  */ | 
|  | 299 | strneb		r3, [r0] | 
|  | 300 |  | 
|  | 301 | /* we're done! restore everything and return */ | 
|  | 302 | 1:		ldmfd		sp!, {r5-r11} | 
|  | 303 | ldmfd		sp!, {r0, r4, lr} | 
|  | 304 | bx			lr | 
|  | 305 |  | 
|  | 306 | /********************************************************************/ | 
|  | 307 |  | 
|  | 308 | non_congruent: | 
|  | 309 | /* | 
|  | 310 | * here source is aligned to 4 bytes | 
|  | 311 | * but destination is not. | 
|  | 312 | * | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 313 | * in the code below r2 is the number of bytes read | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 314 | * (the number of bytes written is always smaller, because we have | 
|  | 315 | * partial words in the shift queue) | 
|  | 316 | */ | 
|  | 317 | cmp			r2, #4 | 
|  | 318 | blo			copy_last_3_and_return | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 319 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 320 | /* Use post-incriment mode for stm to spill r5-r11 to reserved stack | 
|  | 321 | * frame. Don't update sp. | 
|  | 322 | */ | 
|  | 323 | stmea		sp, {r5-r11} | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 324 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 325 | /* compute shifts needed to align src to dest */ | 
|  | 326 | rsb			r5, r0, #0 | 
|  | 327 | and			r5, r5, #3			/* r5 = # bytes in partial words */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 328 | mov			r12, r5, lsl #3		/* r12 = right */ | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 329 | rsb			lr, r12, #32		/* lr = left  */ | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 330 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 331 | /* read the first word */ | 
|  | 332 | ldr			r3, [r1], #4 | 
|  | 333 | sub			r2, r2, #4 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 334 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 335 | /* write a partial word (0 to 3 bytes), such that destination | 
|  | 336 | * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) | 
|  | 337 | */ | 
|  | 338 | movs		r5, r5, lsl #31 | 
|  | 339 | strmib		r3, [r0], #1 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 340 | movmi		r3, r3, lsr #8 | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 341 | strcsb		r3, [r0], #1 | 
|  | 342 | movcs		r3, r3, lsr #8 | 
|  | 343 | strcsb		r3, [r0], #1 | 
|  | 344 | movcs		r3, r3, lsr #8 | 
|  | 345 |  | 
|  | 346 | cmp			r2, #4 | 
|  | 347 | blo			partial_word_tail | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 348 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 349 | /* Align destination to 32 bytes (cache line boundary) */ | 
|  | 350 | 1:		tst			r0, #0x1c | 
|  | 351 | beq			2f | 
|  | 352 | ldr			r5, [r1], #4 | 
|  | 353 | sub         r2, r2, #4 | 
|  | 354 | orr			r4, r3, r5,		lsl lr | 
|  | 355 | mov			r3, r5,			lsr r12 | 
|  | 356 | str			r4, [r0], #4 | 
|  | 357 | cmp         r2, #4 | 
|  | 358 | bhs			1b | 
|  | 359 | blo			partial_word_tail | 
|  | 360 |  | 
|  | 361 | /* copy 32 bytes at a time */ | 
|  | 362 | 2:		subs		r2, r2, #32 | 
|  | 363 | blo			less_than_thirtytwo | 
|  | 364 |  | 
|  | 365 | /* Use immediate mode for the shifts, because there is an extra cycle | 
|  | 366 | * for register shifts, which could account for up to 50% of | 
|  | 367 | * performance hit. | 
|  | 368 | */ | 
|  | 369 |  | 
|  | 370 | cmp			r12, #24 | 
|  | 371 | beq			loop24 | 
|  | 372 | cmp			r12, #8 | 
|  | 373 | beq			loop8 | 
|  | 374 |  | 
|  | 375 | loop16: | 
|  | 376 | ldr         r12, [r1], #4 | 
|  | 377 | 1:      mov         r4, r12 | 
|  | 378 | ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11} | 
|  | 379 | PLD         (r1, #64) | 
|  | 380 | subs        r2, r2, #32 | 
|  | 381 | ldrhs       r12, [r1], #4 | 
|  | 382 | orr			r3, r3, r4,		lsl #16 | 
|  | 383 | mov			r4, r4,			lsr #16 | 
|  | 384 | orr			r4, r4, r5,		lsl #16 | 
|  | 385 | mov			r5, r5,			lsr #16 | 
|  | 386 | orr			r5, r5, r6,		lsl #16 | 
|  | 387 | mov			r6, r6,			lsr #16 | 
|  | 388 | orr			r6, r6, r7,		lsl #16 | 
|  | 389 | mov			r7, r7,			lsr #16 | 
|  | 390 | orr			r7, r7, r8,		lsl #16 | 
|  | 391 | mov			r8, r8,			lsr #16 | 
|  | 392 | orr			r8, r8, r9,		lsl #16 | 
|  | 393 | mov			r9, r9,			lsr #16 | 
|  | 394 | orr			r9, r9, r10,	lsl #16 | 
|  | 395 | mov			r10, r10,		lsr #16 | 
|  | 396 | orr			r10, r10, r11,	lsl #16 | 
|  | 397 | stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10} | 
|  | 398 | mov			r3, r11,		lsr #16 | 
|  | 399 | bhs			1b | 
|  | 400 | b			less_than_thirtytwo | 
|  | 401 |  | 
|  | 402 | loop8: | 
|  | 403 | ldr         r12, [r1], #4 | 
|  | 404 | 1:      mov         r4, r12 | 
|  | 405 | ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11} | 
|  | 406 | PLD         (r1, #64) | 
|  | 407 | subs		r2, r2, #32 | 
|  | 408 | ldrhs       r12, [r1], #4 | 
|  | 409 | orr			r3, r3, r4,		lsl #24 | 
|  | 410 | mov			r4, r4,			lsr #8 | 
|  | 411 | orr			r4, r4, r5,		lsl #24 | 
|  | 412 | mov			r5, r5,			lsr #8 | 
|  | 413 | orr			r5, r5, r6,		lsl #24 | 
|  | 414 | mov			r6, r6,			lsr #8 | 
|  | 415 | orr			r6, r6, r7,		lsl #24 | 
|  | 416 | mov			r7, r7,			lsr #8 | 
|  | 417 | orr			r7, r7, r8,		lsl #24 | 
|  | 418 | mov			r8, r8,			lsr #8 | 
|  | 419 | orr			r8, r8, r9,		lsl #24 | 
|  | 420 | mov			r9, r9,			lsr #8 | 
|  | 421 | orr			r9, r9, r10,	lsl #24 | 
|  | 422 | mov			r10, r10,		lsr #8 | 
|  | 423 | orr			r10, r10, r11,	lsl #24 | 
|  | 424 | stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10} | 
|  | 425 | mov			r3, r11,		lsr #8 | 
|  | 426 | bhs			1b | 
|  | 427 | b			less_than_thirtytwo | 
|  | 428 |  | 
|  | 429 | loop24: | 
|  | 430 | ldr         r12, [r1], #4 | 
|  | 431 | 1:      mov         r4, r12 | 
|  | 432 | ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11} | 
|  | 433 | PLD         (r1, #64) | 
|  | 434 | subs		r2, r2, #32 | 
|  | 435 | ldrhs       r12, [r1], #4 | 
|  | 436 | orr			r3, r3, r4,		lsl #8 | 
|  | 437 | mov			r4, r4,			lsr #24 | 
|  | 438 | orr			r4, r4, r5,		lsl #8 | 
|  | 439 | mov			r5, r5,			lsr #24 | 
|  | 440 | orr			r5, r5, r6,		lsl #8 | 
|  | 441 | mov			r6, r6,			lsr #24 | 
|  | 442 | orr			r6, r6, r7,		lsl #8 | 
|  | 443 | mov			r7, r7,			lsr #24 | 
|  | 444 | orr			r7, r7, r8,		lsl #8 | 
|  | 445 | mov			r8, r8,			lsr #24 | 
|  | 446 | orr			r8, r8, r9,		lsl #8 | 
|  | 447 | mov			r9, r9,			lsr #24 | 
|  | 448 | orr			r9, r9, r10,	lsl #8 | 
|  | 449 | mov			r10, r10,		lsr #24 | 
|  | 450 | orr			r10, r10, r11,	lsl #8 | 
|  | 451 | stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10} | 
|  | 452 | mov			r3, r11,		lsr #24 | 
|  | 453 | bhs			1b | 
|  | 454 |  | 
|  | 455 |  | 
|  | 456 | less_than_thirtytwo: | 
|  | 457 | /* copy the last 0 to 31 bytes of the source */ | 
|  | 458 | rsb			r12, lr, #32		/* we corrupted r12, recompute it  */ | 
|  | 459 | add			r2, r2, #32 | 
|  | 460 | cmp			r2, #4 | 
|  | 461 | blo			partial_word_tail | 
|  | 462 |  | 
|  | 463 | 1:		ldr			r5, [r1], #4 | 
|  | 464 | sub         r2, r2, #4 | 
|  | 465 | orr			r4, r3, r5,		lsl lr | 
|  | 466 | mov			r3,	r5,			lsr r12 | 
|  | 467 | str			r4, [r0], #4 | 
|  | 468 | cmp         r2, #4 | 
|  | 469 | bhs			1b | 
|  | 470 |  | 
|  | 471 | partial_word_tail: | 
|  | 472 | /* we have a partial word in the input buffer */ | 
|  | 473 | movs		r5, lr, lsl #(31-3) | 
|  | 474 | strmib		r3, [r0], #1 | 
|  | 475 | movmi		r3, r3, lsr #8 | 
|  | 476 | strcsb		r3, [r0], #1 | 
|  | 477 | movcs		r3, r3, lsr #8 | 
|  | 478 | strcsb		r3, [r0], #1 | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 479 |  | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 480 | /* Refill spilled registers from the stack. Don't update sp. */ | 
|  | 481 | ldmfd		sp, {r5-r11} | 
|  | 482 |  | 
|  | 483 | copy_last_3_and_return: | 
|  | 484 | movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */ | 
|  | 485 | ldrmib		r2, [r1], #1 | 
|  | 486 | ldrcsb		r3, [r1], #1 | 
|  | 487 | ldrcsb		r12,[r1] | 
|  | 488 | strmib		r2, [r0], #1 | 
|  | 489 | strcsb		r3, [r0], #1 | 
|  | 490 | strcsb		r12,[r0] | 
|  | 491 |  | 
|  | 492 | /* we're done! restore sp and spilled registers and return */ | 
|  | 493 | add         sp,  sp, #28 | 
|  | 494 | ldmfd		sp!, {r0, r4, lr} | 
|  | 495 | bx			lr | 
| Kenny Root | 420878c | 2011-02-16 11:55:58 -0800 | [diff] [blame] | 496 | END(memcpy) | 
| The Android Open Source Project | 1dc9e47 | 2009-03-03 19:28:35 -0800 | [diff] [blame] | 497 |  | 
| Mathias Agopian | ee223d0 | 2009-09-27 17:46:43 -0700 | [diff] [blame] | 498 |  | 
|  | 499 | #endif    /* __ARM_ARCH__ < 7 */ |