am 49677dec: Merge "Add optimized version of memcpy for Cortex A9"

* commit '49677deca2c2984ae6c0a3df8fbada7132e82940':
  Add optimized version of memcpy for Cortex A9
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 8453cc0..0dc86d5 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -37,28 +37,35 @@
 #ifdef HAVE_32_BYTE_CACHE_LINE
 /* a prefetch distance of 2 cache-lines */
 #define CACHE_LINE_SIZE     32
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*2)
 #else
 /* a prefetch distance of 4 cache-lines works best experimentally */
 #define CACHE_LINE_SIZE     64
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
 #endif
 
 ENTRY(memcpy)
         .save       {r0, lr}
         /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
         stmfd       sp!, {r0, lr}
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 
+/* If Neon supports unaligned access then remove the align code,
+ * unless a size limit has been specified.
+ */
+#ifndef NEON_UNALIGNED_ACCESS
         /* do we have at least 16-bytes to copy (needed for alignment below) */
         cmp         r2, #16
         blo         5f
 
+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
         /* align destination to cache-line for the write-buffer */
         rsb         r3, r0, #0
         ands        r3, r3, #0xF
-        beq         0f
+        beq         2f
 
         /* copy up to 15-bytes (count in r3) */
         sub         r2, r2, r3
@@ -79,10 +86,9 @@
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
 2:
-
-0:      /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
 
 #ifdef HAVE_32_BYTE_CACHE_LINE
         /* make sure we have at least 32 bytes to copy */
@@ -108,23 +114,22 @@
         subs        r2, r2, #64
         blo         2f
 
-        /* preload all the cache lines we need.
-         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
-         * ideally would would increase the distance in the main loop to
-         * avoid the goofy code below. In practice this doesn't seem to make
-         * a big difference.
-         */
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-        pld         [r1, #(CACHE_LINE_SIZE*3)]
-        pld         [r1, #(PREFETCH_DISTANCE)]
+        /* preload all the cache lines we need. */
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
 
 1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(PREFETCH_DISTANCE)]
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+#ifdef  HAVE_32_BYTE_CACHE_LINE
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
         subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
+        vst1.8      {d0 - d3}, [r0, :128]!
+        vst1.8      {d4 - d7}, [r0, :128]!
         bhs         1b
 
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
@@ -133,9 +138,9 @@
         blo         4f
 
 3:      /* 32 bytes at a time. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
+        vld1.8      {d0 - d3}, [r1]!
         subs        r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
+        vst1.8      {d0 - d3}, [r0, :128]!
         bhs         3b
 #endif
 4:      /* less than 32 left */
@@ -145,7 +150,6 @@
         // copies 16 bytes, 128-bits aligned
         vld1.8      {d0, d1}, [r1]!
         vst1.8      {d0, d1}, [r0, :128]!
-
 5:      /* copy up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
@@ -164,6 +168,164 @@
 
         ldmfd       sp!, {r0, lr}
         bx          lr
+
+#else   /* NEON_UNALIGNED_ACCESS */
+
+        // Check so divider is at least 16 bytes, needed for alignment code.
+        cmp         r2, #16
+        blo         5f
+
+#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
+        /* Check the upper size limit for Neon unaligned memory access in memcpy */
+#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
+        cmp         r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
+        blo         3f
+#endif
+        /* check if buffers are aligned. If so, run arm-only version */
+        eor         r3, r0, r1
+        ands        r3, r3, #0x3
+        beq         11f
+
+        /* align destination to 16 bytes for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         3f
+
+        /* copy up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        ldrmib      lr, [r1], #1
+        strmib      lr, [r0], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+        // copies 4 bytes, destination 32-bits aligned
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0, :32]!
+1:      bcc         2f
+        // copies 8 bytes, destination 64-bits aligned
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+2:
+        /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+3:
+#endif
+        /* make sure we have at least 64 bytes to copy */
+        subs        r2, r2, #64
+        blo         2f
+
+        /* preload all the cache lines we need */
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+
+1:      /* The main loop copies 64 bytes at a time */
+        vld1.8      {d0 - d3}, [r1]!
+        vld1.8      {d4 - d7}, [r1]!
+#ifdef  HAVE_32_BYTE_CACHE_LINE
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
+        subs        r2, r2, #64
+        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d4 - d7}, [r0]!
+        bhs         1b
+
+2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+        add         r2, r2, #64
+        subs        r2, r2, #32
+        blo         4f
+
+3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        vld1.8      {d0 - d3}, [r1]!
+        subs        r2, r2, #32
+        vst1.8      {d0 - d3}, [r0]!
+        bhs         3b
+
+4:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         5f
+        // copies 16 bytes, 128-bits aligned
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0]!
+5:      /* copy up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld1.32     {d0[0]}, [r1]!
+        vst1.32     {d0[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        ldrmib      r3, [r1], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+
+        ldmfd       sp!, {r0, lr}
+        bx          lr
+#endif  /* NEON_UNALIGNED_ACCESS */
+11:
+        /* Simple arm-only copy loop to handle aligned copy operations */
+        stmfd       sp!, {r4, r5, r6, r7, r8}
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+        /* Check alignment */
+        rsb         r3, r1, #0
+        ands        r3, #3
+        beq         2f
+
+        /* align source to 32 bits. We need to insert 2 instructions between
+         * a ldr[b|h] and str[b|h] because byte and half-word instructions
+         * stall 2 cycles.
+         */
+        movs        r12, r3, lsl #31
+        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
+        ldrmib      r3, [r1], #1
+        ldrcsb      r4, [r1], #1
+        ldrcsb      r5, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      r4, [r0], #1
+        strcsb      r5, [r0], #1
+2:
+        subs        r2, #32
+        blt         5f
+        pld         [r1, #(CACHE_LINE_SIZE * 3)]
+3:      /* Main copy loop, copying 32 bytes at a time */
+        pld         [r1, #(CACHE_LINE_SIZE * 4)]
+        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        subs        r2, r2, #32
+        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+        bge         3b
+5:      /* Handle any remaining bytes */
+        adds        r2, #32
+        beq         6f
+
+        movs        r12, r2, lsl #28
+        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
+        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
+        stmcsia     r0!, {r3, r4, r5, r6}
+        stmmiia     r0!, {r7, r8}
+        movs        r12, r2, lsl #30
+        ldrcs       r3, [r1], #4            /*  4 bytes */
+        ldrmih      r4, [r1], #2            /*  2 bytes */
+        strcs       r3, [r0], #4
+        strmih      r4, [r0], #2
+        tst         r2, #0x1
+        ldrneb      r3, [r1]                /*  last byte  */
+        strneb      r3, [r0]
+6:
+        ldmfd       sp!, {r4, r5, r6, r7, r8}
+        ldmfd       sp!, {r0, pc}
 END(memcpy)