Add stack unwinding directives to memcpy.

Also include some Android specific header files.

Change-Id: Idbcbd43458ba945ca8c61bfbc04ea15fc0ae4e00
diff --git a/libc/arch-arm/bionic/memcpy.a15.S b/libc/arch-arm/bionic/memcpy.a15.S
index d1bfb7c..516e20c 100644
--- a/libc/arch-arm/bionic/memcpy.a15.S
+++ b/libc/arch-arm/bionic/memcpy.a15.S
@@ -26,12 +26,6 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
-     (!(defined (__ARM_ARCH_7A__))))
-
-        /* Do nothing here. See memcpy-stub.c in the same directory. */
-
-#else
     /* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */
 
         /* Use the version of memcpy implemented using LDRD and STRD.
@@ -50,16 +44,12 @@
        destination register must be even and the second consecutive in
        ARM state, but not in Thumb state.  */
 
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
         .syntax         unified
 
-#if defined (__thumb__)
-        .thumb
-        .thumb_func
-#endif
-
-        .global memcpy
-        .type   memcpy, %function
-memcpy:
+ENTRY(memcpy)
 
        /* Assumes that n >= 0, and dst, src are valid pointers.
           If there is at least 8 bytes to copy, use LDRD/STRD.
@@ -69,12 +59,16 @@
           When less than 8 left, copy a word and then byte by byte.  */
 
        /* Save registers (r0 holds the return value):
-          optimized push {r0, r4, r5, lr}.
+          optimized push {r0, r4, r5, r6, r7, lr}.
           To try and improve performance, stack layout changed,
           i.e., not keeping the stack looking like users expect
           (highest numbered register at highest address).  */
-        push {r0, lr}
-        strd r4, r5, [sp, #-8]!
+        .save   {r0, lr}
+        push    {r0, lr}
+        .save   {r4, r5}
+        strd    r4, r5, [sp, #-8]!
+        .save   {r6, r7}
+        strd    r6, r7, [sp, #-8]!
 
        /* TODO: Add debug frame directives.
           We don't need exception unwind directives, because the code below
@@ -194,9 +188,11 @@
         strbcs  r5, [r0]
 
 return:
-        /* Restore registers: optimized pop {r0, r4, r5, pc}   */
+        /* Restore registers: optimized pop {r0, r4, r5, r6, r7, pc}   */
+        /* This is the only return point of memcpy.  */
+        ldrd r6, r7, [sp], #8
         ldrd r4, r5, [sp], #8
-        pop {r0, pc}           /* This is the only return point of memcpy.  */
+        pop {r0, pc}
 
 #ifndef __ARM_FEATURE_UNALIGNED
 
@@ -223,12 +219,6 @@
        /* Get here if there is more than 8 bytes to copy.
           The number of bytes to copy is r2+8, r2 >= 0.  */
 
-       /* Save registers: push { r6, r7 }.
-          We need additional registers for LDRD and STRD, because in ARM state
-          the first destination register must be even and the second
-      consecutive.  */
-       strd     r6, r7, [sp, #-8]!
-
        subs     r2, r2, #56
        blt      4f         /* Go to misaligned copy of less than 64 bytes.  */
 
@@ -259,10 +249,6 @@
        /* Restore the count if there is more than 7 bytes to copy.  */
         adds    r2, r2, #56
 
-       /* If less than 8 bytes to copy,
-          restore registers saved for this loop: optimized poplt { r6, r7 }. */
-        itt     lt
-        ldrdlt  r6, r7, [sp], #8
         blt     6f          /* Go to misaligned copy of less than 8 bytes.  */
 
 5:
@@ -278,9 +264,6 @@
         subs    r2, r2, #8
         bge     5b                        /* If there is more to copy.  */
 
-        /* Restore registers saved for this loop: optimized pop { r6, r7 }.  */
-        ldrd    r6, r7, [sp], #8
-
 6:
         /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
            and they are misaligned.  */
@@ -420,4 +403,4 @@
 
 #endif  /* not __ARM_FEATURE_UNALIGNED  */
 
-#endif  /* memcpy */
+END(memcpy)