Add optimized 64 bit strcpy.

Change-Id: I4ac12735a53c3ae9336b148ce694fe6c63613139
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index e44ee31..7f86a5f 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -15,7 +15,6 @@
     upstream-openbsd/lib/libc/string/stpcpy.c \
     upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strcat.c \
-    upstream-openbsd/lib/libc/string/strcpy.c \
     upstream-openbsd/lib/libc/string/strlcat.c \
     upstream-openbsd/lib/libc/string/strlcpy.c \
     upstream-openbsd/lib/libc/string/strncat.c \
diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk
index 36146bf..99b2c79 100644
--- a/libc/arch-arm64/denver64/denver64.mk
+++ b/libc/arch-arm64/denver64/denver64.mk
@@ -4,6 +4,7 @@
     arch-arm64/generic/bionic/memmove.S \
     arch-arm64/denver64/bionic/memset.S \
     arch-arm64/generic/bionic/strcmp.S \
+    arch-arm64/generic/bionic/strcpy.S \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
diff --git a/libc/arch-arm64/generic-neon/generic-neon.mk b/libc/arch-arm64/generic-neon/generic-neon.mk
index 2cbe3cf..6a10fde 100644
--- a/libc/arch-arm64/generic-neon/generic-neon.mk
+++ b/libc/arch-arm64/generic-neon/generic-neon.mk
@@ -3,6 +3,7 @@
     arch-arm64/generic/bionic/memmove.S \
     arch-arm64/generic/bionic/memset.S \
     arch-arm64/generic/bionic/strcmp.S \
+    arch-arm64/generic/bionic/strcpy.S \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
diff --git a/libc/arch-arm64/generic/bionic/strcpy.S b/libc/arch-arm64/generic/bionic/strcpy.S
new file mode 100644
index 0000000..b15e06d
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strcpy.S
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/* Copyright (c) 2014, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results.  */
+#define dstin       x0
+#define src         x1
+
+/* Locals and temporaries.  */
+#define dst         x2
+#define data1       x3
+#define data1_w     w3
+#define data2       x4
+#define data2_w     w4
+#define has_nul1    x5
+#define has_nul1_w  w5
+#define has_nul2    x6
+#define tmp1        x7
+#define tmp2        x8
+#define tmp3        x9
+#define tmp4        x10
+#define zeroones    x11
+#define zeroones_w  w11
+#define pos         x12
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ENTRY(strcpy)
+    mov     zeroones, #REP8_01
+    mov     dst, dstin
+    ands    tmp1, src, #15
+    b.ne    .Lmisaligned
+    // NUL detection works on the principle that (X - 1) & (~X) & 0x80
+    // (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+    // can be done in parallel across the entire word.
+    // The inner loop deals with two Dwords at a time.  This has a
+    // slightly higher start-up cost, but we should win quite quickly,
+    // especially on cores with a high number of issue slots per
+    // cycle, as we get much better parallelism out of the operations.
+.Lloop:
+    ldp     data1, data2, [src], #16
+    sub     tmp1, data1, zeroones
+    orr     tmp2, data1, #REP8_7f
+    bic     has_nul1, tmp1, tmp2
+    cbnz    has_nul1, .Lnul_in_data1
+    sub     tmp3, data2, zeroones
+    orr     tmp4, data2, #REP8_7f
+    bic     has_nul2, tmp3, tmp4
+    cbnz    has_nul2, .Lnul_in_data2
+    // No NUL in either register, copy it in a single instruction.
+    stp     data1, data2, [dst], #16
+    b       .Lloop
+
+.Lnul_in_data1:
+    rev     has_nul1, has_nul1
+    clz     pos, has_nul1
+    add     tmp1, pos, #0x8
+
+    tbz     tmp1, #6, 1f
+    str     data1, [dst]
+    ret
+1:
+    tbz     tmp1, #5, 1f
+    str     data1_w, [dst], #4
+    lsr     data1, data1, #32
+1:
+    tbz     tmp1, #4, 1f
+    strh    data1_w, [dst], #2
+    lsr     data1, data1, #16
+1:
+    tbz     tmp1, #3, 1f
+    strb    data1_w, [dst]
+1:
+    ret
+
+.Lnul_in_data2:
+    str     data1, [dst], #8
+    rev     has_nul2, has_nul2
+    clz     pos, has_nul2
+    add     tmp1, pos, #0x8
+
+    tbz     tmp1, #6, 1f
+    str     data2, [dst]
+    ret
+1:
+    tbz     tmp1, #5, 1f
+    str     data2_w, [dst], #4
+    lsr     data2, data2, #32
+1:
+    tbz     tmp1, #4, 1f
+    strh    data2_w, [dst], #2
+    lsr     data2, data2, #16
+1:
+    tbz     tmp1, #3, 1f
+    strb    data2_w, [dst]
+1:
+    ret
+
+.Lmisaligned:
+    tbz     src, #0, 1f
+    ldrb    data1_w, [src], #1
+    strb    data1_w, [dst], #1
+    cbnz    data1_w, 1f
+    ret
+1:
+    tbz     src, #1, 1f
+    ldrb    data1_w, [src], #1
+    strb    data1_w, [dst], #1
+    cbz     data1_w, .Ldone
+    ldrb    data2_w, [src], #1
+    strb    data2_w, [dst], #1
+    cbnz    data2_w, 1f
+.Ldone:
+    ret
+1:
+    tbz     src, #2, 1f
+    ldr     data1_w, [src], #4
+    // Check for a zero.
+    sub     has_nul1_w, data1_w, zeroones_w
+    bic     has_nul1_w, has_nul1_w, data1_w
+    ands    has_nul1_w, has_nul1_w, #0x80808080
+    b.ne    .Lnul_in_data1
+    str     data1_w, [dst], #4
+1:
+    tbz     src, #3, .Lloop
+    ldr     data1, [src], #8
+    // Check for a zero.
+    sub     tmp1, data1, zeroones
+    orr     tmp2, data1, #REP8_7f
+    bics    has_nul1, tmp1, tmp2
+    b.ne    .Lnul_in_data1
+    str     data1, [dst], #8
+    b       .Lloop
+END(strcpy)
diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk
index e10cf66..0b6e08f 100644
--- a/libc/arch-arm64/generic/generic.mk
+++ b/libc/arch-arm64/generic/generic.mk
@@ -4,6 +4,7 @@
     arch-arm64/generic/bionic/memmove.S \
     arch-arm64/generic/bionic/memset.S \
     arch-arm64/generic/bionic/strcmp.S \
+    arch-arm64/generic/bionic/strcpy.S \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \