libc: ARM: Add 32-bit Kryo memcpy

* Memcpy is based on Scorpion due to Qualcomm's
  128-bit cache line size optimizations.

* PLDOFFSET and PLDSIZE are from the ARM64 Kryo memcpy routine.

Below are the results of the benchmark, tested on a OnePlus 3 with MSM8996.

Before:
BM_string_memcpy/8                          1000k          8    0.934 GiB/s
BM_string_memcpy/64                         1000k         11    5.785 GiB/s
BM_string_memcpy/512                        1000k         25   19.918 GiB/s
BM_string_memcpy/1024                         50M         42   23.938 GiB/s
BM_string_memcpy/8Ki                          10M        473   17.291 GiB/s
BM_string_memcpy/16Ki                          5M        565   28.976 GiB/s
BM_string_memcpy/32Ki                       1000k       1105   29.631 GiB/s
BM_string_memcpy/64Ki                       1000k       2194   29.864 GiB/s

After:
BM_string_memcpy/8                          1000k          6    1.145 GiB/s
BM_string_memcpy/64                         1000k          7    8.560 GiB/s
BM_string_memcpy/512                        1000k         18   27.370 GiB/s
BM_string_memcpy/1024                         50M         33   30.340 GiB/s
BM_string_memcpy/8Ki                          10M        266   30.770 GiB/s
BM_string_memcpy/16Ki                          5M        553   29.599 GiB/s
BM_string_memcpy/32Ki                       1000k       1121   29.219 GiB/s
BM_string_memcpy/64Ki                       1000k       2208   29.678 GiB/s

Test: make otapackage
Test: Ran bionic unit tests on Pixel device. Verified memcpy wins on
Test: Pixel device.

Change-Id: Id7a9c37ef75a306dd5cf8d374d79d0fe83f8a3ba
diff --git a/libc/Android.bp b/libc/Android.bp
index de270c2..e8175f4 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1018,7 +1018,7 @@
             },
             kryo: {
                 srcs: [
-                    "arch-arm/krait/bionic/memcpy.S",
+                    "arch-arm/kryo/bionic/memcpy.S",
                     "arch-arm/cortex-a7/bionic/memset.S",
                     "arch-arm/krait/bionic/strcmp.S",
                     "arch-arm/krait/bionic/__strcat_chk.S",
diff --git a/libc/NOTICE b/libc/NOTICE
index 3668dda..1c317a9 100644
--- a/libc/NOTICE
+++ b/libc/NOTICE
@@ -998,6 +998,36 @@
 Copyright (C) 2017 The Android Open Source Project
 All rights reserved.
 
+Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+-------------------------------------------------------------------
+
+Copyright (C) 2017 The Android Open Source Project
+All rights reserved.
+
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
diff --git a/libc/arch-arm/kryo/bionic/memcpy.S b/libc/arch-arm/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..7e96f7d
--- /dev/null
+++ b/libc/arch-arm/kryo/bionic/memcpy.S
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+
+#define PLDOFFS (16)
+#define PLDSIZE (128) /* L2 cache line size */
+
+        .code 32
+ENTRY(__memcpy_chk)
+        cmp         r2, r3
+        bls         memcpy
+
+        // Preserve lr for backtrace.
+        push        {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
+        bl          __memcpy_chk_fail
+END(__memcpy_chk)
+
+ENTRY(memcpy)
+        push            {r0}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset r0, 0
+        cmp             r2, #4
+        blt             .Lneon_lt4
+        cmp             r2, #16
+        blt             .Lneon_lt16
+        cmp             r2, #32
+        blt             .Lneon_16
+        cmp              r2, #128
+        blt              .Lneon_copy_32_a
+        /* Copy blocks of 128-bytes (word-aligned) at a time*/
+        /* Code below is optimized for PLDSIZE=128 only */
+        mov             r12, r2, lsr #7
+        cmp             r12, #PLDOFFS
+        ble             .Lneon_copy_128_loop_nopld
+        sub             r12, #PLDOFFS
+        pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+        pld             [r1, #(PLDOFFS*PLDSIZE)]
+        pld             [r1, #(PLDOFFS)*(PLDSIZE)+64]
+        vld1.32         {q0, q1}, [r1]!
+        vld1.32         {q2, q3}, [r1]!
+        vld1.32         {q8, q9}, [r1]!
+        vld1.32         {q10, q11}, [r1]!
+        subs            r12, r12, #1
+        vst1.32         {q0, q1}, [r0]!
+        vst1.32         {q2, q3}, [r0]!
+        vst1.32         {q8, q9}, [r0]!
+        vst1.32         {q10, q11}, [r0]!
+        bne             .Lneon_copy_128_loop_outer
+        mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+        vld1.32         {q0, q1}, [r1]!
+        vld1.32         {q2, q3}, [r1]!
+        vld1.32         {q8, q9}, [r1]!
+        vld1.32         {q10, q11}, [r1]!
+        subs            r12, r12, #1
+        vst1.32         {q0, q1}, [r0]!
+        vst1.32         {q2, q3}, [r0]!
+        vst1.32         {q8, q9}, [r0]!
+        vst1.32         {q10, q11}, [r0]!
+        bne             .Lneon_copy_128_loop_nopld
+        ands            r2, r2, #0x7f
+        beq             .Lneon_exit
+        cmp             r2, #32
+        blt             .Lneon_16
+        nop
+        /* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+        mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+        vld1.32         {q0,q1}, [r1]!
+        subs            r12, r12, #1
+        vst1.32         {q0,q1}, [r0]!
+        bne             .Lneon_copy_32_loop_a
+        ands            r2, r2, #0x1f
+        beq             .Lneon_exit
+.Lneon_16:
+        subs            r2, r2, #16
+        blt             .Lneon_lt16
+        vld1.32         {q8}, [r1]!
+        vst1.32         {q8}, [r0]!
+        beq             .Lneon_exit
+.Lneon_lt16:
+        movs            r12, r2, lsl #29
+        bcc             .Lneon_skip8
+        ldr             r3, [r1], #4
+        ldr             r12, [r1], #4
+        str             r3, [r0], #4
+        str             r12, [r0], #4
+.Lneon_skip8:
+        bpl             .Lneon_lt4
+        ldr             r3, [r1], #4
+        str             r3, [r0], #4
+.Lneon_lt4:
+        movs            r2, r2, lsl #31
+        bcc             .Lneon_lt2
+        ldrh            r3, [r1], #2
+        strh            r3, [r0], #2
+.Lneon_lt2:
+        bpl             .Lneon_exit
+        ldrb            r12, [r1]
+        strb            r12, [r0]
+.Lneon_exit:
+        pop             {r0}
+        bx              lr
+
+END(memcpy)