libc: ARM: Add 32-bit Kryo memcpy
* Memcpy is based on Scorpion due to Qualcomm's
128-bit cache line size optimizations.
* PLDOFFSET and PLDSIZE are from the ARM64 Kryo memcpy routine.
Below are the results of the benchmark, tested on a OnePlus 3 with MSM8996.
Before:
BM_string_memcpy/8 1000k 8 0.934 GiB/s
BM_string_memcpy/64 1000k 11 5.785 GiB/s
BM_string_memcpy/512 1000k 25 19.918 GiB/s
BM_string_memcpy/1024 50M 42 23.938 GiB/s
BM_string_memcpy/8Ki 10M 473 17.291 GiB/s
BM_string_memcpy/16Ki 5M 565 28.976 GiB/s
BM_string_memcpy/32Ki 1000k 1105 29.631 GiB/s
BM_string_memcpy/64Ki 1000k 2194 29.864 GiB/s
After:
BM_string_memcpy/8 1000k 6 1.145 GiB/s
BM_string_memcpy/64 1000k 7 8.560 GiB/s
BM_string_memcpy/512 1000k 18 27.370 GiB/s
BM_string_memcpy/1024 50M 33 30.340 GiB/s
BM_string_memcpy/8Ki 10M 266 30.770 GiB/s
BM_string_memcpy/16Ki 5M 553 29.599 GiB/s
BM_string_memcpy/32Ki 1000k 1121 29.219 GiB/s
BM_string_memcpy/64Ki 1000k 2208 29.678 GiB/s
Test: make otapackage
Test: Ran bionic unit tests on Pixel device. Verified memcpy wins on
Test: Pixel device.
Change-Id: Id7a9c37ef75a306dd5cf8d374d79d0fe83f8a3ba
diff --git a/libc/Android.bp b/libc/Android.bp
index de270c2..e8175f4 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1018,7 +1018,7 @@
},
kryo: {
srcs: [
- "arch-arm/krait/bionic/memcpy.S",
+ "arch-arm/kryo/bionic/memcpy.S",
"arch-arm/cortex-a7/bionic/memset.S",
"arch-arm/krait/bionic/strcmp.S",
"arch-arm/krait/bionic/__strcat_chk.S",
diff --git a/libc/NOTICE b/libc/NOTICE
index 3668dda..1c317a9 100644
--- a/libc/NOTICE
+++ b/libc/NOTICE
@@ -998,6 +998,36 @@
Copyright (C) 2017 The Android Open Source Project
All rights reserved.
+Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+-------------------------------------------------------------------
+
+Copyright (C) 2017 The Android Open Source Project
+All rights reserved.
+
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
diff --git a/libc/arch-arm/kryo/bionic/memcpy.S b/libc/arch-arm/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..7e96f7d
--- /dev/null
+++ b/libc/arch-arm/kryo/bionic/memcpy.S
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+
+#define PLDOFFS (16)
+#define PLDSIZE (128) /* L2 cache line size */
+
+ .code 32
+ENTRY(__memcpy_chk)
+ cmp r2, r3
+ bls memcpy
+
+ // Preserve lr for backtrace.
+ push {lr}
+ .cfi_def_cfa_offset 4
+ .cfi_rel_offset lr, 0
+
+ bl __memcpy_chk_fail
+END(__memcpy_chk)
+
+ENTRY(memcpy)
+ push {r0}
+ .cfi_def_cfa_offset 4
+ .cfi_rel_offset r0, 0
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #128
+ blt .Lneon_copy_32_a
+ /* Copy blocks of 128-bytes (word-aligned) at a time*/
+ /* Code below is optimized for PLDSIZE=128 only */
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ pld [r1, #(PLDOFFS)*(PLDSIZE)+64]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_exit
+ cmp r2, #32
+ blt .Lneon_16
+ nop
+ /* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+ mov r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_copy_32_loop_a
+ ands r2, r2, #0x1f
+ beq .Lneon_exit
+.Lneon_16:
+ subs r2, r2, #16
+ blt .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ beq .Lneon_exit
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ bcc .Lneon_skip8
+ ldr r3, [r1], #4
+ ldr r12, [r1], #4
+ str r3, [r0], #4
+ str r12, [r0], #4
+.Lneon_skip8:
+ bpl .Lneon_lt4
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ bcc .Lneon_lt2
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+.Lneon_lt2:
+ bpl .Lneon_exit
+ ldrb r12, [r1]
+ strb r12, [r0]
+.Lneon_exit:
+ pop {r0}
+ bx lr
+
+END(memcpy)