Custom memcpy implementation for Qualcomm Oryon CPU
Submitted on behalf of a third-party: Arm Limited
License rights, if any, to the submission are granted solely by the
copyright owner of such submission under its applicable intellectual
property.
Copyright (c) 2012-2022, Arm Limited.
SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
Origin Project URL: https://github.com/ARM-software/optimized-routines
Tag: v24.01
Third Party code includes additions/modifications from Qualcomm Innovation Center, Inc.
Test: All
Change-Id: I0c97398a435e3f8ddf8ad38bc6bd71cc0d78aea5
diff --git a/libc/Android.bp b/libc/Android.bp
index 4020ede..5a75c6b 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1075,6 +1075,7 @@
"arch-arm64/bionic/setjmp.S",
"arch-arm64/bionic/syscall.S",
"arch-arm64/bionic/vfork.S",
+ "arch-arm64/oryon/memcpy-nt.S",
],
},
diff --git a/libc/NOTICE b/libc/NOTICE
index dfd93ff..0279d2a 100644
--- a/libc/NOTICE
+++ b/libc/NOTICE
@@ -5155,3 +5155,11 @@
-------------------------------------------------------------------
+memcpy - copy memory area
+
+Copyright (c) 2012-2022, Arm Limited.
+Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+-------------------------------------------------------------------
+
diff --git a/libc/arch-arm64/dynamic_function_dispatch.cpp b/libc/arch-arm64/dynamic_function_dispatch.cpp
index 450138c..ca7f39f 100644
--- a/libc/arch-arm64/dynamic_function_dispatch.cpp
+++ b/libc/arch-arm64/dynamic_function_dispatch.cpp
@@ -30,6 +30,19 @@
#include <stddef.h>
#include <sys/auxv.h>
+#define MIDR_IMPL_ID_SHIFT 24u
+#define MIDR_IMPL_ID_MASK 0xFF
+#define CPU_VARIANT_SHIFT 20u
+#define CPU_VARIANT_MASK 0xF
+
+/* Macro to identify CPU implementer */
+#define QCOM_IMPL_ID 0x51
+
+/* Macro to indentify qualcomm CPU variants which supports
+ * __memcpy_aarch64_nt routine
+ */
+#define QCOM_ORYON_CPU_VARIANTS 0x5
+
extern "C" {
typedef void* memchr_func(const void*, int, size_t);
@@ -49,20 +62,72 @@
typedef void* memcpy_func(void*, const void*, size_t);
DEFINE_IFUNC_FOR(memcpy) {
- if (arg->_hwcap & HWCAP_ASIMD) {
- RETURN_FUNC(memcpy_func, __memcpy_aarch64_simd);
- } else {
+ unsigned long midr;
+ unsigned int impl_id, cpu_variant;
+
+ /* Check if hardware capability CPUID is available */
+ if (arg->_hwcap & HWCAP_CPUID) {
+ /* Read the MIDR register */
+ asm("mrs %0, MIDR_EL1 \n\t" : "=r"(midr));
+
+ /* Extract the CPU Implementer ID */
+ impl_id = (midr >> MIDR_IMPL_ID_SHIFT) & (MIDR_IMPL_ID_MASK);
+
+ /* Check for Qualcomm implementer ID */
+ if (impl_id == QCOM_IMPL_ID) {
+ cpu_variant = (midr >> CPU_VARIANT_SHIFT) & CPU_VARIANT_MASK;
+
+ /* Check for Qualcomm Oryon CPU variants: 0x1, 0x2, 0x3, 0x4, 0x5 */
+ if (cpu_variant <= QCOM_ORYON_CPU_VARIANTS) {
+ RETURN_FUNC(memcpy_func, __memcpy_aarch64_nt);
+ } else {
RETURN_FUNC(memcpy_func, __memcpy_aarch64);
+ }
}
+ }
+ /* If CPU implementer is not Qualcomm, choose the custom
+ * implementation based on CPU architecture feature
+ * */
+ if (arg->_hwcap & HWCAP_ASIMD) {
+ RETURN_FUNC(memcpy_func, __memcpy_aarch64_simd);
+ } else {
+ RETURN_FUNC(memcpy_func, __memcpy_aarch64);
+ }
}
typedef void* memmove_func(void*, const void*, size_t);
DEFINE_IFUNC_FOR(memmove) {
- if (arg->_hwcap & HWCAP_ASIMD) {
- RETURN_FUNC(memmove_func, __memmove_aarch64_simd);
- } else {
- RETURN_FUNC(memmove_func, __memmove_aarch64);
+ unsigned long midr;
+ unsigned int impl_id, cpu_variant;
+
+ /* Check if hardware capability CPUID is available */
+ if (arg->_hwcap & HWCAP_CPUID) {
+ /* Read the MIDR register */
+ asm("mrs %0, MIDR_EL1 \n\t" : "=r"(midr));
+
+ /* Extract the CPU Implementer ID */
+ impl_id = (midr >> MIDR_IMPL_ID_SHIFT) & (MIDR_IMPL_ID_MASK);
+
+ /* Check for Qualcomm implementer ID */
+ if (impl_id == QCOM_IMPL_ID) {
+ cpu_variant = (midr >> CPU_VARIANT_SHIFT) & CPU_VARIANT_MASK;
+
+ /* Check for Qualcomm Oryon CPU variants: 0x1, 0x2, 0x3, 0x4, 0x5 */
+ if (cpu_variant <= QCOM_ORYON_CPU_VARIANTS) {
+ RETURN_FUNC(memcpy_func, __memmove_aarch64_nt);
+ } else {
+ RETURN_FUNC(memcpy_func, __memmove_aarch64);
+ }
}
+ }
+ /* If CPU implementer is not Qualcomm, choose the custom
+ * implementation based on CPU architecture feature
+ * */
+ if (arg->_hwcap & HWCAP_ASIMD) {
+ RETURN_FUNC(memmove_func, __memmove_aarch64_simd);
+ } else {
+ RETURN_FUNC(memmove_func, __memmove_aarch64);
+ }
}
typedef int memrchr_func(const void*, int, size_t);
diff --git a/libc/arch-arm64/oryon/memcpy-nt.S b/libc/arch-arm64/oryon/memcpy-nt.S
new file mode 100644
index 0000000..46f1541
--- /dev/null
+++ b/libc/arch-arm64/oryon/memcpy-nt.S
@@ -0,0 +1,351 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include <private/bionic_asm.h>
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+#define tmp2 x16
+#define SMALL_BUFFER_SIZE 48
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ALIAS_SYMBOL (__memmove_aarch64_nt, __memcpy_aarch64_nt)
+ENTRY (__memcpy_aarch64_nt)
+
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ mov tmp2, #SMALL_BUFFER_SIZE
+ cmp count, tmp2, LSL#10
+ bgt L(copy_long_nt)
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
+ .p2align 4
+ /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions. */
+L(copy_long_nt):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards_nt)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldnp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldnp A_l, A_h, [src, 16]
+ stnp D_l, D_h, [dstin]
+ ldnp B_l, B_h, [src, 32]
+ ldnp C_l, C_h, [src, 48]
+ ldnp D_l, D_h, [src, 64]
+ add src, src, #64
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end_nt)
+
+L(loop64_nt):
+ stnp A_l, A_h, [dst, 16]
+ ldnp A_l, A_h, [src, 16]
+ stnp B_l, B_h, [dst, 32]
+ ldnp B_l, B_h, [src, 32]
+ stnp C_l, C_h, [dst, 48]
+ ldnp C_l, C_h, [src, 48]
+ stnp D_l, D_h, [dst, 64]
+ add dst, dst, #64
+ ldnp D_l, D_h, [src, 64]
+ add src, src, #64
+ subs count, count, 64
+ b.hi L(loop64_nt)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end_nt):
+ ldnp E_l, E_h, [srcend, -64]
+ stnp A_l, A_h, [dst, 16]
+ ldnp A_l, A_h, [srcend, -48]
+ stnp B_l, B_h, [dst, 32]
+ ldnp B_l, B_h, [srcend, -32]
+ stnp C_l, C_h, [dst, 48]
+ ldnp C_l, C_h, [srcend, -16]
+ stnp D_l, D_h, [dst, 64]
+ stnp E_l, E_h, [dstend, -64]
+ stnp A_l, A_h, [dstend, -48]
+ stnp B_l, B_h, [dstend, -32]
+ stnp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards_nt):
+ ldnp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldnp A_l, A_h, [srcend, -16]
+ stnp D_l, D_h, [dstend, -16]
+ ldnp B_l, B_h, [srcend, -32]
+ ldnp C_l, C_h, [srcend, -48]
+ ldnp D_l, D_h, [srcend, -64]
+ add srcend, srcend, #-64
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start_nt)
+
+L(loop64_backwards_nt):
+ stnp A_l, A_h, [dstend, -16]
+ ldnp A_l, A_h, [srcend, -16]
+ stnp B_l, B_h, [dstend, -32]
+ ldnp B_l, B_h, [srcend, -32]
+ stnp C_l, C_h, [dstend, -48]
+ ldnp C_l, C_h, [srcend, -48]
+ stnp D_l, D_h, [dstend, -64]
+ add dstend, dstend, #-64
+ ldnp D_l, D_h, [srcend, -64]
+ add srcend, srcend, #-64
+ subs count, count, 64
+ b.hi L(loop64_backwards_nt)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start_nt):
+ ldnp G_l, G_h, [src, 48]
+ stnp A_l, A_h, [dstend, -16]
+ ldnp A_l, A_h, [src, 32]
+ stnp B_l, B_h, [dstend, -32]
+ ldnp B_l, B_h, [src, 16]
+ stnp C_l, C_h, [dstend, -48]
+ ldnp C_l, C_h, [src]
+ stnp D_l, D_h, [dstend, -64]
+ stnp G_l, G_h, [dstin, 48]
+ stnp A_l, A_h, [dstin, 32]
+ stnp B_l, B_h, [dstin, 16]
+ stnp C_l, C_h, [dstin]
+ ret
+
+END (__memcpy_aarch64_nt)
+