Custom memcpy implementation for Qualcomm Oryon CPU

Submitted on behalf of a third-party: Arm Limited

License rights, if any, to the submission are granted solely by the
copyright owner of such submission under its applicable intellectual
property.

Copyright (c) 2012-2022, Arm Limited.
SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception

Origin Project URL: https://github.com/ARM-software/optimized-routines
Tag: v24.01

Third Party code includes additions/modifications from Qualcomm Innovation Center, Inc.

Test: All
Change-Id: I0c97398a435e3f8ddf8ad38bc6bd71cc0d78aea5
diff --git a/libc/Android.bp b/libc/Android.bp
index 4020ede..5a75c6b 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1075,6 +1075,7 @@
                 "arch-arm64/bionic/setjmp.S",
                 "arch-arm64/bionic/syscall.S",
                 "arch-arm64/bionic/vfork.S",
+                "arch-arm64/oryon/memcpy-nt.S",
             ],
         },
 
diff --git a/libc/NOTICE b/libc/NOTICE
index dfd93ff..0279d2a 100644
--- a/libc/NOTICE
+++ b/libc/NOTICE
@@ -5155,3 +5155,11 @@
 
 -------------------------------------------------------------------
 
+memcpy - copy memory area
+
+Copyright (c) 2012-2022, Arm Limited.
+Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+-------------------------------------------------------------------
+
diff --git a/libc/arch-arm64/dynamic_function_dispatch.cpp b/libc/arch-arm64/dynamic_function_dispatch.cpp
index 450138c..ca7f39f 100644
--- a/libc/arch-arm64/dynamic_function_dispatch.cpp
+++ b/libc/arch-arm64/dynamic_function_dispatch.cpp
@@ -30,6 +30,19 @@
 #include <stddef.h>
 #include <sys/auxv.h>
 
+#define MIDR_IMPL_ID_SHIFT 24u
+#define MIDR_IMPL_ID_MASK 0xFF
+#define CPU_VARIANT_SHIFT 20u
+#define CPU_VARIANT_MASK 0xF
+
+/* Macro to identify CPU implementer */
+#define QCOM_IMPL_ID 0x51
+
+/* Macro to indentify qualcomm CPU variants which supports
+ * __memcpy_aarch64_nt routine
+ */
+#define QCOM_ORYON_CPU_VARIANTS 0x5
+
 extern "C" {
 
 typedef void* memchr_func(const void*, int, size_t);
@@ -49,20 +62,72 @@
 
 typedef void* memcpy_func(void*, const void*, size_t);
 DEFINE_IFUNC_FOR(memcpy) {
-    if (arg->_hwcap & HWCAP_ASIMD) {
-        RETURN_FUNC(memcpy_func, __memcpy_aarch64_simd);
-    } else {
+  unsigned long midr;
+  unsigned int impl_id, cpu_variant;
+
+  /* Check if hardware capability CPUID is available */
+  if (arg->_hwcap & HWCAP_CPUID) {
+    /* Read the MIDR register */
+    asm("mrs %0, MIDR_EL1 \n\t" : "=r"(midr));
+
+    /* Extract the CPU Implementer ID */
+    impl_id = (midr >> MIDR_IMPL_ID_SHIFT) & (MIDR_IMPL_ID_MASK);
+
+    /* Check for Qualcomm implementer ID */
+    if (impl_id == QCOM_IMPL_ID) {
+      cpu_variant = (midr >> CPU_VARIANT_SHIFT) & CPU_VARIANT_MASK;
+
+      /* Check for Qualcomm Oryon CPU variants: 0x1, 0x2, 0x3, 0x4, 0x5 */
+      if (cpu_variant <= QCOM_ORYON_CPU_VARIANTS) {
+        RETURN_FUNC(memcpy_func, __memcpy_aarch64_nt);
+      } else {
         RETURN_FUNC(memcpy_func, __memcpy_aarch64);
+      }
     }
+  }
+  /* If CPU implementer is not Qualcomm, choose the custom
+   * implementation based on CPU architecture feature
+   * */
+  if (arg->_hwcap & HWCAP_ASIMD) {
+    RETURN_FUNC(memcpy_func, __memcpy_aarch64_simd);
+  } else {
+    RETURN_FUNC(memcpy_func, __memcpy_aarch64);
+  }
 }
 
 typedef void* memmove_func(void*, const void*, size_t);
 DEFINE_IFUNC_FOR(memmove) {
-    if (arg->_hwcap & HWCAP_ASIMD) {
-        RETURN_FUNC(memmove_func, __memmove_aarch64_simd);
-    } else {
-        RETURN_FUNC(memmove_func, __memmove_aarch64);
+  unsigned long midr;
+  unsigned int impl_id, cpu_variant;
+
+  /* Check if hardware capability CPUID is available */
+  if (arg->_hwcap & HWCAP_CPUID) {
+    /* Read the MIDR register */
+    asm("mrs %0, MIDR_EL1 \n\t" : "=r"(midr));
+
+    /* Extract the CPU Implementer ID */
+    impl_id = (midr >> MIDR_IMPL_ID_SHIFT) & (MIDR_IMPL_ID_MASK);
+
+    /* Check for Qualcomm implementer ID */
+    if (impl_id == QCOM_IMPL_ID) {
+      cpu_variant = (midr >> CPU_VARIANT_SHIFT) & CPU_VARIANT_MASK;
+
+      /* Check for Qualcomm Oryon CPU variants: 0x1, 0x2, 0x3, 0x4, 0x5 */
+      if (cpu_variant <= QCOM_ORYON_CPU_VARIANTS) {
+        RETURN_FUNC(memcpy_func, __memmove_aarch64_nt);
+      } else {
+        RETURN_FUNC(memcpy_func, __memmove_aarch64);
+      }
     }
+  }
+  /* If CPU implementer is not Qualcomm, choose the custom
+   * implementation based on CPU architecture feature
+   * */
+  if (arg->_hwcap & HWCAP_ASIMD) {
+    RETURN_FUNC(memmove_func, __memmove_aarch64_simd);
+  } else {
+    RETURN_FUNC(memmove_func, __memmove_aarch64);
+  }
 }
 
 typedef int memrchr_func(const void*, int, size_t);
diff --git a/libc/arch-arm64/oryon/memcpy-nt.S b/libc/arch-arm64/oryon/memcpy-nt.S
new file mode 100644
index 0000000..46f1541
--- /dev/null
+++ b/libc/arch-arm64/oryon/memcpy-nt.S
@@ -0,0 +1,351 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include <private/bionic_asm.h>
+
+#define dstin     x0
+#define src       x1
+#define count     x2
+#define dst       x3
+#define srcend    x4
+#define dstend    x5
+#define A_l       x6
+#define A_lw      w6
+#define A_h       x7
+#define B_l       x8
+#define B_lw      w8
+#define B_h       x9
+#define C_l       x10
+#define C_lw      w10
+#define C_h       x11
+#define D_l       x12
+#define D_h       x13
+#define E_l       x14
+#define E_h       x15
+#define F_l       x16
+#define F_h       x17
+#define G_l       count
+#define G_h       dst
+#define H_l       src
+#define H_h       srcend
+#define tmp1      x14
+#define tmp2      x16
+#define SMALL_BUFFER_SIZE    48
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ALIAS_SYMBOL (__memmove_aarch64_nt, __memcpy_aarch64_nt)
+ENTRY (__memcpy_aarch64_nt)
+
+    add    srcend, src, count
+    add    dstend, dstin, count
+    cmp    count, 128
+    b.hi    L(copy_long)
+    cmp    count, 32
+    b.hi    L(copy32_128)
+
+    /* Small copies: 0..32 bytes.  */
+    cmp    count, 16
+    b.lo    L(copy16)
+    ldp    A_l, A_h, [src]
+    ldp    D_l, D_h, [srcend, -16]
+    stp    A_l, A_h, [dstin]
+    stp    D_l, D_h, [dstend, -16]
+    ret
+
+    /* Copy 8-15 bytes.  */
+L(copy16):
+    tbz    count, 3, L(copy8)
+    ldr    A_l, [src]
+    ldr    A_h, [srcend, -8]
+    str    A_l, [dstin]
+    str    A_h, [dstend, -8]
+    ret
+
+    .p2align 3
+    /* Copy 4-7 bytes.  */
+L(copy8):
+    tbz    count, 2, L(copy4)
+    ldr    A_lw, [src]
+    ldr    B_lw, [srcend, -4]
+    str    A_lw, [dstin]
+    str    B_lw, [dstend, -4]
+    ret
+
+    /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+    cbz    count, L(copy0)
+    lsr    tmp1, count, 1
+    ldrb    A_lw, [src]
+    ldrb    C_lw, [srcend, -1]
+    ldrb    B_lw, [src, tmp1]
+    strb    A_lw, [dstin]
+    strb    B_lw, [dstin, tmp1]
+    strb    C_lw, [dstend, -1]
+L(copy0):
+    ret
+
+    .p2align 4
+    /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+    ldp    A_l, A_h, [src]
+    ldp    B_l, B_h, [src, 16]
+    ldp    C_l, C_h, [srcend, -32]
+    ldp    D_l, D_h, [srcend, -16]
+    cmp    count, 64
+    b.hi    L(copy128)
+    stp    A_l, A_h, [dstin]
+    stp    B_l, B_h, [dstin, 16]
+    stp    C_l, C_h, [dstend, -32]
+    stp    D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    /* Copy 65..128 bytes.  */
+L(copy128):
+    ldp    E_l, E_h, [src, 32]
+    ldp    F_l, F_h, [src, 48]
+    cmp    count, 96
+    b.ls    L(copy96)
+    ldp    G_l, G_h, [srcend, -64]
+    ldp    H_l, H_h, [srcend, -48]
+    stp    G_l, G_h, [dstend, -64]
+    stp    H_l, H_h, [dstend, -48]
+L(copy96):
+    stp    A_l, A_h, [dstin]
+    stp    B_l, B_h, [dstin, 16]
+    stp    E_l, E_h, [dstin, 32]
+    stp    F_l, F_h, [dstin, 48]
+    stp    C_l, C_h, [dstend, -32]
+    stp    D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    /* Copy more than 128 bytes.  */
+L(copy_long):
+    mov tmp2, #SMALL_BUFFER_SIZE
+    cmp count, tmp2, LSL#10
+    bgt L(copy_long_nt)
+    /* Use backwards copy if there is an overlap.  */
+    sub    tmp1, dstin, src
+    cbz    tmp1, L(copy0)
+    cmp    tmp1, count
+    b.lo    L(copy_long_backwards)
+
+    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+    ldp    D_l, D_h, [src]
+    and    tmp1, dstin, 15
+    bic    dst, dstin, 15
+    sub    src, src, tmp1
+    add    count, count, tmp1    /* Count is now 16 too large.  */
+    ldp    A_l, A_h, [src, 16]
+    stp    D_l, D_h, [dstin]
+    ldp    B_l, B_h, [src, 32]
+    ldp    C_l, C_h, [src, 48]
+    ldp    D_l, D_h, [src, 64]!
+    subs    count, count, 128 + 16    /* Test and readjust count.  */
+    b.ls    L(copy64_from_end)
+
+L(loop64):
+    stp    A_l, A_h, [dst, 16]
+    ldp    A_l, A_h, [src, 16]
+    stp    B_l, B_h, [dst, 32]
+    ldp    B_l, B_h, [src, 32]
+    stp    C_l, C_h, [dst, 48]
+    ldp    C_l, C_h, [src, 48]
+    stp    D_l, D_h, [dst, 64]!
+    ldp    D_l, D_h, [src, 64]!
+    subs    count, count, 64
+    b.hi    L(loop64)
+
+    /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+    ldp    E_l, E_h, [srcend, -64]
+    stp    A_l, A_h, [dst, 16]
+    ldp    A_l, A_h, [srcend, -48]
+    stp    B_l, B_h, [dst, 32]
+    ldp    B_l, B_h, [srcend, -32]
+    stp    C_l, C_h, [dst, 48]
+    ldp    C_l, C_h, [srcend, -16]
+    stp    D_l, D_h, [dst, 64]
+    stp    E_l, E_h, [dstend, -64]
+    stp    A_l, A_h, [dstend, -48]
+    stp    B_l, B_h, [dstend, -32]
+    stp    C_l, C_h, [dstend, -16]
+    ret
+
+    .p2align 4
+
+    /* Large backwards copy for overlapping copies.
+       Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+    ldp    D_l, D_h, [srcend, -16]
+    and    tmp1, dstend, 15
+    sub    srcend, srcend, tmp1
+    sub    count, count, tmp1
+    ldp    A_l, A_h, [srcend, -16]
+    stp    D_l, D_h, [dstend, -16]
+    ldp    B_l, B_h, [srcend, -32]
+    ldp    C_l, C_h, [srcend, -48]
+    ldp    D_l, D_h, [srcend, -64]!
+    sub    dstend, dstend, tmp1
+    subs    count, count, 128
+    b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+    stp    A_l, A_h, [dstend, -16]
+    ldp    A_l, A_h, [srcend, -16]
+    stp    B_l, B_h, [dstend, -32]
+    ldp    B_l, B_h, [srcend, -32]
+    stp    C_l, C_h, [dstend, -48]
+    ldp    C_l, C_h, [srcend, -48]
+    stp    D_l, D_h, [dstend, -64]!
+    ldp    D_l, D_h, [srcend, -64]!
+    subs    count, count, 64
+    b.hi    L(loop64_backwards)
+
+    /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+    ldp    G_l, G_h, [src, 48]
+    stp    A_l, A_h, [dstend, -16]
+    ldp    A_l, A_h, [src, 32]
+    stp    B_l, B_h, [dstend, -32]
+    ldp    B_l, B_h, [src, 16]
+    stp    C_l, C_h, [dstend, -48]
+    ldp    C_l, C_h, [src]
+    stp    D_l, D_h, [dstend, -64]
+    stp    G_l, G_h, [dstin, 48]
+    stp    A_l, A_h, [dstin, 32]
+    stp    B_l, B_h, [dstin, 16]
+    stp    C_l, C_h, [dstin]
+    ret
+
+    .p2align 4
+    /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions.  */
+L(copy_long_nt):
+    /* Use backwards copy if there is an overlap.  */
+    sub    tmp1, dstin, src
+    cbz    tmp1, L(copy0)
+    cmp    tmp1, count
+    b.lo    L(copy_long_backwards_nt)
+
+    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+    ldnp    D_l, D_h, [src]
+    and    tmp1, dstin, 15
+    bic    dst, dstin, 15
+    sub    src, src, tmp1
+    add    count, count, tmp1    /* Count is now 16 too large.  */
+    ldnp    A_l, A_h, [src, 16]
+    stnp    D_l, D_h, [dstin]
+    ldnp    B_l, B_h, [src, 32]
+    ldnp    C_l, C_h, [src, 48]
+    ldnp    D_l, D_h, [src, 64]
+    add     src, src, #64
+    subs    count, count, 128 + 16    /* Test and readjust count.  */
+    b.ls    L(copy64_from_end_nt)
+
+L(loop64_nt):
+    stnp    A_l, A_h, [dst, 16]
+    ldnp    A_l, A_h, [src, 16]
+    stnp    B_l, B_h, [dst, 32]
+    ldnp    B_l, B_h, [src, 32]
+    stnp    C_l, C_h, [dst, 48]
+    ldnp    C_l, C_h, [src, 48]
+    stnp    D_l, D_h, [dst, 64]
+    add dst, dst, #64
+    ldnp    D_l, D_h, [src, 64]
+    add src, src, #64
+    subs    count, count, 64
+    b.hi    L(loop64_nt)
+
+    /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end_nt):
+    ldnp    E_l, E_h, [srcend, -64]
+    stnp    A_l, A_h, [dst, 16]
+    ldnp    A_l, A_h, [srcend, -48]
+    stnp    B_l, B_h, [dst, 32]
+    ldnp    B_l, B_h, [srcend, -32]
+    stnp    C_l, C_h, [dst, 48]
+    ldnp    C_l, C_h, [srcend, -16]
+    stnp    D_l, D_h, [dst, 64]
+    stnp    E_l, E_h, [dstend, -64]
+    stnp    A_l, A_h, [dstend, -48]
+    stnp    B_l, B_h, [dstend, -32]
+    stnp    C_l, C_h, [dstend, -16]
+    ret
+
+    .p2align 4
+
+    /* Large backwards copy for overlapping copies.
+       Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards_nt):
+    ldnp    D_l, D_h, [srcend, -16]
+    and    tmp1, dstend, 15
+    sub    srcend, srcend, tmp1
+    sub    count, count, tmp1
+    ldnp    A_l, A_h, [srcend, -16]
+    stnp    D_l, D_h, [dstend, -16]
+    ldnp    B_l, B_h, [srcend, -32]
+    ldnp    C_l, C_h, [srcend, -48]
+    ldnp    D_l, D_h, [srcend, -64]
+    add     srcend, srcend, #-64
+    sub    dstend, dstend, tmp1
+    subs    count, count, 128
+    b.ls    L(copy64_from_start_nt)
+
+L(loop64_backwards_nt):
+    stnp    A_l, A_h, [dstend, -16]
+    ldnp    A_l, A_h, [srcend, -16]
+    stnp    B_l, B_h, [dstend, -32]
+    ldnp    B_l, B_h, [srcend, -32]
+    stnp    C_l, C_h, [dstend, -48]
+    ldnp    C_l, C_h, [srcend, -48]
+    stnp    D_l, D_h, [dstend, -64]
+    add     dstend, dstend, #-64
+    ldnp    D_l, D_h, [srcend, -64]
+    add     srcend, srcend, #-64
+    subs    count, count, 64
+    b.hi    L(loop64_backwards_nt)
+
+    /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start_nt):
+    ldnp    G_l, G_h, [src, 48]
+    stnp    A_l, A_h, [dstend, -16]
+    ldnp    A_l, A_h, [src, 32]
+    stnp    B_l, B_h, [dstend, -32]
+    ldnp    B_l, B_h, [src, 16]
+    stnp    C_l, C_h, [dstend, -48]
+    ldnp    C_l, C_h, [src]
+    stnp    D_l, D_h, [dstend, -64]
+    stnp    G_l, G_h, [dstin, 48]
+    stnp    A_l, A_h, [dstin, 32]
+    stnp    B_l, B_h, [dstin, 16]
+    stnp    C_l, C_h, [dstin]
+    ret
+
+END (__memcpy_aarch64_nt)
+