Use cortex-a53/bionic/memmove.S by default for arm64
cortex-a53/bionic/memmove.S looks like a more optimized version. It
should be used in most cases. It delegates small (<= 96 bytes) moves
to memcpy.
The only exception is denver64. It is using its own memcpy, which
doesn't allow overlap for < 96 bytes copies. Only for this variant we
need generic/bionic/memmove.S.
Benchmark result looks pretty close through (on marlin)
Before: using generic/bionic/memmove.S
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
BM_string_memcpy/8/0/0 6 ns 6 ns 108872005 1.15787GB/s
BM_string_memcpy/64/0/0 7 ns 7 ns 107387438 9.14365GB/s
BM_string_memcpy/512/0/0 21 ns 20 ns 34165353 23.2734GB/s
BM_string_memcpy/1024/0/0 40 ns 39 ns 17766657 24.2346GB/s
BM_string_memcpy/8192/0/0 311 ns 310 ns 2259904 24.6339GB/s
BM_string_memcpy/16384/0/0 616 ns 613 ns 1143027 24.8852GB/s
BM_string_memcpy/32768/0/0 1322 ns 1316 ns 530799 23.1835GB/s
BM_string_memcpy/65536/0/0 2672 ns 2661 ns 229638 22.937GB/s
BM_string_memcpy/131072/0/0 5379 ns 5357 ns 128316 22.788GB/s
After: using cortex-a53/bionic/memmove.S
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
BM_string_memcpy/8/0/0 6 ns 6 ns 116610749 1.24646GB/s
BM_string_memcpy/64/0/0 6 ns 6 ns 115634093 9.84708GB/s
BM_string_memcpy/512/0/0 21 ns 21 ns 34167322 22.8938GB/s
BM_string_memcpy/1024/0/0 39 ns 39 ns 17859445 24.3312GB/s
BM_string_memcpy/8192/0/0 311 ns 310 ns 2260192 24.6325GB/s
BM_string_memcpy/16384/0/0 610 ns 608 ns 1151889 25.0987GB/s
BM_string_memcpy/32768/0/0 1488 ns 1482 ns 532508 20.5988GB/s
BM_string_memcpy/65536/0/0 2421 ns 2411 ns 290502 25.3146GB/s
BM_string_memcpy/131072/0/0 5278 ns 5256 ns 132710 23.2234GB/s
Test: Build and benchmark on marlin
Bug: http://b/63992911
Change-Id: Id85961aca18ba841bcbcfe0d8b162843eab30584
diff --git a/libc/Android.bp b/libc/Android.bp
index 00686ac..f5e2285 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1042,45 +1042,15 @@
denver64: {
srcs: [
"arch-arm64/denver64/bionic/memcpy.S",
+ "arch-arm64/denver64/bionic/memmove.S",
"arch-arm64/denver64/bionic/memset.S",
],
exclude_srcs: [
"arch-arm64/generic/bionic/memcpy.S",
+ "arch-arm64/generic/bionic/memmove.S",
"arch-arm64/generic/bionic/memset.S",
],
},
- cortex_a53: {
- srcs: [
- "arch-arm64/cortex-a53/bionic/memmove.S",
- ],
- exclude_srcs: [
- "arch-arm64/generic/bionic/memmove.S",
- ],
- },
- cortex_a55: {
- srcs: [
- "arch-arm64/cortex-a53/bionic/memmove.S",
- ],
- exclude_srcs: [
- "arch-arm64/generic/bionic/memmove.S",
- ],
- },
- cortex_a73: {
- srcs: [
- "arch-arm64/cortex-a53/bionic/memmove.S",
- ],
- exclude_srcs: [
- "arch-arm64/generic/bionic/memmove.S",
- ],
- },
- cortex_a75: {
- srcs: [
- "arch-arm64/cortex-a53/bionic/memmove.S",
- ],
- exclude_srcs: [
- "arch-arm64/generic/bionic/memmove.S",
- ],
- },
},
mips: {
diff --git a/libc/arch-arm64/cortex-a53/bionic/memmove.S b/libc/arch-arm64/cortex-a53/bionic/memmove.S
deleted file mode 100644
index c50112d..0000000
--- a/libc/arch-arm64/cortex-a53/bionic/memmove.S
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2013, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
- */
-
-#include <private/bionic_asm.h>
-
-/* Parameters and result. */
-#define dstin x0
-#define src x1
-#define count x2
-#define srcend x3
-#define dstend x4
-#define tmp1 x5
-#define A_l x6
-#define A_h x7
-#define B_l x8
-#define B_h x9
-#define C_l x10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l count
-#define E_h tmp1
-
-/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
- Larger backwards copies are also handled by memcpy. The only remaining
- case is forward large copies. The destination is aligned, and an
- unrolled loop processes 64 bytes per iteration.
-*/
-
-#if defined(WMEMMOVE)
-ENTRY(wmemmove)
- lsl count, count, #2
-#else
-ENTRY(memmove)
-#endif
- sub tmp1, dstin, src
- cmp count, 96
- ccmp tmp1, count, 2, hi
- b.hs memcpy
-
- cbz tmp1, 3f
- add dstend, dstin, count
- add srcend, src, count
-
- /* Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
- and tmp1, dstend, 15
- ldp D_l, D_h, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
- nop
-1:
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
- subs count, count, 64
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldp E_l, E_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp E_l, E_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
-3: ret
-
-#if defined(WMEMMOVE)
-END(wmemmove)
-#else
-END(memmove)
-#endif
diff --git a/libc/arch-arm64/denver64/bionic/memmove.S b/libc/arch-arm64/denver64/bionic/memmove.S
new file mode 100644
index 0000000..739ce49
--- /dev/null
+++ b/libc/arch-arm64/denver64/bionic/memmove.S
@@ -0,0 +1,329 @@
+/* Copyright (c) 2014, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ * wchar_t is 4 bytes
+ */
+
+#include <private/bionic_asm.h>
+
+/* Parameters and result. */
+#define dstin x0
+#define src x1
+#define count x2
+#define tmp1 x3
+#define tmp1w w3
+#define tmp2 x4
+#define tmp2w w4
+#define tmp3 x5
+#define tmp3w w5
+#define dst x6
+
+#define A_l x7
+#define A_h x8
+#define B_l x9
+#define B_h x10
+#define C_l x11
+#define C_h x12
+#define D_l x13
+#define D_h x14
+
+#if defined(WMEMMOVE)
+ENTRY(wmemmove)
+ lsl count, count, #2
+#else
+ENTRY(memmove)
+#endif
+ cmp dstin, src
+ b.lo .Ldownwards
+ add tmp1, src, count
+ cmp dstin, tmp1
+ b.hs memcpy /* No overlap. */
+
+ /* Upwards move with potential overlap.
+ * Need to move from the tail backwards. SRC and DST point one
+ * byte beyond the remaining data to move. */
+ add dst, dstin, count
+ add src, src, count
+ cmp count, #64
+ b.ge .Lmov_not_short_up
+
+ /* Deal with small moves quickly by dropping straight into the
+ * exit block. */
+.Ltail63up:
+ /* Move up to 48 bytes of data. At this point we only need the
+ * bottom 6 bits of count to be accurate. */
+ ands tmp1, count, #0x30
+ b.eq .Ltail15up
+ sub dst, dst, tmp1
+ sub src, src, tmp1
+ cmp tmp1w, #0x20
+ b.eq 1f
+ b.lt 2f
+ ldp A_l, A_h, [src, #32]
+ stp A_l, A_h, [dst, #32]
+1:
+ ldp A_l, A_h, [src, #16]
+ stp A_l, A_h, [dst, #16]
+2:
+ ldp A_l, A_h, [src]
+ stp A_l, A_h, [dst]
+.Ltail15up:
+ /* Move up to 15 bytes of data. Does not assume additional data
+ * being moved. */
+ tbz count, #3, 1f
+ ldr tmp1, [src, #-8]!
+ str tmp1, [dst, #-8]!
+1:
+ tbz count, #2, 1f
+ ldr tmp1w, [src, #-4]!
+ str tmp1w, [dst, #-4]!
+1:
+ tbz count, #1, 1f
+ ldrh tmp1w, [src, #-2]!
+ strh tmp1w, [dst, #-2]!
+1:
+ tbz count, #0, 1f
+ ldrb tmp1w, [src, #-1]
+ strb tmp1w, [dst, #-1]
+1:
+ ret
+
+.Lmov_not_short_up:
+ /* We don't much care about the alignment of DST, but we want SRC
+ * to be 128-bit (16 byte) aligned so that we don't cross cache line
+ * boundaries on both loads and stores. */
+ ands tmp2, src, #15 /* Bytes to reach alignment. */
+ b.eq 2f
+ sub count, count, tmp2
+ /* Move enough data to reach alignment; unlike memcpy, we have to
+ * be aware of the overlap, which means we can't move data twice. */
+ tbz tmp2, #3, 1f
+ ldr tmp1, [src, #-8]!
+ str tmp1, [dst, #-8]!
+1:
+ tbz tmp2, #2, 1f
+ ldr tmp1w, [src, #-4]!
+ str tmp1w, [dst, #-4]!
+1:
+ tbz tmp2, #1, 1f
+ ldrh tmp1w, [src, #-2]!
+ strh tmp1w, [dst, #-2]!
+1:
+ tbz tmp2, #0, 1f
+ ldrb tmp1w, [src, #-1]!
+ strb tmp1w, [dst, #-1]!
+1:
+
+ /* There may be less than 63 bytes to go now. */
+ cmp count, #63
+ b.le .Ltail63up
+2:
+ subs count, count, #128
+ b.ge .Lmov_body_large_up
+ /* Less than 128 bytes to move, so handle 64 here and then jump
+ * to the tail. */
+ ldp A_l, A_h, [src, #-64]!
+ ldp B_l, B_h, [src, #16]
+ ldp C_l, C_h, [src, #32]
+ ldp D_l, D_h, [src, #48]
+ stp A_l, A_h, [dst, #-64]!
+ stp B_l, B_h, [dst, #16]
+ stp C_l, C_h, [dst, #32]
+ stp D_l, D_h, [dst, #48]
+ tst count, #0x3f
+ b.ne .Ltail63up
+ ret
+
+ /* Critical loop. Start at a new Icache line boundary. Assuming
+ * 64 bytes per line this ensures the entire loop is in one line. */
+ .p2align 6
+.Lmov_body_large_up:
+ /* There are at least 128 bytes to move. */
+ ldp A_l, A_h, [src, #-16]
+ ldp B_l, B_h, [src, #-32]
+ ldp C_l, C_h, [src, #-48]
+ ldp D_l, D_h, [src, #-64]!
+1:
+ stp A_l, A_h, [dst, #-16]
+ ldp A_l, A_h, [src, #-16]
+ stp B_l, B_h, [dst, #-32]
+ ldp B_l, B_h, [src, #-32]
+ stp C_l, C_h, [dst, #-48]
+ ldp C_l, C_h, [src, #-48]
+ stp D_l, D_h, [dst, #-64]!
+ ldp D_l, D_h, [src, #-64]!
+ subs count, count, #64
+ b.ge 1b
+ stp A_l, A_h, [dst, #-16]
+ stp B_l, B_h, [dst, #-32]
+ stp C_l, C_h, [dst, #-48]
+ stp D_l, D_h, [dst, #-64]!
+ tst count, #0x3f
+ b.ne .Ltail63up
+ ret
+
+
+.Ldownwards:
+ /* For a downwards move we can safely use memcpy provided that
+ * DST is more than 16 bytes away from SRC. */
+ sub tmp1, src, #16
+ cmp dstin, tmp1
+ b.ls memcpy /* May overlap, but not critically. */
+
+ mov dst, dstin /* Preserve DSTIN for return value. */
+ cmp count, #64
+ b.ge .Lmov_not_short_down
+
+ /* Deal with small moves quickly by dropping straight into the
+ * exit block. */
+.Ltail63down:
+ /* Move up to 48 bytes of data. At this point we only need the
+ * bottom 6 bits of count to be accurate. */
+ ands tmp1, count, #0x30
+ b.eq .Ltail15down
+ add dst, dst, tmp1
+ add src, src, tmp1
+ cmp tmp1w, #0x20
+ b.eq 1f
+ b.lt 2f
+ ldp A_l, A_h, [src, #-48]
+ stp A_l, A_h, [dst, #-48]
+1:
+ ldp A_l, A_h, [src, #-32]
+ stp A_l, A_h, [dst, #-32]
+2:
+ ldp A_l, A_h, [src, #-16]
+ stp A_l, A_h, [dst, #-16]
+.Ltail15down:
+ /* Move up to 15 bytes of data. Does not assume additional data
+ being moved. */
+ tbz count, #3, 1f
+ ldr tmp1, [src], #8
+ str tmp1, [dst], #8
+1:
+ tbz count, #2, 1f
+ ldr tmp1w, [src], #4
+ str tmp1w, [dst], #4
+1:
+ tbz count, #1, 1f
+ ldrh tmp1w, [src], #2
+ strh tmp1w, [dst], #2
+1:
+ tbz count, #0, 1f
+ ldrb tmp1w, [src]
+ strb tmp1w, [dst]
+1:
+ ret
+
+.Lmov_not_short_down:
+ /* We don't much care about the alignment of DST, but we want SRC
+ * to be 128-bit (16 byte) aligned so that we don't cross cache line
+ * boundaries on both loads and stores. */
+ neg tmp2, src
+ ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
+ b.eq 2f
+ sub count, count, tmp2
+ /* Move enough data to reach alignment; unlike memcpy, we have to
+ * be aware of the overlap, which means we can't move data twice. */
+ tbz tmp2, #3, 1f
+ ldr tmp1, [src], #8
+ str tmp1, [dst], #8
+1:
+ tbz tmp2, #2, 1f
+ ldr tmp1w, [src], #4
+ str tmp1w, [dst], #4
+1:
+ tbz tmp2, #1, 1f
+ ldrh tmp1w, [src], #2
+ strh tmp1w, [dst], #2
+1:
+ tbz tmp2, #0, 1f
+ ldrb tmp1w, [src], #1
+ strb tmp1w, [dst], #1
+1:
+
+ /* There may be less than 63 bytes to go now. */
+ cmp count, #63
+ b.le .Ltail63down
+2:
+ subs count, count, #128
+ b.ge .Lmov_body_large_down
+ /* Less than 128 bytes to move, so handle 64 here and then jump
+ * to the tail. */
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, #16]
+ ldp C_l, C_h, [src, #32]
+ ldp D_l, D_h, [src, #48]
+ stp A_l, A_h, [dst]
+ stp B_l, B_h, [dst, #16]
+ stp C_l, C_h, [dst, #32]
+ stp D_l, D_h, [dst, #48]
+ tst count, #0x3f
+ add src, src, #64
+ add dst, dst, #64
+ b.ne .Ltail63down
+ ret
+
+ /* Critical loop. Start at a new cache line boundary. Assuming
+ * 64 bytes per line this ensures the entire loop is in one line. */
+ .p2align 6
+.Lmov_body_large_down:
+ /* There are at least 128 bytes to move. */
+ ldp A_l, A_h, [src, #0]
+ sub dst, dst, #16 /* Pre-bias. */
+ ldp B_l, B_h, [src, #16]
+ ldp C_l, C_h, [src, #32]
+ ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
+1:
+ stp A_l, A_h, [dst, #16]
+ ldp A_l, A_h, [src, #16]
+ stp B_l, B_h, [dst, #32]
+ ldp B_l, B_h, [src, #32]
+ stp C_l, C_h, [dst, #48]
+ ldp C_l, C_h, [src, #48]
+ stp D_l, D_h, [dst, #64]!
+ ldp D_l, D_h, [src, #64]!
+ subs count, count, #64
+ b.ge 1b
+ stp A_l, A_h, [dst, #16]
+ stp B_l, B_h, [dst, #32]
+ stp C_l, C_h, [dst, #48]
+ stp D_l, D_h, [dst, #64]
+ add src, src, #16
+ add dst, dst, #64 + 16
+ tst count, #0x3f
+ b.ne .Ltail63down
+ ret
+#if defined(WMEMMOVE)
+END(wmemmove)
+#else
+END(memmove)
+#endif
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 739ce49..c50112d 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -22,14 +22,39 @@
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Unaligned accesses
- * wchar_t is 4 bytes
+ * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
*/
#include <private/bionic_asm.h>
@@ -38,22 +63,25 @@
#define dstin x0
#define src x1
#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define tmp3 x5
-#define tmp3w w5
-#define dst x6
+#define srcend x3
+#define dstend x4
+#define tmp1 x5
+#define A_l x6
+#define A_h x7
+#define B_l x8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l count
+#define E_h tmp1
-#define A_l x7
-#define A_h x8
-#define B_l x9
-#define B_h x10
-#define C_l x11
-#define C_h x12
-#define D_l x13
-#define D_h x14
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+ Larger backwards copies are also handled by memcpy. The only remaining
+ case is forward large copies. The destination is aligned, and an
+ unrolled loop processes 64 bytes per iteration.
+*/
#if defined(WMEMMOVE)
ENTRY(wmemmove)
@@ -61,267 +89,63 @@
#else
ENTRY(memmove)
#endif
- cmp dstin, src
- b.lo .Ldownwards
- add tmp1, src, count
- cmp dstin, tmp1
- b.hs memcpy /* No overlap. */
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.hs memcpy
- /* Upwards move with potential overlap.
- * Need to move from the tail backwards. SRC and DST point one
- * byte beyond the remaining data to move. */
- add dst, dstin, count
- add src, src, count
- cmp count, #64
- b.ge .Lmov_not_short_up
+ cbz tmp1, 3f
+ add dstend, dstin, count
+ add srcend, src, count
- /* Deal with small moves quickly by dropping straight into the
- * exit block. */
-.Ltail63up:
- /* Move up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq .Ltail15up
- sub dst, dst, tmp1
- sub src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #32]
- stp A_l, A_h, [dst, #32]
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+ nop
1:
- ldp A_l, A_h, [src, #16]
- stp A_l, A_h, [dst, #16]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
2:
- ldp A_l, A_h, [src]
- stp A_l, A_h, [dst]
-.Ltail15up:
- /* Move up to 15 bytes of data. Does not assume additional data
- * being moved. */
- tbz count, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src, #-1]
- strb tmp1w, [dst, #-1]
-1:
- ret
+ ldp E_l, E_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp E_l, E_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
-.Lmov_not_short_up:
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- ands tmp2, src, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Move enough data to reach alignment; unlike memcpy, we have to
- * be aware of the overlap, which means we can't move data twice. */
- tbz tmp2, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz tmp2, #2, 1f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-1:
- tbz tmp2, #1, 1f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-1:
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src, #-1]!
- strb tmp1w, [dst, #-1]!
-1:
-
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le .Ltail63up
-2:
- subs count, count, #128
- b.ge .Lmov_body_large_up
- /* Less than 128 bytes to move, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src, #-64]!
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst, #-64]!
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- b.ne .Ltail63up
- ret
-
- /* Critical loop. Start at a new Icache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-.Lmov_body_large_up:
- /* There are at least 128 bytes to move. */
- ldp A_l, A_h, [src, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- ldp D_l, D_h, [src, #-64]!
-1:
- stp A_l, A_h, [dst, #-16]
- ldp A_l, A_h, [src, #-16]
- stp B_l, B_h, [dst, #-32]
- ldp B_l, B_h, [src, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp C_l, C_h, [src, #-48]
- stp D_l, D_h, [dst, #-64]!
- ldp D_l, D_h, [src, #-64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #-16]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- stp D_l, D_h, [dst, #-64]!
- tst count, #0x3f
- b.ne .Ltail63up
- ret
-
-
-.Ldownwards:
- /* For a downwards move we can safely use memcpy provided that
- * DST is more than 16 bytes away from SRC. */
- sub tmp1, src, #16
- cmp dstin, tmp1
- b.ls memcpy /* May overlap, but not critically. */
-
- mov dst, dstin /* Preserve DSTIN for return value. */
- cmp count, #64
- b.ge .Lmov_not_short_down
-
- /* Deal with small moves quickly by dropping straight into the
- * exit block. */
-.Ltail63down:
- /* Move up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq .Ltail15down
- add dst, dst, tmp1
- add src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-48]
- stp A_l, A_h, [dst, #-48]
-1:
- ldp A_l, A_h, [src, #-32]
- stp A_l, A_h, [dst, #-32]
-2:
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
-.Ltail15down:
- /* Move up to 15 bytes of data. Does not assume additional data
- being moved. */
- tbz count, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src]
- strb tmp1w, [dst]
-1:
- ret
-
-.Lmov_not_short_down:
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- neg tmp2, src
- ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Move enough data to reach alignment; unlike memcpy, we have to
- * be aware of the overlap, which means we can't move data twice. */
- tbz tmp2, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz tmp2, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz tmp2, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src], #1
- strb tmp1w, [dst], #1
-1:
-
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le .Ltail63down
-2:
- subs count, count, #128
- b.ge .Lmov_body_large_down
- /* Less than 128 bytes to move, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst]
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- add src, src, #64
- add dst, dst, #64
- b.ne .Ltail63down
- ret
-
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-.Lmov_body_large_down:
- /* There are at least 128 bytes to move. */
- ldp A_l, A_h, [src, #0]
- sub dst, dst, #16 /* Pre-bias. */
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
-1:
- stp A_l, A_h, [dst, #16]
- ldp A_l, A_h, [src, #16]
- stp B_l, B_h, [dst, #32]
- ldp B_l, B_h, [src, #32]
- stp C_l, C_h, [dst, #48]
- ldp C_l, C_h, [src, #48]
- stp D_l, D_h, [dst, #64]!
- ldp D_l, D_h, [src, #64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #16]
- stp B_l, B_h, [dst, #32]
- stp C_l, C_h, [dst, #48]
- stp D_l, D_h, [dst, #64]
- add src, src, #16
- add dst, dst, #64 + 16
- tst count, #0x3f
- b.ne .Ltail63down
- ret
#if defined(WMEMMOVE)
END(wmemmove)
#else