[AArch64] Optimize memcmp for medium to large sizes
This patch was originally written by Siddhesh Poyarekar and pushed on
cortex-strings [1]. This improved memcmp provides a fast path for
compares up to 16 bytes and then compares 16 bytes at a time, thus
optimizing loads from both sources.
Comparison on the default bionic and proposed optimized routines
shows the following performance improvements on A72 (using the
new proposed memcmp input data from test_memcmp.xml):
Benchmark Time CPU Time Old Time New CPU Old CPU New
--------------------------------------------------------------------------------------------------------------------
BM_string_memcmp/1/0/0 -0.2074 -0.2074 15 12 15 12
BM_string_memcmp/2/0/0 -0.5193 -0.5193 31 15 31 15
BM_string_memcmp/3/0/0 -0.1291 -0.1291 19 17 19 17
BM_string_memcmp/4/0/0 -0.2889 -0.2889 17 12 17 12
BM_string_memcmp/5/0/0 -0.2606 -0.2606 15 11 15 11
BM_string_memcmp/6/0/0 -0.1656 -0.1655 17 14 17 14
BM_string_memcmp/7/0/0 -0.1721 -0.1721 19 15 19 15
BM_string_memcmp/8/0/0 -0.3048 -0.3048 15 10 15 10
BM_string_memcmp/9/0/0 -0.3041 -0.3041 15 10 15 10
BM_string_memcmp/10/0/0 -0.3040 -0.3040 15 10 15 10
BM_string_memcmp/11/0/0 -0.3048 -0.3048 15 10 15 10
BM_string_memcmp/12/0/0 -0.3041 -0.3041 15 10 15 10
BM_string_memcmp/13/0/0 -0.3040 -0.3040 15 10 15 10
BM_string_memcmp/14/0/0 -0.3048 -0.3048 15 10 15 10
BM_string_memcmp/15/0/0 -0.3040 -0.3040 15 10 15 10
BM_string_memcmp/16/0/0 -0.3041 -0.3041 15 10 15 10
BM_string_memcmp/24/0/0 -0.1209 -0.1209 15 13 15 13
BM_string_memcmp/32/0/0 -0.3228 -0.3228 20 13 20 13
BM_string_memcmp/40/0/0 -0.2937 -0.2937 22 15 22 15
BM_string_memcmp/48/0/0 -0.3299 -0.3299 23 15 23 15
BM_string_memcmp/56/0/0 -0.1845 -0.1845 24 20 24 20
BM_string_memcmp/64/0/0 -0.2247 -0.2247 26 20 26 20
BM_string_memcmp/72/0/0 -0.1947 -0.1947 27 22 27 22
BM_string_memcmp/80/0/0 -0.2275 -0.2275 28 22 28 22
BM_string_memcmp/88/0/0 -0.2360 -0.2360 29 22 29 22
BM_string_memcmp/96/0/0 -0.2675 -0.2675 31 22 31 22
BM_string_memcmp/104/0/0 -0.2559 -0.2559 32 24 32 24
BM_string_memcmp/112/0/0 -0.2787 -0.2786 33 24 33 24
BM_string_memcmp/120/0/0 -0.2599 -0.2599 34 25 34 25
BM_string_memcmp/128/0/0 -0.2860 -0.2860 35 25 35 25
BM_string_memcmp/136/0/0 -0.4708 -0.4708 53 28 53 28
BM_string_memcmp/144/0/0 -0.4719 -0.4719 53 28 53 28
BM_string_memcmp/160/0/0 -0.4680 -0.4680 56 30 56 30
BM_string_memcmp/176/0/0 -0.4645 -0.4645 60 32 60 32
BM_string_memcmp/192/0/0 -0.4641 -0.4641 63 34 63 34
BM_string_memcmp/208/0/0 -0.4555 -0.4555 66 36 66 36
BM_string_memcmp/224/0/0 -0.4558 -0.4557 69 38 69 38
BM_string_memcmp/240/0/0 -0.4534 -0.4534 72 40 72 40
BM_string_memcmp/256/0/0 -0.4463 -0.4463 75 42 75 42
BM_string_memcmp/512/0/0 -0.3077 -0.3077 126 88 126 88
BM_string_memcmp/1024/0/0 -0.3493 -0.3493 229 149 229 149
BM_string_memcmp/8192/0/0 -0.4173 -0.4173 1729 1007 1729 1007
BM_string_memcmp/16384/0/0 -0.3855 -0.3855 3377 2076 3377 2075
BM_string_memcmp/32768/0/0 -0.2968 -0.2968 6847 4815 6847 4814
BM_string_memcmp/65536/0/0 -0.2496 -0.2496 13715 10292 13714 10291
BM_string_memcmp/131072/0/0 -0.2676 -0.2676 27354 20033 27351 20031
BM_string_memcmp/262144/0/0 -0.2319 -0.2319 54604 41943 54598 41939
BM_string_memcmp/524288/0/0 -0.2359 -0.2359 109225 83460 109212 83449
BM_string_memcmp/1048576/0/0 -0.0439 -0.0439 423367 404791 423251 404686
BM_string_memcmp/2097152/0/0 -0.0023 -0.0024 762470 760701 761956 760122
BM_string_memcmp/512/4/4 -0.2853 -0.2853 125 89 125 89
BM_string_memcmp/1024/4/4 -0.3377 -0.3377 228 151 227 151
BM_string_memcmp/8192/4/4 -0.4083 -0.4083 1706 1009 1706 1009
BM_string_memcmp/16384/4/4 -0.3853 -0.3853 3376 2075 3376 2075
BM_string_memcmp/32768/4/4 -0.2974 -0.2974 6846 4810 6845 4810
BM_string_memcmp/65536/4/4 -0.2485 -0.2485 13619 10235 13618 10234
BM_string_memcmp/131072/4/4 -0.2387 -0.2387 27056 20597 27054 20595
BM_string_memcmp/512/4/0 -0.2898 -0.2898 123 88 123 88
BM_string_memcmp/1024/4/0 -0.3401 -0.3401 225 149 225 149
BM_string_memcmp/8192/4/0 -0.4167 -0.4167 1727 1007 1727 1007
BM_string_memcmp/16384/4/0 -0.3820 -0.3820 3384 2092 3384 2091
BM_string_memcmp/32768/4/0 -0.2535 -0.2535 6886 5141 6886 5140
BM_string_memcmp/65536/4/0 -0.1897 -0.1897 13850 11223 13849 11223
BM_string_memcmp/131072/4/0 -0.1972 -0.1972 27536 22106 27533 22104
BM_string_memcmp/512/0/4 -0.2854 -0.2854 125 89 125 89
BM_string_memcmp/1024/0/4 -0.3332 -0.3333 226 151 226 151
BM_string_memcmp/8192/0/4 -0.4199 -0.4199 1740 1009 1740 1009
BM_string_memcmp/16384/0/4 -0.3811 -0.3811 3383 2094 3383 2094
BM_string_memcmp/32768/0/4 -0.2409 -0.2409 6900 5238 6899 5237
BM_string_memcmp/65536/0/4 -0.1920 -0.1920 13922 11250 13921 11248
BM_string_memcmp/131072/0/4 -0.2029 -0.2029 27699 22079 27697 22077
I see similar improvements on A54 as well:
Benchmark Time CPU Time Old Time New CPU Old CPU New
--------------------------------------------------------------------------------------------------------------------
BM_string_memcmp/1/0/0 -0.2074 -0.2074 15 12 15 12
BM_string_memcmp/2/0/0 -0.5193 -0.5193 31 15 31 15
BM_string_memcmp/3/0/0 -0.1291 -0.1291 19 17 19 17
BM_string_memcmp/4/0/0 -0.2889 -0.2889 17 12 17 12
BM_string_memcmp/5/0/0 -0.2606 -0.2606 15 11 15 11
BM_string_memcmp/6/0/0 -0.1656 -0.1655 17 14 17 14
BM_string_memcmp/7/0/0 -0.1721 -0.1721 19 15 19 15
BM_string_memcmp/8/0/0 -0.3048 -0.3048 15 10 15 10
BM_string_memcmp/9/0/0 -0.3041 -0.3041 15 10 15 10
BM_string_memcmp/10/0/0 -0.3040 -0.3040 15 10 15 10
BM_string_memcmp/11/0/0 -0.3048 -0.3048 15 10 15 10
BM_string_memcmp/12/0/0 -0.3041 -0.3041 15 10 15 10
BM_string_memcmp/13/0/0 -0.3040 -0.3040 15 10 15 10
BM_string_memcmp/14/0/0 -0.3048 -0.3048 15 10 15 10
BM_string_memcmp/15/0/0 -0.3040 -0.3040 15 10 15 10
BM_string_memcmp/16/0/0 -0.3041 -0.3041 15 10 15 10
BM_string_memcmp/24/0/0 -0.1209 -0.1209 15 13 15 13
BM_string_memcmp/32/0/0 -0.3228 -0.3228 20 13 20 13
BM_string_memcmp/40/0/0 -0.2937 -0.2937 22 15 22 15
BM_string_memcmp/48/0/0 -0.3299 -0.3299 23 15 23 15
BM_string_memcmp/56/0/0 -0.1845 -0.1845 24 20 24 20
BM_string_memcmp/64/0/0 -0.2247 -0.2247 26 20 26 20
BM_string_memcmp/72/0/0 -0.1947 -0.1947 27 22 27 22
BM_string_memcmp/80/0/0 -0.2275 -0.2275 28 22 28 22
BM_string_memcmp/88/0/0 -0.2360 -0.2360 29 22 29 22
BM_string_memcmp/96/0/0 -0.2675 -0.2675 31 22 31 22
BM_string_memcmp/104/0/0 -0.2559 -0.2559 32 24 32 24
BM_string_memcmp/112/0/0 -0.2787 -0.2786 33 24 33 24
BM_string_memcmp/120/0/0 -0.2599 -0.2599 34 25 34 25
BM_string_memcmp/128/0/0 -0.2860 -0.2860 35 25 35 25
BM_string_memcmp/136/0/0 -0.4708 -0.4708 53 28 53 28
BM_string_memcmp/144/0/0 -0.4719 -0.4719 53 28 53 28
BM_string_memcmp/160/0/0 -0.4680 -0.4680 56 30 56 30
BM_string_memcmp/176/0/0 -0.4645 -0.4645 60 32 60 32
BM_string_memcmp/192/0/0 -0.4641 -0.4641 63 34 63 34
BM_string_memcmp/208/0/0 -0.4555 -0.4555 66 36 66 36
BM_string_memcmp/224/0/0 -0.4558 -0.4557 69 38 69 38
BM_string_memcmp/240/0/0 -0.4534 -0.4534 72 40 72 40
BM_string_memcmp/256/0/0 -0.4463 -0.4463 75 42 75 42
BM_string_memcmp/512/0/0 -0.3077 -0.3077 126 88 126 88
BM_string_memcmp/1024/0/0 -0.3493 -0.3493 229 149 229 149
BM_string_memcmp/8192/0/0 -0.4173 -0.4173 1729 1007 1729 1007
BM_string_memcmp/16384/0/0 -0.3855 -0.3855 3377 2076 3377 2075
BM_string_memcmp/32768/0/0 -0.2968 -0.2968 6847 4815 6847 4814
BM_string_memcmp/65536/0/0 -0.2496 -0.2496 13715 10292 13714 10291
BM_string_memcmp/131072/0/0 -0.2676 -0.2676 27354 20033 27351 20031
BM_string_memcmp/262144/0/0 -0.2319 -0.2319 54604 41943 54598 41939
BM_string_memcmp/524288/0/0 -0.2359 -0.2359 109225 83460 109212 83449
BM_string_memcmp/1048576/0/0 -0.0439 -0.0439 423367 404791 423251 404686
BM_string_memcmp/2097152/0/0 -0.0023 -0.0024 762470 760701 761956 760122
BM_string_memcmp/512/4/4 -0.2853 -0.2853 125 89 125 89
BM_string_memcmp/1024/4/4 -0.3377 -0.3377 228 151 227 151
BM_string_memcmp/8192/4/4 -0.4083 -0.4083 1706 1009 1706 1009
BM_string_memcmp/16384/4/4 -0.3853 -0.3853 3376 2075 3376 2075
BM_string_memcmp/32768/4/4 -0.2974 -0.2974 6846 4810 6845 4810
BM_string_memcmp/65536/4/4 -0.2485 -0.2485 13619 10235 13618 10234
BM_string_memcmp/131072/4/4 -0.2387 -0.2387 27056 20597 27054 20595
BM_string_memcmp/512/4/0 -0.2898 -0.2898 123 88 123 88
BM_string_memcmp/1024/4/0 -0.3401 -0.3401 225 149 225 149
BM_string_memcmp/8192/4/0 -0.4167 -0.4167 1727 1007 1727 1007
BM_string_memcmp/16384/4/0 -0.3820 -0.3820 3384 2092 3384 2091
BM_string_memcmp/32768/4/0 -0.2535 -0.2535 6886 5141 6886 5140
BM_string_memcmp/65536/4/0 -0.1897 -0.1897 13850 11223 13849 11223
BM_string_memcmp/131072/4/0 -0.1972 -0.1972 27536 22106 27533 22104
BM_string_memcmp/512/0/4 -0.2854 -0.2854 125 89 125 89
BM_string_memcmp/1024/0/4 -0.3332 -0.3333 226 151 226 151
BM_string_memcmp/8192/0/4 -0.4199 -0.4199 1740 1009 1740 1009
BM_string_memcmp/16384/0/4 -0.3811 -0.3811 3383 2094 3383 2094
BM_string_memcmp/32768/0/4 -0.2409 -0.2409 6900 5238 6899 5237
BM_string_memcmp/65536/0/4 -0.1920 -0.1920 13922 11250 13921 11248
BM_string_memcmp/131072/0/4 -0.2029 -0.2029 27699 22079 27697 22077
[1] Commit id: f77e4c932b4fd65177b57dd5e220bd17fb4037d6
Test: bionic tests and benchmarks on aarch64.
Change-Id: I2791e2b20d1c0ad429e8e5a41d3e47b1ac02c921
diff --git a/libc/arch-arm64/generic/bionic/memcmp.S b/libc/arch-arm64/generic/bionic/memcmp.S
index 3a138bf..bff54ae 100644
--- a/libc/arch-arm64/generic/bionic/memcmp.S
+++ b/libc/arch-arm64/generic/bionic/memcmp.S
@@ -33,6 +33,8 @@
#include <private/bionic_asm.h>
+#define L(l) .L ## l
+
/* Parameters and result. */
#define src1 x0
#define src2 x1
@@ -42,88 +44,124 @@
/* Internal variables. */
#define data1 x3
#define data1w w3
-#define data2 x4
-#define data2w w4
-#define tmp1 x5
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
/* Small inputs of less than 8 bytes are handled separately. This allows the
- main code to be sped up using unaligned loads since there are now at least
+ main code to be speed up using unaligned loads since there are now at least
8 bytes to be compared. If the first 8 bytes are equal, align src1.
This ensures each iteration does at most one unaligned access even if both
src1 and src2 are unaligned, and mutually aligned inputs behave as if
- aligned. After the main loop, process the last 8 bytes using unaligned
+ aligned. After the main loop, process the last 16 bytes using unaligned
accesses. */
-.p2align 6
ENTRY(memcmp)
+.p2align 6
subs limit, limit, 8
- b.lo .Lless8
+ b.lo L(less8)
/* Limit >= 8, so check first 8 bytes using unaligned loads. */
ldr data1, [src1], 8
ldr data2, [src2], 8
- and tmp1, src1, 7
- add limit, limit, tmp1
cmp data1, data2
- bne .Lreturn
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop16)
/* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
- subs limit, limit, 8
- b.ls .Llast_bytes
-
- /* Loop performing 8 bytes per iteration using aligned src1.
- Limit is pre-decremented by 8 and must be larger than zero.
- Exit if <= 8 bytes left to do or if the data is not equal. */
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
-.Lloop8:
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- subs limit, limit, 8
- ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
- b.eq .Lloop8
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
cmp data1, data2
- bne .Lreturn
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
- /* Compare last 1-8 bytes using unaligned access. */
-.Llast_bytes:
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
/* Compare data bytes and set return value to 0, -1 or 1. */
-.Lreturn:
+L(return):
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
cmp data1, data2
-.Lret_eq:
+L(ret_eq):
cset result, ne
cneg result, result, lo
- ret
+ ret
.p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */
-.Lless8:
+L(less8):
adds limit, limit, 4
- b.lo .Lless4
+ b.lo L(less4)
ldr data1w, [src1], 4
ldr data2w, [src2], 4
cmp data1w, data2w
- b.ne .Lreturn
+ b.ne L(return)
sub limit, limit, 4
-.Lless4:
+L(less4):
adds limit, limit, 4
- beq .Lret_eq
-.Lbyte_loop:
+ beq L(ret_eq)
+L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.eq .Lbyte_loop
+ b.eq L(byte_loop)
sub result, data1w, data2w
ret
+
END(memcmp)