[AArch64] Improve strcmp performance for misaligned strings
This patch was originally written by Siddhesh Poyarekar and pushed on
cortex-strings [1]. Replace the simple byte-wise compare in the
misaligned case with a dword compare with page boundary checks in
place. For simplicity its uses a 4K page boundary so that it does not
have to query the actual page size on the system.
Comparison on the default bionic and proposed optimized routines
shows the following performance improvements on A64 (using the
new proposed memcmp input data from test_strcmp.xml):
- Small improvement for aligned arguments with sizes up to 56 bytes
(from 10% to 20%).
- Large improvements for unaligned arguments for small sizes (from
3 to 256 bytes).
Benchmark Time CPU Time Old Time New CPU Old CPU New
-------------------------------------------------------------------------------------------------------------------
BM_string_strcmp/1/0/0 +0.0034 +0.0034 11 11 11 11
BM_string_strcmp/2/0/0 +0.0000 +0.0000 11 11 11 11
BM_string_strcmp/3/0/0 -0.1726 -0.1726 11 9 11 9
BM_string_strcmp/4/0/0 -0.1726 -0.1726 11 9 11 9
BM_string_strcmp/5/0/0 -0.1726 -0.1726 11 9 11 9
BM_string_strcmp/6/0/0 -0.1719 -0.1719 11 9 11 9
BM_string_strcmp/7/0/0 -0.1724 -0.1724 11 9 11 9
BM_string_strcmp/8/0/0 -0.1718 -0.1718 11 9 11 9
BM_string_strcmp/9/0/0 -0.2008 -0.2008 16 13 16 13
BM_string_strcmp/10/0/0 -0.2008 -0.2008 16 13 16 13
BM_string_strcmp/11/0/0 -0.2040 -0.2040 16 13 16 13
BM_string_strcmp/12/0/0 -0.1991 -0.1991 16 13 16 13
BM_string_strcmp/13/0/0 -0.1997 -0.1997 16 13 16 13
BM_string_strcmp/14/0/0 -0.1988 -0.1989 16 13 16 13
BM_string_strcmp/15/0/0 -0.2006 -0.2006 16 13 16 13
BM_string_strcmp/16/0/0 -0.2043 -0.2043 16 13 16 13
BM_string_strcmp/24/0/0 -0.1927 -0.1927 18 15 18 15
BM_string_strcmp/32/0/0 -0.1743 -0.1743 20 17 20 17
BM_string_strcmp/40/0/0 -0.1427 -0.1427 22 19 22 19
BM_string_strcmp/48/0/0 -0.1053 -0.1053 24 22 24 22
BM_string_strcmp/56/0/0 -0.0805 -0.0805 26 24 26 24
BM_string_strcmp/64/0/0 -0.0454 -0.0454 28 27 28 27
BM_string_strcmp/72/0/0 -0.0303 -0.0303 30 29 30 29
BM_string_strcmp/80/0/0 -0.0111 -0.0111 32 32 32 32
BM_string_strcmp/88/0/0 -0.0004 -0.0004 34 34 34 34
BM_string_strcmp/96/0/0 -0.0058 -0.0058 37 37 37 37
BM_string_strcmp/104/0/0 +0.0000 +0.0000 40 40 40 40
BM_string_strcmp/112/0/0 -0.0457 -0.0457 61 58 61 58
BM_string_strcmp/120/0/0 -0.0486 -0.0487 61 58 61 58
BM_string_strcmp/128/0/0 -0.0499 -0.0499 64 61 64 61
BM_string_strcmp/136/0/0 -0.0529 -0.0529 66 63 66 63
BM_string_strcmp/144/0/0 -0.0492 -0.0492 69 66 69 66
BM_string_strcmp/160/0/0 -0.0459 -0.0459 74 71 74 71
BM_string_strcmp/176/0/0 -0.0400 -0.0401 79 76 79 76
BM_string_strcmp/192/0/0 -0.0378 -0.0378 85 81 85 81
BM_string_strcmp/208/0/0 -0.0009 -0.0009 89 89 89 89
BM_string_strcmp/224/0/0 -0.0003 -0.0003 95 95 95 95
BM_string_strcmp/240/0/0 -0.0320 -0.0320 100 96 100 96
BM_string_strcmp/256/0/0 -0.0303 -0.0304 105 102 105 102
BM_string_strcmp/512/0/0 -0.0171 -0.0171 187 183 187 183
BM_string_strcmp/1024/0/0 -0.0091 -0.0091 350 347 350 347
BM_string_strcmp/8192/0/0 -0.0030 -0.0031 2668 2660 2668 2660
BM_string_strcmp/16384/0/0 +0.0007 +0.0007 5449 5452 5448 5452
BM_string_strcmp/32768/0/0 +0.0635 +0.0635 10868 11558 10867 11557
BM_string_strcmp/65536/0/0 -0.0017 -0.0017 21824 21786 21822 21784
BM_string_strcmp/131072/0/0 +0.0012 +0.0012 43485 43536 43480 43532
BM_string_strcmp/1/4/0 +0.7630 +0.7630 7 12 7 12
BM_string_strcmp/2/4/0 +0.9265 +0.9265 12 23 12 23
BM_string_strcmp/3/4/0 -0.0000 -0.0000 14 14 14 14
BM_string_strcmp/4/4/0 +0.0372 +0.0372 19 19 19 19
BM_string_strcmp/6/4/0 -0.0921 -0.0921 20 19 20 19
BM_string_strcmp/7/4/0 -0.0291 -0.0291 19 19 19 19
BM_string_strcmp/8/4/0 +0.0648 +0.0648 20 22 20 22
BM_string_strcmp/9/4/0 +0.0001 -0.0055 22 22 22 22
BM_string_strcmp/10/4/0 -0.1924 -0.1924 23 19 23 19
BM_string_strcmp/11/4/0 -0.2347 -0.2347 24 19 24 19
BM_string_strcmp/12/4/0 -0.2738 -0.2739 26 19 26 19
BM_string_strcmp/13/4/0 -0.3804 -0.3804 42 26 42 26
BM_string_strcmp/14/4/0 -0.3581 -0.3582 41 26 41 26
BM_string_strcmp/15/4/0 -0.3905 -0.3905 43 26 43 26
BM_string_strcmp/16/4/0 -0.4068 -0.4068 44 26 44 26
BM_string_strcmp/24/4/0 -0.4917 -0.4917 57 29 57 29
BM_string_strcmp/32/4/0 -0.5607 -0.5607 70 31 70 31
BM_string_strcmp/40/4/0 -0.5940 -0.5940 82 33 82 33
BM_string_strcmp/48/4/0 -0.5303 -0.5302 95 45 95 45
BM_string_strcmp/56/4/0 -0.4975 -0.4975 108 54 108 54
BM_string_strcmp/64/4/0 -0.5167 -0.5167 121 58 121 58
BM_string_strcmp/72/4/0 -0.5325 -0.5325 133 62 133 62
BM_string_strcmp/80/4/0 -0.5523 -0.5523 146 65 146 65
BM_string_strcmp/88/4/0 -0.5686 -0.5686 159 69 159 69
BM_string_strcmp/96/4/0 -0.5815 -0.5815 172 72 172 72
BM_string_strcmp/104/4/0 -0.5931 -0.5931 185 75 185 75
BM_string_strcmp/112/4/0 -0.6046 -0.6046 197 78 197 78
BM_string_strcmp/120/4/0 -0.6113 -0.6113 210 82 210 82
BM_string_strcmp/128/4/0 -0.6186 -0.6186 223 85 223 85
BM_string_strcmp/136/4/0 -0.6278 -0.6278 237 88 237 88
BM_string_strcmp/144/4/0 -0.6410 -0.6410 253 91 253 91
BM_string_strcmp/160/4/0 -0.6506 -0.6506 280 98 280 98
BM_string_strcmp/176/4/0 -0.6593 -0.6593 304 104 304 104
BM_string_strcmp/192/4/0 -0.6647 -0.6647 330 111 330 111
BM_string_strcmp/208/4/0 -0.6741 -0.6741 357 116 357 116
BM_string_strcmp/224/4/0 -0.6761 -0.6761 381 123 381 123
BM_string_strcmp/240/4/0 -0.6824 -0.6824 406 129 406 129
BM_string_strcmp/256/4/0 -0.6846 -0.6846 432 136 432 136
BM_string_strcmp/1/0/4 +1.0024 +1.0024 7 14 7 14
BM_string_strcmp/2/0/4 +0.1591 +0.1591 12 14 12 14
BM_string_strcmp/3/0/4 -0.0015 -0.0015 14 14 14 14
BM_string_strcmp/4/0/4 -0.0809 -0.0809 15 14 15 14
BM_string_strcmp/5/0/4 -0.1535 -0.1536 17 14 17 14
BM_string_strcmp/6/0/4 -0.2111 -0.2111 18 14 18 14
BM_string_strcmp/7/0/4 -0.2650 -0.2650 19 14 19 14
BM_string_strcmp/8/0/4 -0.3118 -0.3118 20 14 20 14
BM_string_strcmp/9/0/4 -0.1741 -0.1740 22 18 22 18
BM_string_strcmp/10/0/4 -0.2201 -0.2201 23 18 23 18
BM_string_strcmp/11/0/4 -0.2610 -0.2610 24 18 24 18
BM_string_strcmp/12/0/4 -0.2987 -0.2987 26 18 26 18
BM_string_strcmp/13/0/4 -0.5748 -0.5748 42 18 42 18
BM_string_strcmp/14/0/4 -0.5796 -0.5796 43 18 43 18
BM_string_strcmp/15/0/4 -0.6167 -0.6167 47 18 47 18
BM_string_strcmp/16/0/4 -0.6303 -0.6303 49 18 49 18
BM_string_strcmp/24/0/4 -0.6557 -0.6557 61 21 61 21
BM_string_strcmp/32/0/4 -0.6612 -0.6612 70 24 70 24
BM_string_strcmp/40/0/4 -0.6812 -0.6813 82 26 82 26
BM_string_strcmp/48/0/4 -0.6974 -0.6974 95 29 95 29
BM_string_strcmp/56/0/4 -0.7151 -0.7151 108 31 108 31
BM_string_strcmp/64/0/4 -0.5717 -0.5717 121 52 121 52
BM_string_strcmp/72/0/4 -0.5927 -0.5927 134 54 134 54
BM_string_strcmp/80/0/4 -0.6004 -0.6004 146 58 146 58
BM_string_strcmp/88/0/4 -0.6145 -0.6145 159 61 159 61
BM_string_strcmp/96/0/4 -0.6287 -0.6287 172 64 172 64
BM_string_strcmp/104/0/4 -0.6351 -0.6351 185 67 185 67
BM_string_strcmp/112/0/4 -0.6423 -0.6423 197 71 197 71
BM_string_strcmp/120/0/4 -0.6489 -0.6489 210 74 210 74
BM_string_strcmp/128/0/4 -0.6578 -0.6578 223 76 223 76
BM_string_strcmp/136/0/4 -0.6597 -0.6597 236 80 236 80
BM_string_strcmp/144/0/4 -0.6674 -0.6674 250 83 250 83
BM_string_strcmp/160/0/4 -0.6751 -0.6751 274 89 274 89
BM_string_strcmp/176/0/4 -0.6798 -0.6798 300 96 300 96
BM_string_strcmp/192/0/4 -0.6873 -0.6855 327 102 325 102
BM_string_strcmp/208/0/4 -0.6903 -0.6903 351 109 351 109
BM_string_strcmp/224/0/4 -0.6907 -0.6907 376 116 376 116
BM_string_strcmp/240/0/4 -0.6897 -0.6897 402 125 402 125
BM_string_strcmp/256/0/4 -0.6937 -0.6937 427 131 427 131
BM_string_strcmp/1/4/4 +0.0009 +0.0009 14 14 14 14
BM_string_strcmp/2/4/4 -0.2229 -0.2229 14 11 14 11
BM_string_strcmp/3/4/4 -0.2256 -0.2256 14 11 14 11
BM_string_strcmp/4/4/4 -0.2241 -0.2240 14 11 14 11
BM_string_strcmp/5/4/4 -0.2220 -0.2220 20 15 20 15
BM_string_strcmp/6/4/4 -0.2267 -0.2267 20 15 20 15
BM_string_strcmp/7/4/4 -0.2228 -0.2227 20 15 20 15
BM_string_strcmp/8/4/4 -0.2219 -0.2219 20 15 20 15
BM_string_strcmp/9/4/4 -0.2220 -0.2220 20 15 20 15
BM_string_strcmp/10/4/4 -0.2227 -0.2227 20 15 20 15
BM_string_strcmp/11/4/4 -0.2210 -0.2210 20 15 20 15
BM_string_strcmp/12/4/4 -0.2224 -0.2224 20 15 20 15
BM_string_strcmp/13/4/4 -0.1778 -0.1778 21 17 21 17
BM_string_strcmp/14/4/4 -0.1863 -0.1863 21 17 21 17
BM_string_strcmp/15/4/4 -0.1780 -0.1780 21 17 21 17
BM_string_strcmp/16/4/4 +0.0031 +0.0031 21 21 21 21
BM_string_strcmp/24/4/4 +0.0041 +0.0041 24 24 24 24
BM_string_strcmp/32/4/4 -0.0001 -0.0000 25 25 25 25
BM_string_strcmp/40/4/4 +0.0016 +0.0016 26 26 26 26
BM_string_strcmp/48/4/4 +0.0001 +0.0001 28 28 28 28
BM_string_strcmp/56/4/4 -0.0001 -0.0001 30 30 30 30
BM_string_strcmp/64/4/4 -0.0342 -0.0342 32 31 32 31
BM_string_strcmp/72/4/4 -0.0186 -0.0186 34 34 34 34
BM_string_strcmp/80/4/4 +0.0004 +0.0004 36 36 36 36
BM_string_strcmp/88/4/4 -0.0000 -0.0000 39 39 39 39
BM_string_strcmp/96/4/4 -0.0510 -0.0510 62 59 62 59
BM_string_strcmp/104/4/4 -0.0502 -0.0502 63 60 63 60
BM_string_strcmp/112/4/4 -0.0490 -0.0490 65 62 65 62
BM_string_strcmp/120/4/4 -0.0387 -0.0387 67 65 67 65
BM_string_strcmp/128/4/4 -0.0426 -0.0426 70 67 70 67
BM_string_strcmp/136/4/4 -0.0408 -0.0408 73 70 73 70
BM_string_strcmp/144/4/4 -0.0194 -0.0194 75 74 75 74
BM_string_strcmp/160/4/4 -0.0035 -0.0035 81 81 81 81
BM_string_strcmp/176/4/4 -0.0001 -0.0001 86 86 86 86
BM_string_strcmp/192/4/4 -0.0002 -0.0002 91 91 91 91
BM_string_strcmp/208/4/4 -0.0335 -0.0335 96 93 96 93
BM_string_strcmp/224/4/4 -0.0314 -0.0314 101 98 101 98
BM_string_strcmp/240/4/4 -0.0303 -0.0303 106 103 106 103
BM_string_strcmp/256/4/4 -0.0288 -0.0288 111 108 111 108
[1] Commit id: f98f2a6780d686ca3d44f8011c7823d42d9b083a
Test: bionic tests and benchmarks on aarch64.
Change-Id: I75f8948782b8bd459d21f15e75e1d420905f5e5a
diff --git a/libc/arch-arm64/generic/bionic/strcmp.S b/libc/arch-arm64/generic/bionic/strcmp.S
index 271452d..fbc215e 100644
--- a/libc/arch-arm64/generic/bionic/strcmp.S
+++ b/libc/arch-arm64/generic/bionic/strcmp.S
@@ -32,6 +32,8 @@
#include <private/bionic_asm.h>
+#define L(label) .L ## label
+
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
@@ -61,24 +63,25 @@
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
- b.ne .Lmisaligned8
+ b.ne L(misaligned8)
ands tmp1, src1, #7
- b.ne .Lmutual_align
+ b.ne L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
-.Lloop_aligned:
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
- cbz syndrome, .Lloop_aligned
+ cbz syndrome, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
+L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
@@ -129,7 +132,7 @@
ret
#endif
-.Lmutual_align:
+L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that preceed the start point. */
@@ -149,15 +152,41 @@
#endif
orr data1, data1, tmp2
orr data2, data2, tmp2
- b .Lstart_realigned
+ b L(start_realigned)
-.Lmisaligned8:
- /* We can do better than this. */
+L(misaligned8):
+ /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+ checking to make sure that we don't access beyond page boundary in
+ SRC2. */
+ tst src1, #7
+ b.eq L(loop_misaligned)
+L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Lmisaligned8
+ b.ne L(done)
+ tst src1, #7
+ b.ne L(do_misaligned)
+
+L(loop_misaligned):
+ /* Test if we are within the last dword of the end of a 4K page. If
+ yes then jump back to the misaligned loop to copy a byte at a time. */
+ and tmp1, src2, #0xff8
+ eor tmp1, tmp1, #0xff8
+ cbz tmp1, L(do_misaligned)
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, L(loop_misaligned)
+ b L(end)
+
+L(done):
sub result, data1, data2
ret
END(strcmp)