[AArch64] Improve strncmp for mutually misaligned inputs
This patch was originally written by Siddhesh Poyarekar and pushed on
cortex-strings [1]. The mutually misaligned inputs on aarch64 are
compared with a simple byte copy, which is not very efficient.
This patch enhances the comparison similar to strcmp by loading a
double-word at a time.
Comparison on the default bionic and proposed optimized routines
shows the following performance improvements on A54 (using the
new proposed memcmp input data from test_strncmp.xml):
- No noticeable change on aligned inputs or with same alignment.
- Large improvements on unaligned inputs from sizes larger than
16 bytes.
Benchmark Time CPU Time Old Time New CPU Old CPU New
--------------------------------------------------------------------------------------------------------------------
BM_string_strncmp/1/0/0 -0.0954 -0.0954 19 17 19 17
BM_string_strncmp/2/0/0 -0.0344 -0.0344 19 18 19 18
BM_string_strncmp/3/0/0 +0.1768 +0.1768 15 18 15 18
BM_string_strncmp/4/0/0 -0.0344 -0.0344 19 18 19 18
BM_string_strncmp/5/0/0 -0.0344 -0.0344 19 18 19 18
BM_string_strncmp/6/0/0 +0.1589 +0.1589 15 18 15 18
BM_string_strncmp/7/0/0 -0.0344 -0.0344 19 18 19 18
BM_string_strncmp/8/0/0 -0.0998 -0.0998 19 17 19 17
BM_string_strncmp/9/0/0 -0.0277 -0.0277 23 22 23 22
BM_string_strncmp/10/0/0 -0.0270 -0.0270 23 22 23 22
BM_string_strncmp/11/0/0 -0.0331 -0.0331 23 22 23 22
BM_string_strncmp/12/0/0 -0.0270 -0.0270 23 22 23 22
BM_string_strncmp/13/0/0 -0.0284 -0.0284 23 22 23 22
BM_string_strncmp/14/0/0 +0.1042 +0.1042 20 22 20 22
BM_string_strncmp/15/0/0 -0.0277 -0.0277 23 22 23 22
BM_string_strncmp/16/0/0 +0.0214 +0.0215 22 22 22 22
BM_string_strncmp/24/0/0 -0.1291 -0.1291 24 21 24 21
BM_string_strncmp/32/0/0 -0.0470 -0.0470 27 26 27 26
BM_string_strncmp/40/0/0 -0.0433 -0.0433 29 28 29 28
BM_string_strncmp/48/0/0 -0.0301 -0.0301 31 30 31 30
BM_string_strncmp/56/0/0 -0.0800 -0.0800 33 31 33 31
BM_string_strncmp/64/0/0 +0.0188 +0.0188 34 34 34 34
BM_string_strncmp/72/0/0 -0.0334 -0.0334 38 37 38 37
BM_string_strncmp/80/0/0 -0.0000 -0.0000 40 40 40 40
BM_string_strncmp/88/0/0 +0.0413 +0.0413 61 64 61 64
BM_string_strncmp/96/0/0 -0.0215 -0.0216 69 67 69 67
BM_string_strncmp/104/0/0 -0.0208 -0.0208 72 70 72 70
BM_string_strncmp/112/0/0 -0.0173 -0.0173 75 74 75 74
BM_string_strncmp/120/0/0 -0.0166 -0.0166 78 77 78 77
BM_string_strncmp/128/0/0 -0.0158 -0.0158 81 80 81 80
BM_string_strncmp/136/0/0 -0.0149 -0.0149 84 83 84 83
BM_string_strncmp/144/0/0 -0.0201 -0.0201 88 86 88 86
BM_string_strncmp/160/0/0 -0.0136 -0.0136 94 93 94 93
BM_string_strncmp/176/0/0 +0.0224 +0.0224 96 98 96 98
BM_string_strncmp/192/0/0 +0.0289 +0.0289 102 105 102 105
BM_string_strncmp/208/0/0 +0.0101 +0.0101 111 112 111 112
BM_string_strncmp/224/0/0 -0.0107 -0.0107 119 118 119 118
BM_string_strncmp/240/0/0 -0.0088 -0.0088 126 125 126 125
BM_string_strncmp/256/0/0 -0.0101 -0.0101 132 131 132 131
BM_string_strncmp/512/0/0 -0.0056 -0.0056 235 233 235 233
BM_string_strncmp/1024/0/0 -0.0030 -0.0030 439 437 439 437
BM_string_strncmp/8192/0/0 -0.0431 -0.0431 3799 3635 3799 3635
BM_string_strncmp/16384/0/0 -0.0069 -0.0069 6778 6732 6779 6732
BM_string_strncmp/32768/0/0 -0.0001 -0.0002 13405 13403 13405 13403
BM_string_strncmp/65536/0/0 +0.0005 +0.0005 26968 26981 26968 26981
BM_string_strncmp/131072/0/0 -0.0057 -0.0057 53959 53650 53958 53650
BM_string_strncmp/1/4/0 -0.1352 -0.1352 12 10 12 10
BM_string_strncmp/2/4/0 +0.0020 +0.0020 15 15 15 15
BM_string_strncmp/3/4/0 -0.1560 -0.1560 20 17 20 17
BM_string_strncmp/4/4/0 +0.0296 +0.0296 22 22 22 22
BM_string_strncmp/5/4/0 +0.0573 +0.0573 22 23 22 23
BM_string_strncmp/6/4/0 -0.0340 -0.0340 25 24 25 24
BM_string_strncmp/7/4/0 +0.0185 +0.0185 26 26 26 26
BM_string_strncmp/8/4/0 -0.0050 -0.0050 27 27 27 27
BM_string_strncmp/9/4/0 -0.1294 -0.1294 28 24 28 24
BM_string_strncmp/10/4/0 +0.0109 +0.0109 29 29 29 29
BM_string_strncmp/11/4/0 -0.0000 -0.0001 30 30 30 30
BM_string_strncmp/12/4/0 +0.0055 +0.0055 50 50 50 50
BM_string_strncmp/13/4/0 -0.0249 -0.0249 51 50 51 50
BM_string_strncmp/14/4/0 -0.0289 -0.0289 53 52 53 52
BM_string_strncmp/15/4/0 -0.0205 -0.0205 55 54 55 54
BM_string_strncmp/16/4/0 -0.4616 -0.4616 57 31 57 31
BM_string_strncmp/24/4/0 -0.4871 -0.4871 72 37 72 37
BM_string_strncmp/32/4/0 -0.5549 -0.5549 87 39 87 39
BM_string_strncmp/40/4/0 -0.5964 -0.5964 103 42 103 42
BM_string_strncmp/48/4/0 -0.6647 -0.6647 118 40 118 40
BM_string_strncmp/56/4/0 -0.6551 -0.6551 134 46 134 46
BM_string_strncmp/64/4/0 -0.6609 -0.6609 145 49 145 49
BM_string_strncmp/72/4/0 -0.5709 -0.5710 164 70 164 70
BM_string_strncmp/80/4/0 -0.5929 -0.5929 180 73 180 73
BM_string_strncmp/88/4/0 -0.6051 -0.6051 195 77 195 77
BM_string_strncmp/96/4/0 -0.6160 -0.6160 210 81 210 81
BM_string_strncmp/104/4/0 -0.6199 -0.6199 223 85 223 85
BM_string_strncmp/112/4/0 -0.6293 -0.6293 240 89 240 89
BM_string_strncmp/120/4/0 -0.6439 -0.6439 255 91 255 91
BM_string_strncmp/128/4/0 -0.6493 -0.6493 271 95 271 95
BM_string_strncmp/136/4/0 -0.6704 -0.6704 287 95 287 95
BM_string_strncmp/144/4/0 -0.6744 -0.6744 302 98 302 98
BM_string_strncmp/160/4/0 -0.6700 -0.6700 333 110 333 110
BM_string_strncmp/176/4/0 -0.6821 -0.6821 364 116 364 116
BM_string_strncmp/192/4/0 -0.6887 -0.6887 394 123 394 123
BM_string_strncmp/208/4/0 -0.6949 -0.6949 425 130 425 130
BM_string_strncmp/224/4/0 -0.7069 -0.7069 456 134 456 134
BM_string_strncmp/240/4/0 -0.7042 -0.7042 486 144 486 144
BM_string_strncmp/256/4/0 -0.7043 -0.7043 514 152 514 152
BM_string_strncmp/1/0/4 +0.0227 +0.0227 14 14 14 14
BM_string_strncmp/2/0/4 +0.0442 +0.0442 15 16 15 16
BM_string_strncmp/3/0/4 +0.5829 +0.5829 17 27 17 27
BM_string_strncmp/4/0/4 -0.1593 -0.1593 22 19 22 19
BM_string_strncmp/5/0/4 -0.0516 -0.0516 23 22 23 22
BM_string_strncmp/6/0/4 -0.1684 -0.1684 25 20 25 20
BM_string_strncmp/7/0/4 +0.0170 +0.0170 26 26 26 26
BM_string_strncmp/8/0/4 +0.0006 +0.0006 27 27 27 27
BM_string_strncmp/9/0/4 +0.1272 +0.1272 25 28 25 28
BM_string_strncmp/10/0/4 +0.0108 +0.0108 29 29 29 29
BM_string_strncmp/11/0/4 -0.0001 -0.0001 30 30 30 30
BM_string_strncmp/12/0/4 -0.3557 -0.3557 50 32 50 32
BM_string_strncmp/13/0/4 -0.3370 -0.3370 51 34 51 34
BM_string_strncmp/14/0/4 -0.3444 -0.3444 53 35 53 35
BM_string_strncmp/15/0/4 +0.0946 +0.0946 51 56 51 56
BM_string_strncmp/16/0/4 -0.5203 -0.5203 53 25 53 25
BM_string_strncmp/24/0/4 -0.6109 -0.6109 72 28 72 28
BM_string_strncmp/32/0/4 -0.6934 -0.6934 88 27 88 27
BM_string_strncmp/40/0/4 -0.6833 -0.6833 103 33 103 33
BM_string_strncmp/48/0/4 -0.6973 -0.6973 118 36 118 36
BM_string_strncmp/56/0/4 -0.7116 -0.7116 134 39 134 39
BM_string_strncmp/64/0/4 -0.6017 -0.6018 149 59 149 59
BM_string_strncmp/72/0/4 -0.6268 -0.6268 164 61 164 61
BM_string_strncmp/80/0/4 -0.6409 -0.6409 179 64 179 64
BM_string_strncmp/88/0/4 -0.6465 -0.6465 195 69 195 69
BM_string_strncmp/96/0/4 -0.6551 -0.6551 210 72 210 72
BM_string_strncmp/104/0/4 -0.6662 -0.6662 227 76 227 76
BM_string_strncmp/112/0/4 -0.6700 -0.6700 240 79 240 79
BM_string_strncmp/120/0/4 -0.6740 -0.6740 256 83 256 83
BM_string_strncmp/128/0/4 -0.6862 -0.6862 271 85 271 85
BM_string_strncmp/136/0/4 -0.6883 -0.6883 287 89 287 89
BM_string_strncmp/144/0/4 -0.7031 -0.7031 297 88 297 88
BM_string_strncmp/160/0/4 -0.6985 -0.6985 333 100 333 100
BM_string_strncmp/176/0/4 -0.7082 -0.7082 364 106 364 106
BM_string_strncmp/192/0/4 -0.7223 -0.7223 396 110 396 110
BM_string_strncmp/208/0/4 -0.7135 -0.7135 421 121 421 121
BM_string_strncmp/224/0/4 -0.7194 -0.7194 455 128 455 128
BM_string_strncmp/240/0/4 -0.7233 -0.7233 487 135 487 135
BM_string_strncmp/256/0/4 -0.7239 -0.7239 516 143 516 143
BM_string_strncmp/1/4/4 +0.0224 +0.0225 21 22 21 22
BM_string_strncmp/2/4/4 -0.0001 -0.0001 22 22 22 22
BM_string_strncmp/3/4/4 -0.0001 -0.0001 22 22 22 22
BM_string_strncmp/4/4/4 -0.0435 -0.0435 22 21 22 21
BM_string_strncmp/5/4/4 -0.0118 -0.0118 27 27 27 27
BM_string_strncmp/6/4/4 -0.0118 -0.0118 27 27 27 27
BM_string_strncmp/7/4/4 -0.0117 -0.0117 27 27 27 27
BM_string_strncmp/8/4/4 -0.0118 -0.0118 27 27 27 27
BM_string_strncmp/9/4/4 -0.0117 -0.0117 27 27 27 27
BM_string_strncmp/10/4/4 +0.1447 +0.1447 23 27 23 27
BM_string_strncmp/11/4/4 -0.0062 -0.0062 27 27 27 27
BM_string_strncmp/12/4/4 -0.0454 -0.0454 28 27 28 27
BM_string_strncmp/13/4/4 -0.1507 -0.1507 29 24 29 24
BM_string_strncmp/14/4/4 -0.0003 -0.0003 29 29 29 29
BM_string_strncmp/15/4/4 -0.0002 -0.0003 29 29 29 29
BM_string_strncmp/16/4/4 +0.0047 +0.0047 29 29 29 29
BM_string_strncmp/24/4/4 -0.0104 -0.0104 31 30 31 30
BM_string_strncmp/32/4/4 -0.0290 -0.0290 33 32 33 32
BM_string_strncmp/40/4/4 -0.0189 -0.0189 34 33 34 33
BM_string_strncmp/48/4/4 -0.0059 -0.0059 36 36 36 36
BM_string_strncmp/56/4/4 +0.0000 +0.0000 39 39 39 39
BM_string_strncmp/64/4/4 +0.0000 +0.0000 42 42 42 42
BM_string_strncmp/72/4/4 +0.0000 +0.0000 45 45 45 45
BM_string_strncmp/80/4/4 +0.0391 +0.0392 65 68 65 68
BM_string_strncmp/88/4/4 -0.0090 -0.0090 71 70 71 70
BM_string_strncmp/96/4/4 -0.0034 -0.0034 74 74 74 74
BM_string_strncmp/104/4/4 -0.0482 -0.0482 77 73 77 73
BM_string_strncmp/112/4/4 +0.0387 +0.0387 77 80 77 80
BM_string_strncmp/120/4/4 -0.0072 -0.0073 84 83 84 83
BM_string_strncmp/128/4/4 -0.0071 -0.0071 87 86 87 86
BM_string_strncmp/136/4/4 +0.0366 +0.0366 86 89 86 89
BM_string_strncmp/144/4/4 -0.0068 -0.0068 93 93 93 93
BM_string_strncmp/160/4/4 -0.0064 -0.0064 100 99 100 99
BM_string_strncmp/176/4/4 -0.0063 -0.0063 106 105 106 105
BM_string_strncmp/192/4/4 -0.0012 -0.0012 112 112 112 112
BM_string_strncmp/208/4/4 -0.0098 -0.0098 119 118 119 118
BM_string_strncmp/224/4/4 -0.0050 -0.0050 125 125 125 125
BM_string_strncmp/240/4/4 -0.0060 -0.0060 132 131 132 131
BM_string_strncmp/256/4/4 -0.0046 -0.0046 138 137 138 137
[1] Commit id: 26cc4faec37a55529e5d0a39949f7b6ec81008f9
Test: bionic tests and benchmarks on aarch64.
Change-Id: Ied579d2044b4092fc95fad486af6541d1eb71dc3
diff --git a/libc/arch-arm64/generic/bionic/strncmp.S b/libc/arch-arm64/generic/bionic/strncmp.S
index 267f663..b81f43a 100644
--- a/libc/arch-arm64/generic/bionic/strncmp.S
+++ b/libc/arch-arm64/generic/bionic/strncmp.S
@@ -58,6 +58,7 @@
#define limit_wd x13
#define mask x14
#define endloop x15
+#define count mask
.text
.p2align 6
@@ -69,9 +70,9 @@
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
+ and count, src1, #7
b.ne .Lmisaligned8
- ands tmp1, src1, #7
- b.ne .Lmutual_align
+ cbnz count, .Lmutual_align
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -176,42 +177,104 @@
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
- neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+ lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, tmp1
- add tmp3, tmp3, tmp1
+ add limit, limit, count
+ add tmp3, tmp3, count
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b .Lstart_realigned
-.Lret0:
- mov result, #0
- ret
-
.p2align 6
+ /* Don't bother with dwords for up to 16 bytes. */
.Lmisaligned8:
- sub limit, limit, #1
-1:
+ cmp limit, #16
+ b.hs .Ltry_misaligned_words
+
+.Lbyte_loop:
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
- ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
+ ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
+ b.eq .Lbyte_loop
+.Ldone:
sub result, data1, data2
ret
+ /* Align the SRC1 to a dword by doing a bytewise compare and then do
+ the dword loop. */
+.Ltry_misaligned_words:
+ lsr limit_wd, limit, #3
+ cbz count, .Ldo_misaligned
+
+ neg count, count
+ and count, count, #7
+ sub limit, limit, count
+ lsr limit_wd, limit, #3
+
+.Lpage_end_loop:
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.ne .Ldone
+ subs count, count, #1
+ b.hi .Lpage_end_loop
+
+.Ldo_misaligned:
+ /* Prepare ourselves for the next page crossing. Unlike the aligned
+ loop, we fetch 1 less dword because we risk crossing bounds on
+ SRC2. */
+ mov count, #8
+ subs limit_wd, limit_wd, #1
+ b.lo .Ldone_loop
+.Lloop_misaligned:
+ and tmp2, src2, #0xff8
+ eor tmp2, tmp2, #0xff8
+ cbz tmp2, .Lpage_end_loop
+
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+ subs limit_wd, limit_wd, #1
+ b.pl .Lloop_misaligned
+
+.Ldone_loop:
+ /* We found a difference or a NULL before the limit was reached. */
+ and limit, limit, #7
+ cbz limit, .Lnot_limit
+ /* Read the last word. */
+ sub src1, src1, 8
+ sub src2, src2, 8
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp diff, #0, #0, eq
+ b.ne .Lnot_limit
+
+.Lret0:
+ mov result, #0
+ ret
END(strncmp)