[AArch64] Optimize memcmp for medium to large sizes

This patch was originally written by Siddhesh Poyarekar and pushed on
cortex-strings [1]. This improved memcmp provides a fast path for
compares up to 16 bytes and then compares 16 bytes at a time, thus
optimizing loads from both sources.

Comparison on the default bionic and proposed optimized routines
shows the following performance improvements on A72 (using the
new proposed memcmp input data from test_memcmp.xml):

Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------------
BM_string_memcmp/1/0/0               -0.2074         -0.2074            15            12            15            12
BM_string_memcmp/2/0/0               -0.5193         -0.5193            31            15            31            15
BM_string_memcmp/3/0/0               -0.1291         -0.1291            19            17            19            17
BM_string_memcmp/4/0/0               -0.2889         -0.2889            17            12            17            12
BM_string_memcmp/5/0/0               -0.2606         -0.2606            15            11            15            11
BM_string_memcmp/6/0/0               -0.1656         -0.1655            17            14            17            14
BM_string_memcmp/7/0/0               -0.1721         -0.1721            19            15            19            15
BM_string_memcmp/8/0/0               -0.3048         -0.3048            15            10            15            10
BM_string_memcmp/9/0/0               -0.3041         -0.3041            15            10            15            10
BM_string_memcmp/10/0/0              -0.3040         -0.3040            15            10            15            10
BM_string_memcmp/11/0/0              -0.3048         -0.3048            15            10            15            10
BM_string_memcmp/12/0/0              -0.3041         -0.3041            15            10            15            10
BM_string_memcmp/13/0/0              -0.3040         -0.3040            15            10            15            10
BM_string_memcmp/14/0/0              -0.3048         -0.3048            15            10            15            10
BM_string_memcmp/15/0/0              -0.3040         -0.3040            15            10            15            10
BM_string_memcmp/16/0/0              -0.3041         -0.3041            15            10            15            10
BM_string_memcmp/24/0/0              -0.1209         -0.1209            15            13            15            13
BM_string_memcmp/32/0/0              -0.3228         -0.3228            20            13            20            13
BM_string_memcmp/40/0/0              -0.2937         -0.2937            22            15            22            15
BM_string_memcmp/48/0/0              -0.3299         -0.3299            23            15            23            15
BM_string_memcmp/56/0/0              -0.1845         -0.1845            24            20            24            20
BM_string_memcmp/64/0/0              -0.2247         -0.2247            26            20            26            20
BM_string_memcmp/72/0/0              -0.1947         -0.1947            27            22            27            22
BM_string_memcmp/80/0/0              -0.2275         -0.2275            28            22            28            22
BM_string_memcmp/88/0/0              -0.2360         -0.2360            29            22            29            22
BM_string_memcmp/96/0/0              -0.2675         -0.2675            31            22            31            22
BM_string_memcmp/104/0/0             -0.2559         -0.2559            32            24            32            24
BM_string_memcmp/112/0/0             -0.2787         -0.2786            33            24            33            24
BM_string_memcmp/120/0/0             -0.2599         -0.2599            34            25            34            25
BM_string_memcmp/128/0/0             -0.2860         -0.2860            35            25            35            25
BM_string_memcmp/136/0/0             -0.4708         -0.4708            53            28            53            28
BM_string_memcmp/144/0/0             -0.4719         -0.4719            53            28            53            28
BM_string_memcmp/160/0/0             -0.4680         -0.4680            56            30            56            30
BM_string_memcmp/176/0/0             -0.4645         -0.4645            60            32            60            32
BM_string_memcmp/192/0/0             -0.4641         -0.4641            63            34            63            34
BM_string_memcmp/208/0/0             -0.4555         -0.4555            66            36            66            36
BM_string_memcmp/224/0/0             -0.4558         -0.4557            69            38            69            38
BM_string_memcmp/240/0/0             -0.4534         -0.4534            72            40            72            40
BM_string_memcmp/256/0/0             -0.4463         -0.4463            75            42            75            42
BM_string_memcmp/512/0/0             -0.3077         -0.3077           126            88           126            88
BM_string_memcmp/1024/0/0            -0.3493         -0.3493           229           149           229           149
BM_string_memcmp/8192/0/0            -0.4173         -0.4173          1729          1007          1729          1007
BM_string_memcmp/16384/0/0           -0.3855         -0.3855          3377          2076          3377          2075
BM_string_memcmp/32768/0/0           -0.2968         -0.2968          6847          4815          6847          4814
BM_string_memcmp/65536/0/0           -0.2496         -0.2496         13715         10292         13714         10291
BM_string_memcmp/131072/0/0          -0.2676         -0.2676         27354         20033         27351         20031
BM_string_memcmp/262144/0/0          -0.2319         -0.2319         54604         41943         54598         41939
BM_string_memcmp/524288/0/0          -0.2359         -0.2359        109225         83460        109212         83449
BM_string_memcmp/1048576/0/0         -0.0439         -0.0439        423367        404791        423251        404686
BM_string_memcmp/2097152/0/0         -0.0023         -0.0024        762470        760701        761956        760122
BM_string_memcmp/512/4/4             -0.2853         -0.2853           125            89           125            89
BM_string_memcmp/1024/4/4            -0.3377         -0.3377           228           151           227           151
BM_string_memcmp/8192/4/4            -0.4083         -0.4083          1706          1009          1706          1009
BM_string_memcmp/16384/4/4           -0.3853         -0.3853          3376          2075          3376          2075
BM_string_memcmp/32768/4/4           -0.2974         -0.2974          6846          4810          6845          4810
BM_string_memcmp/65536/4/4           -0.2485         -0.2485         13619         10235         13618         10234
BM_string_memcmp/131072/4/4          -0.2387         -0.2387         27056         20597         27054         20595
BM_string_memcmp/512/4/0             -0.2898         -0.2898           123            88           123            88
BM_string_memcmp/1024/4/0            -0.3401         -0.3401           225           149           225           149
BM_string_memcmp/8192/4/0            -0.4167         -0.4167          1727          1007          1727          1007
BM_string_memcmp/16384/4/0           -0.3820         -0.3820          3384          2092          3384          2091
BM_string_memcmp/32768/4/0           -0.2535         -0.2535          6886          5141          6886          5140
BM_string_memcmp/65536/4/0           -0.1897         -0.1897         13850         11223         13849         11223
BM_string_memcmp/131072/4/0          -0.1972         -0.1972         27536         22106         27533         22104
BM_string_memcmp/512/0/4             -0.2854         -0.2854           125            89           125            89
BM_string_memcmp/1024/0/4            -0.3332         -0.3333           226           151           226           151
BM_string_memcmp/8192/0/4            -0.4199         -0.4199          1740          1009          1740          1009
BM_string_memcmp/16384/0/4           -0.3811         -0.3811          3383          2094          3383          2094
BM_string_memcmp/32768/0/4           -0.2409         -0.2409          6900          5238          6899          5237
BM_string_memcmp/65536/0/4           -0.1920         -0.1920         13922         11250         13921         11248
BM_string_memcmp/131072/0/4          -0.2029         -0.2029         27699         22079         27697         22077

I see similar improvements on A54 as well:

Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------------
BM_string_memcmp/1/0/0               -0.2074         -0.2074            15            12            15            12
BM_string_memcmp/2/0/0               -0.5193         -0.5193            31            15            31            15
BM_string_memcmp/3/0/0               -0.1291         -0.1291            19            17            19            17
BM_string_memcmp/4/0/0               -0.2889         -0.2889            17            12            17            12
BM_string_memcmp/5/0/0               -0.2606         -0.2606            15            11            15            11
BM_string_memcmp/6/0/0               -0.1656         -0.1655            17            14            17            14
BM_string_memcmp/7/0/0               -0.1721         -0.1721            19            15            19            15
BM_string_memcmp/8/0/0               -0.3048         -0.3048            15            10            15            10
BM_string_memcmp/9/0/0               -0.3041         -0.3041            15            10            15            10
BM_string_memcmp/10/0/0              -0.3040         -0.3040            15            10            15            10
BM_string_memcmp/11/0/0              -0.3048         -0.3048            15            10            15            10
BM_string_memcmp/12/0/0              -0.3041         -0.3041            15            10            15            10
BM_string_memcmp/13/0/0              -0.3040         -0.3040            15            10            15            10
BM_string_memcmp/14/0/0              -0.3048         -0.3048            15            10            15            10
BM_string_memcmp/15/0/0              -0.3040         -0.3040            15            10            15            10
BM_string_memcmp/16/0/0              -0.3041         -0.3041            15            10            15            10
BM_string_memcmp/24/0/0              -0.1209         -0.1209            15            13            15            13
BM_string_memcmp/32/0/0              -0.3228         -0.3228            20            13            20            13
BM_string_memcmp/40/0/0              -0.2937         -0.2937            22            15            22            15
BM_string_memcmp/48/0/0              -0.3299         -0.3299            23            15            23            15
BM_string_memcmp/56/0/0              -0.1845         -0.1845            24            20            24            20
BM_string_memcmp/64/0/0              -0.2247         -0.2247            26            20            26            20
BM_string_memcmp/72/0/0              -0.1947         -0.1947            27            22            27            22
BM_string_memcmp/80/0/0              -0.2275         -0.2275            28            22            28            22
BM_string_memcmp/88/0/0              -0.2360         -0.2360            29            22            29            22
BM_string_memcmp/96/0/0              -0.2675         -0.2675            31            22            31            22
BM_string_memcmp/104/0/0             -0.2559         -0.2559            32            24            32            24
BM_string_memcmp/112/0/0             -0.2787         -0.2786            33            24            33            24
BM_string_memcmp/120/0/0             -0.2599         -0.2599            34            25            34            25
BM_string_memcmp/128/0/0             -0.2860         -0.2860            35            25            35            25
BM_string_memcmp/136/0/0             -0.4708         -0.4708            53            28            53            28
BM_string_memcmp/144/0/0             -0.4719         -0.4719            53            28            53            28
BM_string_memcmp/160/0/0             -0.4680         -0.4680            56            30            56            30
BM_string_memcmp/176/0/0             -0.4645         -0.4645            60            32            60            32
BM_string_memcmp/192/0/0             -0.4641         -0.4641            63            34            63            34
BM_string_memcmp/208/0/0             -0.4555         -0.4555            66            36            66            36
BM_string_memcmp/224/0/0             -0.4558         -0.4557            69            38            69            38
BM_string_memcmp/240/0/0             -0.4534         -0.4534            72            40            72            40
BM_string_memcmp/256/0/0             -0.4463         -0.4463            75            42            75            42
BM_string_memcmp/512/0/0             -0.3077         -0.3077           126            88           126            88
BM_string_memcmp/1024/0/0            -0.3493         -0.3493           229           149           229           149
BM_string_memcmp/8192/0/0            -0.4173         -0.4173          1729          1007          1729          1007
BM_string_memcmp/16384/0/0           -0.3855         -0.3855          3377          2076          3377          2075
BM_string_memcmp/32768/0/0           -0.2968         -0.2968          6847          4815          6847          4814
BM_string_memcmp/65536/0/0           -0.2496         -0.2496         13715         10292         13714         10291
BM_string_memcmp/131072/0/0          -0.2676         -0.2676         27354         20033         27351         20031
BM_string_memcmp/262144/0/0          -0.2319         -0.2319         54604         41943         54598         41939
BM_string_memcmp/524288/0/0          -0.2359         -0.2359        109225         83460        109212         83449
BM_string_memcmp/1048576/0/0         -0.0439         -0.0439        423367        404791        423251        404686
BM_string_memcmp/2097152/0/0         -0.0023         -0.0024        762470        760701        761956        760122
BM_string_memcmp/512/4/4             -0.2853         -0.2853           125            89           125            89
BM_string_memcmp/1024/4/4            -0.3377         -0.3377           228           151           227           151
BM_string_memcmp/8192/4/4            -0.4083         -0.4083          1706          1009          1706          1009
BM_string_memcmp/16384/4/4           -0.3853         -0.3853          3376          2075          3376          2075
BM_string_memcmp/32768/4/4           -0.2974         -0.2974          6846          4810          6845          4810
BM_string_memcmp/65536/4/4           -0.2485         -0.2485         13619         10235         13618         10234
BM_string_memcmp/131072/4/4          -0.2387         -0.2387         27056         20597         27054         20595
BM_string_memcmp/512/4/0             -0.2898         -0.2898           123            88           123            88
BM_string_memcmp/1024/4/0            -0.3401         -0.3401           225           149           225           149
BM_string_memcmp/8192/4/0            -0.4167         -0.4167          1727          1007          1727          1007
BM_string_memcmp/16384/4/0           -0.3820         -0.3820          3384          2092          3384          2091
BM_string_memcmp/32768/4/0           -0.2535         -0.2535          6886          5141          6886          5140
BM_string_memcmp/65536/4/0           -0.1897         -0.1897         13850         11223         13849         11223
BM_string_memcmp/131072/4/0          -0.1972         -0.1972         27536         22106         27533         22104
BM_string_memcmp/512/0/4             -0.2854         -0.2854           125            89           125            89
BM_string_memcmp/1024/0/4            -0.3332         -0.3333           226           151           226           151
BM_string_memcmp/8192/0/4            -0.4199         -0.4199          1740          1009          1740          1009
BM_string_memcmp/16384/0/4           -0.3811         -0.3811          3383          2094          3383          2094
BM_string_memcmp/32768/0/4           -0.2409         -0.2409          6900          5238          6899          5237
BM_string_memcmp/65536/0/4           -0.1920         -0.1920         13922         11250         13921         11248
BM_string_memcmp/131072/0/4          -0.2029         -0.2029         27699         22079         27697         22077

[1] Commit id: f77e4c932b4fd65177b57dd5e220bd17fb4037d6

Test: bionic tests and benchmarks on aarch64.
Change-Id: I2791e2b20d1c0ad429e8e5a41d3e47b1ac02c921
diff --git a/libc/arch-arm64/generic/bionic/memcmp.S b/libc/arch-arm64/generic/bionic/memcmp.S
index 3a138bf..bff54ae 100644
--- a/libc/arch-arm64/generic/bionic/memcmp.S
+++ b/libc/arch-arm64/generic/bionic/memcmp.S
@@ -33,6 +33,8 @@
 
 #include <private/bionic_asm.h>
 
+#define L(l) .L ## l
+
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
@@ -42,88 +44,124 @@
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define tmp1		x5
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 /* Small inputs of less than 8 bytes are handled separately.  This allows the
-   main code to be sped up using unaligned loads since there are now at least
+   main code to be speed up using unaligned loads since there are now at least
    8 bytes to be compared.  If the first 8 bytes are equal, align src1.
    This ensures each iteration does at most one unaligned access even if both
    src1 and src2 are unaligned, and mutually aligned inputs behave as if
-   aligned.  After the main loop, process the last 8 bytes using unaligned
+   aligned.  After the main loop, process the last 16 bytes using unaligned
    accesses.  */
 
-.p2align 6
 ENTRY(memcmp)
+.p2align 6
 	subs	limit, limit, 8
-	b.lo	.Lless8
+	b.lo	L(less8)
 
 	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
-	and	tmp1, src1, 7
-	add	limit, limit, tmp1
 	cmp	data1, data2
-	bne	.Lreturn
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
 
 	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 
-	subs	limit, limit, 8
-	b.ls	.Llast_bytes
-
-	/* Loop performing 8 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 8 and must be larger than zero.
-	   Exit if <= 8 bytes left to do or if the data is not equal.  */
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-.Lloop8:
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	subs	limit, limit, 8
-	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
-	b.eq	.Lloop8
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
 
 	cmp	data1, data2
-	bne	.Lreturn
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
 
-	/* Compare last 1-8 bytes using unaligned access.  */
-.Llast_bytes:
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
-.Lreturn:
+L(return):
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
 	cmp     data1, data2
-.Lret_eq:
+L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
-        ret
+	ret
 
 	.p2align 4
 	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
-.Lless8:
+L(less8):
 	adds	limit, limit, 4
-	b.lo	.Lless4
+	b.lo	L(less4)
 	ldr	data1w, [src1], 4
 	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.ne	.Lreturn
+	b.ne	L(return)
 	sub	limit, limit, 4
-.Lless4:
+L(less4):
 	adds	limit, limit, 4
-	beq	.Lret_eq
-.Lbyte_loop:
+	beq	L(ret_eq)
+L(byte_loop):
 	ldrb	data1w, [src1], 1
 	ldrb	data2w, [src2], 1
 	subs	limit, limit, 1
 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
+	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
 	ret
+
 END(memcmp)