Add libc optimizations to bionic for MIPS
Test: Used bionic tests available under bionic/tests folder.
Tested for mips32r1/mips32r2/mips64r6 on emulators.
Change-Id: I589415ddc496df3f6067ae34cb33ca58b3a1f276
Signed-off-by: Prashant Patil <prashant.patil@imgtec.com>
diff --git a/libc/arch-mips/string/strcmp.S b/libc/arch-mips/string/strcmp.S
index 2b67f5a..e1faf2d 100644
--- a/libc/arch-mips/string/strcmp.S
+++ b/libc/arch-mips/string/strcmp.S
@@ -1,30 +1,33 @@
/*
- * Copyright (c) 2014
- * Imagination Technologies Limited.
+ * Copyright (c) 2017 Imagination Technologies.
+ *
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
*
- * THIS SOFTWARE IS PROVIDED BY IMAGINATION TECHNOLOGIES LIMITED ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL IMAGINATION TECHNOLOGIES LIMITED BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with
+ * the distribution.
+ * * Neither the name of Imagination Technologies nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __ANDROID__
@@ -41,6 +44,22 @@
# include <sys/asm.h>
#endif
+#if __mips64
+# define NSIZE 8
+# define LW ld
+# define EXT dext
+# define SRL dsrl
+# define SLL dsll
+# define SUBU dsubu
+#else
+# define NSIZE 4
+# define LW lw
+# define EXT ext
+# define SRL srl
+# define SLL sll
+# define SUBU subu
+#endif
+
/* Technically strcmp should not read past the end of the strings being
compared. We will read a full word that may contain excess bits beyond
the NULL string terminator but unless ENABLE_READAHEAD is set, we will not
@@ -77,6 +96,23 @@
# endif
#endif
+/* It might seem better to do the 'beq' instruction between the two 'lbu'
+ instructions so that the nop is not needed but testing showed that this
+ code is actually faster (based on glibc strcmp test). */
+#define BYTECMP01(OFFSET) \
+ lbu v0, OFFSET(a0); \
+ lbu v1, OFFSET(a1); \
+ beq v0, zero, L(bexit01); \
+ nop; \
+ bne v0, v1, L(bexit01)
+
+#define BYTECMP89(OFFSET) \
+ lbu t8, OFFSET(a0); \
+ lbu t9, OFFSET(a1); \
+ beq t8, zero, L(bexit89); \
+ nop; \
+ bne t8, t9, L(bexit89)
+
/* Allow the routine to be named something else if desired. */
#ifndef STRCMP_NAME
# define STRCMP_NAME strcmp
@@ -87,170 +123,236 @@
#else
LEAF(STRCMP_NAME)
#endif
- .set nomips16
- .set noreorder
+ .set nomips16
+ .set noreorder
- or t0, a0, a1
- andi t0,0x3
- bne t0, zero, L(byteloop)
+ andi t1, a1, (NSIZE - 1)
+ beqz t1, L(exitalign)
+ or t0, zero, NSIZE
+ SUBU t1, t0, t1 #process (NSIZE - 1) bytes at max
-/* Both strings are 4 byte aligned at this point. */
+L(alignloop): #do by bytes until a1 aligned
+ BYTECMP01(0)
+ SUBU t1, t1, 0x1
+ PTR_ADDIU a0, a0, 0x1
+ bnez t1, L(alignloop)
+ PTR_ADDIU a1, a1, 0x1
- lui t8, 0x0101
- ori t8, t8, 0x0101
- lui t9, 0x7f7f
- ori t9, 0x7f7f
+L(exitalign):
-#define STRCMP32(OFFSET) \
- lw v0, OFFSET(a0); \
- lw v1, OFFSET(a1); \
- subu t0, v0, t8; \
- bne v0, v1, L(worddiff); \
- nor t1, v0, t9; \
- and t0, t0, t1; \
- bne t0, zero, L(returnzero)
+/* string a1 is NSIZE byte aligned at this point. */
+
+ lui t8, 0x0101
+ ori t8, 0x0101
+ lui t9, 0x7f7f
+ ori t9, 0x7f7f
+#if __mips64
+ dsll t1, t8, 32
+ or t8, t1
+ dsll t1, t9, 32
+ or t9, t1
+#endif
+
+ andi t2, a0, (NSIZE - 1) #check if a0 aligned
+ SUBU t3, t0, t2 #t3 will be used as shifter
+ bnez t2, L(uloopenter)
+ SUBU a2, a0, t2 #bring back a0 to aligned position
+
+#define STRCMPW(OFFSET) \
+ LW v0, OFFSET(a0); \
+ LW v1, OFFSET(a1); \
+ SUBU t0, v0, t8; \
+ bne v0, v1, L(worddiff); \
+ nor t1, v0, t9; \
+ and t0, t0, t1; \
+ bne t0, zero, L(returnzero);\
L(wordloop):
- STRCMP32(0)
- DELAY_READ
- STRCMP32(4)
- DELAY_READ
- STRCMP32(8)
- DELAY_READ
- STRCMP32(12)
- DELAY_READ
- STRCMP32(16)
- DELAY_READ
- STRCMP32(20)
- DELAY_READ
- STRCMP32(24)
- DELAY_READ
- STRCMP32(28)
- PTR_ADDIU a0, a0, 32
- b L(wordloop)
- PTR_ADDIU a1, a1, 32
+ STRCMPW(0 * NSIZE)
+ DELAY_READ
+ STRCMPW(1 * NSIZE)
+ DELAY_READ
+ STRCMPW(2 * NSIZE)
+ DELAY_READ
+ STRCMPW(3 * NSIZE)
+ DELAY_READ
+ STRCMPW(4 * NSIZE)
+ DELAY_READ
+ STRCMPW(5 * NSIZE)
+ DELAY_READ
+ STRCMPW(6 * NSIZE)
+ DELAY_READ
+ STRCMPW(7 * NSIZE)
+ PTR_ADDIU a0, a0, (8 * NSIZE)
+ b L(wordloop)
+ PTR_ADDIU a1, a1, (8 * NSIZE)
+
+#define USTRCMPW(OFFSET) \
+ LW v1, OFFSET(a1); \
+ SUBU t0, v0, t8; \
+ nor t1, v0, t9; \
+ and t0, t0, t1; \
+ bne t0, zero, L(worddiff); \
+ SRL v0, t2; \
+ LW a3, (OFFSET + NSIZE)(a2); \
+ SUBU t0, v1, t8; \
+ SLL t1, a3, t3; \
+ or v0, v0, t1; \
+ bne v0, v1, L(worddiff); \
+ nor t1, v1, t9; \
+ and t0, t0, t1; \
+ bne t0, zero, L(returnzero); \
+ move v0, a3;\
+
+L(uloopenter):
+ LW v0, 0(a2)
+ SLL t2, 3 #multiply by 8
+ SLL t3, 3 #multiply by 8
+ li a3, -1 #all 1s
+ SRL a3, t3
+ or v0, a3 #replace with all 1s if zeros in unintented read
+
+L(uwordloop):
+ USTRCMPW(0 * NSIZE)
+ USTRCMPW(1 * NSIZE)
+ USTRCMPW(2 * NSIZE)
+ USTRCMPW(3 * NSIZE)
+ USTRCMPW(4 * NSIZE)
+ USTRCMPW(5 * NSIZE)
+ USTRCMPW(6 * NSIZE)
+ USTRCMPW(7 * NSIZE)
+ PTR_ADDIU a2, a2, (8 * NSIZE)
+ b L(uwordloop)
+ PTR_ADDIU a1, a1, (8 * NSIZE)
L(returnzero):
- j ra
- move v0, zero
+ j ra
+ move v0, zero
+
+#if __mips_isa_rev > 1
+#define EXT_COMPARE01(POS) \
+ EXT t0, v0, POS, 8; \
+ beq t0, zero, L(wexit01); \
+ EXT t1, v1, POS, 8; \
+ bne t0, t1, L(wexit01)
+#define EXT_COMPARE89(POS) \
+ EXT t8, v0, POS, 8; \
+ beq t8, zero, L(wexit89); \
+ EXT t9, v1, POS, 8; \
+ bne t8, t9, L(wexit89)
+#else
+#define EXT_COMPARE01(POS) \
+ SRL t0, v0, POS; \
+ SRL t1, v1, POS; \
+ andi t0, t0, 0xff; \
+ beq t0, zero, L(wexit01); \
+ andi t1, t1, 0xff; \
+ bne t0, t1, L(wexit01)
+#define EXT_COMPARE89(POS) \
+ SRL t8, v0, POS; \
+ SRL t9, v1, POS; \
+ andi t8, t8, 0xff; \
+ beq t8, zero, L(wexit89); \
+ andi t9, t9, 0xff; \
+ bne t8, t9, L(wexit89)
+#endif
L(worddiff):
#ifdef USE_CLZ
- subu t0, v0, t8
- nor t1, v0, t9
- and t1, t0, t1
- xor t0, v0, v1
- or t0, t0, t1
+ SUBU t0, v0, t8
+ nor t1, v0, t9
+ and t1, t0, t1
+ xor t0, v0, v1
+ or t0, t0, t1
# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- wsbh t0, t0
- rotr t0, t0, 16
+ wsbh t0, t0
+ rotr t0, t0, 16
# endif
- clz t1, t0
- and t1, 0xf8
+ clz t1, t0
+ and t1, 0xf8
# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- neg t1
- addu t1, 24
+ neg t1
+ addu t1, 24
# endif
- rotrv v0, v0, t1
- rotrv v1, v1, t1
- and v0, v0, 0xff
- and v1, v1, 0xff
- j ra
- subu v0, v0, v1
+ rotrv v0, v0, t1
+ rotrv v1, v1, t1
+ and v0, v0, 0xff
+ and v1, v1, 0xff
+ j ra
+ SUBU v0, v0, v1
#else /* USE_CLZ */
# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- andi t0, v0, 0xff
- beq t0, zero, L(wexit01)
- andi t1, v1, 0xff
- bne t0, t1, L(wexit01)
+ andi t0, v0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, v1, 0xff
+ bne t0, t1, L(wexit01)
+ EXT_COMPARE89(8)
+ EXT_COMPARE01(16)
+#ifndef __mips64
+ SRL t8, v0, 24
+ SRL t9, v1, 24
+#else
+ EXT_COMPARE89(24)
+ EXT_COMPARE01(32)
+ EXT_COMPARE89(40)
+ EXT_COMPARE01(48)
+ SRL t8, v0, 56
+ SRL t9, v1, 56
+#endif
- srl t8, v0, 8
- srl t9, v1, 8
- andi t8, t8, 0xff
- beq t8, zero, L(wexit89)
- andi t9, t9, 0xff
- bne t8, t9, L(wexit89)
-
- srl t0, v0, 16
- srl t1, v1, 16
- andi t0, t0, 0xff
- beq t0, zero, L(wexit01)
- andi t1, t1, 0xff
- bne t0, t1, L(wexit01)
-
- srl t8, v0, 24
- srl t9, v1, 24
# else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
- srl t0, v0, 24
- beq t0, zero, L(wexit01)
- srl t1, v1, 24
- bne t0, t1, L(wexit01)
+#ifdef __mips64
+ SRL t0, v0, 56
+ beq t0, zero, L(wexit01)
+ SRL t1, v1, 56
+ bne t0, t1, L(wexit01)
+ EXT_COMPARE89(48)
+ EXT_COMPARE01(40)
+ EXT_COMPARE89(32)
+ EXT_COMPARE01(24)
+#else
+ SRL t0, v0, 24
+ beq t0, zero, L(wexit01)
+ SRL t1, v1, 24
+ bne t0, t1, L(wexit01)
+#endif
+ EXT_COMPARE89(16)
+ EXT_COMPARE01(8)
- srl t8, v0, 16
- srl t9, v1, 16
- andi t8, t8, 0xff
- beq t8, zero, L(wexit89)
- andi t9, t9, 0xff
- bne t8, t9, L(wexit89)
-
- srl t0, v0, 8
- srl t1, v1, 8
- andi t0, t0, 0xff
- beq t0, zero, L(wexit01)
- andi t1, t1, 0xff
- bne t0, t1, L(wexit01)
-
- andi t8, v0, 0xff
- andi t9, v1, 0xff
+ andi t8, v0, 0xff
+ andi t9, v1, 0xff
# endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
L(wexit89):
- j ra
- subu v0, t8, t9
+ j ra
+ SUBU v0, t8, t9
L(wexit01):
- j ra
- subu v0, t0, t1
+ j ra
+ SUBU v0, t0, t1
#endif /* USE_CLZ */
-/* It might seem better to do the 'beq' instruction between the two 'lbu'
- instructions so that the nop is not needed but testing showed that this
- code is actually faster (based on glibc strcmp test). */
-#define BYTECMP01(OFFSET) \
- lbu v0, OFFSET(a0); \
- lbu v1, OFFSET(a1); \
- beq v0, zero, L(bexit01); \
- nop; \
- bne v0, v1, L(bexit01)
-
-#define BYTECMP89(OFFSET) \
- lbu t8, OFFSET(a0); \
- lbu t9, OFFSET(a1); \
- beq t8, zero, L(bexit89); \
- nop; \
- bne t8, t9, L(bexit89)
-
L(byteloop):
- BYTECMP01(0)
- BYTECMP89(1)
- BYTECMP01(2)
- BYTECMP89(3)
- BYTECMP01(4)
- BYTECMP89(5)
- BYTECMP01(6)
- BYTECMP89(7)
- PTR_ADDIU a0, a0, 8
- b L(byteloop)
- PTR_ADDIU a1, a1, 8
+ BYTECMP01(0)
+ BYTECMP89(1)
+ BYTECMP01(2)
+ BYTECMP89(3)
+ BYTECMP01(4)
+ BYTECMP89(5)
+ BYTECMP01(6)
+ BYTECMP89(7)
+ PTR_ADDIU a0, a0, 8
+ b L(byteloop)
+ PTR_ADDIU a1, a1, 8
L(bexit01):
- j ra
- subu v0, v0, v1
+ j ra
+ SUBU v0, v0, v1
L(bexit89):
- j ra
- subu v0, t8, t9
+ j ra
+ SUBU v0, t8, t9
- .set at
- .set reorder
+ .set at
+ .set reorder
END(STRCMP_NAME)
#ifndef __ANDROID__