Move the generic arm memcmp.S into the generic directory.
Change-Id: I48e4d14a0dcddbb246edbac6d0329619574ab44d
diff --git a/libc/arch-arm/generic/bionic/memcmp.S b/libc/arch-arm/generic/bionic/memcmp.S
new file mode 100644
index 0000000..70a2a58
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/memcmp.S
@@ -0,0 +1,343 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+#include <private/bionic_asm.h>
+
+
+#ifdef HAVE_32_BYTE_CACHE_LINE
+#define CACHE_LINE_SIZE 32
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+
+/*
+ * Optimized memcmp() for Cortex-A9.
+ */
+
+ENTRY(memcmp)
+ pld [r0, #(CACHE_LINE_SIZE * 0)]
+ pld [r0, #(CACHE_LINE_SIZE * 1)]
+
+ /* take of the case where length is 0 or the buffers are the same */
+ cmp r0, r1
+ moveq r0, #0
+ bxeq lr
+
+ pld [r1, #(CACHE_LINE_SIZE * 0)]
+ pld [r1, #(CACHE_LINE_SIZE * 1)]
+
+ /* make sure we have at least 8+4 bytes, this simplify things below
+ * and avoid some overhead for small blocks
+ */
+ cmp r2, #(8+4)
+ bmi 10f
+/*
+ * Neon optimization
+ * Comparing 32 bytes at a time
+ */
+#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
+ subs r2, r2, #32
+ blo 3f
+
+ /* preload all the cache lines we need. */
+ pld [r0, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+
+1: /* The main loop compares 32 bytes at a time */
+ vld1.8 {d0 - d3}, [r0]!
+ pld [r0, #(CACHE_LINE_SIZE * 2)]
+ vld1.8 {d4 - d7}, [r1]!
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+
+ /* Start subtracting the values and merge results */
+ vsub.i8 q0, q2
+ vsub.i8 q1, q3
+ vorr q2, q0, q1
+ vorr d4, d5
+ vmov r3, ip, d4
+ /* Check if there are any differences among the 32 bytes */
+ orrs r3, ip
+ bne 2f
+ subs r2, r2, #32
+ bhs 1b
+ b 3f
+2:
+ /* Check if the difference was in the first or last 16 bytes */
+ sub r0, #32
+ vorr d0, d1
+ sub r1, #32
+ vmov r3, ip, d0
+ orrs r3, ip
+ /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
+ ittt eq
+ subeq r2, #16
+ addeq r0, #16
+ addeq r1, #16
+
+3: /* fix-up the remaining count */
+ add r2, r2, #32
+
+ cmp r2, #(8+4)
+ bmi 10f
+#endif
+
+ /* save registers */
+ stmfd sp!, {r4, lr}
+ .cfi_def_cfa_offset 8
+ .cfi_rel_offset r4, 0
+ .cfi_rel_offset lr, 4
+
+ /* since r0 hold the result, move the first source
+ * pointer somewhere else
+ */
+ mov r4, r0
+
+ /* align first pointer to word boundary
+ * offset = -src & 3
+ */
+ rsb r3, r4, #0
+ ands r3, r3, #3
+ beq 0f
+
+ /* align first pointer */
+ sub r2, r2, r3
+1: ldrb r0, [r4], #1
+ ldrb ip, [r1], #1
+ subs r0, r0, ip
+ bne 9f
+ subs r3, r3, #1
+ bne 1b
+
+
+0: /* here the first pointer is aligned, and we have at least 4 bytes
+ * to process.
+ */
+
+ /* see if the pointers are congruent */
+ eor r0, r4, r1
+ ands r0, r0, #3
+ bne 5f
+
+ /* congruent case, 32 bytes per iteration
+ * We need to make sure there are at least 32+4 bytes left
+ * because we effectively read ahead one word, and we could
+ * read past the buffer (and segfault) if we're not careful.
+ */
+
+ ldr ip, [r1]
+ subs r2, r2, #(32 + 4)
+ bmi 1f
+
+0: pld [r4, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+ ldr r0, [r4], #4
+ ldr lr, [r1, #4]!
+ eors r0, r0, ip
+ ldreq r0, [r4], #4
+ ldreq ip, [r1, #4]!
+ eoreqs r0, r0, lr
+ ldreq r0, [r4], #4
+ ldreq lr, [r1, #4]!
+ eoreqs r0, r0, ip
+ ldreq r0, [r4], #4
+ ldreq ip, [r1, #4]!
+ eoreqs r0, r0, lr
+ ldreq r0, [r4], #4
+ ldreq lr, [r1, #4]!
+ eoreqs r0, r0, ip
+ ldreq r0, [r4], #4
+ ldreq ip, [r1, #4]!
+ eoreqs r0, r0, lr
+ ldreq r0, [r4], #4
+ ldreq lr, [r1, #4]!
+ eoreqs r0, r0, ip
+ ldreq r0, [r4], #4
+ ldreq ip, [r1, #4]!
+ eoreqs r0, r0, lr
+ bne 2f
+ subs r2, r2, #32
+ bhs 0b
+
+ /* do we have at least 4 bytes left? */
+1: adds r2, r2, #(32 - 4 + 4)
+ bmi 4f
+
+ /* finish off 4 bytes at a time */
+3: ldr r0, [r4], #4
+ ldr ip, [r1], #4
+ eors r0, r0, ip
+ bne 2f
+ subs r2, r2, #4
+ bhs 3b
+
+ /* are we done? */
+4: adds r2, r2, #4
+ moveq r0, #0
+ beq 9f
+
+ /* finish off the remaining bytes */
+ b 8f
+
+2: /* the last 4 bytes are different, restart them */
+ sub r4, r4, #4
+ sub r1, r1, #4
+ mov r2, #4
+
+ /* process the last few bytes */
+8: ldrb r0, [r4], #1
+ ldrb ip, [r1], #1
+ // stall
+ subs r0, r0, ip
+ bne 9f
+ subs r2, r2, #1
+ bne 8b
+
+9: /* restore registers and return */
+ ldmfd sp!, {r4, lr}
+ bx lr
+
+10: /* process less than 12 bytes */
+ cmp r2, #0
+ moveq r0, #0
+ bxeq lr
+ mov r3, r0
+11:
+ ldrb r0, [r3], #1
+ ldrb ip, [r1], #1
+ subs r0, ip
+ bxne lr
+ subs r2, r2, #1
+ bne 11b
+ bx lr
+
+5: /*************** non-congruent case ***************/
+ and r0, r1, #3
+ cmp r0, #2
+ bne 4f
+
+ /* here, offset is 2 (16-bits aligned, special cased) */
+
+ /* make sure we have at least 16 bytes to process */
+ subs r2, r2, #16
+ addmi r2, r2, #16
+ bmi 8b
+
+ /* align the unaligned pointer */
+ bic r1, r1, #3
+ ldr lr, [r1], #4
+
+6: pld [r1, #(CACHE_LINE_SIZE * 2)]
+ pld [r4, #(CACHE_LINE_SIZE * 2)]
+ mov ip, lr, lsr #16
+ ldr lr, [r1], #4
+ ldr r0, [r4], #4
+ orr ip, ip, lr, lsl #16
+ eors r0, r0, ip
+ moveq ip, lr, lsr #16
+ ldreq lr, [r1], #4
+ ldreq r0, [r4], #4
+ orreq ip, ip, lr, lsl #16
+ eoreqs r0, r0, ip
+ moveq ip, lr, lsr #16
+ ldreq lr, [r1], #4
+ ldreq r0, [r4], #4
+ orreq ip, ip, lr, lsl #16
+ eoreqs r0, r0, ip
+ moveq ip, lr, lsr #16
+ ldreq lr, [r1], #4
+ ldreq r0, [r4], #4
+ orreq ip, ip, lr, lsl #16
+ eoreqs r0, r0, ip
+ bne 7f
+ subs r2, r2, #16
+ bhs 6b
+ sub r1, r1, #2
+ /* are we done? */
+ adds r2, r2, #16
+ moveq r0, #0
+ beq 9b
+ /* finish off the remaining bytes */
+ b 8b
+
+7: /* fix up the 2 pointers and fallthrough... */
+ sub r1, r1, #(4+2)
+ sub r4, r4, #4
+ mov r2, #4
+ b 8b
+
+
+4: /*************** offset is 1 or 3 (less optimized) ***************/
+
+ stmfd sp!, {r5, r6, r7}
+
+ // r5 = rhs
+ // r6 = lhs
+ // r7 = scratch
+
+ mov r5, r0, lsl #3 /* r5 = right shift */
+ rsb r6, r5, #32 /* r6 = left shift */
+
+ /* align the unaligned pointer */
+ bic r1, r1, #3
+ ldr r7, [r1], #4
+ sub r2, r2, #8
+
+6: mov ip, r7, lsr r5
+ ldr r7, [r1], #4
+ ldr r0, [r4], #4
+ orr ip, ip, r7, lsl r6
+ eors r0, r0, ip
+ moveq ip, r7, lsr r5
+ ldreq r7, [r1], #4
+ ldreq r0, [r4], #4
+ orreq ip, ip, r7, lsl r6
+ eoreqs r0, r0, ip
+ bne 7f
+ subs r2, r2, #8
+ bhs 6b
+
+ sub r1, r1, r6, lsr #3
+ ldmfd sp!, {r5, r6, r7}
+
+ /* are we done? */
+ adds r2, r2, #8
+ moveq r0, #0
+ beq 9b
+
+ /* finish off the remaining bytes */
+ b 8b
+
+7: /* fix up the 2 pointers and fallthrough... */
+ sub r1, r1, #4
+ sub r1, r1, r6, lsr #3
+ sub r4, r4, #4
+ mov r2, #4
+ ldmfd sp!, {r5, r6, r7}
+ b 8b
+END(memcmp)
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index 95be867..96ed949 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -1,4 +1,5 @@
libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memcmp.S \
arch-arm/generic/bionic/memcpy.S \
arch-arm/generic/bionic/memset.S \
arch-arm/generic/bionic/strcmp.S \