Move the generic arm memcmp.S into the generic directory.

Change-Id: I48e4d14a0dcddbb246edbac6d0329619574ab44d
diff --git a/libc/arch-arm/generic/bionic/memcmp.S b/libc/arch-arm/generic/bionic/memcmp.S
new file mode 100644
index 0000000..70a2a58
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/memcmp.S
@@ -0,0 +1,343 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+#include <private/bionic_asm.h>
+
+
+#ifdef HAVE_32_BYTE_CACHE_LINE
+#define CACHE_LINE_SIZE     32
+#else
+#define CACHE_LINE_SIZE     64
+#endif
+
+/*
+ * Optimized memcmp() for Cortex-A9.
+ */
+
+ENTRY(memcmp)
+        pld         [r0, #(CACHE_LINE_SIZE * 0)]
+        pld         [r0, #(CACHE_LINE_SIZE * 1)]
+
+        /* take of the case where length is 0 or the buffers are the same */
+        cmp         r0, r1
+        moveq       r0, #0
+        bxeq        lr
+
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+
+        /* make sure we have at least 8+4 bytes, this simplify things below
+         * and avoid some overhead for small blocks
+         */
+        cmp        r2, #(8+4)
+        bmi        10f
+/*
+ * Neon optimization
+ * Comparing 32 bytes at a time
+ */
+#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
+        subs        r2, r2, #32
+        blo         3f
+
+        /* preload all the cache lines we need. */
+        pld         [r0, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+1:      /* The main loop compares 32 bytes at a time */
+        vld1.8      {d0 - d3}, [r0]!
+        pld         [r0, #(CACHE_LINE_SIZE * 2)]
+        vld1.8      {d4 - d7}, [r1]!
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+
+        /* Start subtracting the values and merge results */
+        vsub.i8     q0, q2
+        vsub.i8     q1, q3
+        vorr        q2, q0, q1
+        vorr        d4, d5
+        vmov        r3, ip, d4
+        /* Check if there are any differences among the 32 bytes */
+        orrs        r3, ip
+        bne         2f
+        subs        r2, r2, #32
+        bhs         1b
+        b           3f
+2:
+        /* Check if the difference was in the first or last 16 bytes */
+        sub         r0, #32
+        vorr        d0, d1
+        sub         r1, #32
+        vmov        r3, ip, d0
+        orrs        r3, ip
+        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
+        ittt        eq
+        subeq       r2, #16
+        addeq       r0, #16
+        addeq       r1, #16
+
+3:      /* fix-up the remaining count */
+        add         r2, r2, #32
+
+        cmp        r2, #(8+4)
+        bmi        10f
+#endif
+
+        /* save registers */
+        stmfd       sp!, {r4, lr}
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r4, 0
+        .cfi_rel_offset lr, 4
+
+        /* since r0 hold the result, move the first source
+         * pointer somewhere else
+         */
+         mov        r4, r0
+
+        /* align first pointer to word boundary
+         * offset = -src & 3
+         */
+        rsb         r3, r4, #0
+        ands        r3, r3, #3
+        beq         0f
+
+        /* align first pointer  */
+        sub         r2, r2, r3
+1:      ldrb        r0, [r4], #1
+        ldrb        ip, [r1], #1
+        subs        r0, r0, ip
+        bne         9f
+        subs        r3, r3, #1
+        bne         1b
+
+
+0:      /* here the first pointer is aligned, and we have at least 4 bytes
+         * to process.
+         */
+
+        /* see if the pointers are congruent */
+        eor         r0, r4, r1
+        ands        r0, r0, #3
+        bne         5f
+
+        /* congruent case, 32 bytes per iteration
+         * We need to make sure there are at least 32+4 bytes left
+         * because we effectively read ahead one word, and we could
+         * read past the buffer (and segfault) if we're not careful.
+         */
+
+        ldr         ip, [r1]
+        subs        r2, r2, #(32 + 4)
+        bmi         1f
+
+0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        ldr         r0, [r4], #4
+        ldr         lr, [r1, #4]!
+        eors        r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        ldreq       r0, [r4], #4
+        ldreq       lr, [r1, #4]!
+        eoreqs      r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        ldreq       r0, [r4], #4
+        ldreq       lr, [r1, #4]!
+        eoreqs      r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        ldreq       r0, [r4], #4
+        ldreq       lr, [r1, #4]!
+        eoreqs      r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        bne         2f
+        subs        r2, r2, #32
+        bhs         0b
+
+        /* do we have at least 4 bytes left? */
+1:      adds        r2, r2, #(32 - 4 + 4)
+        bmi         4f
+
+        /* finish off 4 bytes at a time */
+3:      ldr         r0, [r4], #4
+        ldr         ip, [r1], #4
+        eors        r0, r0, ip
+        bne         2f
+        subs        r2, r2, #4
+        bhs         3b
+
+        /* are we done? */
+4:      adds        r2, r2, #4
+        moveq       r0, #0
+        beq         9f
+
+        /* finish off the remaining bytes */
+        b           8f
+
+2:      /* the last 4 bytes are different, restart them */
+        sub         r4, r4, #4
+        sub         r1, r1, #4
+        mov         r2, #4
+
+        /* process the last few bytes */
+8:      ldrb        r0, [r4], #1
+        ldrb        ip, [r1], #1
+        // stall
+        subs        r0, r0, ip
+        bne         9f
+        subs        r2, r2, #1
+        bne         8b
+
+9:      /* restore registers and return */
+        ldmfd       sp!, {r4, lr}
+        bx          lr
+
+10:     /* process less than 12 bytes */
+        cmp         r2, #0
+        moveq       r0, #0
+        bxeq        lr
+        mov         r3, r0
+11:
+        ldrb        r0, [r3], #1
+        ldrb        ip, [r1], #1
+        subs        r0, ip
+        bxne        lr
+        subs        r2, r2, #1
+        bne         11b
+        bx          lr
+
+5:      /*************** non-congruent case ***************/
+        and         r0, r1, #3
+        cmp         r0, #2
+        bne         4f
+
+        /* here, offset is 2 (16-bits aligned, special cased) */
+
+        /* make sure we have at least 16 bytes to process */
+        subs        r2, r2, #16
+        addmi       r2, r2, #16
+        bmi         8b
+
+        /* align the unaligned pointer */
+        bic         r1, r1, #3
+        ldr         lr, [r1], #4
+
+6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r4, #(CACHE_LINE_SIZE * 2)]
+        mov         ip, lr, lsr #16
+        ldr         lr, [r1], #4
+        ldr         r0, [r4], #4
+        orr         ip, ip, lr, lsl #16
+        eors        r0, r0, ip
+        moveq       ip, lr, lsr #16
+        ldreq       lr, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, lr, lsl #16
+        eoreqs      r0, r0, ip
+        moveq       ip, lr, lsr #16
+        ldreq       lr, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, lr, lsl #16
+        eoreqs      r0, r0, ip
+        moveq       ip, lr, lsr #16
+        ldreq       lr, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, lr, lsl #16
+        eoreqs      r0, r0, ip
+        bne         7f
+        subs        r2, r2, #16
+        bhs         6b
+        sub         r1, r1, #2
+        /* are we done? */
+        adds        r2, r2, #16
+        moveq       r0, #0
+        beq         9b
+        /* finish off the remaining bytes */
+        b           8b
+
+7:      /* fix up the 2 pointers and fallthrough... */
+        sub         r1, r1, #(4+2)
+        sub         r4, r4, #4
+        mov         r2, #4
+        b           8b
+
+
+4:      /*************** offset is 1 or 3 (less optimized) ***************/
+
+		stmfd		sp!, {r5, r6, r7}
+
+        // r5 = rhs
+        // r6 = lhs
+        // r7 = scratch
+
+        mov         r5, r0, lsl #3		/* r5 = right shift */
+        rsb         r6, r5, #32         /* r6 = left shift */
+
+        /* align the unaligned pointer */
+        bic         r1, r1, #3
+        ldr         r7, [r1], #4
+        sub         r2, r2, #8
+
+6:      mov         ip, r7, lsr r5
+        ldr         r7, [r1], #4
+        ldr         r0, [r4], #4
+        orr         ip, ip, r7, lsl r6
+        eors        r0, r0, ip
+        moveq       ip, r7, lsr r5
+        ldreq       r7, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, r7, lsl r6
+        eoreqs      r0, r0, ip
+        bne         7f
+        subs        r2, r2, #8
+        bhs         6b
+
+        sub         r1, r1, r6, lsr #3
+		ldmfd       sp!, {r5, r6, r7}
+
+        /* are we done? */
+        adds        r2, r2, #8
+        moveq       r0, #0
+        beq         9b
+
+        /* finish off the remaining bytes */
+        b           8b
+
+7:      /* fix up the 2 pointers and fallthrough... */
+        sub         r1, r1, #4
+        sub         r1, r1, r6, lsr #3
+        sub         r4, r4, #4
+        mov         r2, #4
+		ldmfd		sp!, {r5, r6, r7}
+        b           8b
+END(memcmp)
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index 95be867..96ed949 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memcmp.S \
     arch-arm/generic/bionic/memcpy.S \
     arch-arm/generic/bionic/memset.S \
     arch-arm/generic/bionic/strcmp.S \