|  | /* | 
|  | * Copyright (C) 2008 The Android Open Source Project | 
|  | * All rights reserved. | 
|  | * | 
|  | * Redistribution and use in source and binary forms, with or without | 
|  | * modification, are permitted provided that the following conditions | 
|  | * are met: | 
|  | *  * Redistributions of source code must retain the above copyright | 
|  | *    notice, this list of conditions and the following disclaimer. | 
|  | *  * Redistributions in binary form must reproduce the above copyright | 
|  | *    notice, this list of conditions and the following disclaimer in | 
|  | *    the documentation and/or other materials provided with the | 
|  | *    distribution. | 
|  | * | 
|  | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
|  | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
|  | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
|  | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
|  | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
|  | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
|  | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
|  | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
|  | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | * SUCH DAMAGE. | 
|  | */ | 
|  |  | 
|  | #include <machine/cpu-features.h> | 
|  | #include <machine/asm.h> | 
|  |  | 
|  | /* | 
|  | * Optimized memcmp() for ARM9. | 
|  | * This would not be optimal on XScale or ARM11, where more prefetching | 
|  | * and use of PLD will be needed. | 
|  | * The 2 major optimzations here are | 
|  | * (1) The main loop compares 16 bytes at a time | 
|  | * (2) The loads are scheduled in a way they won't stall | 
|  | */ | 
|  |  | 
|  | ENTRY(memcmp) | 
|  | PLD         (r0, #0) | 
|  | PLD         (r1, #0) | 
|  |  | 
|  | /* take of the case where length is 0 or the buffers are the same */ | 
|  | cmp         r0, r1 | 
|  | cmpne       r2, #0 | 
|  | moveq       r0, #0 | 
|  | bxeq        lr | 
|  |  | 
|  | .save {r4, lr} | 
|  | /* save registers */ | 
|  | stmfd       sp!, {r4, lr} | 
|  |  | 
|  | PLD         (r0, #32) | 
|  | PLD         (r1, #32) | 
|  |  | 
|  | /* since r0 hold the result, move the first source | 
|  | * pointer somewhere else | 
|  | */ | 
|  |  | 
|  | mov        r4, r0 | 
|  |  | 
|  | /* make sure we have at least 8+4 bytes, this simplify things below | 
|  | * and avoid some overhead for small blocks | 
|  | */ | 
|  | cmp        r2, #(8+4) | 
|  | bmi        8f | 
|  |  | 
|  | /* align first pointer to word boundary | 
|  | * offset = -src & 3 | 
|  | */ | 
|  | rsb         r3, r4, #0 | 
|  | ands        r3, r3, #3 | 
|  | beq         0f | 
|  |  | 
|  | /* align first pointer  */ | 
|  | sub         r2, r2, r3 | 
|  | 1:      ldrb        r0, [r4], #1 | 
|  | ldrb        ip, [r1], #1 | 
|  | subs        r0, r0, ip | 
|  | bne         9f | 
|  | subs        r3, r3, #1 | 
|  | bne         1b | 
|  |  | 
|  |  | 
|  | 0:      /* here the first pointer is aligned, and we have at least 4 bytes | 
|  | * to process. | 
|  | */ | 
|  |  | 
|  | /* see if the pointers are congruent */ | 
|  | eor         r0, r4, r1 | 
|  | ands        r0, r0, #3 | 
|  | bne         5f | 
|  |  | 
|  | /* congruent case, 32 bytes per iteration | 
|  | * We need to make sure there are at least 32+4 bytes left | 
|  | * because we effectively read ahead one word, and we could | 
|  | * read past the buffer (and segfault) if we're not careful. | 
|  | */ | 
|  |  | 
|  | ldr         ip, [r1] | 
|  | subs        r2, r2, #(32 + 4) | 
|  | bmi         1f | 
|  |  | 
|  | 0:      PLD         (r4, #64) | 
|  | PLD         (r1, #64) | 
|  | ldr         r0, [r4], #4 | 
|  | ldr         lr, [r1, #4]! | 
|  | eors        r0, r0, ip | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       ip, [r1, #4]! | 
|  | eoreqs      r0, r0, lr | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       lr, [r1, #4]! | 
|  | eoreqs      r0, r0, ip | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       ip, [r1, #4]! | 
|  | eoreqs      r0, r0, lr | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       lr, [r1, #4]! | 
|  | eoreqs      r0, r0, ip | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       ip, [r1, #4]! | 
|  | eoreqs      r0, r0, lr | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       lr, [r1, #4]! | 
|  | eoreqs      r0, r0, ip | 
|  | ldreq       r0, [r4], #4 | 
|  | ldreq       ip, [r1, #4]! | 
|  | eoreqs      r0, r0, lr | 
|  | bne         2f | 
|  | subs        r2, r2, #32 | 
|  | bhs         0b | 
|  |  | 
|  | /* do we have at least 4 bytes left? */ | 
|  | 1:      adds        r2, r2, #(32 - 4 + 4) | 
|  | bmi         4f | 
|  |  | 
|  | /* finish off 4 bytes at a time */ | 
|  | 3:      ldr         r0, [r4], #4 | 
|  | ldr         ip, [r1], #4 | 
|  | eors        r0, r0, ip | 
|  | bne         2f | 
|  | subs        r2, r2, #4 | 
|  | bhs         3b | 
|  |  | 
|  | /* are we done? */ | 
|  | 4:      adds        r2, r2, #4 | 
|  | moveq       r0, #0 | 
|  | beq         9f | 
|  |  | 
|  | /* finish off the remaining bytes */ | 
|  | b           8f | 
|  |  | 
|  | 2:      /* the last 4 bytes are different, restart them */ | 
|  | sub         r4, r4, #4 | 
|  | sub         r1, r1, #4 | 
|  | mov         r2, #4 | 
|  |  | 
|  | /* process the last few bytes */ | 
|  | 8:      ldrb        r0, [r4], #1 | 
|  | ldrb        ip, [r1], #1 | 
|  | // stall | 
|  | subs        r0, r0, ip | 
|  | bne         9f | 
|  | subs        r2, r2, #1 | 
|  | bne         8b | 
|  |  | 
|  | 9:      /* restore registers and return */ | 
|  | ldmfd       sp!, {r4, lr} | 
|  | bx          lr | 
|  | END(memcmp) | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  |  | 
|  | 5:      /*************** non-congruent case ***************/ | 
|  | and         r0, r1, #3 | 
|  | cmp         r0, #2 | 
|  | bne         4f | 
|  |  | 
|  | /* here, offset is 2 (16-bits aligned, special cased) */ | 
|  |  | 
|  | /* make sure we have at least 16 bytes to process */ | 
|  | subs        r2, r2, #16 | 
|  | addmi       r2, r2, #16 | 
|  | bmi         8b | 
|  |  | 
|  | /* align the unaligned pointer */ | 
|  | bic         r1, r1, #3 | 
|  | ldr         lr, [r1], #4 | 
|  |  | 
|  | 6:      PLD         (r1, #64) | 
|  | PLD         (r4, #64) | 
|  | mov         ip, lr, lsr #16 | 
|  | ldr         lr, [r1], #4 | 
|  | ldr         r0, [r4], #4 | 
|  | orr         ip, ip, lr, lsl #16 | 
|  | eors        r0, r0, ip | 
|  | moveq       ip, lr, lsr #16 | 
|  | ldreq       lr, [r1], #4 | 
|  | ldreq       r0, [r4], #4 | 
|  | orreq       ip, ip, lr, lsl #16 | 
|  | eoreqs      r0, r0, ip | 
|  | moveq       ip, lr, lsr #16 | 
|  | ldreq       lr, [r1], #4 | 
|  | ldreq       r0, [r4], #4 | 
|  | orreq       ip, ip, lr, lsl #16 | 
|  | eoreqs      r0, r0, ip | 
|  | moveq       ip, lr, lsr #16 | 
|  | ldreq       lr, [r1], #4 | 
|  | ldreq       r0, [r4], #4 | 
|  | orreq       ip, ip, lr, lsl #16 | 
|  | eoreqs      r0, r0, ip | 
|  | bne         7f | 
|  | subs        r2, r2, #16 | 
|  | bhs         6b | 
|  | sub         r1, r1, #2 | 
|  | /* are we done? */ | 
|  | adds        r2, r2, #16 | 
|  | moveq       r0, #0 | 
|  | beq         9b | 
|  | /* finish off the remaining bytes */ | 
|  | b           8b | 
|  |  | 
|  | 7:      /* fix up the 2 pointers and fallthrough... */ | 
|  | sub         r1, r1, #(4+2) | 
|  | sub         r4, r4, #4 | 
|  | mov         r2, #4 | 
|  | b           8b | 
|  |  | 
|  |  | 
|  | 4:      /*************** offset is 1 or 3 (less optimized) ***************/ | 
|  |  | 
|  | stmfd		sp!, {r5, r6, r7} | 
|  |  | 
|  | // r5 = rhs | 
|  | // r6 = lhs | 
|  | // r7 = scratch | 
|  |  | 
|  | mov         r5, r0, lsl #3		/* r5 = right shift */ | 
|  | rsb         r6, r5, #32         /* r6 = left shift */ | 
|  |  | 
|  | /* align the unaligned pointer */ | 
|  | bic         r1, r1, #3 | 
|  | ldr         r7, [r1], #4 | 
|  | sub         r2, r2, #8 | 
|  |  | 
|  | 6:      mov         ip, r7, lsr r5 | 
|  | ldr         r7, [r1], #4 | 
|  | ldr         r0, [r4], #4 | 
|  | orr         ip, ip, r7, lsl r6 | 
|  | eors        r0, r0, ip | 
|  | moveq       ip, r7, lsr r5 | 
|  | ldreq       r7, [r1], #4 | 
|  | ldreq       r0, [r4], #4 | 
|  | orreq       ip, ip, r7, lsl r6 | 
|  | eoreqs      r0, r0, ip | 
|  | bne         7f | 
|  | subs        r2, r2, #8 | 
|  | bhs         6b | 
|  |  | 
|  | sub         r1, r1, r6, lsr #3 | 
|  | ldmfd       sp!, {r5, r6, r7} | 
|  |  | 
|  | /* are we done? */ | 
|  | adds        r2, r2, #8 | 
|  | moveq       r0, #0 | 
|  | beq         9b | 
|  |  | 
|  | /* finish off the remaining bytes */ | 
|  | b           8b | 
|  |  | 
|  | 7:      /* fix up the 2 pointers and fallthrough... */ | 
|  | sub         r1, r1, #4 | 
|  | sub         r1, r1, r6, lsr #3 | 
|  | sub         r4, r4, #4 | 
|  | mov         r2, #4 | 
|  | ldmfd		sp!, {r5, r6, r7} | 
|  | b           8b |