| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 1 | /*************************************************************************** | 
|  | 2 | Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 3 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 4 | Redistribution and use in source and binary forms, with or without | 
|  | 5 | modification, are permitted provided that the following conditions are met: | 
|  | 6 | * Redistributions of source code must retain the above copyright | 
|  | 7 | notice, this list of conditions and the following disclaimer. | 
|  | 8 | * Redistributions in binary form must reproduce the above copyright | 
|  | 9 | notice, this list of conditions and the following disclaimer in the | 
|  | 10 | documentation and/or other materials provided with the distribution. | 
|  | 11 | * Neither the name of The Linux Foundation nor the names of its contributors may | 
|  | 12 | be used to endorse or promote products derived from this software | 
|  | 13 | without specific prior written permission. | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 14 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | 
|  | 19 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | 20 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | 21 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | 22 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | 23 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | 24 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | 25 | POSSIBILITY OF SUCH DAMAGE. | 
|  | 26 | ***************************************************************************/ | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 27 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 28 | /* Assumes neon instructions and a cache line size of 64 bytes. */ | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 29 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 30 | #define PLDOFFS	(10) | 
|  | 31 | #define PLDTHRESH (PLDOFFS) | 
|  | 32 | #define BBTHRESH (4096/64) | 
|  | 33 | #define PLDSIZE (64) | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 34 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 35 | #if (PLDOFFS < 1) | 
|  | 36 | #error Routine does not support offsets less than 1 | 
|  | 37 | #endif | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 38 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 39 | #if (PLDTHRESH < PLDOFFS) | 
|  | 40 | #error PLD threshold must be greater than or equal to the PLD offset | 
|  | 41 | #endif | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 42 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 43 | .text | 
| Haibo Huang | ea9957a | 2018-11-19 11:00:32 -0800 | [diff] [blame] | 44 | .syntax unified | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 45 | .fpu    neon | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 46 |  | 
| Haibo Huang | ea9957a | 2018-11-19 11:00:32 -0800 | [diff] [blame] | 47 | // To avoid warning about deprecated instructions, add an explicit | 
|  | 48 | // arch. The code generated is exactly the same. | 
|  | 49 | .arch armv7-a | 
|  | 50 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 51 | .L_memcpy_base: | 
|  | 52 | cmp	r2, #4 | 
|  | 53 | blt	.L_neon_lt4 | 
|  | 54 | cmp	r2, #16 | 
|  | 55 | blt	.L_neon_lt16 | 
|  | 56 | cmp	r2, #32 | 
|  | 57 | blt	.L_neon_16 | 
|  | 58 | cmp	r2, #64 | 
|  | 59 | blt	.L_neon_copy_32_a | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 60 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 61 | mov	r12, r2, lsr #6 | 
|  | 62 | cmp	r12, #PLDTHRESH | 
|  | 63 | ble	.L_neon_copy_64_loop_nopld | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 64 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 65 | push	{r9, r10} | 
|  | 66 | .cfi_adjust_cfa_offset 8 | 
|  | 67 | .cfi_rel_offset r9, 0 | 
|  | 68 | .cfi_rel_offset r10, 4 | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 69 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 70 | cmp	r12, #BBTHRESH | 
|  | 71 | ble	.L_neon_prime_pump | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 72 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 73 | add	lr, r0, #0x400 | 
|  | 74 | add	r9, r1, #(PLDOFFS*PLDSIZE) | 
|  | 75 | sub	lr, lr, r9 | 
|  | 76 | lsl	lr, lr, #21 | 
|  | 77 | lsr	lr, lr, #21 | 
|  | 78 | add	lr, lr, #(PLDOFFS*PLDSIZE) | 
|  | 79 | cmp	r12, lr, lsr #6 | 
|  | 80 | ble	.L_neon_prime_pump | 
| Christopher Ferris | 5f45d58 | 2013-08-07 13:09:51 -0700 | [diff] [blame] | 81 |  | 
| Brent DeGraaf | 1d0268c | 2013-10-02 13:47:11 +0000 | [diff] [blame] | 82 | itt	gt | 
|  | 83 | movgt	r9, #(PLDOFFS) | 
|  | 84 | rsbsgt	r9, r9, lr, lsr #6 | 
|  | 85 | ble	.L_neon_prime_pump | 
|  | 86 |  | 
|  | 87 | add	r10, r1, lr | 
|  | 88 | bic	r10, #0x3F | 
|  | 89 |  | 
|  | 90 | sub	r12, r12, lr, lsr #6 | 
|  | 91 |  | 
|  | 92 | cmp	r9, r12 | 
|  | 93 | itee	le | 
|  | 94 | suble	r12, r12, r9 | 
|  | 95 | movgt	r9, r12 | 
|  | 96 | movgt	r12, #0 | 
|  | 97 |  | 
|  | 98 | pld	[r1, #((PLDOFFS-1)*PLDSIZE)] | 
|  | 99 | .L_neon_copy_64_loop_outer_doublepld: | 
|  | 100 | pld	[r1, #((PLDOFFS)*PLDSIZE)] | 
|  | 101 | vld1.32	{q0, q1}, [r1]! | 
|  | 102 | vld1.32	{q2, q3}, [r1]! | 
|  | 103 | ldr	r3, [r10] | 
|  | 104 | subs	r9, r9, #1 | 
|  | 105 | vst1.32	{q0, q1}, [r0]! | 
|  | 106 | vst1.32	{q2, q3}, [r0]! | 
|  | 107 | add	r10, #64 | 
|  | 108 | bne	.L_neon_copy_64_loop_outer_doublepld | 
|  | 109 | cmp	r12, #0 | 
|  | 110 | beq	.L_neon_pop_before_nopld | 
|  | 111 |  | 
|  | 112 | cmp	r12, #(512*1024/64) | 
|  | 113 | blt	.L_neon_copy_64_loop_outer | 
|  | 114 |  | 
|  | 115 | .L_neon_copy_64_loop_ddr: | 
|  | 116 | vld1.32	{q0, q1}, [r1]! | 
|  | 117 | vld1.32	{q2, q3}, [r1]! | 
|  | 118 | pld	[r10] | 
|  | 119 | subs	r12, r12, #1 | 
|  | 120 | vst1.32	{q0, q1}, [r0]! | 
|  | 121 | vst1.32	{q2, q3}, [r0]! | 
|  | 122 | add	r10, #64 | 
|  | 123 | bne	.L_neon_copy_64_loop_ddr | 
|  | 124 | b	.L_neon_pop_before_nopld | 
|  | 125 |  | 
|  | 126 | .L_neon_prime_pump: | 
|  | 127 | mov	lr, #(PLDOFFS*PLDSIZE) | 
|  | 128 | add	r10, r1, #(PLDOFFS*PLDSIZE) | 
|  | 129 | bic	r10, #0x3F | 
|  | 130 | sub	r12, r12, #PLDOFFS | 
|  | 131 | ldr	r3, [r10, #(-1*PLDSIZE)] | 
|  | 132 |  | 
|  | 133 | .L_neon_copy_64_loop_outer: | 
|  | 134 | vld1.32	{q0, q1}, [r1]! | 
|  | 135 | vld1.32	{q2, q3}, [r1]! | 
|  | 136 | ldr	r3, [r10] | 
|  | 137 | subs	r12, r12, #1 | 
|  | 138 | vst1.32	{q0, q1}, [r0]! | 
|  | 139 | vst1.32	{q2, q3}, [r0]! | 
|  | 140 | add	r10, #64 | 
|  | 141 | bne	.L_neon_copy_64_loop_outer | 
|  | 142 |  | 
|  | 143 | .L_neon_pop_before_nopld: | 
|  | 144 | mov	r12, lr, lsr #6 | 
|  | 145 | pop	{r9, r10} | 
|  | 146 | .cfi_adjust_cfa_offset -8 | 
|  | 147 | .cfi_restore r9 | 
|  | 148 | .cfi_restore r10 | 
|  | 149 |  | 
|  | 150 | .L_neon_copy_64_loop_nopld: | 
|  | 151 | vld1.32	{q8, q9}, [r1]! | 
|  | 152 | vld1.32	{q10, q11}, [r1]! | 
|  | 153 | subs	r12, r12, #1 | 
|  | 154 | vst1.32	{q8, q9}, [r0]! | 
|  | 155 | vst1.32	{q10, q11}, [r0]! | 
|  | 156 | bne	.L_neon_copy_64_loop_nopld | 
|  | 157 | ands	r2, r2, #0x3f | 
|  | 158 | beq	.L_neon_exit | 
|  | 159 |  | 
|  | 160 | .L_neon_copy_32_a: | 
|  | 161 | movs	r3, r2, lsl #27 | 
|  | 162 | bcc	.L_neon_16 | 
|  | 163 | vld1.32	{q0,q1}, [r1]! | 
|  | 164 | vst1.32	{q0,q1}, [r0]! | 
|  | 165 |  | 
|  | 166 | .L_neon_16: | 
|  | 167 | bpl	.L_neon_lt16 | 
|  | 168 | vld1.32	{q8}, [r1]! | 
|  | 169 | vst1.32	{q8}, [r0]! | 
|  | 170 | ands	r2, r2, #0x0f | 
|  | 171 | beq	.L_neon_exit | 
|  | 172 |  | 
|  | 173 | .L_neon_lt16: | 
|  | 174 | movs	r3, r2, lsl #29 | 
|  | 175 | bcc	1f | 
|  | 176 | vld1.8	{d0}, [r1]! | 
|  | 177 | vst1.8	{d0}, [r0]! | 
|  | 178 | 1: | 
|  | 179 | bge	.L_neon_lt4 | 
|  | 180 | vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]! | 
|  | 181 | vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]! | 
|  | 182 |  | 
|  | 183 | .L_neon_lt4: | 
|  | 184 | movs	r2, r2, lsl #31 | 
|  | 185 | itt	cs | 
|  | 186 | ldrhcs	r3, [r1], #2 | 
|  | 187 | strhcs	r3, [r0], #2 | 
|  | 188 | itt	mi | 
|  | 189 | ldrbmi	r3, [r1] | 
|  | 190 | strbmi	r3, [r0] | 
|  | 191 |  | 
|  | 192 | .L_neon_exit: | 
|  | 193 | pop	{r0, pc} |