blob: 5b4b70d5c903505500045073843242f04d1efa69 [file] [log] [blame]
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00001/***************************************************************************
2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
Christopher Ferris5f45d582013-08-07 13:09:51 -07003
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00004 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of The Linux Foundation nor the names of its contributors may
12 be used to endorse or promote products derived from this software
13 without specific prior written permission.
Christopher Ferris5f45d582013-08-07 13:09:51 -070014
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***************************************************************************/
Christopher Ferris5f45d582013-08-07 13:09:51 -070027
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000028/* Assumes neon instructions and a cache line size of 64 bytes. */
Christopher Ferris5f45d582013-08-07 13:09:51 -070029
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000030#define PLDOFFS (10)
31#define PLDTHRESH (PLDOFFS)
32#define BBTHRESH (4096/64)
33#define PLDSIZE (64)
Christopher Ferris5f45d582013-08-07 13:09:51 -070034
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000035#if (PLDOFFS < 1)
36#error Routine does not support offsets less than 1
37#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070038
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000039#if (PLDTHRESH < PLDOFFS)
40#error PLD threshold must be greater than or equal to the PLD offset
41#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070042
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000043 .text
Haibo Huangea9957a2018-11-19 11:00:32 -080044 .syntax unified
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000045 .fpu neon
Christopher Ferris5f45d582013-08-07 13:09:51 -070046
Haibo Huangea9957a2018-11-19 11:00:32 -080047 // To avoid warning about deprecated instructions, add an explicit
48 // arch. The code generated is exactly the same.
49 .arch armv7-a
50
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000051.L_memcpy_base:
52 cmp r2, #4
53 blt .L_neon_lt4
54 cmp r2, #16
55 blt .L_neon_lt16
56 cmp r2, #32
57 blt .L_neon_16
58 cmp r2, #64
59 blt .L_neon_copy_32_a
Christopher Ferris5f45d582013-08-07 13:09:51 -070060
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000061 mov r12, r2, lsr #6
62 cmp r12, #PLDTHRESH
63 ble .L_neon_copy_64_loop_nopld
Christopher Ferris5f45d582013-08-07 13:09:51 -070064
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000065 push {r9, r10}
66 .cfi_adjust_cfa_offset 8
67 .cfi_rel_offset r9, 0
68 .cfi_rel_offset r10, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -070069
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000070 cmp r12, #BBTHRESH
71 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070072
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000073 add lr, r0, #0x400
74 add r9, r1, #(PLDOFFS*PLDSIZE)
75 sub lr, lr, r9
76 lsl lr, lr, #21
77 lsr lr, lr, #21
78 add lr, lr, #(PLDOFFS*PLDSIZE)
79 cmp r12, lr, lsr #6
80 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070081
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000082 itt gt
83 movgt r9, #(PLDOFFS)
84 rsbsgt r9, r9, lr, lsr #6
85 ble .L_neon_prime_pump
86
87 add r10, r1, lr
88 bic r10, #0x3F
89
90 sub r12, r12, lr, lsr #6
91
92 cmp r9, r12
93 itee le
94 suble r12, r12, r9
95 movgt r9, r12
96 movgt r12, #0
97
98 pld [r1, #((PLDOFFS-1)*PLDSIZE)]
99.L_neon_copy_64_loop_outer_doublepld:
100 pld [r1, #((PLDOFFS)*PLDSIZE)]
101 vld1.32 {q0, q1}, [r1]!
102 vld1.32 {q2, q3}, [r1]!
103 ldr r3, [r10]
104 subs r9, r9, #1
105 vst1.32 {q0, q1}, [r0]!
106 vst1.32 {q2, q3}, [r0]!
107 add r10, #64
108 bne .L_neon_copy_64_loop_outer_doublepld
109 cmp r12, #0
110 beq .L_neon_pop_before_nopld
111
112 cmp r12, #(512*1024/64)
113 blt .L_neon_copy_64_loop_outer
114
115.L_neon_copy_64_loop_ddr:
116 vld1.32 {q0, q1}, [r1]!
117 vld1.32 {q2, q3}, [r1]!
118 pld [r10]
119 subs r12, r12, #1
120 vst1.32 {q0, q1}, [r0]!
121 vst1.32 {q2, q3}, [r0]!
122 add r10, #64
123 bne .L_neon_copy_64_loop_ddr
124 b .L_neon_pop_before_nopld
125
126.L_neon_prime_pump:
127 mov lr, #(PLDOFFS*PLDSIZE)
128 add r10, r1, #(PLDOFFS*PLDSIZE)
129 bic r10, #0x3F
130 sub r12, r12, #PLDOFFS
131 ldr r3, [r10, #(-1*PLDSIZE)]
132
133.L_neon_copy_64_loop_outer:
134 vld1.32 {q0, q1}, [r1]!
135 vld1.32 {q2, q3}, [r1]!
136 ldr r3, [r10]
137 subs r12, r12, #1
138 vst1.32 {q0, q1}, [r0]!
139 vst1.32 {q2, q3}, [r0]!
140 add r10, #64
141 bne .L_neon_copy_64_loop_outer
142
143.L_neon_pop_before_nopld:
144 mov r12, lr, lsr #6
145 pop {r9, r10}
146 .cfi_adjust_cfa_offset -8
147 .cfi_restore r9
148 .cfi_restore r10
149
150.L_neon_copy_64_loop_nopld:
151 vld1.32 {q8, q9}, [r1]!
152 vld1.32 {q10, q11}, [r1]!
153 subs r12, r12, #1
154 vst1.32 {q8, q9}, [r0]!
155 vst1.32 {q10, q11}, [r0]!
156 bne .L_neon_copy_64_loop_nopld
157 ands r2, r2, #0x3f
158 beq .L_neon_exit
159
160.L_neon_copy_32_a:
161 movs r3, r2, lsl #27
162 bcc .L_neon_16
163 vld1.32 {q0,q1}, [r1]!
164 vst1.32 {q0,q1}, [r0]!
165
166.L_neon_16:
167 bpl .L_neon_lt16
168 vld1.32 {q8}, [r1]!
169 vst1.32 {q8}, [r0]!
170 ands r2, r2, #0x0f
171 beq .L_neon_exit
172
173.L_neon_lt16:
174 movs r3, r2, lsl #29
175 bcc 1f
176 vld1.8 {d0}, [r1]!
177 vst1.8 {d0}, [r0]!
1781:
179 bge .L_neon_lt4
180 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
181 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
182
183.L_neon_lt4:
184 movs r2, r2, lsl #31
185 itt cs
186 ldrhcs r3, [r1], #2
187 strhcs r3, [r0], #2
188 itt mi
189 ldrbmi r3, [r1]
190 strbmi r3, [r0]
191
192.L_neon_exit:
193 pop {r0, pc}