blob: dc8ad2cdfb855b6b6674c74ea1d8dbe955de17a3 [file] [log] [blame]
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00001/***************************************************************************
2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
Christopher Ferris5f45d582013-08-07 13:09:51 -07003
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00004 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of The Linux Foundation nor the names of its contributors may
12 be used to endorse or promote products derived from this software
13 without specific prior written permission.
Christopher Ferris5f45d582013-08-07 13:09:51 -070014
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***************************************************************************/
Christopher Ferris5f45d582013-08-07 13:09:51 -070027
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000028/* Assumes neon instructions and a cache line size of 64 bytes. */
Christopher Ferris5f45d582013-08-07 13:09:51 -070029
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000030#define PLDOFFS (10)
31#define PLDTHRESH (PLDOFFS)
32#define BBTHRESH (4096/64)
33#define PLDSIZE (64)
Christopher Ferris5f45d582013-08-07 13:09:51 -070034
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000035#if (PLDOFFS < 1)
36#error Routine does not support offsets less than 1
37#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070038
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000039#if (PLDTHRESH < PLDOFFS)
40#error PLD threshold must be greater than or equal to the PLD offset
41#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070042
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000043 .text
44 .fpu neon
Christopher Ferris5f45d582013-08-07 13:09:51 -070045
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000046.L_memcpy_base:
47 cmp r2, #4
48 blt .L_neon_lt4
49 cmp r2, #16
50 blt .L_neon_lt16
51 cmp r2, #32
52 blt .L_neon_16
53 cmp r2, #64
54 blt .L_neon_copy_32_a
Christopher Ferris5f45d582013-08-07 13:09:51 -070055
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000056 mov r12, r2, lsr #6
57 cmp r12, #PLDTHRESH
58 ble .L_neon_copy_64_loop_nopld
Christopher Ferris5f45d582013-08-07 13:09:51 -070059
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000060 push {r9, r10}
61 .cfi_adjust_cfa_offset 8
62 .cfi_rel_offset r9, 0
63 .cfi_rel_offset r10, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -070064
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000065 cmp r12, #BBTHRESH
66 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070067
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000068 add lr, r0, #0x400
69 add r9, r1, #(PLDOFFS*PLDSIZE)
70 sub lr, lr, r9
71 lsl lr, lr, #21
72 lsr lr, lr, #21
73 add lr, lr, #(PLDOFFS*PLDSIZE)
74 cmp r12, lr, lsr #6
75 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070076
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000077 itt gt
78 movgt r9, #(PLDOFFS)
79 rsbsgt r9, r9, lr, lsr #6
80 ble .L_neon_prime_pump
81
82 add r10, r1, lr
83 bic r10, #0x3F
84
85 sub r12, r12, lr, lsr #6
86
87 cmp r9, r12
88 itee le
89 suble r12, r12, r9
90 movgt r9, r12
91 movgt r12, #0
92
93 pld [r1, #((PLDOFFS-1)*PLDSIZE)]
94.L_neon_copy_64_loop_outer_doublepld:
95 pld [r1, #((PLDOFFS)*PLDSIZE)]
96 vld1.32 {q0, q1}, [r1]!
97 vld1.32 {q2, q3}, [r1]!
98 ldr r3, [r10]
99 subs r9, r9, #1
100 vst1.32 {q0, q1}, [r0]!
101 vst1.32 {q2, q3}, [r0]!
102 add r10, #64
103 bne .L_neon_copy_64_loop_outer_doublepld
104 cmp r12, #0
105 beq .L_neon_pop_before_nopld
106
107 cmp r12, #(512*1024/64)
108 blt .L_neon_copy_64_loop_outer
109
110.L_neon_copy_64_loop_ddr:
111 vld1.32 {q0, q1}, [r1]!
112 vld1.32 {q2, q3}, [r1]!
113 pld [r10]
114 subs r12, r12, #1
115 vst1.32 {q0, q1}, [r0]!
116 vst1.32 {q2, q3}, [r0]!
117 add r10, #64
118 bne .L_neon_copy_64_loop_ddr
119 b .L_neon_pop_before_nopld
120
121.L_neon_prime_pump:
122 mov lr, #(PLDOFFS*PLDSIZE)
123 add r10, r1, #(PLDOFFS*PLDSIZE)
124 bic r10, #0x3F
125 sub r12, r12, #PLDOFFS
126 ldr r3, [r10, #(-1*PLDSIZE)]
127
128.L_neon_copy_64_loop_outer:
129 vld1.32 {q0, q1}, [r1]!
130 vld1.32 {q2, q3}, [r1]!
131 ldr r3, [r10]
132 subs r12, r12, #1
133 vst1.32 {q0, q1}, [r0]!
134 vst1.32 {q2, q3}, [r0]!
135 add r10, #64
136 bne .L_neon_copy_64_loop_outer
137
138.L_neon_pop_before_nopld:
139 mov r12, lr, lsr #6
140 pop {r9, r10}
141 .cfi_adjust_cfa_offset -8
142 .cfi_restore r9
143 .cfi_restore r10
144
145.L_neon_copy_64_loop_nopld:
146 vld1.32 {q8, q9}, [r1]!
147 vld1.32 {q10, q11}, [r1]!
148 subs r12, r12, #1
149 vst1.32 {q8, q9}, [r0]!
150 vst1.32 {q10, q11}, [r0]!
151 bne .L_neon_copy_64_loop_nopld
152 ands r2, r2, #0x3f
153 beq .L_neon_exit
154
155.L_neon_copy_32_a:
156 movs r3, r2, lsl #27
157 bcc .L_neon_16
158 vld1.32 {q0,q1}, [r1]!
159 vst1.32 {q0,q1}, [r0]!
160
161.L_neon_16:
162 bpl .L_neon_lt16
163 vld1.32 {q8}, [r1]!
164 vst1.32 {q8}, [r0]!
165 ands r2, r2, #0x0f
166 beq .L_neon_exit
167
168.L_neon_lt16:
169 movs r3, r2, lsl #29
170 bcc 1f
171 vld1.8 {d0}, [r1]!
172 vst1.8 {d0}, [r0]!
1731:
174 bge .L_neon_lt4
175 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
176 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
177
178.L_neon_lt4:
179 movs r2, r2, lsl #31
180 itt cs
181 ldrhcs r3, [r1], #2
182 strhcs r3, [r0], #2
183 itt mi
184 ldrbmi r3, [r1]
185 strbmi r3, [r0]
186
187.L_neon_exit:
188 pop {r0, pc}