blob: 7368e63ce5c7346d41dae2081aac4a80736a89e1 [file] [log] [blame]
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00001/***************************************************************************
2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
Christopher Ferris5f45d582013-08-07 13:09:51 -07003
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00004 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of The Linux Foundation nor the names of its contributors may
12 be used to endorse or promote products derived from this software
13 without specific prior written permission.
Christopher Ferris5f45d582013-08-07 13:09:51 -070014
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***************************************************************************/
Christopher Ferris5f45d582013-08-07 13:09:51 -070027
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000028/* Assumes neon instructions and a cache line size of 64 bytes. */
Christopher Ferris5f45d582013-08-07 13:09:51 -070029
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000030#include <machine/asm.h>
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070031
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000032#define PLDOFFS (10)
33#define PLDTHRESH (PLDOFFS)
34#define BBTHRESH (4096/64)
35#define PLDSIZE (64)
Christopher Ferris5f45d582013-08-07 13:09:51 -070036
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000037#if (PLDOFFS < 1)
38#error Routine does not support offsets less than 1
39#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070040
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000041#if (PLDTHRESH < PLDOFFS)
42#error PLD threshold must be greater than or equal to the PLD offset
43#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070044
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000045 .text
46 .fpu neon
Christopher Ferris5f45d582013-08-07 13:09:51 -070047
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000048.L_memcpy_base:
49 cmp r2, #4
50 blt .L_neon_lt4
51 cmp r2, #16
52 blt .L_neon_lt16
53 cmp r2, #32
54 blt .L_neon_16
55 cmp r2, #64
56 blt .L_neon_copy_32_a
Christopher Ferris5f45d582013-08-07 13:09:51 -070057
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000058 mov r12, r2, lsr #6
59 cmp r12, #PLDTHRESH
60 ble .L_neon_copy_64_loop_nopld
Christopher Ferris5f45d582013-08-07 13:09:51 -070061
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000062 push {r9, r10}
63 .cfi_adjust_cfa_offset 8
64 .cfi_rel_offset r9, 0
65 .cfi_rel_offset r10, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -070066
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000067 cmp r12, #BBTHRESH
68 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070069
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000070 add lr, r0, #0x400
71 add r9, r1, #(PLDOFFS*PLDSIZE)
72 sub lr, lr, r9
73 lsl lr, lr, #21
74 lsr lr, lr, #21
75 add lr, lr, #(PLDOFFS*PLDSIZE)
76 cmp r12, lr, lsr #6
77 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070078
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000079 itt gt
80 movgt r9, #(PLDOFFS)
81 rsbsgt r9, r9, lr, lsr #6
82 ble .L_neon_prime_pump
83
84 add r10, r1, lr
85 bic r10, #0x3F
86
87 sub r12, r12, lr, lsr #6
88
89 cmp r9, r12
90 itee le
91 suble r12, r12, r9
92 movgt r9, r12
93 movgt r12, #0
94
95 pld [r1, #((PLDOFFS-1)*PLDSIZE)]
96.L_neon_copy_64_loop_outer_doublepld:
97 pld [r1, #((PLDOFFS)*PLDSIZE)]
98 vld1.32 {q0, q1}, [r1]!
99 vld1.32 {q2, q3}, [r1]!
100 ldr r3, [r10]
101 subs r9, r9, #1
102 vst1.32 {q0, q1}, [r0]!
103 vst1.32 {q2, q3}, [r0]!
104 add r10, #64
105 bne .L_neon_copy_64_loop_outer_doublepld
106 cmp r12, #0
107 beq .L_neon_pop_before_nopld
108
109 cmp r12, #(512*1024/64)
110 blt .L_neon_copy_64_loop_outer
111
112.L_neon_copy_64_loop_ddr:
113 vld1.32 {q0, q1}, [r1]!
114 vld1.32 {q2, q3}, [r1]!
115 pld [r10]
116 subs r12, r12, #1
117 vst1.32 {q0, q1}, [r0]!
118 vst1.32 {q2, q3}, [r0]!
119 add r10, #64
120 bne .L_neon_copy_64_loop_ddr
121 b .L_neon_pop_before_nopld
122
123.L_neon_prime_pump:
124 mov lr, #(PLDOFFS*PLDSIZE)
125 add r10, r1, #(PLDOFFS*PLDSIZE)
126 bic r10, #0x3F
127 sub r12, r12, #PLDOFFS
128 ldr r3, [r10, #(-1*PLDSIZE)]
129
130.L_neon_copy_64_loop_outer:
131 vld1.32 {q0, q1}, [r1]!
132 vld1.32 {q2, q3}, [r1]!
133 ldr r3, [r10]
134 subs r12, r12, #1
135 vst1.32 {q0, q1}, [r0]!
136 vst1.32 {q2, q3}, [r0]!
137 add r10, #64
138 bne .L_neon_copy_64_loop_outer
139
140.L_neon_pop_before_nopld:
141 mov r12, lr, lsr #6
142 pop {r9, r10}
143 .cfi_adjust_cfa_offset -8
144 .cfi_restore r9
145 .cfi_restore r10
146
147.L_neon_copy_64_loop_nopld:
148 vld1.32 {q8, q9}, [r1]!
149 vld1.32 {q10, q11}, [r1]!
150 subs r12, r12, #1
151 vst1.32 {q8, q9}, [r0]!
152 vst1.32 {q10, q11}, [r0]!
153 bne .L_neon_copy_64_loop_nopld
154 ands r2, r2, #0x3f
155 beq .L_neon_exit
156
157.L_neon_copy_32_a:
158 movs r3, r2, lsl #27
159 bcc .L_neon_16
160 vld1.32 {q0,q1}, [r1]!
161 vst1.32 {q0,q1}, [r0]!
162
163.L_neon_16:
164 bpl .L_neon_lt16
165 vld1.32 {q8}, [r1]!
166 vst1.32 {q8}, [r0]!
167 ands r2, r2, #0x0f
168 beq .L_neon_exit
169
170.L_neon_lt16:
171 movs r3, r2, lsl #29
172 bcc 1f
173 vld1.8 {d0}, [r1]!
174 vst1.8 {d0}, [r0]!
1751:
176 bge .L_neon_lt4
177 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
178 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
179
180.L_neon_lt4:
181 movs r2, r2, lsl #31
182 itt cs
183 ldrhcs r3, [r1], #2
184 strhcs r3, [r0], #2
185 itt mi
186 ldrbmi r3, [r1]
187 strbmi r3, [r0]
188
189.L_neon_exit:
190 pop {r0, pc}