blob: bea5b265e4c6b4f90965a0117bc6f3a7f3a75401 [file] [log] [blame]
Shu Zhang0ef7a8f2014-03-17 15:15:32 +08001/* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3 Copyright (c) 2014, NVIDIA Corporation. All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the Linaro nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* Assumptions:
30 *
31 * denver, ARMv8-a, AArch64
32 * Unaligned accesses
33 *
34 */
35
36#include <private/bionic_asm.h>
37
38/* By default we assume that the DC instruction can be used to zero
39 data blocks more efficiently. In some circumstances this might be
40 unsafe, for example in an asymmetric multiprocessor environment with
41 different DC clear lengths (neither the upper nor lower lengths are
42 safe to use). The feature can be disabled by defining DONT_USE_DC.
43
44 If code may be run in a virtualized environment, then define
45 MAYBE_VIRT. This will cause the code to cache the system register
46 values rather than re-reading them each call. */
47
48#define dstin x0
49#define val w1
50#define count x2
Elliott Hughes78460932016-03-02 11:58:41 -080051#define dst_count x3 /* for __memset_chk */
Shu Zhang0ef7a8f2014-03-17 15:15:32 +080052#define tmp1 x3
53#define tmp1w w3
54#define tmp2 x4
55#define tmp2w w4
56#define zva_len_x x5
57#define zva_len w5
58#define zva_bits_x x6
59
60#define A_l x7
61#define A_lw w7
62#define dst x8
63#define tmp3w w9
64
65#define QA_l q0
66
Elliott Hughes78460932016-03-02 11:58:41 -080067ENTRY(__memset_chk)
68 cmp count, dst_count
69 bls memset
70
71 // Preserve for accurate backtrace.
72 stp x29, x30, [sp, -16]!
73 .cfi_def_cfa_offset 16
74 .cfi_rel_offset x29, 0
75 .cfi_rel_offset x30, 8
76
77 bl __memset_chk_fail
78END(__memset_chk)
79
Shu Zhang0ef7a8f2014-03-17 15:15:32 +080080ENTRY(memset)
81
82 mov dst, dstin /* Preserve return value. */
83 ands A_lw, val, #255
84#ifndef DONT_USE_DC
85# b.eq .Lzero_mem
86#endif
87 orr A_lw, A_lw, A_lw, lsl #8
88 orr A_lw, A_lw, A_lw, lsl #16
89 orr A_l, A_l, A_l, lsl #32
90.Ltail_maybe_long:
91 cmp count, #256
92 b.ge .Lnot_short
93.Ltail_maybe_tiny:
94 cmp count, #15
95 b.le .Ltail15tiny
96.Ltail255:
97 ands tmp1, count, #0xC0
98 b.eq .Ltail63
99 dup v0.4s, A_lw
100 cmp tmp1w, #0x80
101 b.eq 1f
102 b.lt 2f
103 stp QA_l, QA_l, [dst], #32
104 stp QA_l, QA_l, [dst], #32
1051:
106 stp QA_l, QA_l, [dst], #32
107 stp QA_l, QA_l, [dst], #32
1082:
109 stp QA_l, QA_l, [dst], #32
110 stp QA_l, QA_l, [dst], #32
111.Ltail63:
112 ands tmp1, count, #0x30
113 b.eq .Ltail15
114 add dst, dst, tmp1
115 cmp tmp1w, #0x20
116 b.eq 1f
117 b.lt 2f
118 stp A_l, A_l, [dst, #-48]
1191:
120 stp A_l, A_l, [dst, #-32]
1212:
122 stp A_l, A_l, [dst, #-16]
123
124.Ltail15:
125 and count, count, #15
126 add dst, dst, count
127 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
128 ret
129
130.Ltail15tiny:
131 /* Set up to 15 bytes. Does not assume earlier memory
132 being set. */
133 tbz count, #3, 1f
134 str A_l, [dst], #8
1351:
136 tbz count, #2, 1f
137 str A_lw, [dst], #4
1381:
139 tbz count, #1, 1f
140 strh A_lw, [dst], #2
1411:
142 tbz count, #0, 1f
143 strb A_lw, [dst]
1441:
145 ret
146
147 /* Critical loop. Start at a new cache line boundary. Assuming
148 * 64 bytes per line, this ensures the entire loop is in one line. */
149 .p2align 6
150.Lnot_short:
151 dup v0.4s, A_lw
152 neg tmp2, dst
153 ands tmp2, tmp2, #15
154 b.eq 2f
155 /* Bring DST to 128-bit (16-byte) alignment. We know that there's
156 * more than that to set, so we simply store 16 bytes and advance by
157 * the amount required to reach alignment. */
158 sub count, count, tmp2
159 stp A_l, A_l, [dst]
160 add dst, dst, tmp2
161 /* There may be less than 63 bytes to go now. */
162 cmp count, #255
163 b.le .Ltail255
1642:
165 cmp count, #2097152
166 b.gt 3f
1671:
168 sub count, count, #256
1692:
170 stp QA_l, QA_l, [dst], #32
171 stp QA_l, QA_l, [dst], #32
172 stp QA_l, QA_l, [dst], #32
173 stp QA_l, QA_l, [dst], #32
174 stp QA_l, QA_l, [dst], #32
175 stp QA_l, QA_l, [dst], #32
176 stp QA_l, QA_l, [dst], #32
177 stp QA_l, QA_l, [dst], #32
178 subs count, count, #256
179 b.ge 2b
180 tst count, #0xff
181 b.ne .Ltail255
182 ret
1833:
184 sub count, count, #64
1854:
186 subs count, count, #64
187 stnp QA_l, QA_l, [dst]
188 stnp QA_l, QA_l, [dst, #32]
189 add dst, dst, #64
190 b.ge 4b
191 tst count, #0x3f
192 b.ne .Ltail63
193 ret
194
195#ifndef DONT_USE_DC
196 /* For zeroing memory, check to see if we can use the ZVA feature to
197 * zero entire 'cache' lines. */
198.Lzero_mem:
199 mov A_l, #0
200 cmp count, #63
201 b.le .Ltail_maybe_tiny
202 neg tmp2, dst
203 ands tmp2, tmp2, #15
204 b.eq 1f
205 sub count, count, tmp2
206 stp A_l, A_l, [dst]
207 add dst, dst, tmp2
208 cmp count, #63
209 b.le .Ltail63
2101:
211 /* For zeroing small amounts of memory, it's not worth setting up
212 * the line-clear code. */
213 cmp count, #128
214 b.lt .Lnot_short
215#ifdef MAYBE_VIRT
216 /* For efficiency when virtualized, we cache the ZVA capability. */
217 adrp tmp2, .Lcache_clear
218 ldr zva_len, [tmp2, #:lo12:.Lcache_clear]
219 tbnz zva_len, #31, .Lnot_short
220 cbnz zva_len, .Lzero_by_line
221 mrs tmp1, dczid_el0
222 tbz tmp1, #4, 1f
223 /* ZVA not available. Remember this for next time. */
224 mov zva_len, #~0
225 str zva_len, [tmp2, #:lo12:.Lcache_clear]
226 b .Lnot_short
2271:
228 mov tmp3w, #4
229 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
230 lsl zva_len, tmp3w, zva_len
231 str zva_len, [tmp2, #:lo12:.Lcache_clear]
232#else
233 mrs tmp1, dczid_el0
234 tbnz tmp1, #4, .Lnot_short
235 mov tmp3w, #4
236 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
237 lsl zva_len, tmp3w, zva_len
238#endif
239
240.Lzero_by_line:
241 /* Compute how far we need to go to become suitably aligned. We're
242 * already at quad-word alignment. */
243 cmp count, zva_len_x
244 b.lt .Lnot_short /* Not enough to reach alignment. */
245 sub zva_bits_x, zva_len_x, #1
246 neg tmp2, dst
247 ands tmp2, tmp2, zva_bits_x
248 b.eq 1f /* Already aligned. */
249 /* Not aligned, check that there's enough to copy after alignment. */
250 sub tmp1, count, tmp2
251 cmp tmp1, #64
252 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
253 b.lt .Lnot_short
254 /* We know that there's at least 64 bytes to zero and that it's safe
255 * to overrun by 64 bytes. */
256 mov count, tmp1
2572:
258 stp A_l, A_l, [dst]
259 stp A_l, A_l, [dst, #16]
260 stp A_l, A_l, [dst, #32]
261 subs tmp2, tmp2, #64
262 stp A_l, A_l, [dst, #48]
263 add dst, dst, #64
264 b.ge 2b
265 /* We've overrun a bit, so adjust dst downwards. */
266 add dst, dst, tmp2
2671:
268 sub count, count, zva_len_x
2693:
270 dc zva, dst
271 add dst, dst, zva_len_x
272 subs count, count, zva_len_x
273 b.ge 3b
274 ands count, count, zva_bits_x
275 b.ne .Ltail_maybe_long
276 ret
277END(memset)
278
279#ifdef MAYBE_VIRT
280 .bss
281 .p2align 2
282.Lcache_clear:
283 .space 4
284#endif
285#endif /* DONT_USE_DC */