blob: 9626c0bd626899e70054d79ba2f58e8971ea92b3 [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37/* By default we assume that the DC instruction can be used to zero
38 data blocks more efficiently. In some circumstances this might be
39 unsafe, for example in an asymmetric multiprocessor environment with
40 different DC clear lengths (neither the upper nor lower lengths are
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020041 safe to use).
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010042
43 If code may be run in a virtualized environment, then define
44 MAYBE_VIRT. This will cause the code to cache the system register
45 values rather than re-reading them each call. */
46
47#define dstin x0
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020048#define val w1
Elliott Hughes78460932016-03-02 11:58:41 -080049#define count x2
50#define dst_count x3 /* for __memset_chk */
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010051#define tmp1 x3
52#define tmp1w w3
53#define tmp2 x4
54#define tmp2w w4
55#define zva_len_x x5
56#define zva_len w5
57#define zva_bits_x x6
58
59#define A_l x7
60#define A_lw w7
61#define dst x8
62#define tmp3w w9
63
Elliott Hughes78460932016-03-02 11:58:41 -080064ENTRY(__memset_chk)
65 cmp count, dst_count
66 bls memset
67
68 // Preserve for accurate backtrace.
69 stp x29, x30, [sp, -16]!
70 .cfi_def_cfa_offset 16
71 .cfi_rel_offset x29, 0
72 .cfi_rel_offset x30, 8
73
74 bl __memset_chk_fail
75END(__memset_chk)
76
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010077ENTRY(memset)
78
79 mov dst, dstin /* Preserve return value. */
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020080 ands A_lw, val, #255
81 b.eq .Lzero_mem
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010082 orr A_lw, A_lw, A_lw, lsl #8
83 orr A_lw, A_lw, A_lw, lsl #16
84 orr A_l, A_l, A_l, lsl #32
85.Ltail_maybe_long:
86 cmp count, #64
87 b.ge .Lnot_short
88.Ltail_maybe_tiny:
89 cmp count, #15
90 b.le .Ltail15tiny
91.Ltail63:
92 ands tmp1, count, #0x30
93 b.eq .Ltail15
94 add dst, dst, tmp1
95 cmp tmp1w, #0x20
96 b.eq 1f
97 b.lt 2f
98 stp A_l, A_l, [dst, #-48]
991:
100 stp A_l, A_l, [dst, #-32]
1012:
102 stp A_l, A_l, [dst, #-16]
103
104.Ltail15:
105 and count, count, #15
106 add dst, dst, count
107 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
108 ret
109
110.Ltail15tiny:
111 /* Set up to 15 bytes. Does not assume earlier memory
112 being set. */
113 tbz count, #3, 1f
114 str A_l, [dst], #8
1151:
116 tbz count, #2, 1f
117 str A_lw, [dst], #4
1181:
119 tbz count, #1, 1f
120 strh A_lw, [dst], #2
1211:
122 tbz count, #0, 1f
123 strb A_lw, [dst]
1241:
125 ret
126
127 /* Critical loop. Start at a new cache line boundary. Assuming
128 * 64 bytes per line, this ensures the entire loop is in one line. */
129 .p2align 6
130.Lnot_short:
131 neg tmp2, dst
132 ands tmp2, tmp2, #15
133 b.eq 2f
134 /* Bring DST to 128-bit (16-byte) alignment. We know that there's
135 * more than that to set, so we simply store 16 bytes and advance by
136 * the amount required to reach alignment. */
137 sub count, count, tmp2
138 stp A_l, A_l, [dst]
139 add dst, dst, tmp2
140 /* There may be less than 63 bytes to go now. */
141 cmp count, #63
142 b.le .Ltail63
1432:
144 sub dst, dst, #16 /* Pre-bias. */
145 sub count, count, #64
1461:
147 stp A_l, A_l, [dst, #16]
148 stp A_l, A_l, [dst, #32]
149 stp A_l, A_l, [dst, #48]
150 stp A_l, A_l, [dst, #64]!
151 subs count, count, #64
152 b.ge 1b
153 tst count, #0x3f
154 add dst, dst, #16
155 b.ne .Ltail63
156 ret
157
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100158 /* For zeroing memory, check to see if we can use the ZVA feature to
159 * zero entire 'cache' lines. */
160.Lzero_mem:
161 mov A_l, #0
162 cmp count, #63
163 b.le .Ltail_maybe_tiny
164 neg tmp2, dst
165 ands tmp2, tmp2, #15
166 b.eq 1f
167 sub count, count, tmp2
168 stp A_l, A_l, [dst]
169 add dst, dst, tmp2
170 cmp count, #63
171 b.le .Ltail63
1721:
173 /* For zeroing small amounts of memory, it's not worth setting up
174 * the line-clear code. */
175 cmp count, #128
176 b.lt .Lnot_short
177#ifdef MAYBE_VIRT
178 /* For efficiency when virtualized, we cache the ZVA capability. */
179 adrp tmp2, .Lcache_clear
180 ldr zva_len, [tmp2, #:lo12:.Lcache_clear]
181 tbnz zva_len, #31, .Lnot_short
182 cbnz zva_len, .Lzero_by_line
183 mrs tmp1, dczid_el0
184 tbz tmp1, #4, 1f
185 /* ZVA not available. Remember this for next time. */
186 mov zva_len, #~0
187 str zva_len, [tmp2, #:lo12:.Lcache_clear]
188 b .Lnot_short
1891:
190 mov tmp3w, #4
191 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
192 lsl zva_len, tmp3w, zva_len
193 str zva_len, [tmp2, #:lo12:.Lcache_clear]
194#else
195 mrs tmp1, dczid_el0
196 tbnz tmp1, #4, .Lnot_short
197 mov tmp3w, #4
198 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
199 lsl zva_len, tmp3w, zva_len
200#endif
201
202.Lzero_by_line:
203 /* Compute how far we need to go to become suitably aligned. We're
204 * already at quad-word alignment. */
205 cmp count, zva_len_x
206 b.lt .Lnot_short /* Not enough to reach alignment. */
207 sub zva_bits_x, zva_len_x, #1
208 neg tmp2, dst
209 ands tmp2, tmp2, zva_bits_x
210 b.eq 1f /* Already aligned. */
211 /* Not aligned, check that there's enough to copy after alignment. */
212 sub tmp1, count, tmp2
213 cmp tmp1, #64
214 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
215 b.lt .Lnot_short
216 /* We know that there's at least 64 bytes to zero and that it's safe
217 * to overrun by 64 bytes. */
218 mov count, tmp1
2192:
220 stp A_l, A_l, [dst]
221 stp A_l, A_l, [dst, #16]
222 stp A_l, A_l, [dst, #32]
223 subs tmp2, tmp2, #64
224 stp A_l, A_l, [dst, #48]
225 add dst, dst, #64
226 b.ge 2b
227 /* We've overrun a bit, so adjust dst downwards. */
228 add dst, dst, tmp2
2291:
230 sub count, count, zva_len_x
2313:
232 dc zva, dst
233 add dst, dst, zva_len_x
234 subs count, count, zva_len_x
235 b.ge 3b
236 ands count, count, zva_bits_x
237 b.ne .Ltail_maybe_long
238 ret
239END(memset)
240
241#ifdef MAYBE_VIRT
242 .bss
243 .p2align 2
244.Lcache_clear:
245 .space 4
246#endif