blob: 8d79e5b4629c8c22299e793fe7d3112abd378b89 [file] [log] [blame]
Shu Zhang5b5d6e72014-03-12 11:18:41 +08001/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <machine/cpu-features.h>
31#include <private/bionic_asm.h>
Shu Zhang5b5d6e72014-03-12 11:18:41 +080032
33 /*
34 * Optimized memset() for ARM.
35 *
36 * memset() returns its first argument.
37 */
38
Bernhard Rosenkränzerce46f552014-11-30 22:17:30 +010039 .cpu cortex-a15
Shu Zhang5b5d6e72014-03-12 11:18:41 +080040 .fpu neon
41 .syntax unified
42
43ENTRY(__memset_chk)
44 cmp r2, r3
45 bls .L_done
46
47 // Preserve lr for backtrace.
48 push {lr}
49 .cfi_def_cfa_offset 4
50 .cfi_rel_offset lr, 0
51
Elliott Hughes62e59642016-03-01 11:22:42 -080052 bl __memset_chk_fail
Shu Zhang5b5d6e72014-03-12 11:18:41 +080053END(__memset_chk)
54
55ENTRY(bzero)
56 mov r2, r1
57 mov r1, #0
58.L_done:
59 // Fall through to memset...
60END(bzero)
61
62ENTRY(memset)
63 pldw [r0]
64 mov r3, r0
65
66 // Duplicate the low byte of r1
67 mov r1, r1, lsl #24
68 orr r1, r1, r1, lsr #8
69 orr r1, r1, r1, lsr #16
70
71 cmp r2, #16
72 blo .L_less_than_16
73
74 // This section handles regions 16 bytes or larger
75 //
76 // Use aligned vst1.8 and vstm when possible. Register values will be:
77 // ip is scratch
78 // q0, q1, and r1 contain the memset value
79 // r2 is the number of bytes to set
80 // r3 is the advancing destination pointer
81 vdup.32 q0, r1
82
83 ands ip, r3, 0xF
84 beq .L_memset_aligned
85
86 // Align dest pointer to 16-byte boundary.
87 pldw [r0, #64]
88 rsb ip, ip, #16
89
90 // Pre-adjust the byte count to reflect post-aligment value. Expecting
91 // 8-byte alignment to be rather common so we special case that one.
92 sub r2, r2, ip
93
94 /* set 1 byte */
95 tst ip, #1
96 it ne
97 strbne r1, [r3], #1
98 /* set 2 bytes */
99 tst ip, #2
100 it ne
101 strhne r1, [r3], #2
102 /* set 4 bytes */
103 movs ip, ip, lsl #29
104 it mi
105 strmi r1, [r3], #4
106 /* set 8 bytes */
107 itt cs
108 strcs r1, [r3], #4
109 strcs r1, [r3], #4
110
111.L_memset_aligned:
112 // Destination is now 16-byte aligned. Determine how to handle
113 // remaining bytes.
114 vmov q1, q0
115 cmp r2, #128
116 blo .L_less_than_128
117
118 // We need to set a larger block of memory. Use four Q regs to
119 // set a full cache line in one instruction. Pre-decrement
120 // r2 to simplify end-of-loop detection
121 vmov q2, q0
122 vmov q3, q0
123 pldw [r0, #128]
124 sub r2, r2, #128
125 .align 4
126.L_memset_loop_128:
127 pldw [r3, #192]
128 vstm r3!, {q0, q1, q2, q3}
129 vstm r3!, {q0, q1, q2, q3}
130 subs r2, r2, #128
131 bhs .L_memset_loop_128
132
133 // Un-bias r2 so it contains the number of bytes left. Early
134 // exit if we are done.
135 adds r2, r2, #128
136 beq 2f
137
138 .align 4
139.L_less_than_128:
140 // set 64 bytes
141 movs ip, r2, lsl #26
142 bcc 1f
143 vst1.8 {q0, q1}, [r3, :128]!
144 vst1.8 {q0, q1}, [r3, :128]!
145 beq 2f
1461:
147 // set 32 bytes
148 bpl 1f
149 vst1.8 {q0, q1}, [r3, :128]!
1501:
151 // set 16 bytes
152 movs ip, r2, lsl #28
153 bcc 1f
154 vst1.8 {q0}, [r3, :128]!
155 beq 2f
1561:
157 // set 8 bytes
158 bpl 1f
159 vst1.8 {d0}, [r3, :64]!
1601:
161 // set 4 bytes
162 tst r2, #4
163 it ne
164 strne r1, [r3], #4
1651:
166 // set 2 bytes
167 movs ip, r2, lsl #31
168 it cs
169 strhcs r1, [r3], #2
170 // set 1 byte
171 it mi
172 strbmi r1, [r3]
1732:
174 bx lr
175
176.L_less_than_16:
177 // Store up to 15 bytes without worrying about byte alignment
178 movs ip, r2, lsl #29
179 bcc 1f
180 str r1, [r3], #4
181 str r1, [r3], #4
182 beq 2f
1831:
184 it mi
185 strmi r1, [r3], #4
186 movs ip, r2, lsl #31
187 it mi
188 strbmi r1, [r3], #1
189 itt cs
190 strbcs r1, [r3], #1
191 strbcs r1, [r3]
1922:
193 bx lr
194END(memset)