blob: ec692e524f194b4bafa5d75cb6aedc491f25b7ef [file] [log] [blame]
Christopher Ferris31dea252013-03-08 16:50:31 -08001/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
15 * permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
Elliott Hughes851e68a2014-02-19 16:53:20 -080029#include <private/bionic_asm.h>
Christopher Ferris31dea252013-03-08 16:50:31 -080030
31#ifdef __ARMEB__
32#define S2LOMEM lsl
33#define S2LOMEMEQ lsleq
34#define S2HIMEM lsr
35#define MSB 0x000000ff
36#define LSB 0xff000000
37#define BYTE0_OFFSET 24
38#define BYTE1_OFFSET 16
39#define BYTE2_OFFSET 8
40#define BYTE3_OFFSET 0
41#else /* not __ARMEB__ */
42#define S2LOMEM lsr
43#define S2LOMEMEQ lsreq
44#define S2HIMEM lsl
45#define BYTE0_OFFSET 0
46#define BYTE1_OFFSET 8
47#define BYTE2_OFFSET 16
48#define BYTE3_OFFSET 24
49#define MSB 0xff000000
50#define LSB 0x000000ff
51#endif /* not __ARMEB__ */
52
53.syntax unified
54
Haibo Huangea9957a2018-11-19 11:00:32 -080055// To avoid warning about deprecated instructions, add an explicit
56// arch. The code generated is exactly the same.
57.arch armv7-a
58
Christopher Ferris31dea252013-03-08 16:50:31 -080059#if defined (__thumb__)
60 .thumb
61 .thumb_func
62#endif
63
Haibo Huangea9957a2018-11-19 11:00:32 -080064ENTRY(strcmp_krait)
Christopher Ferris31dea252013-03-08 16:50:31 -080065 /* Use LDRD whenever possible. */
66
67/* The main thing to look out for when comparing large blocks is that
68 the loads do not cross a page boundary when loading past the index
69 of the byte with the first difference or the first string-terminator.
70
71 For example, if the strings are identical and the string-terminator
72 is at index k, byte by byte comparison will not load beyond address
73 s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
74 k; double word - up to 7 bytes. If the load of these bytes crosses
75 a page boundary, it might cause a memory fault (if the page is not mapped)
76 that would not have happened in byte by byte comparison.
77
78 If an address is (double) word aligned, then a load of a (double) word
79 from that address will not cross a page boundary.
80 Therefore, the algorithm below considers word and double-word alignment
81 of strings separately. */
82
83/* High-level description of the algorithm.
84
85 * The fast path: if both strings are double-word aligned,
86 use LDRD to load two words from each string in every loop iteration.
87 * If the strings have the same offset from a word boundary,
88 use LDRB to load and compare byte by byte until
89 the first string is aligned to a word boundary (at most 3 bytes).
90 This is optimized for quick return on short unaligned strings.
91 * If the strings have the same offset from a double-word boundary,
92 use LDRD to load two words from each string in every loop iteration, as in the fast path.
93 * If the strings do not have the same offset from a double-word boundary,
94 load a word from the second string before the loop to initialize the queue.
95 Use LDRD to load two words from every string in every loop iteration.
96 Inside the loop, load the second word from the second string only after comparing
97 the first word, using the queued value, to guarantee safety across page boundaries.
98 * If the strings do not have the same offset from a word boundary,
99 use LDR and a shift queue. Order of loads and comparisons matters,
100 similarly to the previous case.
101
102 * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
103 * The only difference between ARM and Thumb modes is the use of CBZ instruction.
104 * The only difference between big and little endian is the use of REV in little endian
105 to compute the return value, instead of MOV.
106*/
107
108 .macro m_cbz reg label
109#ifdef __thumb2__
110 cbz \reg, \label
111#else /* not defined __thumb2__ */
112 cmp \reg, #0
113 beq \label
114#endif /* not defined __thumb2__ */
115 .endm /* m_cbz */
116
117 .macro m_cbnz reg label
118#ifdef __thumb2__
119 cbnz \reg, \label
120#else /* not defined __thumb2__ */
121 cmp \reg, #0
122 bne \label
123#endif /* not defined __thumb2__ */
124 .endm /* m_cbnz */
125
126 .macro init
127 /* Macro to save temporary registers and prepare magic values. */
128 subs sp, sp, #16
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700129 .cfi_def_cfa_offset 16
Christopher Ferris31dea252013-03-08 16:50:31 -0800130 strd r4, r5, [sp, #8]
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700131 .cfi_rel_offset r4, 0
132 .cfi_rel_offset r5, 4
Christopher Ferris31dea252013-03-08 16:50:31 -0800133 strd r6, r7, [sp]
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700134 .cfi_rel_offset r6, 8
135 .cfi_rel_offset r7, 12
Christopher Ferris31dea252013-03-08 16:50:31 -0800136 mvn r6, #0 /* all F */
137 mov r7, #0 /* all 0 */
138 .endm /* init */
139
140 .macro magic_compare_and_branch w1 w2 label
141 /* Macro to compare registers w1 and w2 and conditionally branch to label. */
142 cmp \w1, \w2 /* Are w1 and w2 the same? */
143 magic_find_zero_bytes \w1
144 it eq
145 cmpeq ip, #0 /* Is there a zero byte in w1? */
146 bne \label
147 .endm /* magic_compare_and_branch */
148
149 .macro magic_find_zero_bytes w1
150 /* Macro to find all-zero bytes in w1, result is in ip. */
Christopher Ferris31dea252013-03-08 16:50:31 -0800151 uadd8 ip, \w1, r6
152 sel ip, r7, r6
Christopher Ferris31dea252013-03-08 16:50:31 -0800153 .endm /* magic_find_zero_bytes */
154
155 .macro setup_return w1 w2
156#ifdef __ARMEB__
157 mov r1, \w1
158 mov r2, \w2
159#else /* not __ARMEB__ */
160 rev r1, \w1
161 rev r2, \w2
162#endif /* not __ARMEB__ */
163 .endm /* setup_return */
164
165 pld [r0, #0]
166 pld [r1, #0]
167
168 /* Are both strings double-word aligned? */
169 orr ip, r0, r1
170 tst ip, #7
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700171 bne .L_do_align
Christopher Ferris31dea252013-03-08 16:50:31 -0800172
173 /* Fast path. */
174 init
175
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700176.L_doubleword_aligned:
Christopher Ferris31dea252013-03-08 16:50:31 -0800177
178 /* Get here when the strings to compare are double-word aligned. */
179 /* Compare two words in every iteration. */
180 .p2align 2
1812:
182 pld [r0, #16]
183 pld [r1, #16]
184
185 /* Load the next double-word from each string. */
186 ldrd r2, r3, [r0], #8
187 ldrd r4, r5, [r1], #8
188
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700189 magic_compare_and_branch w1=r2, w2=r4, label=.L_return_24
190 magic_compare_and_branch w1=r3, w2=r5, label=.L_return_35
Christopher Ferris31dea252013-03-08 16:50:31 -0800191 b 2b
192
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700193.L_do_align:
Christopher Ferris31dea252013-03-08 16:50:31 -0800194 /* Is the first string word-aligned? */
195 ands ip, r0, #3
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700196 beq .L_word_aligned_r0
Christopher Ferris31dea252013-03-08 16:50:31 -0800197
198 /* Fast compare byte by byte until the first string is word-aligned. */
199 /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
200 to read until the next word boundary is 4-ip. */
201 bic r0, r0, #3
202 ldr r2, [r0], #4
203 lsls ip, ip, #31
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700204 beq .L_byte2
205 bcs .L_byte3
Christopher Ferris31dea252013-03-08 16:50:31 -0800206
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700207.L_byte1:
Christopher Ferris31dea252013-03-08 16:50:31 -0800208 ldrb ip, [r1], #1
209 uxtb r3, r2, ror #BYTE1_OFFSET
210 subs ip, r3, ip
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700211 bne .L_fast_return
212 m_cbz reg=r3, label=.L_fast_return
Christopher Ferris31dea252013-03-08 16:50:31 -0800213
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700214.L_byte2:
Christopher Ferris31dea252013-03-08 16:50:31 -0800215 ldrb ip, [r1], #1
216 uxtb r3, r2, ror #BYTE2_OFFSET
217 subs ip, r3, ip
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700218 bne .L_fast_return
219 m_cbz reg=r3, label=.L_fast_return
Christopher Ferris31dea252013-03-08 16:50:31 -0800220
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700221.L_byte3:
Christopher Ferris31dea252013-03-08 16:50:31 -0800222 ldrb ip, [r1], #1
223 uxtb r3, r2, ror #BYTE3_OFFSET
224 subs ip, r3, ip
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700225 bne .L_fast_return
226 m_cbnz reg=r3, label=.L_word_aligned_r0
Christopher Ferris31dea252013-03-08 16:50:31 -0800227
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700228.L_fast_return:
Christopher Ferris31dea252013-03-08 16:50:31 -0800229 mov r0, ip
230 bx lr
231
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700232.L_word_aligned_r0:
Christopher Ferris31dea252013-03-08 16:50:31 -0800233 init
234 /* The first string is word-aligned. */
235 /* Is the second string word-aligned? */
236 ands ip, r1, #3
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700237 bne .L_strcmp_unaligned
Christopher Ferris31dea252013-03-08 16:50:31 -0800238
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700239.L_word_aligned:
Christopher Ferris31dea252013-03-08 16:50:31 -0800240 /* The strings are word-aligned. */
241 /* Is the first string double-word aligned? */
242 tst r0, #4
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700243 beq .L_doubleword_aligned_r0
Christopher Ferris31dea252013-03-08 16:50:31 -0800244
245 /* If r0 is not double-word aligned yet, align it by loading
246 and comparing the next word from each string. */
247 ldr r2, [r0], #4
248 ldr r4, [r1], #4
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700249 magic_compare_and_branch w1=r2 w2=r4 label=.L_return_24
Christopher Ferris31dea252013-03-08 16:50:31 -0800250
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700251.L_doubleword_aligned_r0:
Christopher Ferris31dea252013-03-08 16:50:31 -0800252 /* Get here when r0 is double-word aligned. */
253 /* Is r1 doubleword_aligned? */
254 tst r1, #4
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700255 beq .L_doubleword_aligned
Christopher Ferris31dea252013-03-08 16:50:31 -0800256
257 /* Get here when the strings to compare are word-aligned,
258 r0 is double-word aligned, but r1 is not double-word aligned. */
259
260 /* Initialize the queue. */
261 ldr r5, [r1], #4
262
263 /* Compare two words in every iteration. */
264 .p2align 2
2653:
266 pld [r0, #16]
267 pld [r1, #16]
268
269 /* Load the next double-word from each string and compare. */
270 ldrd r2, r3, [r0], #8
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700271 magic_compare_and_branch w1=r2 w2=r5 label=.L_return_25
Christopher Ferris31dea252013-03-08 16:50:31 -0800272 ldrd r4, r5, [r1], #8
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700273 magic_compare_and_branch w1=r3 w2=r4 label=.L_return_34
Christopher Ferris31dea252013-03-08 16:50:31 -0800274 b 3b
275
276 .macro miscmp_word offsetlo offsethi
277 /* Macro to compare misaligned strings. */
278 /* r0, r1 are word-aligned, and at least one of the strings
279 is not double-word aligned. */
280 /* Compare one word in every loop iteration. */
281 /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
282 OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word). */
283
284 /* Initialize the shift queue. */
285 ldr r5, [r1], #4
286
287 /* Compare one word from each string in every loop iteration. */
288 .p2align 2
2897:
290 ldr r3, [r0], #4
291 S2LOMEM r5, r5, #\offsetlo
292 magic_find_zero_bytes w1=r3
293 cmp r7, ip, S2HIMEM #\offsetlo
294 and r2, r3, r6, S2LOMEM #\offsetlo
295 it eq
296 cmpeq r2, r5
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700297 bne .L_return_25
Christopher Ferris31dea252013-03-08 16:50:31 -0800298 ldr r5, [r1], #4
299 cmp ip, #0
300 eor r3, r2, r3
301 S2HIMEM r2, r5, #\offsethi
302 it eq
303 cmpeq r3, r2
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700304 bne .L_return_32
Christopher Ferris31dea252013-03-08 16:50:31 -0800305 b 7b
306 .endm /* miscmp_word */
307
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700308.L_strcmp_unaligned:
Christopher Ferris31dea252013-03-08 16:50:31 -0800309 /* r0 is word-aligned, r1 is at offset ip from a word. */
310 /* Align r1 to the (previous) word-boundary. */
311 bic r1, r1, #3
312
313 /* Unaligned comparison word by word using LDRs. */
314 cmp ip, #2
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700315 beq .L_miscmp_word_16 /* If ip == 2. */
316 bge .L_miscmp_word_24 /* If ip == 3. */
Christopher Ferris31dea252013-03-08 16:50:31 -0800317 miscmp_word offsetlo=8 offsethi=24 /* If ip == 1. */
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700318.L_miscmp_word_24: miscmp_word offsetlo=24 offsethi=8
Christopher Ferris31dea252013-03-08 16:50:31 -0800319
320
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700321.L_return_32:
Christopher Ferris31dea252013-03-08 16:50:31 -0800322 setup_return w1=r3, w2=r2
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700323 b .L_do_return
324.L_return_34:
Christopher Ferris31dea252013-03-08 16:50:31 -0800325 setup_return w1=r3, w2=r4
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700326 b .L_do_return
327.L_return_25:
Christopher Ferris31dea252013-03-08 16:50:31 -0800328 setup_return w1=r2, w2=r5
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700329 b .L_do_return
330.L_return_35:
Christopher Ferris31dea252013-03-08 16:50:31 -0800331 setup_return w1=r3, w2=r5
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700332 b .L_do_return
333.L_return_24:
Christopher Ferris31dea252013-03-08 16:50:31 -0800334 setup_return w1=r2, w2=r4
335
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700336.L_do_return:
Christopher Ferris31dea252013-03-08 16:50:31 -0800337
338#ifdef __ARMEB__
339 mov r0, ip
340#else /* not __ARMEB__ */
341 rev r0, ip
342#endif /* not __ARMEB__ */
343
344 /* Restore temporaries early, before computing the return value. */
345 ldrd r6, r7, [sp]
346 ldrd r4, r5, [sp, #8]
347 adds sp, sp, #16
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700348 .cfi_def_cfa_offset 0
349 .cfi_restore r4
350 .cfi_restore r5
351 .cfi_restore r6
352 .cfi_restore r7
Christopher Ferris31dea252013-03-08 16:50:31 -0800353
354 /* There is a zero or a different byte between r1 and r2. */
355 /* r0 contains a mask of all-zero bytes in r1. */
356 /* Using r0 and not ip here because cbz requires low register. */
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700357 m_cbz reg=r0, label=.L_compute_return_value
Christopher Ferris31dea252013-03-08 16:50:31 -0800358 clz r0, r0
359 /* r0 contains the number of bits on the left of the first all-zero byte in r1. */
360 rsb r0, r0, #24
361 /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1. */
362 lsr r1, r1, r0
363 lsr r2, r2, r0
364
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700365.L_compute_return_value:
Christopher Ferris31dea252013-03-08 16:50:31 -0800366 movs r0, #1
367 cmp r1, r2
368 /* The return value is computed as follows.
369 If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
370 If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
371 which means r0:=r0-r0-1 and r0 is #-1 at return.
372 If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
373 which means r0:=r0-r0 and r0 is #0 at return.
374 (C==0 and Z==1) cannot happen because the carry bit is "not borrow". */
375 it ls
376 sbcls r0, r0, r0
377 bx lr
378
379 /* The code from the previous version of strcmp.S handles this
380 * particular case (the second string is 2 bytes off a word alignment)
381 * faster than any current version. In this very specific case, use the
382 * previous version. See bionic/libc/arch-arm/cortex-a15/bionic/strcmp.S
383 * for the unedited version of this code.
384 */
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700385.L_miscmp_word_16:
Christopher Ferris31dea252013-03-08 16:50:31 -0800386 wp1 .req r0
387 wp2 .req r1
388 b1 .req r2
389 w1 .req r4
390 w2 .req r5
391 t1 .req ip
392 @ r3 is scratch
393
394 /* At this point, wp1 (r0) has already been word-aligned. */
3952:
396 mov b1, #1
397 orr b1, b1, b1, lsl #8
398 orr b1, b1, b1, lsl #16
399
400 and t1, wp2, #3
401 bic wp2, wp2, #3
402 ldr w1, [wp1], #4
403 ldr w2, [wp2], #4
404
405 /* Critical inner Loop: Block with 2 bytes initial overlap */
406 .p2align 2
4072:
408 S2HIMEM t1, w1, #16
409 sub r3, w1, b1
410 S2LOMEM t1, t1, #16
411 bic r3, r3, w1
412 cmp t1, w2, S2LOMEM #16
413 bne 4f
414 ands r3, r3, b1, lsl #7
415 it eq
416 ldreq w2, [wp2], #4
417 bne 5f
418 eor t1, t1, w1
419 cmp t1, w2, S2HIMEM #16
420 bne 6f
421 ldr w1, [wp1], #4
422 b 2b
423
4245:
425#ifdef __ARMEB__
426 /* The syndrome value may contain false ones if the string ends
427 * with the bytes 0x01 0x00
428 */
429 tst w1, #0xff000000
430 it ne
431 tstne w1, #0x00ff0000
432 beq 7f
433#else
434 lsls r3, r3, #16
435 bne 7f
436#endif
437 ldrh w2, [wp2]
438 S2LOMEM t1, w1, #16
439#ifdef __ARMEB__
440 lsl w2, w2, #16
441#endif
442 b 8f
443
4446:
445 S2HIMEM w2, w2, #16
446 S2LOMEM t1, w1, #16
4474:
448 S2LOMEM w2, w2, #16
449 b 8f
450
4517:
452 mov r0, #0
453
454 /* Restore registers and stack. */
455 ldrd r6, r7, [sp]
456 ldrd r4, r5, [sp, #8]
457 adds sp, sp, #16
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700458 .cfi_def_cfa_offset 0
459 .cfi_restore r4
460 .cfi_restore r5
461 .cfi_restore r6
462 .cfi_restore r7
Christopher Ferris31dea252013-03-08 16:50:31 -0800463
464 bx lr
465
4668:
467 and r2, t1, #LSB
468 and r0, w2, #LSB
469 cmp r0, #1
470 it cs
471 cmpcs r0, r2
472 itt eq
473 S2LOMEMEQ t1, t1, #8
474 S2LOMEMEQ w2, w2, #8
475 beq 8b
476 sub r0, r2, r0
477
478 /* Restore registers and stack. */
479 ldrd r6, r7, [sp]
480 ldrd r4, r5, [sp, #8]
481 adds sp, sp, #16
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700482 .cfi_def_cfa_offset 0
483 .cfi_restore r4
484 .cfi_restore r5
485 .cfi_restore r6
486 .cfi_restore r7
Christopher Ferris31dea252013-03-08 16:50:31 -0800487
488 bx lr
Haibo Huangea9957a2018-11-19 11:00:32 -0800489END(strcmp_krait)