Add arm64 string.h function implementations for use with hardware supporting MTE.
As it turns out, our "generic" arm64 implementations of certain string.h
functions are not actually generic, since they will eagerly read memory
possibly outside of the bounds of an MTE granule, which may lead to a segfault
on MTE-enabled hardware. Therefore, move the implementations into a "default"
directory and use ifuncs to select between them and a new set of "mte"
implementations, conditional on whether the hardware and kernel support MTE.
The MTE implementations are currently naive implementations written in C
but will later be replaced with a set of optimized assembly implementations.
Bug: 135772972
Change-Id: Ife37c4e0e6fd60ff20a34594cc09c541af4d1dd7
diff --git a/libc/arch-arm64/default/bionic/strnlen.S b/libc/arch-arm64/default/bionic/strnlen.S
new file mode 100644
index 0000000..1694532
--- /dev/null
+++ b/libc/arch-arm64/default/bionic/strnlen.S
@@ -0,0 +1,174 @@
+/* Copyright (c) 2014, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+#define limit x1
+
+/* Locals and temporaries. */
+#define src x2
+#define data1 x3
+#define data2 x4
+#define data2a x5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define pos x13
+#define limit_wd x14
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ .text
+ .p2align 6
+.Lstart:
+ /* Pre-pad to ensure critical loop begins an icache line. */
+ .rep 7
+ nop
+ .endr
+ /* Put this code here to avoid wasting more space with pre-padding. */
+.Lhit_limit:
+ mov len, limit
+ ret
+
+ENTRY(strnlen_default)
+ cbz limit, .Lhit_limit
+ mov zeroones, #REP8_01
+ bic src, srcin, #15
+ ands tmp1, srcin, #15
+ b.ne .Lmisaligned
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+
+ /* Start of critial section -- keep to one 64Byte cache line. */
+.Lloop:
+ ldp data1, data2, [src], #16
+.Lrealigned:
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ subs limit_wd, limit_wd, #1
+ orr tmp1, has_nul1, has_nul2
+ ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
+ b.eq .Lloop
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ orr tmp1, has_nul1, has_nul2
+ cbz tmp1, .Lhit_limit /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+ sub len, src, srcin
+ cbz has_nul1, .Lnul_in_data2
+#ifdef __AARCH64EB__
+ mov data2, data1
+#endif
+ sub len, len, #8
+ mov has_nul2, has_nul1
+.Lnul_in_data2:
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ rev data2, data2
+ sub tmp1, data2, zeroones
+ orr tmp2, data2, #REP8_7f
+ bic has_nul2, tmp1, tmp2
+#endif
+ sub len, len, #8
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ add len, len, pos, lsr #3 /* Bits to bytes. */
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
+ ret
+
+.Lmisaligned:
+ /* Deal with a partial first word.
+ We're doing two things in parallel here;
+ 1) Calculate the number of words (but avoiding overflow if
+ limit is near ULONG_MAX) - to do this we need to work out
+ limit + tmp1 - 1 as a 65-bit value before shifting it;
+ 2) Load and mask the initial data words - we force the bytes
+ before the ones we are interested in to 0xff - this ensures
+ early bytes will not hit any zero detection. */
+ sub limit_wd, limit, #1
+ neg tmp4, tmp1
+ cmp tmp1, #8
+
+ and tmp3, limit_wd, #15
+ lsr limit_wd, limit_wd, #4
+ mov tmp2, #~0
+
+ ldp data1, data2, [src], #16
+ lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
+ add tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#endif
+ add limit_wd, limit_wd, tmp3, lsr #4
+
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+
+ csinv data1, data1, xzr, le
+ csel data2, data2, data2a, le
+ b .Lrealigned
+END(strnlen_default)