Merge "Relax check on number of segments in a .so file"
diff --git a/libc/Android.bp b/libc/Android.bp
index 00686ac..a6df757 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -102,7 +102,10 @@
             srcs: ["arch-arm64/bionic/__set_tls.c"],
         },
         x86: {
-            srcs: ["arch-x86/bionic/__set_tls.cpp"],
+            srcs: [
+                "arch-x86/bionic/__libc_init_sysinfo.cpp",
+                "arch-x86/bionic/__set_tls.cpp",
+            ],
         },
         x86_64: {
             srcs: ["arch-x86_64/bionic/__set_tls.c"],
@@ -1042,45 +1045,15 @@
             denver64: {
                 srcs: [
                     "arch-arm64/denver64/bionic/memcpy.S",
+                    "arch-arm64/denver64/bionic/memmove.S",
                     "arch-arm64/denver64/bionic/memset.S",
                 ],
                 exclude_srcs: [
                     "arch-arm64/generic/bionic/memcpy.S",
+                    "arch-arm64/generic/bionic/memmove.S",
                     "arch-arm64/generic/bionic/memset.S",
                 ],
             },
-            cortex_a53: {
-                srcs: [
-                    "arch-arm64/cortex-a53/bionic/memmove.S",
-                ],
-                exclude_srcs: [
-                    "arch-arm64/generic/bionic/memmove.S",
-                ],
-            },
-            cortex_a55: {
-                srcs: [
-                    "arch-arm64/cortex-a53/bionic/memmove.S",
-                ],
-                exclude_srcs: [
-                    "arch-arm64/generic/bionic/memmove.S",
-                ],
-            },
-            cortex_a73: {
-                srcs: [
-                    "arch-arm64/cortex-a53/bionic/memmove.S",
-                ],
-                exclude_srcs: [
-                    "arch-arm64/generic/bionic/memmove.S",
-                ],
-            },
-            cortex_a75: {
-                srcs: [
-                    "arch-arm64/cortex-a53/bionic/memmove.S",
-                ],
-                exclude_srcs: [
-                    "arch-arm64/generic/bionic/memmove.S",
-                ],
-            },
         },
 
         mips: {
diff --git a/libc/arch-arm64/cortex-a53/bionic/memmove.S b/libc/arch-arm64/cortex-a53/bionic/memmove.S
deleted file mode 100644
index c50112d..0000000
--- a/libc/arch-arm64/cortex-a53/bionic/memmove.S
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
- */
-
-#include <private/bionic_asm.h>
-
-/* Parameters and result.  */
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define srcend	x3
-#define dstend	x4
-#define tmp1	x5
-#define A_l	x6
-#define A_h	x7
-#define B_l	x8
-#define B_h	x9
-#define C_l	x10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	count
-#define E_h	tmp1
-
-/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
-   Larger backwards copies are also handled by memcpy. The only remaining
-   case is forward large copies.  The destination is aligned, and an
-   unrolled loop processes 64 bytes per iteration.
-*/
-
-#if defined(WMEMMOVE)
-ENTRY(wmemmove)
-	lsl	count, count, #2
-#else
-ENTRY(memmove)
-#endif
-	sub	tmp1, dstin, src
-	cmp	count, 96
-	ccmp	tmp1, count, 2, hi
-	b.hs	memcpy
-
-	cbz	tmp1, 3f
-	add	dstend, dstin, count
-	add	srcend, src, count
-
-	/* Align dstend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.	 There are at least 96 bytes
-	   to copy, so copy 16 bytes unaligned and then align.	The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-	and	tmp1, dstend, 15
-	ldp	D_l, D_h, [srcend, -16]
-	sub	srcend, srcend, tmp1
-	sub	count, count, tmp1
-	ldp	A_l, A_h, [srcend, -16]
-	stp	D_l, D_h, [dstend, -16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -48]
-	ldp	D_l, D_h, [srcend, -64]!
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	2f
-	nop
-1:
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -48]
-	stp	D_l, D_h, [dstend, -64]!
-	ldp	D_l, D_h, [srcend, -64]!
-	subs	count, count, 64
-	b.hi	1b
-
-	/* Write the last full set of 64 bytes.	 The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
-2:
-	ldp	E_l, E_h, [src, 48]
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [src, 32]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [src, 16]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [src]
-	stp	D_l, D_h, [dstend, -64]
-	stp	E_l, E_h, [dstin, 48]
-	stp	A_l, A_h, [dstin, 32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin]
-3:	ret
-
-#if defined(WMEMMOVE)
-END(wmemmove)
-#else
-END(memmove)
-#endif
diff --git a/libc/arch-arm64/denver64/bionic/memmove.S b/libc/arch-arm64/denver64/bionic/memmove.S
new file mode 100644
index 0000000..739ce49
--- /dev/null
+++ b/libc/arch-arm64/denver64/bionic/memmove.S
@@ -0,0 +1,329 @@
+/* Copyright (c) 2014, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ * wchar_t is 4 bytes
+ */
+
+#include <private/bionic_asm.h>
+
+/* Parameters and result.  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
+#if defined(WMEMMOVE)
+ENTRY(wmemmove)
+	lsl	count, count, #2
+#else
+ENTRY(memmove)
+#endif
+	cmp	dstin, src
+	b.lo	.Ldownwards
+	add	tmp1, src, count
+	cmp	dstin, tmp1
+	b.hs	memcpy		/* No overlap.  */
+
+	/* Upwards move with potential overlap.
+	 * Need to move from the tail backwards.  SRC and DST point one
+	 * byte beyond the remaining data to move.  */
+	add	dst, dstin, count
+	add	src, src, count
+	cmp	count, #64
+	b.ge	.Lmov_not_short_up
+
+	/* Deal with small moves quickly by dropping straight into the
+	 * exit block.  */
+.Ltail63up:
+	/* Move up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15up
+	sub	dst, dst, tmp1
+	sub	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #32]
+	stp	A_l, A_h, [dst, #32]
+1:
+	ldp	A_l, A_h, [src, #16]
+	stp	A_l, A_h, [dst, #16]
+2:
+	ldp	A_l, A_h, [src]
+	stp	A_l, A_h, [dst]
+.Ltail15up:
+	/* Move up to 15 bytes of data.  Does not assume additional data
+	 * being moved.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src, #-1]
+	strb	tmp1w, [dst, #-1]
+1:
+	ret
+
+.Lmov_not_short_up:
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Move enough data to reach alignment; unlike memcpy, we have to
+	 * be aware of the overlap, which means we can't move data twice.  */
+	tbz	tmp2, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+1:
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src, #-1]!
+	strb	tmp1w, [dst, #-1]!
+1:
+
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	.Ltail63up
+2:
+	subs	count, count, #128
+	b.ge	.Lmov_body_large_up
+	/* Less than 128 bytes to move, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src, #-64]!
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst, #-64]!
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	b.ne	.Ltail63up
+	ret
+
+	/* Critical loop.  Start at a new Icache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+.Lmov_body_large_up:
+	/* There are at least 128 bytes to move.  */
+	ldp	A_l, A_h, [src, #-16]
+	ldp	B_l, B_h, [src, #-32]
+	ldp	C_l, C_h, [src, #-48]
+	ldp	D_l, D_h, [src, #-64]!
+1:
+	stp	A_l, A_h, [dst, #-16]
+	ldp	A_l, A_h, [src, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	ldp	B_l, B_h, [src, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	ldp	C_l, C_h, [src, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	ldp	D_l, D_h, [src, #-64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	tst	count, #0x3f
+	b.ne	.Ltail63up
+	ret
+
+
+.Ldownwards:
+	/* For a downwards move we can safely use memcpy provided that
+	 * DST is more than 16 bytes away from SRC.  */
+	sub	tmp1, src, #16
+	cmp	dstin, tmp1
+	b.ls	memcpy		/* May overlap, but not critically.  */
+
+	mov	dst, dstin	/* Preserve DSTIN for return value.  */
+	cmp	count, #64
+	b.ge	.Lmov_not_short_down
+
+	/* Deal with small moves quickly by dropping straight into the
+	 * exit block.  */
+.Ltail63down:
+	/* Move up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15down
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-48]
+	stp	A_l, A_h, [dst, #-48]
+1:
+	ldp	A_l, A_h, [src, #-32]
+	stp	A_l, A_h, [dst, #-32]
+2:
+	ldp	A_l, A_h, [src, #-16]
+	stp	A_l, A_h, [dst, #-16]
+.Ltail15down:
+	/* Move up to 15 bytes of data.  Does not assume additional data
+	   being moved.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+1:
+	ret
+
+.Lmov_not_short_down:
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Move enough data to reach alignment; unlike memcpy, we have to
+	 * be aware of the overlap, which means we can't move data twice.  */
+	tbz	tmp2, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	.Ltail63down
+2:
+	subs	count, count, #128
+	b.ge	.Lmov_body_large_down
+	/* Less than 128 bytes to move, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst]
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	add	src, src, #64
+	add	dst, dst, #64
+	b.ne	.Ltail63down
+	ret
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+.Lmov_body_large_down:
+	/* There are at least 128 bytes to move.  */
+	ldp	A_l, A_h, [src, #0]
+	sub	dst, dst, #16		/* Pre-bias.  */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+1:
+	stp	A_l, A_h, [dst, #16]
+	ldp	A_l, A_h, [src, #16]
+	stp	B_l, B_h, [dst, #32]
+	ldp	B_l, B_h, [src, #32]
+	stp	C_l, C_h, [dst, #48]
+	ldp	C_l, C_h, [src, #48]
+	stp	D_l, D_h, [dst, #64]!
+	ldp	D_l, D_h, [src, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #16]
+	stp	B_l, B_h, [dst, #32]
+	stp	C_l, C_h, [dst, #48]
+	stp	D_l, D_h, [dst, #64]
+	add	src, src, #16
+	add	dst, dst, #64 + 16
+	tst	count, #0x3f
+	b.ne	.Ltail63down
+	ret
+#if defined(WMEMMOVE)
+END(wmemmove)
+#else
+END(memmove)
+#endif
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 739ce49..c50112d 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,14 +22,39 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
- * wchar_t is 4 bytes
+ * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
  */
 
 #include <private/bionic_asm.h>
@@ -38,22 +63,25 @@
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
+#define srcend	x3
+#define dstend	x4
+#define tmp1	x5
+#define A_l	x6
+#define A_h	x7
+#define B_l	x8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	count
+#define E_h	tmp1
 
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+   Larger backwards copies are also handled by memcpy. The only remaining
+   case is forward large copies.  The destination is aligned, and an
+   unrolled loop processes 64 bytes per iteration.
+*/
 
 #if defined(WMEMMOVE)
 ENTRY(wmemmove)
@@ -61,267 +89,63 @@
 #else
 ENTRY(memmove)
 #endif
-	cmp	dstin, src
-	b.lo	.Ldownwards
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	memcpy		/* No overlap.  */
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.hs	memcpy
 
-	/* Upwards move with potential overlap.
-	 * Need to move from the tail backwards.  SRC and DST point one
-	 * byte beyond the remaining data to move.  */
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #64
-	b.ge	.Lmov_not_short_up
+	cbz	tmp1, 3f
+	add	dstend, dstin, count
+	add	srcend, src, count
 
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63up:
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15up
-	sub	dst, dst, tmp1
-	sub	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #32]
-	stp	A_l, A_h, [dst, #32]
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+	nop
 1:
-	ldp	A_l, A_h, [src, #16]
-	stp	A_l, A_h, [dst, #16]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
 2:
-	ldp	A_l, A_h, [src]
-	stp	A_l, A_h, [dst]
-.Ltail15up:
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	 * being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-1:
-	ret
+	ldp	E_l, E_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	E_l, E_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
 
-.Lmov_not_short_up:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63up
-2:
-	subs	count, count, #128
-	b.ge	.Lmov_body_large_up
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src, #-64]!
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst, #-64]!
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	b.ne	.Ltail63up
-	ret
-
-	/* Critical loop.  Start at a new Icache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lmov_body_large_up:
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	tst	count, #0x3f
-	b.ne	.Ltail63up
-	ret
-
-
-.Ldownwards:
-	/* For a downwards move we can safely use memcpy provided that
-	 * DST is more than 16 bytes away from SRC.  */
-	sub	tmp1, src, #16
-	cmp	dstin, tmp1
-	b.ls	memcpy		/* May overlap, but not critically.  */
-
-	mov	dst, dstin	/* Preserve DSTIN for return value.  */
-	cmp	count, #64
-	b.ge	.Lmov_not_short_down
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63down:
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15down
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-.Ltail15down:
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	   being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
-1:
-	ret
-
-.Lmov_not_short_down:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src], #1
-	strb	tmp1w, [dst], #1
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63down
-2:
-	subs	count, count, #128
-	b.ge	.Lmov_body_large_down
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	.Ltail63down
-	ret
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lmov_body_large_down:
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
-1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	.Ltail63down
-	ret
 #if defined(WMEMMOVE)
 END(wmemmove)
 #else
diff --git a/libc/arch-x86/bionic/__libc_init_sysinfo.cpp b/libc/arch-x86/bionic/__libc_init_sysinfo.cpp
new file mode 100644
index 0000000..f7c8c23
--- /dev/null
+++ b/libc/arch-x86/bionic/__libc_init_sysinfo.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "private/KernelArgumentBlock.h"
+#include "private/bionic_globals.h"
+
+// This file is compiled without stack protection, because it runs before TLS
+// has been set up.
+
+__LIBC_HIDDEN__ __attribute__((__naked__)) void __libc_int0x80() {
+  __asm__ volatile("int $0x80; ret");
+}
+
+__LIBC_HIDDEN__ void __libc_init_sysinfo(KernelArgumentBlock& args) {
+  // Running under valgrind, AT_SYSINFO won't be set. http://b/77856586.
+  void* at_sysinfo = reinterpret_cast<void*>(args.getauxval(AT_SYSINFO));
+  __libc_sysinfo = (at_sysinfo != nullptr) ? at_sysinfo :
+      reinterpret_cast<void*>(__libc_int0x80);
+}
+
+// TODO: lose this function and just access __libc_sysinfo directly.
+__LIBC_HIDDEN__ extern "C" void* __kernel_syscall() {
+  return __libc_sysinfo;
+}
diff --git a/libc/bionic/__libc_init_main_thread.cpp b/libc/bionic/__libc_init_main_thread.cpp
index efa7dee..8645df2 100644
--- a/libc/bionic/__libc_init_main_thread.cpp
+++ b/libc/bionic/__libc_init_main_thread.cpp
@@ -42,10 +42,6 @@
 // Declared in "private/bionic_ssp.h".
 uintptr_t __stack_chk_guard = 0;
 
-void __libc_init_global_stack_chk_guard(KernelArgumentBlock& args) {
-  __libc_safe_arc4random_buf(&__stack_chk_guard, sizeof(__stack_chk_guard), args);
-}
-
 // Setup for the main thread. For dynamic executables, this is called by the
 // linker _before_ libc is mapped in memory. This means that all writes to
 // globals from this function will apply to linker-private copies and will not
@@ -81,8 +77,7 @@
 
   // We don't want to free the main thread's stack even when the main thread exits
   // because things like environment variables with global scope live on it.
-  // We also can't free the pthread_internal_t itself, since that lives on the main
-  // thread's stack rather than on the heap.
+  // We also can't free the pthread_internal_t itself, since it is a static variable.
   // The main thread has no mmap allocated space for stack or pthread_internal_t.
   main_thread.mmap_size = 0;
 
@@ -97,14 +92,10 @@
   // The TLS stack guard is set from the global, so ensure that we've initialized the global
   // before we initialize the TLS. Dynamic executables will initialize their copy of the global
   // stack protector from the one in the main thread's TLS.
-  __libc_init_global_stack_chk_guard(args);
+  __libc_safe_arc4random_buf(&__stack_chk_guard, sizeof(__stack_chk_guard), args);
   __init_thread_stack_guard(&main_thread);
 
   __init_thread(&main_thread);
 
-  // Store a pointer to the kernel argument block in a TLS slot to be
-  // picked up by the libc constructor.
-  main_thread.tls[TLS_SLOT_BIONIC_PREINIT] = &args;
-
   __init_alternate_signal_stack(&main_thread);
 }
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index 2396c36..1ff6603 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -60,25 +60,6 @@
 // Not public, but well-known in the BSDs.
 const char* __progname;
 
-#if defined(__i386__)
-__attribute__((__naked__)) static void __libc_int0x80() {
-  __asm__ volatile("int $0x80; ret");
-}
-
-__LIBC_HIDDEN__ void* __libc_sysinfo = reinterpret_cast<void*>(__libc_int0x80);
-
-__LIBC_HIDDEN__ void __libc_init_sysinfo(KernelArgumentBlock& args) {
-  // Running under valgrind, AT_SYSINFO won't be set.
-  void* at_sysinfo = reinterpret_cast<void*>(args.getauxval(AT_SYSINFO));
-  if (at_sysinfo != nullptr) __libc_sysinfo = at_sysinfo;
-}
-
-// TODO: lose this function and just access __libc_sysinfo directly.
-__LIBC_HIDDEN__ extern "C" void* __kernel_syscall() {
-  return __libc_sysinfo;
-}
-#endif
-
 void __libc_init_globals(KernelArgumentBlock& args) {
 #if defined(__i386__)
   __libc_init_sysinfo(args);
diff --git a/libc/bionic/libc_init_dynamic.cpp b/libc/bionic/libc_init_dynamic.cpp
index 0b68280..6b8f57f 100644
--- a/libc/bionic/libc_init_dynamic.cpp
+++ b/libc/bionic/libc_init_dynamic.cpp
@@ -62,6 +62,12 @@
   extern int __cxa_atexit(void (*)(void *), void *, void *);
 };
 
+// Use an initializer so __libc_sysinfo will have a fallback implementation
+// while .preinit_array constructors run.
+#if defined(__i386__)
+__LIBC_HIDDEN__ void* __libc_sysinfo = reinterpret_cast<void*>(__libc_int0x80);
+#endif
+
 // We need a helper function for __libc_preinit because compiling with LTO may
 // inline functions requiring a stack protector check, but __stack_chk_guard is
 // not initialized at the start of __libc_preinit. __libc_preinit_impl will run
@@ -85,14 +91,11 @@
 // to run before any others (such as the jemalloc constructor), and lower
 // is better (http://b/68046352).
 __attribute__((constructor(1))) static void __libc_preinit() {
-  // Read the kernel argument block pointer from TLS.
+  // Read the kernel argument block pointer from TLS, then clear the slot so no
+  // other initializer sees its value.
   void** tls = __get_tls();
-  KernelArgumentBlock** args_slot = &reinterpret_cast<KernelArgumentBlock**>(tls)[TLS_SLOT_BIONIC_PREINIT];
-  KernelArgumentBlock* args = *args_slot;
-
-  // Clear the slot so no other initializer sees its value.
-  // __libc_init_common() will change the TLS area so the old one won't be accessible anyway.
-  *args_slot = NULL;
+  KernelArgumentBlock* args = static_cast<KernelArgumentBlock*>(tls[TLS_SLOT_BIONIC_PREINIT]);
+  tls[TLS_SLOT_BIONIC_PREINIT] = nullptr;
 
   // The linker has initialized its copy of the global stack_chk_guard, and filled in the main
   // thread's TLS slot with that value. Initialize the local global stack guard with its value.
@@ -102,9 +105,8 @@
 }
 
 // This function is called from the executable's _start entry point
-// (see arch-$ARCH/bionic/crtbegin_dynamic.S), which is itself
-// called by the dynamic linker after it has loaded all shared
-// libraries the executable depends on.
+// (see arch-$ARCH/bionic/crtbegin.c), which is itself called by the dynamic
+// linker after it has loaded all shared libraries the executable depends on.
 //
 // Note that the dynamic linker has also run all constructors in the
 // executable at this point.
diff --git a/libc/bionic/libc_init_static.cpp b/libc/bionic/libc_init_static.cpp
index d19dd28..038f6a7 100644
--- a/libc/bionic/libc_init_static.cpp
+++ b/libc/bionic/libc_init_static.cpp
@@ -45,6 +45,13 @@
 #include "private/bionic_tls.h"
 #include "private/KernelArgumentBlock.h"
 
+// Leave the variable uninitialized for the sake of the dynamic loader, which
+// links in this file. The loader will initialize this variable before
+// relocating itself.
+#if defined(__i386__)
+__LIBC_HIDDEN__ void* __libc_sysinfo;
+#endif
+
 extern "C" int __cxa_atexit(void (*)(void *), void *, void *);
 
 static void call_array(void(**list)()) {
@@ -86,10 +93,9 @@
   BIONIC_STOP_UNWIND;
 
   KernelArgumentBlock args(raw_args);
-  __libc_init_main_thread(args);
 
   // Initializing the globals requires TLS to be available for errno.
-  __init_thread_stack_guard(__get_thread());
+  __libc_init_main_thread(args);
   __libc_init_globals(args);
 
   __libc_init_AT_SECURE(args);
diff --git a/libc/libc.arm.map b/libc/libc.arm.map
index ee05b16..159d4a0 100644
--- a/libc/libc.arm.map
+++ b/libc/libc.arm.map
@@ -746,7 +746,7 @@
     pthread_cond_signal;
     pthread_cond_timedwait;
     pthread_cond_timedwait_monotonic; # arm x86 mips
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_timedwait_relative_np; # arm x86 mips
     pthread_cond_timeout_np; # arm x86 mips
     pthread_cond_wait;
diff --git a/libc/libc.arm64.map b/libc/libc.arm64.map
index 5e6f991..7702f9e 100644
--- a/libc/libc.arm64.map
+++ b/libc/libc.arm64.map
@@ -692,7 +692,7 @@
     pthread_cond_init;
     pthread_cond_signal;
     pthread_cond_timedwait;
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_wait;
     pthread_condattr_destroy;
     pthread_condattr_getclock; # introduced=21
diff --git a/libc/libc.map.txt b/libc/libc.map.txt
index 6bbffba..ebe2098 100644
--- a/libc/libc.map.txt
+++ b/libc/libc.map.txt
@@ -771,7 +771,7 @@
     pthread_cond_signal;
     pthread_cond_timedwait;
     pthread_cond_timedwait_monotonic; # arm x86 mips
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_timedwait_relative_np; # arm x86 mips
     pthread_cond_timeout_np; # arm x86 mips
     pthread_cond_wait;
diff --git a/libc/libc.mips.map b/libc/libc.mips.map
index cdba570..d04cb4f 100644
--- a/libc/libc.mips.map
+++ b/libc/libc.mips.map
@@ -744,7 +744,7 @@
     pthread_cond_signal;
     pthread_cond_timedwait;
     pthread_cond_timedwait_monotonic; # arm x86 mips
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_timedwait_relative_np; # arm x86 mips
     pthread_cond_timeout_np; # arm x86 mips
     pthread_cond_wait;
diff --git a/libc/libc.mips64.map b/libc/libc.mips64.map
index 5e6f991..7702f9e 100644
--- a/libc/libc.mips64.map
+++ b/libc/libc.mips64.map
@@ -692,7 +692,7 @@
     pthread_cond_init;
     pthread_cond_signal;
     pthread_cond_timedwait;
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_wait;
     pthread_condattr_destroy;
     pthread_condattr_getclock; # introduced=21
diff --git a/libc/libc.x86.map b/libc/libc.x86.map
index 5cb0cf4..2e10bbb 100644
--- a/libc/libc.x86.map
+++ b/libc/libc.x86.map
@@ -742,7 +742,7 @@
     pthread_cond_signal;
     pthread_cond_timedwait;
     pthread_cond_timedwait_monotonic; # arm x86 mips
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_timedwait_relative_np; # arm x86 mips
     pthread_cond_timeout_np; # arm x86 mips
     pthread_cond_wait;
diff --git a/libc/libc.x86_64.map b/libc/libc.x86_64.map
index 5e6f991..7702f9e 100644
--- a/libc/libc.x86_64.map
+++ b/libc/libc.x86_64.map
@@ -692,7 +692,7 @@
     pthread_cond_init;
     pthread_cond_signal;
     pthread_cond_timedwait;
-    pthread_cond_timedwait_monotonic_np; # introduced-arm=16 introduced-x86=16 introduced-mips=16 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
+    pthread_cond_timedwait_monotonic_np; # introduced-arm=9 introduced-x86=9 introduced-mips=9 introduced-arm64=28 introduced-x64_64=28 introduced-mips64=28
     pthread_cond_wait;
     pthread_condattr_destroy;
     pthread_condattr_getclock; # introduced=21
diff --git a/libc/malloc_debug/README.md b/libc/malloc_debug/README.md
index 18d22a4..c427d5f 100644
--- a/libc/malloc_debug/README.md
+++ b/libc/malloc_debug/README.md
@@ -610,6 +610,11 @@
 
     adb shell setprop wrap.<APP> '"LIBC_DEBUG_MALLOC_OPTIONS=backtrace logwrapper"'
 
+If you need to enable multiple options using this method, then you can set
+them like so:
+
+    adb shell setprop wrap.<APP> '"LIBC_DEBUG_MALLOC_OPTIONS=backtrace\ leak_track\ fill logwrapper"'
+
 For example, to enable malloc debug for the google search box (Android O or later):
 
     adb shell setprop wrap.com.google.android.googlequicksearchbox '"LIBC_DEBUG_MALLOC_OPTIONS=backtrace logwrapper"'
diff --git a/libc/malloc_hooks/README.md b/libc/malloc_hooks/README.md
index 85b2769..b418e1e 100644
--- a/libc/malloc_hooks/README.md
+++ b/libc/malloc_hooks/README.md
@@ -89,13 +89,13 @@
 Enable the hooks for all processes:
 
     adb shell stop
-    adb shell setprop libc.debug.malloc.hooks 1
+    adb shell setprop libc.debug.hooks.enable 1
     adb shell start
 
-Enable malloc debug using an environment variable:
+Enable malloc hooks using an environment variable:
 
     adb shell
-    # export LIBC_HOOK_ENABLE=1
+    # export LIBC_HOOKS_ENABLE=1
     # ls
 
 Any process spawned from this shell will run with malloc hooks enabled.
diff --git a/libc/private/bionic_globals.h b/libc/private/bionic_globals.h
index 94dd7e8..ad0244c 100644
--- a/libc/private/bionic_globals.h
+++ b/libc/private/bionic_globals.h
@@ -50,6 +50,7 @@
 
 #if defined(__i386__)
 __LIBC_HIDDEN__ extern void* __libc_sysinfo;
+__LIBC_HIDDEN__ void __libc_int0x80();
 __LIBC_HIDDEN__ void __libc_init_sysinfo(KernelArgumentBlock& args);
 #endif
 
diff --git a/linker/linker.cpp b/linker/linker.cpp
index 12d3a78..9503a49 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -189,7 +189,6 @@
     "libgui.so",
     "libmedia.so",
     "libnativehelper.so",
-    "libskia.so",
     "libssl.so",
     "libstagefright.so",
     "libsqlite.so",
diff --git a/linker/linker_main.cpp b/linker/linker_main.cpp
index 00c72d0..1a0d164 100644
--- a/linker/linker_main.cpp
+++ b/linker/linker_main.cpp
@@ -228,12 +228,7 @@
   _exit(EXIT_FAILURE);
 }
 
-/*
- * This code is called after the linker has linked itself and
- * fixed it's own GOT. It is safe to make references to externs
- * and other non-local data at this point.
- */
-static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args) {
+static ElfW(Addr) linker_main(KernelArgumentBlock& args) {
   ProtectedDataGuard guard;
 
 #if TIMING
@@ -414,6 +409,10 @@
 
   if (!get_cfi_shadow()->InitialLinkDone(solist)) __linker_cannot_link(g_argv[0]);
 
+  // Store a pointer to the kernel argument block in a TLS slot to be
+  // picked up by the libc constructor.
+  __get_tls()[TLS_SLOT_BIONIC_PREINIT] = &args;
+
   si->call_pre_init_constructors();
 
   /* After the prelink_image, the si->load_bias is initialized.
@@ -494,6 +493,11 @@
   return 0;
 }
 
+static ElfW(Addr) __attribute__((noinline))
+__linker_init_post_relocation(KernelArgumentBlock& args,
+                              ElfW(Addr) linker_addr,
+                              soinfo& linker_so);
+
 /*
  * This is the entry point for the linker, called from begin.S. This
  * method is responsible for fixing the linker's own relocations, and
@@ -506,6 +510,10 @@
 extern "C" ElfW(Addr) __linker_init(void* raw_args) {
   KernelArgumentBlock args(raw_args);
 
+#if defined(__i386__)
+  __libc_init_sysinfo(args);
+#endif
+
   // AT_BASE is set to 0 in the case when linker is run by iself
   // so in order to link the linker it needs to calcuate AT_BASE
   // using information at hand. The trick below takes advantage
@@ -523,7 +531,6 @@
   linker_addr += reinterpret_cast<uintptr_t>(raw_args);
 #endif
 
-  ElfW(Addr) entry_point = args.getauxval(AT_ENTRY);
   ElfW(Ehdr)* elf_hdr = reinterpret_cast<ElfW(Ehdr)*>(linker_addr);
   ElfW(Phdr)* phdr = reinterpret_cast<ElfW(Phdr)*>(linker_addr + elf_hdr->e_phoff);
 
@@ -548,15 +555,19 @@
   // functions at this point.
   if (!linker_so.link_image(g_empty_list, g_empty_list, nullptr)) __linker_cannot_link(args.argv[0]);
 
-#if defined(__i386__)
-  // On x86, we can't make system calls before this point.
-  // We can't move this up because this needs to assign to a global.
-  // Note that until we call __libc_init_main_thread below we have
-  // no TLS, so you shouldn't make a system call that can fail, because
-  // it will SEGV when it tries to set errno.
-  __libc_init_sysinfo(args);
-#endif
+  return __linker_init_post_relocation(args, linker_addr, linker_so);
+}
 
+/*
+ * This code is called after the linker has linked itself and fixed its own
+ * GOT. It is safe to make references to externs and other non-local data at
+ * this point. The compiler sometimes moves GOT references earlier in a
+ * function, so avoid inlining this function (http://b/80503879).
+ */
+static ElfW(Addr) __attribute__((noinline))
+__linker_init_post_relocation(KernelArgumentBlock& args,
+                              ElfW(Addr) linker_addr,
+                              soinfo& linker_so) {
   // Initialize the main thread (including TLS, so system calls really work).
   __libc_init_main_thread(args);
 
@@ -581,6 +592,7 @@
   //
   // This happens when user tries to run 'adb shell /system/bin/linker'
   // see also https://code.google.com/p/android/issues/detail?id=63174
+  ElfW(Addr) entry_point = args.getauxval(AT_ENTRY);
   if (reinterpret_cast<ElfW(Addr)>(&_start) == entry_point) {
     async_safe_format_fd(STDOUT_FILENO,
                      "This is %s, the helper program for dynamic executables.\n",
@@ -596,10 +608,8 @@
   sonext = solist = get_libdl_info(kLinkerPath, linker_so, linker_link_map);
   g_default_namespace.add_soinfo(solist);
 
-  // We have successfully fixed our own relocations. It's safe to run
-  // the main part of the linker now.
   args.abort_message_ptr = &g_abort_message;
-  ElfW(Addr) start_address = __linker_init_post_relocation(args);
+  ElfW(Addr) start_address = linker_main(args);
 
   INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address));