Merge "Use builtins for the x86/x86-64 lrint family." into main
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 6b6c448..319db25 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -189,6 +189,5 @@
 
 Some devices have a `perf-setup.sh` script that locks CPU and GPU frequencies. Some TradeFed
 benchmarks appear to be using the script. For more information:
- * run `get_build_var BOARD_PERFSETUP_SCRIPT`
- * run `m perf-setup` to install the script into `${OUT}/data/local/tmp/perf-setup.sh`
+ * run `adb shell perf-setup.sh` to execute the script, it is already by default be installed on device for eng and userdebug build
  * see: https://android.googlesource.com/platform/platform_testing/+/refs/heads/main/scripts/perf-setup/
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/string/cache.h
deleted file mode 100644
index 33719a0..0000000
--- a/libc/arch-x86/string/cache.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef FOR_ATOM
-#define SHARED_CACHE_SIZE (512 * 1024) /* Atom L2 Cache */
-#endif
-#ifdef FOR_SILVERMONT
-#define SHARED_CACHE_SIZE (1024 * 1024) /* Silvermont L2 Cache */
-#endif
-
-#define DATA_CACHE_SIZE (24 * 1024) /* Atom and Silvermont L1 Data Cache */
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/sse2-memmove-slm.S b/libc/arch-x86/string/sse2-memmove-slm.S
index 79b5d1b..7f42374 100644
--- a/libc/arch-x86/string/sse2-memmove-slm.S
+++ b/libc/arch-x86/string/sse2-memmove-slm.S
@@ -29,7 +29,6 @@
 */
 
 #define FOR_SILVERMONT
-#include "cache.h"
 
 #ifndef MEMMOVE
 # define MEMMOVE	memmove_generic
@@ -94,6 +93,8 @@
 #define RETURN_END	POP (%ebx); ret
 #define RETURN		RETURN_END; CFI_PUSH (%ebx)
 
+#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
+
 	.section .text.sse2,"ax",@progbits
 ENTRY (MEMMOVE)
 	ENTRANCE
@@ -193,7 +194,13 @@
 	cmp	%edi, %ebx
 	jbe	L(mm_copy_remaining_forward)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+	PUSH(%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+	/* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */
+	POP(%ebx)
+
 	jae	L(mm_large_page_loop_forward)
 
 	.p2align 4
@@ -424,7 +431,13 @@
 	cmp	%edi, %ebx
 	jae	L(mm_main_loop_backward_end)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
+	PUSH(%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+	/* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */
+	POP(%ebx)
+
 	jae	L(mm_large_page_loop_backward)
 
 	.p2align 4
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
index 320afec..e43ead0 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/string/sse2-memset-atom.S
@@ -31,7 +31,6 @@
 #include <private/bionic_asm.h>
 
 #define FOR_ATOM
-#include "cache.h"
 
 #ifndef L
 # define L(label)	.L##label
@@ -64,6 +63,8 @@
 #define RETURN		RETURN_END; CFI_PUSH(%ebx)
 #define JMPTBL(I, B)	I - B
 
+#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
+
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    jump table with relative offsets.   */
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
@@ -256,14 +257,20 @@
 	ALIGN(4)
 L(128bytesormore):
 	PUSH(%ebx)
-	mov	$SHARED_CACHE_SIZE, %ebx
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
 	cmp	%ebx, %ecx
 	jae	L(128bytesormore_nt_start)
 
 
 	POP(%ebx)
 # define RESTORE_EBX_STATE CFI_PUSH(%ebx)
-	cmp	$DATA_CACHE_SIZE, %ecx
+	PUSH(%ebx)
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+	POP(%ebx)
 
 	jae	L(128bytes_L2_normal)
 	subl	$128, %ecx
diff --git a/libc/arch-x86/string/sse2-memset-slm.S b/libc/arch-x86/string/sse2-memset-slm.S
index 5cff141..e4c8fa1 100644
--- a/libc/arch-x86/string/sse2-memset-slm.S
+++ b/libc/arch-x86/string/sse2-memset-slm.S
@@ -31,7 +31,6 @@
 #include <private/bionic_asm.h>
 
 #define FOR_SILVERMONT
-#include "cache.h"
 
 #ifndef L
 # define L(label)	.L##label
@@ -64,6 +63,8 @@
 # define RETURN		RETURN_END; CFI_PUSH(%ebx)
 # define JMPTBL(I, B)	I - B
 
+#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
+
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
    jump table with relative offsets.   */
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
@@ -177,14 +178,18 @@
 	ALIGN(4)
 L(128bytesormore):
 	PUSH(%ebx)
-	mov	$SHARED_CACHE_SIZE, %ebx
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
 	cmp	%ebx, %ecx
 	jae	L(128bytesormore_nt_start)
 
 	POP(%ebx)
 
 	PUSH(%ebx)
-	mov	$DATA_CACHE_SIZE, %ebx
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
 
 	cmp	%ebx, %ecx
 	jae	L(128bytes_L2_normal)
diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
index fe3082e..83e1985 100644
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ b/libc/arch-x86/string/ssse3-memcpy-atom.S
@@ -29,7 +29,6 @@
 */
 
 #define FOR_ATOM
-#include "cache.h"
 
 #ifndef MEMCPY
 # define MEMCPY	memcpy_atom
diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/string/avx2-memset-kbl.S
index ca62a9f..35d682a 100644
--- a/libc/arch-x86_64/string/avx2-memset-kbl.S
+++ b/libc/arch-x86_64/string/avx2-memset-kbl.S
@@ -30,7 +30,6 @@
 
 #include <private/bionic_asm.h>
 
-#include "cache.h"
 
 #ifndef L
 # define L(label)	.L##label
@@ -117,11 +116,8 @@
 	cmpq	%rcx, %rdx
 	je	L(done)
 
-#ifdef SHARED_CACHE_SIZE
-	cmp	$SHARED_CACHE_SIZE, %r8
-#else
-	cmp	__x86_64_shared_cache_size(%rip), %r8
-#endif
+	cmp	__x86_shared_cache_size(%rip), %r8
+
 	ja	L(non_temporal_loop)
 
 	ALIGN (4)
diff --git a/libc/arch-x86_64/string/cache.h b/libc/arch-x86_64/string/cache.h
deleted file mode 100644
index 4131509..0000000
--- a/libc/arch-x86_64/string/cache.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-Copyright (c) 2014, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* Values are optimized for Core Architecture */
-#define SHARED_CACHE_SIZE (4096*1024)  /* Core Architecture L2 Cache */
-#define DATA_CACHE_SIZE   (24*1024)    /* Core Architecture L1 Data Cache */
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 7395028..8b32680 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -28,7 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include "cache.h"
 
 #ifndef MEMMOVE
 # define MEMMOVE		memmove
@@ -189,8 +188,9 @@
 	cmp	%r8, %rbx
 	jbe	L(mm_copy_remaining_forward)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-	jae	L(mm_large_page_loop_forward)
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+
+	ja      L(mm_overlapping_check_forward)
 
 	.p2align 4
 L(mm_main_loop_forward):
@@ -414,8 +414,9 @@
 	cmp	%r9, %rbx
 	jae	L(mm_recalc_len)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-	jae	L(mm_large_page_loop_backward)
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+
+	ja	L(mm_overlapping_check_backward)
 
 	.p2align 4
 L(mm_main_loop_backward):
@@ -481,6 +482,13 @@
 /* Big length copy forward part.  */
 
 	.p2align 4
+
+L(mm_overlapping_check_forward):
+	mov	%rsi, %r9
+	add	%rdx, %r9
+	cmp	__x86_shared_cache_size(%rip), %r9
+	jbe	L(mm_main_loop_forward)
+
 L(mm_large_page_loop_forward):
 	movdqu	(%r8, %rsi), %xmm0
 	movdqu	16(%r8, %rsi), %xmm1
@@ -498,6 +506,14 @@
 
 /* Big length copy backward part.  */
 	.p2align 4
+
+L(mm_overlapping_check_backward):
+	mov	%rdi, %r11
+	sub	%rsi, %r11 /* r11 = dst - src, diff */
+	add	%rdx, %r11
+	cmp	__x86_shared_cache_size(%rip), %r11
+	jbe	L(mm_main_loop_backward)
+
 L(mm_large_page_loop_backward):
 	movdqu	-64(%r9, %r8), %xmm0
 	movdqu	-48(%r9, %r8), %xmm1
diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/string/sse2-memset-slm.S
index cceadd2..84ab327 100644
--- a/libc/arch-x86_64/string/sse2-memset-slm.S
+++ b/libc/arch-x86_64/string/sse2-memset-slm.S
@@ -30,7 +30,6 @@
 
 #include <private/bionic_asm.h>
 
-#include "cache.h"
 
 #ifndef L
 # define L(label)	.L##label
@@ -116,11 +115,8 @@
 	cmpq	%rcx, %rdx
 	je	L(return)
 
-#ifdef SHARED_CACHE_SIZE
-	cmp	$SHARED_CACHE_SIZE, %r8
-#else
-	cmp	__x86_64_shared_cache_size(%rip), %r8
-#endif
+	cmp	__x86_shared_cache_size(%rip), %r8
+
 	ja	L(128bytesmore_nt)
 
 	ALIGN (4)
diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/string/sse4-memcmp-slm.S
index 8a8b180..46ad78d 100644
--- a/libc/arch-x86_64/string/sse4-memcmp-slm.S
+++ b/libc/arch-x86_64/string/sse4-memcmp-slm.S
@@ -28,7 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include "cache.h"
 
 #ifndef MEMCMP
 # define MEMCMP		memcmp
@@ -353,11 +352,7 @@
 
 	ALIGN (4)
 L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
-	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+	mov	__x86_data_cache_size_half(%rip), %r8
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
@@ -669,11 +664,7 @@
 
 	ALIGN (4)
 L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
-	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+	mov	__x86_data_cache_size_half(%rip), %r8
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index 939e4e1..8c7a49b 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -64,6 +64,28 @@
 __BIONIC_WEAK_VARIABLE_FOR_NATIVE_BRIDGE
 const char* __progname;
 
+#if defined(__i386__) || defined(__x86_64__)
+// Default sizes based on the old hard-coded values for Atom/Silvermont (x86) and Core 2 (x86-64)...
+size_t __x86_data_cache_size = 24 * 1024;
+size_t __x86_data_cache_size_half = __x86_data_cache_size / 2;
+size_t __x86_shared_cache_size = sizeof(long) == 8 ? 4096 * 1024 : 1024 * 1024;
+size_t __x86_shared_cache_size_half = __x86_shared_cache_size / 2;
+// ...overwritten at runtime based on the cpu's reported cache sizes.
+static void __libc_init_x86_cache_info() {
+  // Handle the case where during early boot /sys fs may not yet be ready,
+  // resulting in sysconf() returning 0, leading to crashes.
+  // In that case (basically just init), we keep the defaults.
+  if (sysconf(_SC_LEVEL1_DCACHE_SIZE) != 0) {
+    __x86_data_cache_size = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+    __x86_data_cache_size_half = __x86_data_cache_size / 2;
+  }
+  if (sysconf(_SC_LEVEL2_CACHE_SIZE) != 0) {
+    __x86_shared_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+    __x86_shared_cache_size_half = __x86_shared_cache_size / 2;
+  }
+}
+#endif
+
 void __libc_init_globals() {
   // Initialize libc globals that are needed in both the linker and in libc.
   // In dynamic binaries, this is run at least twice for different copies of the
@@ -78,8 +100,10 @@
 #if !defined(__LP64__)
 static void __check_max_thread_id() {
   if (gettid() > 65535) {
-    async_safe_fatal("Limited by the size of pthread_mutex_t, 32 bit bionic libc only accepts "
-                     "pid <= 65535, but current pid is %d", gettid());
+    async_safe_fatal("32-bit pthread_mutex_t only supports pids <= 65535; "
+                     "current pid %d; "
+                     "`echo 65535 > /proc/sys/kernel/pid_max` as root",
+                     gettid());
   }
 }
 #endif
@@ -173,6 +197,10 @@
   __system_properties_init(); // Requires 'environ'.
   __libc_init_fdsan(); // Requires system properties (for debug.fdsan).
   __libc_init_fdtrack();
+
+#if defined(__i386__) || defined(__x86_64__)
+  __libc_init_x86_cache_info();
+#endif
 }
 
 void __libc_init_fork_handler() {
diff --git a/libm/Android.bp b/libm/Android.bp
index f44a326..9fd79f8 100644
--- a/libm/Android.bp
+++ b/libm/Android.bp
@@ -183,9 +183,9 @@
         "upstream-freebsd/lib/msun/src/w_drem.c",
         "upstream-freebsd/lib/msun/src/w_dremf.c",
 
-        // The FreeBSD complex functions appear to be better, but they're incomplete.
-        // We take the FreeBSD implementations when they exist, but fill out the rest
-        // of <complex.h> from NetBSD...
+        // The FreeBSD complex function implementations appear to be better
+        // than the other BSDs', but they're incomplete. We take the FreeBSD
+        // implementations when they exist, but fill out the rest from NetBSD...
         "upstream-netbsd/lib/libm/complex/ccoshl.c",
         "upstream-netbsd/lib/libm/complex/ccosl.c",
         "upstream-netbsd/lib/libm/complex/cephes_subrl.c",
@@ -386,7 +386,6 @@
     ],
 
     cflags: [
-        "-D_BSD_SOURCE",
         "-include freebsd-compat.h",
         "-fno-builtin",
         "-fno-math-errno",
diff --git a/libm/freebsd-compat.h b/libm/freebsd-compat.h
index f1985d2..9555ff9 100644
--- a/libm/freebsd-compat.h
+++ b/libm/freebsd-compat.h
@@ -16,6 +16,10 @@
 
 #pragma once
 
+// Since we're implementing all the extensions,
+// we need to make sure we get all their declarations when we include <math.h>.
+#define _BSD_SOURCE
+
 // Some FreeBSD source includes <complex.h> and assumes <math.h> from that.
 #include <math.h>