Obtain x86 cache info from CPU

The cache info today is hardcoded in cache.h
May not be optimal across various uarchs/SKUs
Leverage bionic sysconf to get the underlying cache.

Improvements seen on RPL, for various sizes
memmove_non_overlapping
1.25M - 31%
1.5M - 30%
1.75M - 28%

memcpy
1.25M - 31%
1.5M - 31%
1.75M - 30%

The bionic benchmarks (which only go up to 128KiB) show no change, as
you'd expect.

Test: bionic/tests/run-on-host.sh 64 && bionic/tests/run-on-host.sh 32
Bug: 202102347
Change-Id: I4bbad51794758873744149d0f58b86bb92ee307f
Signed-off-by: Vinay Prasad Kompella <vinay.kompella@intel.com>
Signed-off-by: Soni, Ravi Kumar <ravi.kumar.soni@intel.com>
diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/string/avx2-memset-kbl.S
index ca62a9f..35d682a 100644
--- a/libc/arch-x86_64/string/avx2-memset-kbl.S
+++ b/libc/arch-x86_64/string/avx2-memset-kbl.S
@@ -30,7 +30,6 @@
 
 #include <private/bionic_asm.h>
 
-#include "cache.h"
 
 #ifndef L
 # define L(label)	.L##label
@@ -117,11 +116,8 @@
 	cmpq	%rcx, %rdx
 	je	L(done)
 
-#ifdef SHARED_CACHE_SIZE
-	cmp	$SHARED_CACHE_SIZE, %r8
-#else
-	cmp	__x86_64_shared_cache_size(%rip), %r8
-#endif
+	cmp	__x86_shared_cache_size(%rip), %r8
+
 	ja	L(non_temporal_loop)
 
 	ALIGN (4)
diff --git a/libc/arch-x86_64/string/cache.h b/libc/arch-x86_64/string/cache.h
deleted file mode 100644
index 4131509..0000000
--- a/libc/arch-x86_64/string/cache.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-Copyright (c) 2014, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/* Values are optimized for Core Architecture */
-#define SHARED_CACHE_SIZE (4096*1024)  /* Core Architecture L2 Cache */
-#define DATA_CACHE_SIZE   (24*1024)    /* Core Architecture L1 Data Cache */
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 7395028..8b32680 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -28,7 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include "cache.h"
 
 #ifndef MEMMOVE
 # define MEMMOVE		memmove
@@ -189,8 +188,9 @@
 	cmp	%r8, %rbx
 	jbe	L(mm_copy_remaining_forward)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-	jae	L(mm_large_page_loop_forward)
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+
+	ja      L(mm_overlapping_check_forward)
 
 	.p2align 4
 L(mm_main_loop_forward):
@@ -414,8 +414,9 @@
 	cmp	%r9, %rbx
 	jae	L(mm_recalc_len)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-	jae	L(mm_large_page_loop_backward)
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+
+	ja	L(mm_overlapping_check_backward)
 
 	.p2align 4
 L(mm_main_loop_backward):
@@ -481,6 +482,13 @@
 /* Big length copy forward part.  */
 
 	.p2align 4
+
+L(mm_overlapping_check_forward):
+	mov	%rsi, %r9
+	add	%rdx, %r9
+	cmp	__x86_shared_cache_size(%rip), %r9
+	jbe	L(mm_main_loop_forward)
+
 L(mm_large_page_loop_forward):
 	movdqu	(%r8, %rsi), %xmm0
 	movdqu	16(%r8, %rsi), %xmm1
@@ -498,6 +506,14 @@
 
 /* Big length copy backward part.  */
 	.p2align 4
+
+L(mm_overlapping_check_backward):
+	mov	%rdi, %r11
+	sub	%rsi, %r11 /* r11 = dst - src, diff */
+	add	%rdx, %r11
+	cmp	__x86_shared_cache_size(%rip), %r11
+	jbe	L(mm_main_loop_backward)
+
 L(mm_large_page_loop_backward):
 	movdqu	-64(%r9, %r8), %xmm0
 	movdqu	-48(%r9, %r8), %xmm1
diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/string/sse2-memset-slm.S
index cceadd2..84ab327 100644
--- a/libc/arch-x86_64/string/sse2-memset-slm.S
+++ b/libc/arch-x86_64/string/sse2-memset-slm.S
@@ -30,7 +30,6 @@
 
 #include <private/bionic_asm.h>
 
-#include "cache.h"
 
 #ifndef L
 # define L(label)	.L##label
@@ -116,11 +115,8 @@
 	cmpq	%rcx, %rdx
 	je	L(return)
 
-#ifdef SHARED_CACHE_SIZE
-	cmp	$SHARED_CACHE_SIZE, %r8
-#else
-	cmp	__x86_64_shared_cache_size(%rip), %r8
-#endif
+	cmp	__x86_shared_cache_size(%rip), %r8
+
 	ja	L(128bytesmore_nt)
 
 	ALIGN (4)
diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/string/sse4-memcmp-slm.S
index 8a8b180..46ad78d 100644
--- a/libc/arch-x86_64/string/sse4-memcmp-slm.S
+++ b/libc/arch-x86_64/string/sse4-memcmp-slm.S
@@ -28,7 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include "cache.h"
 
 #ifndef MEMCMP
 # define MEMCMP		memcmp
@@ -353,11 +352,7 @@
 
 	ALIGN (4)
 L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
-	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+	mov	__x86_data_cache_size_half(%rip), %r8
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
@@ -669,11 +664,7 @@
 
 	ALIGN (4)
 L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
-	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+	mov	__x86_data_cache_size_half(%rip), %r8
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8