Obtain x86 cache info from CPU

The cache info today is hardcoded in cache.h
May not be optimal across various uarchs/SKUs
Leverage bionic sysconf to get the underlying cache.

Improvements seen on RPL, for various sizes
memmove_non_overlapping
1.25M - 31%
1.5M - 30%
1.75M - 28%

memcpy
1.25M - 31%
1.5M - 31%
1.75M - 30%

The bionic benchmarks (which only go up to 128KiB) show no change, as
you'd expect.

Test: bionic/tests/run-on-host.sh 64 && bionic/tests/run-on-host.sh 32
Bug: 202102347
Change-Id: I4bbad51794758873744149d0f58b86bb92ee307f
Signed-off-by: Vinay Prasad Kompella <vinay.kompella@intel.com>
Signed-off-by: Soni, Ravi Kumar <ravi.kumar.soni@intel.com>
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 7395028..8b32680 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -28,7 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include "cache.h"
 
 #ifndef MEMMOVE
 # define MEMMOVE		memmove
@@ -189,8 +188,9 @@
 	cmp	%r8, %rbx
 	jbe	L(mm_copy_remaining_forward)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-	jae	L(mm_large_page_loop_forward)
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+
+	ja      L(mm_overlapping_check_forward)
 
 	.p2align 4
 L(mm_main_loop_forward):
@@ -414,8 +414,9 @@
 	cmp	%r9, %rbx
 	jae	L(mm_recalc_len)
 
-	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-	jae	L(mm_large_page_loop_backward)
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+
+	ja	L(mm_overlapping_check_backward)
 
 	.p2align 4
 L(mm_main_loop_backward):
@@ -481,6 +482,13 @@
 /* Big length copy forward part.  */
 
 	.p2align 4
+
+L(mm_overlapping_check_forward):
+	mov	%rsi, %r9
+	add	%rdx, %r9
+	cmp	__x86_shared_cache_size(%rip), %r9
+	jbe	L(mm_main_loop_forward)
+
 L(mm_large_page_loop_forward):
 	movdqu	(%r8, %rsi), %xmm0
 	movdqu	16(%r8, %rsi), %xmm1
@@ -498,6 +506,14 @@
 
 /* Big length copy backward part.  */
 	.p2align 4
+
+L(mm_overlapping_check_backward):
+	mov	%rdi, %r11
+	sub	%rsi, %r11 /* r11 = dst - src, diff */
+	add	%rdx, %r11
+	cmp	__x86_shared_cache_size(%rip), %r11
+	jbe	L(mm_main_loop_backward)
+
 L(mm_large_page_loop_backward):
 	movdqu	-64(%r9, %r8), %xmm0
 	movdqu	-48(%r9, %r8), %xmm1