Obtain x86 cache info from CPU
The cache info today is hardcoded in cache.h
May not be optimal across various uarchs/SKUs
Leverage bionic sysconf to get the underlying cache.
Improvements seen on RPL, for various sizes
memmove_non_overlapping
1.25M - 31%
1.5M - 30%
1.75M - 28%
memcpy
1.25M - 31%
1.5M - 31%
1.75M - 30%
The bionic benchmarks (which only go up to 128KiB) show no change, as
you'd expect.
Test: bionic/tests/run-on-host.sh 64 && bionic/tests/run-on-host.sh 32
Bug: 202102347
Change-Id: I4bbad51794758873744149d0f58b86bb92ee307f
Signed-off-by: Vinay Prasad Kompella <vinay.kompella@intel.com>
Signed-off-by: Soni, Ravi Kumar <ravi.kumar.soni@intel.com>
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 7395028..8b32680 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -28,7 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include "cache.h"
#ifndef MEMMOVE
# define MEMMOVE memmove
@@ -189,8 +188,9 @@
cmp %r8, %rbx
jbe L(mm_copy_remaining_forward)
- cmp $SHARED_CACHE_SIZE_HALF, %rdx
- jae L(mm_large_page_loop_forward)
+ cmp __x86_shared_cache_size_half(%rip), %rdx
+
+ ja L(mm_overlapping_check_forward)
.p2align 4
L(mm_main_loop_forward):
@@ -414,8 +414,9 @@
cmp %r9, %rbx
jae L(mm_recalc_len)
- cmp $SHARED_CACHE_SIZE_HALF, %rdx
- jae L(mm_large_page_loop_backward)
+ cmp __x86_shared_cache_size_half(%rip), %rdx
+
+ ja L(mm_overlapping_check_backward)
.p2align 4
L(mm_main_loop_backward):
@@ -481,6 +482,13 @@
/* Big length copy forward part. */
.p2align 4
+
+L(mm_overlapping_check_forward):
+ mov %rsi, %r9
+ add %rdx, %r9
+ cmp __x86_shared_cache_size(%rip), %r9
+ jbe L(mm_main_loop_forward)
+
L(mm_large_page_loop_forward):
movdqu (%r8, %rsi), %xmm0
movdqu 16(%r8, %rsi), %xmm1
@@ -498,6 +506,14 @@
/* Big length copy backward part. */
.p2align 4
+
+L(mm_overlapping_check_backward):
+ mov %rdi, %r11
+ sub %rsi, %r11 /* r11 = dst - src, diff */
+ add %rdx, %r11
+ cmp __x86_shared_cache_size(%rip), %r11
+ jbe L(mm_main_loop_backward)
+
L(mm_large_page_loop_backward):
movdqu -64(%r9, %r8), %xmm0
movdqu -48(%r9, %r8), %xmm1