Merge "riscv64: pass a null argument to ifunc resolvers." into main
diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/string/avx2-memset-kbl.S
index 09dd07d..ca62a9f 100644
--- a/libc/arch-x86_64/string/avx2-memset-kbl.S
+++ b/libc/arch-x86_64/string/avx2-memset-kbl.S
@@ -63,10 +63,9 @@
testb $2, %dl
jnz L(2_3bytes)
testb $1, %dl
- jz L(return)
+ jz 1f
movb %cl, (%rdi)
-L(return):
- ret
+1: ret
L(8_15bytes):
movq %rcx, (%rdi)
@@ -90,59 +89,54 @@
movdqu %xmm0, (%rdi)
movdqu %xmm0, -16(%rdi, %rdx)
cmpq $32, %rdx
- jbe L(32bytesless)
+ jbe L(done)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, -32(%rdi, %rdx)
cmpq $64, %rdx
- jbe L(64bytesless)
+ jbe L(done)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movdqu %xmm0, -64(%rdi, %rdx)
movdqu %xmm0, -48(%rdi, %rdx)
cmpq $128, %rdx
- jbe L(128bytesless)
- vpbroadcastb %xmm0, %ymm0
+ jbe L(done)
+ vpbroadcastb %xmm0, %ymm0
vmovdqu %ymm0, 64(%rdi)
vmovdqu %ymm0, 96(%rdi)
vmovdqu %ymm0, -128(%rdi, %rdx)
vmovdqu %ymm0, -96(%rdi, %rdx)
cmpq $256, %rdx
- ja L(256bytesmore)
-L(32bytesless):
-L(64bytesless):
-L(128bytesless):
- ret
+ jbe L(done)
ALIGN (4)
-L(256bytesmore):
leaq 128(%rdi), %rcx
andq $-128, %rcx
movq %rdx, %r8
addq %rdi, %rdx
andq $-128, %rdx
cmpq %rcx, %rdx
- je L(return)
+ je L(done)
#ifdef SHARED_CACHE_SIZE
cmp $SHARED_CACHE_SIZE, %r8
#else
cmp __x86_64_shared_cache_size(%rip), %r8
#endif
- ja L(256bytesmore_nt)
+ ja L(non_temporal_loop)
ALIGN (4)
-L(256bytesmore_normal):
+L(normal_loop):
vmovdqa %ymm0, (%rcx)
vmovdqa %ymm0, 32(%rcx)
vmovdqa %ymm0, 64(%rcx)
vmovdqa %ymm0, 96(%rcx)
addq $128, %rcx
cmpq %rcx, %rdx
- jne L(256bytesmore_normal)
- ret
+ jne L(normal_loop)
+ jmp L(done)
ALIGN (4)
-L(256bytesmore_nt):
+L(non_temporal_loop):
movntdq %xmm0, (%rcx)
movntdq %xmm0, 16(%rcx)
movntdq %xmm0, 32(%rcx)
@@ -153,8 +147,14 @@
movntdq %xmm0, 112(%rcx)
leaq 128(%rcx), %rcx
cmpq %rcx, %rdx
- jne L(256bytesmore_nt)
+ jne L(non_temporal_loop)
+ # We used non-temporal stores, so we need a fence here.
sfence
+
+L(done):
+ # We used the ymm registers, and that can break SSE2 performance
+ # unless you do this.
+ vzeroupper
ret
END(memset_avx2)
diff --git a/tests/execinfo_test.cpp b/tests/execinfo_test.cpp
index b8e1325..1a0c51b 100644
--- a/tests/execinfo_test.cpp
+++ b/tests/execinfo_test.cpp
@@ -79,9 +79,13 @@
}
static size_t FindFunction(std::vector<void*>& frames, uintptr_t func_addr) {
+ Dl_info func_info;
+ if (!dladdr(reinterpret_cast<void*>(func_addr), &func_info)) {
+ return 0;
+ }
for (size_t i = 0; i < frames.size(); i++) {
- uintptr_t frame_addr = reinterpret_cast<uintptr_t>(frames[i]);
- if (frame_addr >= func_addr && frame_addr <= func_addr + 0x100) {
+ Dl_info frame_info;
+ if (dladdr(frames[i], &frame_info) && func_info.dli_saddr == frame_info.dli_saddr) {
return i + 1;
}
}