avx2 memset: add missing `vzeroupper`.
Also improve a few labels. This actually improves performance slightly,
and removes the weird behavior I was seeing around 512 bytes in the
"before" numbers...
Before:
```
BM_string_memset/8/0 2.12 ns 2.12 ns 332002763 bytes_per_second=3.52172G/s
BM_string_memset/16/0 2.36 ns 2.36 ns 297459840 bytes_per_second=6.31618G/s
BM_string_memset/32/0 2.36 ns 2.36 ns 296996995 bytes_per_second=12.6321G/s
BM_string_memset/64/0 2.37 ns 2.36 ns 296196644 bytes_per_second=25.2097G/s
BM_string_memset/512/0 65.9 ns 65.8 ns 10609200 bytes_per_second=7.24172G/s
BM_string_memset/1024/0 69.5 ns 69.5 ns 10079176 bytes_per_second=13.7312G/s
BM_string_memset/8192/0 123 ns 123 ns 5726682 bytes_per_second=62.2494G/s
BM_string_memset/16384/0 183 ns 183 ns 3832127 bytes_per_second=83.5219G/s
BM_string_memset/32768/0 306 ns 306 ns 2292654 bytes_per_second=99.8293G/s
BM_string_memset/65536/0 570 ns 569 ns 1224926 bytes_per_second=107.185G/s
BM_string_memset/131072/0 1067 ns 1067 ns 654098 bytes_per_second=114.395G/s
```
After:
```
BM_string_memset/8/0 2.34 ns 2.34 ns 299919615 bytes_per_second=3.18993G/s
BM_string_memset/16/0 2.58 ns 2.58 ns 271170449 bytes_per_second=5.76711G/s
BM_string_memset/32/0 2.61 ns 2.61 ns 266003840 bytes_per_second=11.4245G/s
BM_string_memset/64/0 2.62 ns 2.62 ns 269191710 bytes_per_second=22.784G/s
BM_string_memset/128/0 2.84 ns 2.84 ns 244486639 bytes_per_second=41.994G/s
BM_string_memset/256/0 4.23 ns 4.23 ns 165575532 bytes_per_second=56.4047G/s
BM_string_memset/512/0 7.12 ns 7.12 ns 99398933 bytes_per_second=67.0164G/s
BM_string_memset/1024/0 10.9 ns 10.9 ns 64108888 bytes_per_second=87.2884G/s
BM_string_memset/8192/0 63.6 ns 63.6 ns 11012138 bytes_per_second=119.989G/s
BM_string_memset/16384/0 127 ns 127 ns 5506888 bytes_per_second=120.065G/s
BM_string_memset/32768/0 252 ns 251 ns 2783524 bytes_per_second=121.346G/s
BM_string_memset/65536/0 515 ns 515 ns 1357500 bytes_per_second=118.587G/s
BM_string_memset/131072/0 1013 ns 1012 ns 691605 bytes_per_second=120.587G/s
```
Bug: http://b/292281479
Test: treehugger
Change-Id: I45bfffedbdf0ec55a1b1341ffbab0af6d240d3a3
diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/string/avx2-memset-kbl.S
index 09dd07d..ca62a9f 100644
--- a/libc/arch-x86_64/string/avx2-memset-kbl.S
+++ b/libc/arch-x86_64/string/avx2-memset-kbl.S
@@ -63,10 +63,9 @@
testb $2, %dl
jnz L(2_3bytes)
testb $1, %dl
- jz L(return)
+ jz 1f
movb %cl, (%rdi)
-L(return):
- ret
+1: ret
L(8_15bytes):
movq %rcx, (%rdi)
@@ -90,59 +89,54 @@
movdqu %xmm0, (%rdi)
movdqu %xmm0, -16(%rdi, %rdx)
cmpq $32, %rdx
- jbe L(32bytesless)
+ jbe L(done)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, -32(%rdi, %rdx)
cmpq $64, %rdx
- jbe L(64bytesless)
+ jbe L(done)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movdqu %xmm0, -64(%rdi, %rdx)
movdqu %xmm0, -48(%rdi, %rdx)
cmpq $128, %rdx
- jbe L(128bytesless)
- vpbroadcastb %xmm0, %ymm0
+ jbe L(done)
+ vpbroadcastb %xmm0, %ymm0
vmovdqu %ymm0, 64(%rdi)
vmovdqu %ymm0, 96(%rdi)
vmovdqu %ymm0, -128(%rdi, %rdx)
vmovdqu %ymm0, -96(%rdi, %rdx)
cmpq $256, %rdx
- ja L(256bytesmore)
-L(32bytesless):
-L(64bytesless):
-L(128bytesless):
- ret
+ jbe L(done)
ALIGN (4)
-L(256bytesmore):
leaq 128(%rdi), %rcx
andq $-128, %rcx
movq %rdx, %r8
addq %rdi, %rdx
andq $-128, %rdx
cmpq %rcx, %rdx
- je L(return)
+ je L(done)
#ifdef SHARED_CACHE_SIZE
cmp $SHARED_CACHE_SIZE, %r8
#else
cmp __x86_64_shared_cache_size(%rip), %r8
#endif
- ja L(256bytesmore_nt)
+ ja L(non_temporal_loop)
ALIGN (4)
-L(256bytesmore_normal):
+L(normal_loop):
vmovdqa %ymm0, (%rcx)
vmovdqa %ymm0, 32(%rcx)
vmovdqa %ymm0, 64(%rcx)
vmovdqa %ymm0, 96(%rcx)
addq $128, %rcx
cmpq %rcx, %rdx
- jne L(256bytesmore_normal)
- ret
+ jne L(normal_loop)
+ jmp L(done)
ALIGN (4)
-L(256bytesmore_nt):
+L(non_temporal_loop):
movntdq %xmm0, (%rcx)
movntdq %xmm0, 16(%rcx)
movntdq %xmm0, 32(%rcx)
@@ -153,8 +147,14 @@
movntdq %xmm0, 112(%rcx)
leaq 128(%rcx), %rcx
cmpq %rcx, %rdx
- jne L(256bytesmore_nt)
+ jne L(non_temporal_loop)
+ # We used non-temporal stores, so we need a fence here.
sfence
+
+L(done):
+ # We used the ymm registers, and that can break SSE2 performance
+ # unless you do this.
+ vzeroupper
ret
END(memset_avx2)