avx2 memset: add missing `vzeroupper`.

Also improve a few labels. This actually improves performance slightly,
and removes the weird behavior I was seeing around 512 bytes in the
"before" numbers...

Before:
```
BM_string_memset/8/0            2.12 ns         2.12 ns    332002763 bytes_per_second=3.52172G/s
BM_string_memset/16/0           2.36 ns         2.36 ns    297459840 bytes_per_second=6.31618G/s
BM_string_memset/32/0           2.36 ns         2.36 ns    296996995 bytes_per_second=12.6321G/s
BM_string_memset/64/0           2.37 ns         2.36 ns    296196644 bytes_per_second=25.2097G/s
BM_string_memset/512/0          65.9 ns         65.8 ns     10609200 bytes_per_second=7.24172G/s
BM_string_memset/1024/0         69.5 ns         69.5 ns     10079176 bytes_per_second=13.7312G/s
BM_string_memset/8192/0          123 ns          123 ns      5726682 bytes_per_second=62.2494G/s
BM_string_memset/16384/0         183 ns          183 ns      3832127 bytes_per_second=83.5219G/s
BM_string_memset/32768/0         306 ns          306 ns      2292654 bytes_per_second=99.8293G/s
BM_string_memset/65536/0         570 ns          569 ns      1224926 bytes_per_second=107.185G/s
BM_string_memset/131072/0       1067 ns         1067 ns       654098 bytes_per_second=114.395G/s
```

After:
```
BM_string_memset/8/0            2.34 ns         2.34 ns    299919615 bytes_per_second=3.18993G/s
BM_string_memset/16/0           2.58 ns         2.58 ns    271170449 bytes_per_second=5.76711G/s
BM_string_memset/32/0           2.61 ns         2.61 ns    266003840 bytes_per_second=11.4245G/s
BM_string_memset/64/0           2.62 ns         2.62 ns    269191710 bytes_per_second=22.784G/s
BM_string_memset/128/0          2.84 ns         2.84 ns    244486639 bytes_per_second=41.994G/s
BM_string_memset/256/0          4.23 ns         4.23 ns    165575532 bytes_per_second=56.4047G/s
BM_string_memset/512/0          7.12 ns         7.12 ns     99398933 bytes_per_second=67.0164G/s
BM_string_memset/1024/0         10.9 ns         10.9 ns     64108888 bytes_per_second=87.2884G/s
BM_string_memset/8192/0         63.6 ns         63.6 ns     11012138 bytes_per_second=119.989G/s
BM_string_memset/16384/0         127 ns          127 ns      5506888 bytes_per_second=120.065G/s
BM_string_memset/32768/0         252 ns          251 ns      2783524 bytes_per_second=121.346G/s
BM_string_memset/65536/0         515 ns          515 ns      1357500 bytes_per_second=118.587G/s
BM_string_memset/131072/0       1013 ns         1012 ns       691605 bytes_per_second=120.587G/s
```

Bug: http://b/292281479
Test: treehugger
Change-Id: I45bfffedbdf0ec55a1b1341ffbab0af6d240d3a3
diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/string/avx2-memset-kbl.S
index 09dd07d..ca62a9f 100644
--- a/libc/arch-x86_64/string/avx2-memset-kbl.S
+++ b/libc/arch-x86_64/string/avx2-memset-kbl.S
@@ -63,10 +63,9 @@
 	testb	$2, %dl
 	jnz	L(2_3bytes)
 	testb	$1, %dl
-	jz	L(return)
+	jz	1f
 	movb	%cl, (%rdi)
-L(return):
-	ret
+1:	ret
 
 L(8_15bytes):
 	movq	%rcx, (%rdi)
@@ -90,59 +89,54 @@
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm0, -16(%rdi, %rdx)
 	cmpq	$32, %rdx
-	jbe	L(32bytesless)
+	jbe	L(done)
 	movdqu	%xmm0, 16(%rdi)
 	movdqu	%xmm0, -32(%rdi, %rdx)
 	cmpq	$64, %rdx
-	jbe	L(64bytesless)
+	jbe	L(done)
 	movdqu	%xmm0, 32(%rdi)
 	movdqu	%xmm0, 48(%rdi)
 	movdqu	%xmm0, -64(%rdi, %rdx)
 	movdqu	%xmm0, -48(%rdi, %rdx)
 	cmpq	$128, %rdx
-	jbe	L(128bytesless)
-        vpbroadcastb %xmm0, %ymm0
+	jbe	L(done)
+	vpbroadcastb %xmm0, %ymm0
 	vmovdqu	%ymm0, 64(%rdi)
 	vmovdqu	%ymm0, 96(%rdi)
 	vmovdqu	%ymm0, -128(%rdi, %rdx)
 	vmovdqu	%ymm0, -96(%rdi, %rdx)
 	cmpq	$256, %rdx
-        ja      L(256bytesmore)
-L(32bytesless):
-L(64bytesless):
-L(128bytesless):
-	ret
+	jbe	L(done)
 
 	ALIGN (4)
-L(256bytesmore):
 	leaq	128(%rdi), %rcx
 	andq	$-128, %rcx
 	movq	%rdx, %r8
 	addq	%rdi, %rdx
 	andq	$-128, %rdx
 	cmpq	%rcx, %rdx
-	je	L(return)
+	je	L(done)
 
 #ifdef SHARED_CACHE_SIZE
 	cmp	$SHARED_CACHE_SIZE, %r8
 #else
 	cmp	__x86_64_shared_cache_size(%rip), %r8
 #endif
-	ja	L(256bytesmore_nt)
+	ja	L(non_temporal_loop)
 
 	ALIGN (4)
-L(256bytesmore_normal):
+L(normal_loop):
 	vmovdqa	%ymm0, (%rcx)
 	vmovdqa	%ymm0, 32(%rcx)
 	vmovdqa	%ymm0, 64(%rcx)
 	vmovdqa	%ymm0, 96(%rcx)
 	addq	$128, %rcx
 	cmpq	%rcx, %rdx
-	jne	L(256bytesmore_normal)
-	ret
+	jne	L(normal_loop)
+	jmp	L(done)
 
 	ALIGN (4)
-L(256bytesmore_nt):
+L(non_temporal_loop):
 	movntdq	 %xmm0, (%rcx)
 	movntdq	 %xmm0, 16(%rcx)
 	movntdq	 %xmm0, 32(%rcx)
@@ -153,8 +147,14 @@
 	movntdq	 %xmm0, 112(%rcx)
 	leaq	128(%rcx), %rcx
 	cmpq	%rcx, %rdx
-	jne	L(256bytesmore_nt)
+	jne	L(non_temporal_loop)
+	# We used non-temporal stores, so we need a fence here.
 	sfence
+
+L(done):
+	# We used the ymm registers, and that can break SSE2 performance
+	# unless you do this.
+	vzeroupper
 	ret
 
 END(memset_avx2)