Add avx2 version of wmemset in binoic

Test: ./tests/run-on-host.sh 64
Change-Id: Id2f696cc60a10c01846ca3fe0d3a5d513020afe3
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/libc/arch-x86/dynamic_function_dispatch.cpp b/libc/arch-x86/dynamic_function_dispatch.cpp
index 70f4b3e..370b372 100644
--- a/libc/arch-x86/dynamic_function_dispatch.cpp
+++ b/libc/arch-x86/dynamic_function_dispatch.cpp
@@ -107,6 +107,13 @@
     RETURN_FUNC(wmemcmp_func, wmemcmp_freebsd);
 }
 
+typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
+DEFINE_IFUNC_FOR(wmemset) {
+    __builtin_cpu_init();
+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2);
+    RETURN_FUNC(wmemset_func, wmemset_freebsd);
+}
+
 typedef int strcmp_func(const char* __lhs, const char* __rhs);
 DEFINE_IFUNC_FOR(strcmp) {
     __builtin_cpu_init();
diff --git a/libc/arch-x86/generic/string/wmemset.c b/libc/arch-x86/generic/string/wmemset.c
new file mode 100644
index 0000000..6247bfd
--- /dev/null
+++ b/libc/arch-x86/generic/string/wmemset.c
@@ -0,0 +1,17 @@
+// Copyright (C) 2019 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define wmemset wmemset_freebsd
+
+#include <upstream-freebsd/lib/libc/string/wmemset.c>
diff --git a/libc/arch-x86/kabylake/string/avx2-wmemset-kbl.S b/libc/arch-x86/kabylake/string/avx2-wmemset-kbl.S
new file mode 100644
index 0000000..69b66c7
--- /dev/null
+++ b/libc/arch-x86/kabylake/string/avx2-wmemset-kbl.S
@@ -0,0 +1,148 @@
+/*
+Copyright (C) 2019 The Android Open Source Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+*/
+
+#include <private/bionic_asm.h>
+
+#ifndef WMEMSET
+ #define WMEMSET wmemset_avx2
+#endif
+
+ENTRY(WMEMSET)
+# BB#0:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%edi
+	pushl	%esi
+	pushl	%eax
+	movl	32(%esp), %ecx
+	movl	24(%esp), %eax
+	testl	%ecx, %ecx
+	je	.LBB0_12
+# BB#1:
+	movl	28(%esp), %edx
+	xorl	%edi, %edi
+	movl	%eax, %esi
+	cmpl	$32, %ecx
+	jb	.LBB0_10
+# BB#2:
+	movl	%ecx, %eax
+	andl	$-32, %eax
+	vmovd	%edx, %xmm0
+	vpbroadcastd	%xmm0, %ymm0
+	movl	%eax, (%esp)            # 4-byte Spill
+	leal	-32(%eax), %esi
+	movl	%esi, %eax
+	shrl	$5, %eax
+	leal	1(%eax), %edi
+	andl	$7, %edi
+	xorl	%ebx, %ebx
+	cmpl	$224, %esi
+	jb	.LBB0_5
+# BB#3:
+	movl	24(%esp), %esi
+	leal	992(%esi), %ebp
+	leal	-1(%edi), %esi
+	subl	%eax, %esi
+	xorl	%ebx, %ebx
+	.p2align	4, 0x90
+.LBB0_4:                                # =>This Inner Loop Header: Depth=1
+	vmovdqu	%ymm0, -992(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -960(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -928(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -896(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -864(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -832(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -800(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -768(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -736(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -704(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -672(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -640(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -608(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -576(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -544(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -512(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -480(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -448(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -416(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -384(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -352(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -320(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -288(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -256(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -224(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -192(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -160(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -128(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -96(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -64(%ebp,%ebx,4)
+	vmovdqu	%ymm0, -32(%ebp,%ebx,4)
+	vmovdqu	%ymm0, (%ebp,%ebx,4)
+	addl	$256, %ebx              # imm = 0x100
+	addl	$8, %esi
+	jne	.LBB0_4
+.LBB0_5:
+	testl	%edi, %edi
+	movl	24(%esp), %eax
+	je	.LBB0_8
+# BB#6:
+	leal	(%eax,%ebx,4), %esi
+	addl	$96, %esi
+	negl	%edi
+	.p2align	4, 0x90
+.LBB0_7:                                # =>This Inner Loop Header: Depth=1
+	vmovdqu	%ymm0, -96(%esi)
+	vmovdqu	%ymm0, -64(%esi)
+	vmovdqu	%ymm0, -32(%esi)
+	vmovdqu	%ymm0, (%esi)
+	subl	$-128, %esi
+	addl	$1, %edi
+	jne	.LBB0_7
+.LBB0_8:
+	movl	(%esp), %edi            # 4-byte Reload
+	cmpl	%ecx, %edi
+	je	.LBB0_12
+# BB#9:
+	leal	(%eax,%edi,4), %esi
+.LBB0_10:
+	subl	%edi, %ecx
+	.p2align	4, 0x90
+.LBB0_11:                               # =>This Inner Loop Header: Depth=1
+	movl	%edx, (%esi)
+	addl	$4, %esi
+	addl	$-1, %ecx
+	jne	.LBB0_11
+.LBB0_12:
+	addl	$4, %esp
+	popl	%esi
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
+	vzeroupper
+	retl
+END(WMEMSET)
diff --git a/libc/arch-x86/static_function_dispatch.S b/libc/arch-x86/static_function_dispatch.S
index 7e8e63d..1560c04 100644
--- a/libc/arch-x86/static_function_dispatch.S
+++ b/libc/arch-x86/static_function_dispatch.S
@@ -45,6 +45,7 @@
 FUNCTION_DELEGATE(strncmp, strncmp_generic)
 FUNCTION_DELEGATE(strcat, strcat_generic)
 FUNCTION_DELEGATE(wmemcmp, wmemcmp_freebsd)
+FUNCTION_DELEGATE(wmemset, wmemset_freebsd)
 FUNCTION_DELEGATE(wcscat, wcscat_freebsd)
 FUNCTION_DELEGATE(strncat, strncat_openbsd)
 FUNCTION_DELEGATE(strlcat, strlcat_openbsd)